爬取内容:对市场上手机种类爬取以及进行流行程度分析
该论坛界面展示:
image.png
代码实现:
import requests
from bs4 import BeautifulSoup
import jieba
from collections import Counter
import pygal
import chardet
def getHTMLText(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
print(r.encoding)
return r.text
except:
return ""
def getImportantText(soup):
taglist = soup.select('li > div > span > a[target="_blank"]')
text = ""
for i in taglist:
text = text + i.string
return text
def manageFirst(url):
html = getHTMLText(url)
soup = BeautifulSoup(html, "html.parser")
text = getImportantText(soup)
return text
def draw(text):
phoneList = ['苹果', '华为', '荣耀', '魅族', '三星', '小米', 'vivo', 'oppo']
jieba.load_userdict(phoneList)
words = [x for x in jieba.cut(text) if len(x) >= 2]
print(len(words))
c = Counter(words).most_common(490)
pie = pygal.Pie()
for word in c:
if word[0] in phoneList:
print(word)
pie.add(word[0], int(word[1]))
pie.render_to_file("pie.svg")
def main():
url = "https://itbbs.pconline.com.cn/es/f240027.html"
text = manageFirst(url)
url = "https://itbbs.pconline.com.cn/es/f240027_2.html"
text2 = manageFirst(url)
text = text + text2
#print(len(text))
draw(text)
main()
爬取结果展示:
image.png
网友评论