对博客网站[https://www.cnblogs.com/]首页的200页网站进行内容爬取
用lxml和xpath进行爬取数据
-*-coding=utf-8-*-
import requests
from lxml import etree
import lxml
class GetTest:
def getUrl(self,url,title,content):
try:
head = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'
}
url01 = url[:-1]
for x in range(199):
r = requests.get(url).text
re = etree.HTML(r)
s_url = re.xpath("//div[@class='pager']/a[last()]/@href")[0]
url = url01 + s_url
list_url = re.xpath('//h3/a/@href')
for x in list_url:
r01 = requests.get(x, head).content.decode("utf-8")
html01 = etree.HTML(r01)
titles = html01.xpath('string(//a[@id="cb_post_title_url"]//text())')
contents = html01.xpath('string(//div[@id="cnblogs_post_body"])')
title.append(titles)
content.append(contents)
return title,content
except Exception as e:
print(e)
def getPrint(self,num):
try:
with open("cn-blog.csv", "a+", encoding="utf-8") as f:
for i in range(num):
f.write(title[i])
f.write('\n')
f.write(content[i])
f.write("*" * 50 + "\n")
except IOError as e:
print(e)
if __name__ == '__main__':
title = []
content = []
url = "https://www.cnblogs.com/"
test = GetTest()
test.getUrl(url,title,content)
test.getPrint(len(title))
网友评论