美文网首页
爬取博客内容

爬取博客内容

作者: David5262 | 来源:发表于2019-11-03 22:18 被阅读0次

对博客网站[https://www.cnblogs.com/]首页的200页网站进行内容爬取
用lxml和xpath进行爬取数据

-*-coding=utf-8-*-
import requests
from lxml import etree
import lxml

class GetTest:
    def getUrl(self,url,title,content):
        try:
            head = {
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'
            }
            url01 = url[:-1]
            for x in range(199):
                r = requests.get(url).text
                re = etree.HTML(r)
                s_url = re.xpath("//div[@class='pager']/a[last()]/@href")[0]
                url = url01 + s_url
                list_url = re.xpath('//h3/a/@href')
                for x in list_url:
                    r01 = requests.get(x, head).content.decode("utf-8")
                    html01 = etree.HTML(r01)
                    titles = html01.xpath('string(//a[@id="cb_post_title_url"]//text())')
                    contents = html01.xpath('string(//div[@id="cnblogs_post_body"])')
                    title.append(titles)
                    content.append(contents)
            return title,content
        except Exception as e:
            print(e)

    def getPrint(self,num):
        try:
            with open("cn-blog.csv", "a+", encoding="utf-8") as f:
                for i in range(num):
                    f.write(title[i])
                    f.write('\n')
                    f.write(content[i])
                    f.write("*" * 50 + "\n")
        except IOError as e:
            print(e)

if __name__ == '__main__':
    title = []
    content = []
    url = "https://www.cnblogs.com/"
    test = GetTest()
    test.getUrl(url,title,content)
    test.getPrint(len(title))

相关文章

网友评论

      本文标题:爬取博客内容

      本文链接:https://www.haomeiwen.com/subject/hyglbctx.html