美文网首页
内容提取的源码

内容提取的源码

作者: pwld | 来源:发表于2018-02-10 22:53 被阅读0次
    # -*- coding: utf-8 -*-
    import scrapy
    import re
    
    class JobboleSpider(scrapy.Spider):
        name = "jobbole"
        allowed_domains = ["blog.jobbole.com"]
        start_urls = ['http://blog.jobbole.com/113560/']
        def parse(self, response):
           # re_selector = response.xpath("/html/body/div[1]/div[3]/div[1]/div[1]/h1")
           # re2_selector = response.xpath('//*[@id ="post-113560"]/div[1]/h1/text()')
           # 下面create_date表示日期,praise_nums点赞数 fav_num收藏数 comment_nums评论数cotent正文
           # tag_list是日期后面的内容 tags连接的字符串
           title = response.xpath('//*[@id ="post-113560"]/div[1]/h1/text()').extract()[0]
           create_date = response.xpath('//*[@id="post-113560"]/div[2]/p/text()').extract()[0].strip().replace("·", "").strip()
           praise_nums = int(response.xpath('//*[@id="post-113560"]/div[3]/div[3]/span[1]/h10/text()').extract()[0])
           fav_nums= response.xpath('//*[@id="post-113560"]/div[3]/div[3]/span[2]/text()').extract()[0]
           match_re=re.match(".*(\d+).*",fav_nums)
           if match_re:
               fav_nums=match_re.group(1)
           comment_nums=response.xpath('//*[@id="post-113560"]/div[3]/div[3]/a/span/text()').extract()[0]
           match_re = re.match(".*(\d+).*", comment_nums)
           if match_re:
               comment_nums=match_re.group(1)
           cotent= response.xpath('//*[@id="post-113560"]/div[3]').extract()[0]
           tag_list=response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
           tag_list=[element for element in tag_list if not element.strip().endswith("评论")]
           tags =",".join(tag_list)
           pass
    
    

    相关文章

      网友评论

          本文标题:内容提取的源码

          本文链接:https://www.haomeiwen.com/subject/plwhtftx.html