内容提取的源码

作者: pwld | 来源:发表于2018-02-10 22:53 被阅读0次

内容提取的源码
用 int 来储存 boolean 数组代码
golang标准错误输出用法
python爬虫实战：妈妈再也不担心我爬取不到淘宝商品信息了
python爬虫实战：妈妈再也不担心我爬取不到淘宝商品信息了
js提取内容
提取匹配内容
正则表达式提取符号间内容
PostgreSQL源码安装
单独抽取webRtc的AGC(增益)模块

# -*- coding: utf-8 -*-
import scrapy
import re

class JobboleSpider(scrapy.Spider):
    name = "jobbole"
    allowed_domains = ["blog.jobbole.com"]
    start_urls = ['http://blog.jobbole.com/113560/']
    def parse(self, response):
       # re_selector = response.xpath("/html/body/div[1]/div[3]/div[1]/div[1]/h1")
       # re2_selector = response.xpath('//*[@id ="post-113560"]/div[1]/h1/text()')
       # 下面create_date表示日期，praise_nums点赞数 fav_num收藏数 comment_nums评论数cotent正文
       # tag_list是日期后面的内容 tags连接的字符串
       title = response.xpath('//*[@id ="post-113560"]/div[1]/h1/text()').extract()[0]
       create_date = response.xpath('//*[@id="post-113560"]/div[2]/p/text()').extract()[0].strip().replace("·", "").strip()
       praise_nums = int(response.xpath('//*[@id="post-113560"]/div[3]/div[3]/span[1]/h10/text()').extract()[0])
       fav_nums= response.xpath('//*[@id="post-113560"]/div[3]/div[3]/span[2]/text()').extract()[0]
       match_re=re.match(".*(\d+).*",fav_nums)
       if match_re:
           fav_nums=match_re.group(1)
       comment_nums=response.xpath('//*[@id="post-113560"]/div[3]/div[3]/a/span/text()').extract()[0]
       match_re = re.match(".*(\d+).*", comment_nums)
       if match_re:
           comment_nums=match_re.group(1)
       cotent= response.xpath('//*[@id="post-113560"]/div[3]').extract()[0]
       tag_list=response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
       tag_list=[element for element in tag_list if not element.strip().endswith("评论")]
       tags =",".join(tag_list)
       pass