美文网首页
使用Scrapy框架 豆瓣图书 与 评论 (CSS)

使用Scrapy框架 豆瓣图书 与 评论 (CSS)

作者: z小志 | 来源:发表于2018-12-28 13:19 被阅读17次
        #爬取图书
        def parse(self, response):
            self.logger.debug(response)
            try:
                lis = response.css('ul.subject-list .subject-item')
                self.logger.debug(lis)
                for li in lis:
                    item = DoubanItem()
    
                    name = li.css('.info h2 a::attr(title)').extract_first()
                    item['name'] = name.strip() if name else ''
    
                    img = li.css('.pic img::attr(src)').extract_first()
                    item['img'] = img.strip() if img else ''
    
                    public_info = li.css('.pub::text').extract_first()
                    item['public_info'] = public_info.strip() if public_info else ''
    
                    des = li.css('.info p::text').extract_first()
                    item['des'] = des.strip() if des else ''
    
                    detail_url = li.css('.info h2 a::attr(href)').extract_first()
                    item['detail_url'] = detail_url.strip() if detail_url else ''
    
                    id = self.txt_wrap_by('subject/','/',detail_url)
                    item['id'] = id if id else ''
    
                    score = li.css('.rating_nums::text').extract_first()
                    item['score'] = score.strip() if score else ''
    
                    comment = li.css('.pl::text').extract_first()
                    item['comment'] = comment.strip() if comment else ''
                    reviews_url = item['detail_url'] + '/reviews'
                    if reviews_url:
                        print('next_url=' + reviews_url)
                        yield scrapy.Request(url=reviews_url, callback=self.parse_reviews)
                    yield item
                next_url = response.css('.next link::attr(href)').extract_first()
                if next_url:
                    print('next_url=' + next_url)
                    yield  scrapy.Request(url=response.urljoin(next_url.strip()),callback=self.parse)
            except Exception as e:
                    print(e)
                    print("爬取结束")
    
    
        #爬取评论信息
        def parse_reviews(self,response):
           divs = response.css('div.review-list div.main.review-item')
            print(divs)
            for div in divs:
                item = DoubanReviewsItem()
                id = self.txt_wrap_by('subject/', '/', response.url)
                item['id'] = id if id else ''
                name = div.css('.name::text').extract_first()
                item['name'] = name.strip() if name else ''
                content = div.css('.short-content::text').extract_first()
                item['content'] = content.strip() if content else ''
                yield  item
    
    

    爬着爬着发现ip被禁了 ,然后写了个中间件

    class ProxyMiddleware(object):
        def __init__(self,settings):
            self.logger = logging.getLogger(__name__)
            self.is_first = False
    
        def process_request(self,request,spider):    
            #第一次请求 或者 需要重试的时候 切换ip
            if request.meta.get('retry_times') or not self.is_first:
                self.logger.debug('使用代理 ')
                self.is_first = True
                proxy = Proxy().get_random_proxy()
                if proxy:
                    uri = 'https://{proxy}'.format(proxy=proxy)
                    self.logger.debug('使用代理 ' + proxy)
                    request.meta['proxy'] = uri
    
        @classmethod
        def from_crawler(cls, crawler):
            return cls(
                settings=crawler.settings
            )
    
    
    #从本地搭建的代理池获取ip
    from douban.settings import *
    import requests
    
    class Proxy(object):
        def __init__(self):
            self.proxy_url = PROXY_URL
    
        def get_random_proxy(self):
            try:
                response = requests.get(self.proxy_url)
                if response.status_code == 200:
                    proxy = response.text
                    return  proxy
            except requests.ConnectionError:
                return  False
    
    

    相关文章

      网友评论

          本文标题:使用Scrapy框架 豆瓣图书 与 评论 (CSS)

          本文链接:https://www.haomeiwen.com/subject/wgsslqtx.html