美文网首页
scrapy爬取豆瓣热门电影信息

scrapy爬取豆瓣热门电影信息

作者: nice生活 | 来源:发表于2018-10-13 12:53 被阅读0次

    对豆瓣热度电影进行简单爬取

    豆瓣热门电影信息是动态加载的,通过network,可以看到加载热门电影信息的api是https://movie.douban.com/j/search_subjects?type=movie&tag=热门&sort=time&page_limit=20&page_start=xx

    爬取的信息为电影标题,url,评分,是否为新电影

    编写items

    class DouItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        title = scrapy.Field()
        rate = scrapy.Field()
        url = scrapy.Field()
        new = scrapy.Field()
    

    编写spider

    class MoviespiderSpider(scrapy.Spider):
        start=0
        name = 'moviespider'
        allowed_domains = ['movie.douban.com']
        start_urls = ['https://movie.douban.com/j/search_subjects?type=movie&tag=热门&sort=time&page_limit=20&page_start='+str(start)]
    
        def parse(self, response):
            datas=json.loads(response.text,encoding='utf-8')
            datas=datas['subjects']
        #  当subjects为空时关闭spider
            if len(datas)<=0:
                self.crawler.engine.close_spider(self, 'job done!')
            for data in datas:
                itemloader = ItemLoader(item=DouItem(), response=response)
                itemloader.add_value('title',data.get('title'))
                itemloader.add_value('rate',data.get('rate'))
                itemloader.add_value('url',data.get('url'))
                itemloader.add_value('new',data.get('is_new'))
                item = itemloader.load_item()
                yield item
            self.start += 20
            url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=热门&sort=time&page_limit=20&page_start='+str(self.start)
            yield Request(url=url,callback=self.parse)
    

    编写Pipeline

    from concurrent.futures import ThreadPoolExecutor
    
    
    class DouPipeline(object):
        def __init__(self):
            self.pool=ThreadPoolExecutor(max_workers=3)
    
        def process_item(self, item, spider):
            self.pool.submit(self.save,item)
            return item
    
        def save(self, item):
            with open('data.txt', 'a', encoding='utf-8') as f:
                f.write('title:{}\trate:{}\turl:{}\tis_new:{}'.format(item['title'][0], item['rate'][0],
                                                                      item['url'][0], item['new'][0]))
                f.write('\n')
    

    编写Middleware

    在scrapy shell xxxx的时候发现response返回403,说明有反爬,所以写了用户代理的中间件。

    from dou.settings import USER_AGENT_LIST
    
    
    class UserAgentMiddleware(object):
    
        def process_request(self, request, spider):
            def get_ua():
                index = random.randint(0, len(USER_AGENT_LIST) - 1)
                return USER_AGENT_LIST[index]
    
            request.headers.setdefault("User-Agent", get_ua())
    

    setting

    USER_AGENT_LIST=[
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]
    ROBOTSTXT_OBEY = False
    DOWNLOADER_MIDDLEWARES = {
        # 'dou.middlewares.DouDownloaderMiddleware': 543,
        'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
        'dou.middlewares.UserAgentMiddleware': 200
    }
    

    相关文章

      网友评论

          本文标题:scrapy爬取豆瓣热门电影信息

          本文链接:https://www.haomeiwen.com/subject/brroaftx.html