对豆瓣热度电影进行简单爬取
豆瓣热门电影信息是动态加载的,通过network,可以看到加载热门电影信息的api是https://movie.douban.com/j/search_subjects?type=movie&tag=热门&sort=time&page_limit=20&page_start=xx
爬取的信息为电影标题,url,评分,是否为新电影
编写items
class DouItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
rate = scrapy.Field()
url = scrapy.Field()
new = scrapy.Field()
编写spider
class MoviespiderSpider(scrapy.Spider):
start=0
name = 'moviespider'
allowed_domains = ['movie.douban.com']
start_urls = ['https://movie.douban.com/j/search_subjects?type=movie&tag=热门&sort=time&page_limit=20&page_start='+str(start)]
def parse(self, response):
datas=json.loads(response.text,encoding='utf-8')
datas=datas['subjects']
# 当subjects为空时关闭spider
if len(datas)<=0:
self.crawler.engine.close_spider(self, 'job done!')
for data in datas:
itemloader = ItemLoader(item=DouItem(), response=response)
itemloader.add_value('title',data.get('title'))
itemloader.add_value('rate',data.get('rate'))
itemloader.add_value('url',data.get('url'))
itemloader.add_value('new',data.get('is_new'))
item = itemloader.load_item()
yield item
self.start += 20
url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=热门&sort=time&page_limit=20&page_start='+str(self.start)
yield Request(url=url,callback=self.parse)
编写Pipeline
from concurrent.futures import ThreadPoolExecutor
class DouPipeline(object):
def __init__(self):
self.pool=ThreadPoolExecutor(max_workers=3)
def process_item(self, item, spider):
self.pool.submit(self.save,item)
return item
def save(self, item):
with open('data.txt', 'a', encoding='utf-8') as f:
f.write('title:{}\trate:{}\turl:{}\tis_new:{}'.format(item['title'][0], item['rate'][0],
item['url'][0], item['new'][0]))
f.write('\n')
编写Middleware
在scrapy shell xxxx的时候发现response返回403,说明有反爬,所以写了用户代理的中间件。
from dou.settings import USER_AGENT_LIST
class UserAgentMiddleware(object):
def process_request(self, request, spider):
def get_ua():
index = random.randint(0, len(USER_AGENT_LIST) - 1)
return USER_AGENT_LIST[index]
request.headers.setdefault("User-Agent", get_ua())
setting
USER_AGENT_LIST=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
ROBOTSTXT_OBEY = False
DOWNLOADER_MIDDLEWARES = {
# 'dou.middlewares.DouDownloaderMiddleware': 543,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'dou.middlewares.UserAgentMiddleware': 200
}
网友评论