思路
还记得很久以前写的爬取猫眼电影TOP100的代码吗?那时是用requests和正则做的,这里改写成用scrapy做的。代码很简单,就直接放了。主要是重新熟悉一下scrapy。
代码
import scrapy
import re
from maoyan.items import maoyanItem
class maoyanCrawler(scrapy.Spider):
name = "maoyanCrawler"
start_urls = ['http://maoyan.com/board/4']
pattern = re.compile(
'<dd>.*?board-index.*?>(.*?)</i>'
'.*?<img data-src="(.*?)"'
'.*?class="name"><a.*?>(.*?)</a>'
'.*?class="star">(.*?)</p>'
'.*?class="releasetime">(.*?)</p>'
'.*?class="score"><i class="integer">(.*?)</i><i class="fraction">(.*?)</i>'
'.*?</dd>', re.S)
def start_requests(self):
for i in range(10, 100, 10):
self.start_urls.append("http://maoyan.com/board/4?offset={0:d}".format(i))
for url in self.start_urls:
yield scrapy.Request(url=url, callback=self.parse_movie_page)
def parse_movie_page(self, response):
items = self.pattern.findall(response.text)
for item in items:
mitem = maoyanItem()
mitem['movie_id'] = item[0]
mitem['movie_image'] = item[1]
mitem['movie_name'] = item[2]
mitem['movie_star'] = item[3].strip()[3:]
mitem['movie_release_time'] = item[4][5:]
mitem['move_score'] = item[5] + item[6]
yield mitem
网友评论