import scrapy
import re
from ..items import BaiduspiderItem
class BaiduSpider(scrapy.Spider):
name = 'baidu'
# allowed_domains = ['www.baidu.com']
start_urls = ['http://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gb18030&word=%CD%BC%C6%AC&fr=ala&ala=1&alatpl=others&pos=0']
def parse(self, response):
html = response.text
urls = re.findall('"thumbURL":"(.*?)"',html)
for index, url in enumerate(urls):
yield scrapy.Request(url =url, meta={'index':index},callback = self.parse_img) #callback self.方法名
def parse_img(self,response):
item = BaiduspiderItem()
item['img_name'] = response.meta['index']
item['img_content'] = response.body #response.body 返回字节码,response.text 返回字符串
yield item #传递给pipelines```
网友评论