#!/usr/bin/env python
# -*- coding: utf-8 -*-
import scrapy
# import codecs
import os
from bingproxy import BingProxy
class ImagesSpider(scrapy.Spider):
name = "images"
dir_path = "huaban_bingproxy_big_images"
if not os.path.exists(dir_path):
os.makedirs(dir_path)
# allowed_domains = ["tyst.migu.cn"]
start_urls = []
bingProxy = BingProxy()
def start_requests(self):
with open('processing_threading_huaban_big_images_all_urls_part3.txt') as url_list:
for url in url_list:
url = url.strip()
#yield scrapy.Request(url = self.bingProxy.get_proxy_url(url), meta = {"origin_rul": url}, callback = self.parse )
if url != "" and url != None:
yield scrapy.Request(url = url, callback=self.parse,method="get")
#def __init__(self, urlfile=None,*args, **kwargs):
# super(MusicSpider, self).__init__(*args, **kwargs)
# uf = codecs.open(urlfile, 'r', 'utf-8')
# urls = [line.strip() for line in uf.readlines()]
#self.start_urls = urls
def parse(self, response):
path = "huaban_bingproxy_big_images" +"/"+response.url.split('/')[-1] + ".png"
# path = path.split('?')[0]
# self.logger.info('Saving mp3 %s', path)
with open(path, 'wb') as f:
f.write(response.body)
网友评论