测试专用:
import os, sys
import time
import re
import json
import random
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.crawler import CrawlerRunner
from scrapy.spiders import BaseSpider
from scrapy.spiders.init import InitSpider
from scrapy.http import Request, FormRequest
from scrapy.http.request.form import _get_form,_get_inputs
from scrapy.linkextractors import LinkExtractor
from scrapy.exceptions import CloseSpider
from scrapy.spiders import Rule
from scrapy.selector import Selector
from scrapy.http.cookies import CookieJar
class SpiderSpider(InitSpider):
name = 'spider'
# allowed_domains = ['www.amazon.com']
handle_httpstatus_list = [403]
cookie_jar = CookieJar()
cookie_save_type = 'sqlite3'
cookie_save = "./cookie/"
cookie_file = "cookies.json"
cookie_save_type = 'file'
try_login_max_time = 3
start_urls = []
def __init__(self):
self.chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
self.csrftoken = ''.join(random.sample(self.chars, 32))
print(self.csrftoken)
pass
def init_request(self):
url = "xxxxxx"一般为网站首页url
headers = {}
cookies = self.get_cookie()
yield Request(url=url, callback=self.parse, headers=headers,cookies=cookies)
def parse(self, response):
url = "xxxxxxxxxxxxxxx"#请求页面url
headers = {}
cookies = self.merge_cookie(response)
yield Request(url=url, callback=self.get_product_url, headers=headers,cookies=cookies)
def get_product_url(self, response):
self.log(response.text)
def get_cookie(self):
cookie_jar = self.cookie_jar
try:
cookies = {}
if self.cookie_save_type == 'file':
with open(self.cookie_file, 'r+') as f:
data = f.read()
if data:
cookies = json.loads(data)
return cookies
except IOError as e:
return {}
def merge_cookie(self, response):
cookie_jar = self.cookie_jar
cookie_jar.extract_cookies(response, response.request)
cookies_final = self.get_cookie()
p = re.compile(r'<Cookie (.*?) for .*?>')
for item in cookie_jar:
cookies = re.findall(p, str(item))
cookies = (cookie.split('=', 1) for cookie in cookies)
cookies = dict(cookies)
cookies_final.update(cookies)
cookie_str = json.dumps(cookies_final)
if self.cookie_save_type == 'file':
with open(self.cookie_file, 'w') as f:
f.write(cookie_str)
return cookies_final
if __name__ == "__main__":
script_path = os.path.split( os.path.realpath( sys.argv[0] ) )[0]
os.chdir(script_path)
start_at = time.time()
# parser = argparse.ArgumentParser(description='Arguments')
# parser.add_argument('--email', help='Email', required=True)
# parser.add_argument('--password', help='Password', required=True)
# parser.add_argument('--check_phone', help='Enter phone valide code by hand', required=False, default='n')
# args = vars(parser.parse_args())
# email = args['email']
# password = args['password']
# check_phone = args['check_phone']
# params = {'email':email,'password':password,'check_phone':check_phone}
params = {}
CrawlSettings = {
'BOT_NAME': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
'DOWNLOAD_TIMEOUT' : 60,
'DOWNLOAD_DELAY': 1,
'DEFAULT_REQUEST_HEADERS': {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
},
'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
}
process = CrawlerProcess(CrawlSettings)
process.crawl(SpiderSpider,**params)
process.start()
end_at = time.time()
print( end_at-start_at)
网友评论