from scrapy.spidersimport BaseSpider
from scrapy.spiders.initimport InitSpider
from scrapy.httpimport Request, FormRequest
from scrapy.linkextractorsimport LinkExtractor
from scrapy.spidersimport Rule
from scrapy.selectorimport Selector
from scrapy.http.cookiesimport CookieJar
from urllib.parseimport urlparse,urlunparse,parse_qs,parse_qsl
import re
import json
# import MySQLdb
import datetime
import sys
import traceback
# import deathbycaptcha
# import captcha_log
import urllib.request
import hashlib
sys.path.append("..")
# from common.db import DB
# from common.task import TASK
class LoginTestSpider(InitSpider):
name ='login_test'
# allowed_domains = ['www.amazon.com']
platform ='amazon'
# no_proxy = True
cookie_jar = CookieJar()
# login_page = order_page = "https://search.glamaslee.com"
login_page = order_page ="https://www.amazon.com/gp/css/order-history/"
email ='xxxxxxxxxxxxx'
password ='lxxxxxxxxxxxxxx'
try_login_max_time =3
#
# db = DB()
# db.connect()
# cur = db.cursor()
start_urls = []
def init_request(self):
"""This function is called before crawling starts."""
return Request(url=self.login_page,callback=self.check_login,cookies=self.get_cookie())
# return Request(url='https://ip.cn/',callback=self.parse_ip)
def set_cookie(self,response):
cookie_jar =self.cookie_jar
cookie_jar.extract_cookies(response, response.request)
with open('./cookies.txt','w')as f:
p = re.compile(r'')
cookies_final = {}
for itemin cookie_jar:
# f.write(str(cookie) + '\n')
cookies = re.findall(p,str(item))
cookies = (cookie.split('=',1)for cookiein cookies)
cookies =dict(cookies)
cookies_final.update(cookies)
f.write(json.dumps(cookies_final))
def get_cookie(self):
cookie_jar =self.cookie_jar
try:
cookies = {}
with open('./cookies.txt','r+')as f:
data = f.read()
if data:
cookies = json.loads(data)
# cookie_jar = f.read()
# p = re.compile(r'')
# cookies = re.findall(p, cookie_jar)
# cookies = (cookie.split('=', 1) for cookie in cookies)
# cookies = dict(cookies)
return cookies
except IOError as e:
return {}
def check_login(self, response):
last_step = response.meta['step']if 'step' in response.metaelse 1
try_times = response.meta['try_times']if 'try_times' in response.metaelse 0
sel = Selector(response)
step =99
email_input_ele = sel.xpath("//input[@id='ap_email']")
password_input_ele = sel.xpath("//input[@id='ap_password']")
continue_ele = sel.xpath("//input[@id='continue']")
verify_code_ele = sel.xpath("//input[@name='code']")
if email_input_eleand not password_input_ele:
step =1
if email_input_eleand password_input_ele:
step =2
if not email_input_eleand not password_input_eleand continue_ele:
step =3
if not email_input_eleand not password_input_eleand not continue_eleand verify_code_ele:
step =4
print( step)
if last_step == step :
try_times +=1
if try_times >self.try_login_max_time:
return None
if step ==1:
return FormRequest.from_response(response,
formdata={'email':self.email},
callback=self.check_login,
meta={'step':step,'try_times':try_times})
elif step ==2:
with open('./login.html','wb+')as f:
f.write(response.body)
formdata = {'email':self.email,'password':self.password}
captcha_img = sel.xpath("//img[contains(@src,'captcha')]/@src")
if captcha_img:
captcha_filename ='./captcha.jpg'
# captcha_filename = './captcha_'+datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+".jpg"
urllib.request.urlretrieve(captcha_img.extract_first(), captcha_filename)
guess =input('enter captcha code : ')
formdata['guess'] = guess
return FormRequest.from_response(response,
formdata=formdata,
callback=self.check_login,
meta={'step':step,'try_times':try_times})
elif step ==3:
formdata = {}
return FormRequest.from_response(response,
formdata=formdata,
callback=self.check_login,
meta={'step':step,'try_times':try_times})
elif step ==4:
code =input('enter verify code : ')
formdata = {'code':code}
return FormRequest.from_response(response,
formdata=formdata,
callback=self.check_login,
meta={'step':step,'try_times':try_times})
elif step ==99:
self.set_cookie(response)
return self.after_login(response)
def after_login(self,response):
# with open('./login.html', 'w+') as f:
# f.write(response.body)
url ="https://www.amazon.com/dp/B00OQVZDJM/"
self.start_urls.append(url)
return self.initialized()
def parse(self, response):
with open('./login.html','wb+')as f:
f.write(response.body)
print( response.url)
self.log('Start Crawl ...')
return Request(url='http://ip4.me/',callback=self.parse_ip)
def parse_ip(self, response):
# jsonresponse = json.loads(response.body_as_unicode())
# print(jsonresponse)
sel = Selector(response)
ip_str = sel.xpath('//td[@bgcolor="D0D0F0"]//text()').extract()
self.log('\n'.join(ip_str))
网友评论