美文网首页python爬虫学习
scrapy豆瓣网登陆爬虫与验证码自动识别

scrapy豆瓣网登陆爬虫与验证码自动识别

作者: 薛落花随泪绽放 | 来源:发表于2017-11-05 20:20 被阅读17次
d:
cd tmp
scrapy startproject douban
cd douban
scrapy genspider -t basic d1 douban.com
d1.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request,FormRequest
import urllib.request
import os
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

class D1Spider(scrapy.Spider):
    name = 'd1'
    allowed_domains = ['douban.com']
    #start_urls = ['http://douban.com/']
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0"}
    # 编写start_requests()方法,第一次会默认调取该方法中的请求
    def start_requests(self):
        # 首先爬一次登录页,然后进入回调函数parse()
        return [Request("http://douban.com/", meta={"cookiejar": 1}, callback=self.parse)]

    def parse(self, response):
        # 判断是否有验证码
        captcha = response.xpath("//img[@id='captcha_image']/@src").extract()
        if(len(captcha) > 0):
            print("此时有验证码")
            # 将验证码存储到本地
            localpath = "D:/tmp/douban/yzm/captcha.png"
            urllib.request.urlretrieve(captcha[0],filename=localpath)
            captcha_value = input("请到D:/tmp/douban/yzm查看captcha.png中验证码是什么?")
            #cmd="D:/Python27/python.exe D:/Python27/yzm/YDMPythonDemo.py"
            #r = os.popen(cmd)
            #captcha_value = r.read()
            #print("当前验证码自动识别结果为:" + captcha_value)
            data = {
                "captcha - solution": captcha_value,
                "redir": "https://www.douban.com",
                "form_email": "自己的邮箱",
                "form_password": "输入自己的密码",

            }
        else:
            #设置要传递的post信息,此时没有验证码字段
            data = {
                "redir": "https://www.douban.com",
                "form_email": "自己的邮箱",
                "form_password": "密码",
            }

        print("登录中…")
        # 通过FormRequest.from_response()进行登陆
        return [FormRequest.from_response(response,
                                          # 设置cookie信息
                                          meta={"cookiejar": response.meta["cookiejar"]},
                                          # 设置headers信息模拟成浏览器
                                          headers=self.header,
                                          # 设置post表单中的数据
                                          formdata=data,
                                          # 设置回调函数,此时回调函数为next()
                                          callback=self.next,
                                          )]

    def next(self, response):
        # response是
        fh = open("C:/code/14douban/yzm/test.html", 'w')
        fh.write(response.text)
        fh.close()
        title = response.xpath("/html/head/title/text()").extract()
        print(title)

相关文章

网友评论

    本文标题:scrapy豆瓣网登陆爬虫与验证码自动识别

    本文链接:https://www.haomeiwen.com/subject/dxncmxtx.html