美文网首页编程地带
黑板课爬虫闯关 - 第五关

黑板课爬虫闯关 - 第五关

作者: MA木易YA | 来源:发表于2018-11-27 22:14 被阅读0次

    这里是黑板课爬虫闯关第五关

    image.png

    同样的需要登陆

    image.png

    在前面的基础上,虽然没有第四关那么复杂,但这里增加了验证码环节,这个很难受,当时安装相关python包配环境就花了不少功夫,主要会用到的就是pytesseract和Image这两种库,不太了解的看这里,对于验证码这一块我也不是太了解,就简单用网上提供的方法调用了下,然后其他的都和之前的差不多

    简化流程

    1. 登陆验证
    2. 验证码验证
    3. 密码登陆尝试
      重复后面两步就好了
      提供几个版本供大家参考

    one.py(原则上代码逻辑没问题,但是一是速度奇慢,二是确实在验证码识别上不是很清晰,所以在最后结果输出有时会出问题,大家可以试试看,也帮我找找错)

    import re
    import requests
    from lxml import etree
    import pytesseract
    from PIL import Image,ImageEnhance
    
    def verification_Code(img_url):
        #保存验证码
    
        imgs = requests.get(img_url).content
        with open('1.jpg', 'wb') as f:
            f.write(imgs)
    
        image = Image.open('1.jpg')
        imgry = image.convert('L')#图像加强,二值化
        sharpness = ImageEnhance.Contrast(imgry)#对比度增强
        sharp_img = sharpness.enhance(2.0)
        sharp_img.save('1.jpg')
    
        text = pytesseract.image_to_string(image)
        return text
    
    
    
    def login():
        login_url = "http://www.heibanke.com/accounts/login"
        session = requests.Session()
        token = session.get(login_url).cookies['csrftoken']
        data = {
            'username': 'Koelre',
            'password': 'lixue961314',
            'csrfmiddlewaretoken': token
        }
        session.post(login_url, data)
        print("登录成功")
        return session
    
    def ex05(a=1,password=1):
        url = "http://www.heibanke.com/lesson/crawler_ex04/"
        session = login()
        html = session.get(url).text
        etr = etree.HTML(html)
        token = etr.xpath('/html/body/div/div/div[2]/form/input/@value')[0].strip()
        img_src = etr.xpath('/html/body/div/div/div[2]/form/div[3]/img/@src')[0].strip()
        #验证码连接
        img_url = 'http://www.heibanke.com' + str(img_src)
        #图片code
        pic_code = etr.xpath('//*[@id="id_captcha_0"]/@value')[0]
        text = verification_Code(img_url)
    
        data = {
            "csrfmiddlewaretoken": token,
            "username": "a",
            "password": password,
            "captcha_0": pic_code,
            "captcha_1": text
        }
    
        res = session.post(url, data).text
        verification_result = re.findall("验证码输入错误", res)
        passwd_result = re.findall('您输入的密码错误', res)
        h3 = re.findall('<h3>(.*?)</h3>', res)
        if verification_result:
            print(h3)
            print(text)
            print("重试")
            ex05(a+1, password)
        else:
            if passwd_result:
                print(h3)
                print("密码:%s错误" %password)
                ex05(a, password+1)
            else:
                print("闯关成功,密码是:%s" %password)
                print(h3)
    
    
    
    if __name__ == '__main__':
        ex05()
    

    two.py

    """
    黑板客爬虫闯关第五关
    http://www.heibanke.com/lesson/crawler_ex04
    验证码处理
    answer is 22
    """
    
    import Image
    
    from PIL import Image
    from io import BytesIO
    import pytesseract
    import bs4
    from bs4 import BeautifulSoup
    import requests
    import os
    import re
    
    pytesseract.pytesseract.tesseract_cmd = "D:\\Program Files (x86)\\Tesseract-OCR\\tesseract"
    
    url = "http://www.heibanke.com/lesson/crawler_ex04/"
    login_url = "http://www.heibanke.com/accounts/login/?next=/lesson/crawler_ex04/"
    
    data={'username': 'medyg', 'password': '19931122bihu', 'csrfmiddlewaretoken': ''}
    
    """ 打开登陆页面 """
    loginr = requests.get(login_url)
    if loginr.status_code == 200:
        cookie = loginr.cookies
        print("get login_url success, csrftoken is :" + cookie['csrftoken'])
    else:
        print("get login_url failed")
    data['csrfmiddlewaretoken'] = cookie['csrftoken']
    
    """ 登陆 """
    signinr = requests.post(login_url, data = data, allow_redirects = False, cookies = cookie)
    if signinr.status_code == 302:
        cookie2 = signinr.cookies
        print("post login_url success, csrftoken is :" + cookie2['csrftoken'])
    else:
        print("post login_url failed, status_code is " + str(signinr.status_code))
    
    data['csrfmiddlewaretoken'] = cookie2['csrftoken']
    """ 获取并识别验证码(Using Tesseract-Ocr) """
    guesses = 0
    guess_success = 0
    def get_captcha():
        global guesses
        print("\n开始获取第%d次验证码" % guesses)
        captchar = requests.get(url, cookies = cookie2)
        soup = BeautifulSoup(captchar.text, "lxml")
        img_src = soup.find('img', class_='captcha').get('src')
        img_url = "http://www.heibanke.com" + img_src
        captcha_0_value = soup.find('input', id="id_captcha_0").get('value')
        data['captcha_0'] = captcha_0_value
    
        imgr = requests.get(img_url)
        if imgr.status_code == 200:
            print("验证码图片获取成功")
            captcha_img = Image.open(BytesIO(imgr.content)) # content 是bytes类型
        else:
            print("验证码图片获取失败,重新获取")
            return get_captcha()
        #captcha_img.show()
        print("正在识别……")
        captcha_1 = pytesseract.image_to_string(captcha_img) # 使用tesseract进行验证码识别
        captcha_1 = captcha_1.strip()
        captcha_1 = captcha_1.replace(' ', '')
        guesses += 1
        if not re.match('^[A-Z | a-z]{4}$', captcha_1):
            print("验证码识别失败:" + captcha_1)
            return get_captcha()
        else:
            print("验证码识别成功:" + captcha_1)
            return captcha_0_value, captcha_1
    """ 猜密码 """
    pw = 0
    while True:
        captcha_0_value, captcha_1 = get_captcha()
        guess_data = {
            'username' : 'medyg',
            'password' : pw,
            'csrfmiddlewaretoken' : cookie2['csrftoken'],
            'captcha_0' : captcha_0_value,
            'captcha_1' : captcha_1
        }
        print(guess_data)
        guessr = requests.post(url, guess_data, cookies = cookie2)
        if guessr.status_code == 200:
            soup = BeautifulSoup(guessr.text, 'lxml')
            h3 = soup.find('h3')
            if '验证码输入错误' in h3.text:
                print("验证码错误,重新输入验证码,验证码识别率为%f" % (float(guess_success) / guesses))
            elif '密码错误' in h3.text:
                guess_success += 1
                print("密码错误,重新输入密码,验证码识别率为%f" % (float(guess_success) / guesses))
                pw += 1
            else:
                guess_success += 1
                print(h3.text)
                print("密码是%d,验证码识别率为%f" % (pw,  (float(guess_success) / guesses)))
                break
        else:
            print("请求失败,重新请求%d" % guessr.status_code)
    

    three.py

    import requests
    from bs4 import BeautifulSoup
    import urllib.request
    from PIL import Image
    import pytesseract
    import re
    import os
    
    URL = 'http://www.heibanke.com/lesson/crawler_ex04/'
    LOGIN_URL = 'http://www.heibanke.com/accounts/login/?next=/lesson/crawler_ex04/'
    
    login_page = requests.get(LOGIN_URL)
    
    login_data = {
        'csrfmiddlewaretoken': login_page.cookies['csrftoken'],
        'username': 'fuyufjh',
        'password': '142857',
    }
    
    login_res = requests.post(LOGIN_URL, data=login_data, cookies=login_page.cookies, allow_redirects=False)
    
    number = 0
    
    while True:
        prob_res = requests.get(URL, cookies=login_res.cookies)
        soup = BeautifulSoup(prob_res.text, 'lxml')
        captcha_id = soup.find(id='id_captcha_0')['value']
        captcha_image_url = 'http://www.heibanke.com' + soup.find(alt='captcha')['src']
        try:
            urllib.request.urlretrieve(captcha_image_url, 'captcha.png')
            vcode_img = Image.open('captcha.png')
            vcode = pytesseract.image_to_string(vcode_img, lang='eng')
        finally:
            os.remove('captcha.png')
        if not re.match(r'[A-Z]{4}$', vcode):
            print('recognizing failed')
            continue
        data = {
            'username': 'fuyufjh',
            'password': number,
            'captcha_0': captcha_id,
            'captcha_1': vcode,
            'csrfmiddlewaretoken': prob_res.cookies['csrftoken']
        }
    
        print(data)
        guess_res = requests.post(URL, data=data, cookies=login_res.cookies)
    
        if '验证码输入错误' in guess_res.text:
            print('verify code error')
            continue
        elif '密码错误' in guess_res.text:
            print('Password is not %d' % number)
            number += 1
        else:
            print('Password is %d' % number)
            break
    
    • 更多代码详情参考我的Github

    相关文章

      网友评论

        本文标题:黑板课爬虫闯关 - 第五关

        本文链接:https://www.haomeiwen.com/subject/qltoqqtx.html