美文网首页
黑板课爬虫闯关 - 第四关

黑板课爬虫闯关 - 第四关

作者: MA木易YA | 来源:发表于2018-11-27 21:58 被阅读12次
    1. 第四关增加了登录验证和密码获取,主页面如下:
    image.png
    1. 但是想要进入主页面需要先登录:
    image.png
    1. 进去后可以随便试下密码,果然错误,然后系统提供了一个密码表
    image.png
    1. 密码表


      image.png
    2. 然后就是繁杂的解密过程了,流程和之前其实差不多,同样的模拟登陆,不同的是这里增加了密码获取这一条,开始做的时候也是很懵逼,后来在网上大佬的提点下知道密码是有100位QAQ,而且页面加载极其之慢,所以这个过程非常煎熬
      关卡主要是想考察模拟登陆和多线程爬虫这一块,这样比较快嘛,但是也是可以暴力破解的嘛,下面提供三个版本参考
      one.py(单线程,直接获取密码组合到100位再进行测试)
    import requests
    from lxml import etree
    import codecs
    import csv
    import re
    
    
    se = requests.session()
    
    
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"
    }
    
    class HBK():
        def __init__(self):
            self.login_url = "http://www.heibanke.com/accounts/login"
            self.username = "whaike"
            self.password = "12345654321"
            self.passwrods = ['' for i in range(101)]
            self.pwd = ''
    
            ##获取登陆之前的csrf
        def getCsrf(self):
            res = se.get(url=self.login_url,headers=headers,timeout=30).text
            tree = etree.HTML(res)
            self.csrf = tree.xpath('/html/body/div/div/div[2]/form/input[@name="csrfmiddlewaretoken"]/@value')[0]
    
        #登陆
        def login(self):
            self.getCsrf()
            data = {
                "csrfmiddlewaretoken":self.csrf,
                "username":self.username,
                "password":self.password
            }
            se.post(url=self.login_url,headers=headers,data=data,timeout=30)
            print('登陆成功')
    
        #获取登陆之后的csrf,也就是要进行第四关闯关的csrf
        def getNCsrf(self):
            url = 'http://www.heibanke.com/lesson/crawler_ex03/'
            res = se.get(url,headers=headers,timeout=30).text
            tree = etree.HTML(res)
            csrf = tree.xpath('//input[1]/@value')[0]
            return csrf
    
        #猜测密码是否正确
        def guesspwd(self):
            url = 'http://www.heibanke.com/lesson/crawler_ex03/'
            csrf = self.getNCsrf()
            data = {
                "csrfmiddlewaretoken":csrf,
                "username":"whaike",
                "password":self.pwd
            }
            res = se.post(url,headers=headers,data=data,timeout=30)
            if int(res.status_code) == 200:
                self.h3 = re.findall('<h3>(.*?)</h3>',res.text)
                return True
            else:
                return False
    
        #循环抓取第一页的随机值,直到密码长度为100时开始猜测,猜测失败继续执行,猜测成功停止运行
        def getGasswords(self):
            print('获取第一页')
            url = 'http://www.heibanke.com/lesson/crawler_ex03/pw_list/?page=1'
            res = se.get(url,headers=headers,timeout=30).text
            tree = etree.HTML(res)
            trs = tree.xpath('/html/body/div/div/div[2]/table/tr')[1:]
            for tr in trs:
                p1 = tr.xpath('td[1]/text()')[0] #位置
                p = int(re.findall('\d+',p1)[0]) #偶尔数字前会有一些其他字符出现,提取数字部分,转换为整数
                w = tr.xpath('td[2]/text()')[0] #值
                self.passwrods[p] = w
            self.pwd = ''.join(self.passwrods)
            length = len(self.pwd) #密码长度
            print('当前密码:%s,长度%d'%(self.pwd,length))
            if length == 100:
                print('满足条件,开始猜测...')
                if self.guesspwd():
                    print ('猜测成功,密码为:%s'%self.pwd)
                else:
                    print ('猜测失败,继续执行')
                    self.getGasswords()
            else: #如果密码长度不为100,则再次获取第一页的随机密码并组成新的密码
                self.getGasswords() #递归
    
    
    if __name__ == '__main__':
    
        print('开始闯关 - 第四关')
        spider = HBK()
        spider.login()
        spider.getGasswords()
        print(spider.h3)
    

    two.py(多线程版,转自网上某大佬)

      #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    # @Date    : 2017-09-02 22:25:21
    # @Author  : bb (317716008@qq.com)
    # @Word    : python can change world!
    # @Version : python3.6
    import requests
    from bs4 import BeautifulSoup
    import threading
    from queue import Queue
    
    
    dict1={}
    vlauess=[]
    web1="http://www.heibanke.com/accounts/login"
    web2="http://www.heibanke.com/lesson/crawler_ex03/pw_list/"
    web3="http://www.heibanke.com/lesson/crawler_ex03/"
    global queuewz
    global queuemm
    queuewz=Queue()
    queuemm=Queue()
    
    
    class mythreads(threading.Thread):
    
        def __init__(self):
            threading.Thread.__init__(self)
    
        def run(self):
            work()
            while not queuemm.empty():
                try:
                    dict1[str(queuewz.get())]=queuemm.get()
                    print(dict1)
                    print("字典长度为%s"%len(dict1))
                    if int(len(dict1)) ==100:
                        print("凑到100啦!")
                        for i in range(1,101):
                            vlauess.append(dict1[str(i)])
                        c=vlauess[:100]
                        zzmm=''.join(c)
                        print("密码为%s"%zzmm)
                        print("正在登录.......")
                        dataWebsite1 = {'username': 'user','password': zzmm}
                        s=login_get()
                        res=s.post(web3, data=dataWebsite1).text
                        if u'恭喜' in res:
                            title=re.findall("<title>(.*?)</title>",res)
                            word=re.findall("<h1>(.*?)</h1>",res)
                            word2=re.findall("<h3>(.*?)</h3>",res)
                            html=re.findall('<a href="(.*?)" class="btn btn-primary">下一关</a>',res)
                            print('\n'.join([title[0], word[0], word2[0],'下一关地址是','http://www.heibanke.com'+html[0]]))
                            break
                        else:
                            print("网页有问题哦!可以尝试手动将获得的正确密码登入进去哦!")
                            break
                    else:
                        main()
                except IndexError:
                    print("例表空了,下一页!")
    
    
    def login_get():
        try:
            s = requests.Session()
            r=s.get(web1)     # 访问登录页面获取登录要用的csrftoken
            token1 = r.cookies['csrftoken']      # 保存csrftoken
            # 将csrftoekn存入字段csrfmiddlewaretoken
            dataWebsite1 = {'username': 'user',
                            'password': 'password',
                            'csrfmiddlewaretoken': token1
                        }
            res=s.post(web1, data=dataWebsite1)
        except KeyError as e:
            pass
    
        return s
    
    def get_html(s):
        r=s.get(web2)
        res=r.text
        return res
    
    def get_dict(res):
    
        soup=BeautifulSoup(res,"html.parser")
        for a in soup.find_all('td',attrs={'title':'password_pos'}):
            wz=(a.string)
            queuewz.put(wz)
        for b in soup.find_all('td',attrs={'title':'password_val'}):
            mm=(b.string)
            queuemm.put(mm)
    
    def work():
        res=get_html(s)
        get_dict(res)
    
    
    def main():
        global s
        s=login_get()
        threads=[]
        threads_count=10
    
        for i in range(threads_count):
            threads.append(mythreads())
    
        for t in threads:
            t.start()
    
        for t in threads:
            t.join()
    
    
    if __name__ == '__main__':
        main()
    

    three.py

    import re
    import requests
    from threading import Thread
    import time
    
    def print_run_time(func):
        """
        装饰器函数,输出运行时间
        """
        def wrapper(self, *args, **kw):
            local_time = time.time()
            # print args),kw
            func(self)
            print('run time is {:.2f}:'.format(time.time() - local_time))
        return wrapper
    
    class hbk_crawler(object):
        """黑板客爬虫闯关"""
        def __init__(self): pass
    
        def login(self):
            """登录函数 input:第几关"""
            self.url = 'http://www.heibanke.com/lesson/crawler_ex03'
            self.login_url = 'http://www.heibanke.com/accounts/login/?next=/lesson/crawler_ex03'
            self.s = requests.session()
            print("正在登录第4关....")
            try:
                self.csrftoken = self.s.get(self.login_url).cookies['csrftoken']
            except:
                print("网络连接错误,请重试...")
                exit()
            self.payload = {'username': 'test', 'password': 'test123',
                            'csrfmiddlewaretoken': self.csrftoken}
            self.payload['csrfmiddlewaretoken'] = self.s.post(
                self.login_url, self.payload).cookies['csrftoken']
            print("登录成功....")
            return None
    
        def parseurl(self, url):
            """分析网页,查找密码位置和值"""
            while self.count < 100:
                response = self.s.get(url)
                if response.ok:
                    content = response.text
                    pos_pattern = r'_pos.>(.*)</td>'
                    val_pattern = r'_val.>(.*)</td>'
                    pos_list = re.findall(pos_pattern, content)
                    val_list = re.findall(val_pattern, content)
                    for pos, val in zip(pos_list, val_list):
                        if pos not in self.pw_dict:
                            self.pw_dict[pos] = val
                            self.count = self.count + 1
                    print(str(self.count) + '%' + self.count // 2 * '*')
    
        def ex04(self, *args, **kw):
            """ 第4关:找密码,加入了登录验证,CSRF保护,密码长度100位,响应时间增加 """
            self.count = 0
            self.login()
            self.pw_dict = {}
            pw_url = ('http://www.heibanke.com/lesson/crawler_ex03/pw_list',)
            # 线程数,黑板客服务器15秒内最多响应2个请求,否则返回404.
            n = 2
            threads = [Thread(target=self.parseurl, args=(
                pw_url)) for i in range(n)]
            for t in threads:
                print(t.name, 'start...')
                t.start()
            for t in threads:
                t.join()
            self.pw_list = ['' for n in range(101)]
            for pos in self.pw_dict.keys():
                self.pw_list[int(pos)] = self.pw_dict[pos]
            password = int(''.join(self.pw_list))
            self.payload['password'] = password
            response = self.s.post(self.url, self.payload)
            pattern = r'<h3>(.*)</h3>'
            result = re.findall(pattern, response.text)
            result2 = re.findall('<a href="(.*?)" class="btn btn-primary">下一关</a>',response.text)
            print(result[0])
            print(result2)
    
    
    if __name__ == '__main__':
        Hbk_crawler = hbk_crawler()
        Hbk_crawler.ex04()
    

    综合总结下来,几种方法原理都差不多,主要是多线程在数据获取上速度会快一些,关于多线程的板块,后面会进行更新,大家也可以去看看官方文档或者廖雪峰的教程,然后这里每次都要模拟登陆比较麻烦,注意代码不要冗余,在获取错误信息方面基本都差不多,用的re抓取,如果"text" in XXX的方式不适用的话,可以尝试构建result为抓取的错误信息,返回值为空则为正确,有返回值则说明有错误信息,页面加载比较慢,建议给点输出信息以免你觉得代码挂了

    • 更多代码详情参考我的Github

    相关文章

      网友评论

          本文标题:黑板课爬虫闯关 - 第四关

          本文链接:https://www.haomeiwen.com/subject/doynqqtx.html