美文网首页
四、原生数据存储 &ip代理&模拟登陆

四、原生数据存储 &ip代理&模拟登陆

作者: 铅笔与旧友 | 来源:发表于2018-11-01 10:35 被阅读0次

    一、数据存储

    #1)存入json
    import json
    def write_to_json(data):
        # 把数据整合json支持的类型
        json_list = []
        for houses in data:
            for house in houses:
                json_list.append(house)
    
        with open('lianjian.json','w') as fp:
            fp.write(json.dumps(json_list,index=2,ensure_ascii=False))
            # .dumps()方法:将json对象转化为字符串
            # 参数indent:表示缩进个数
            # 参数ensure_ascii:是否将中文字符转化为Unicode字符
    
    import csv
    #2) 存入csv
    def write_to_csv(data):
        # 在写csv的时候,需要把data整合成一个二维列表
        # 定义一个大的列表,用于保存所有的房屋信息
        csv_items = []
        for houses in data:
            for house in houses:
                # house是字典,按照键值的形式存储了每个房屋的信息,取出值写入列表
                item = []
                for k,v in house.items():
                    item.append(v)
                csv_items.append(item)
    
        # 写入csv
        with open('lianjjia.csv','w') as fp:
            # 用fp来创建一个csv的写对象
            w = csv.writer(fp)
            w.writerow(['title','house','position','totalPrice','unitPrice','img'])
            w.writerows(csv_items)
            # .writerows()方法:同时写入多行,参数是个二维列表· 4
            
    import pymysql
    #3) 存入数据库
    def write_to_mysql(data):
        # 创建一个mysql数据库的链接
        db = pymysql.connect(host='127.0.0.1',port=3306,user='root',password='123456',db = 'lianjia',charset='utf8')
        # 创建一个游标,用于解析sql语句
        cursor = db.cursor()
    
        # 创建sql语句
        for houses in data:
            for house in houses:
                sql = 'INSERT INTO ershoufang VALUES (NULL,"%s","%s","%s","%s","%s","%s",)'%(house['title'],house['house'],house['position'],house['totalPrice'],house['nuitPrince'],house['img'])
    
                # 解析并提交sql语句
                cursor.execute(sql)
                db.commit()
    
        # 关闭游标和数据库连接
        cursor.close()
        db.close()
    
    

    二、ip代理

    from  urllib import request
    import requests
    url = 'https://www.baidu.com/s?ie=utf-8&f=3&rsv_bp=1&tn=baidu&wd=ip&oq=ip&rsv_pq=b254df33000238fe&rsv_t=caee8Radj5kHT5OB1roVV9axqOakQtWZzVH9BYWRWLXkJtyQBHfRRlcDylg&rqlang=cn&rsv_enter=0&rsv_sug3=1&rsv_sug1=1&rsv_sug7=100&prefixsug=ip&rsp=0&rsv_sug4=1224'
    
    headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
        }
    
    ##########################################################3
    # 创建一个请求对象
    # request_obj = request.Request(url=url,headers=headers)
    
    # 配置代理
    # handler = request.ProxyHandler({"https":"113.200.56.13:8010"})
    
    # 创建一个opener携带代理handler(h)
    
    # opener = request.build_opener(handler)
    # res = opener.open(request_obj)
    ####################################################################
    
    res = requests.get(url=url,headers=headers,proxies = {"https":"113.200.56.13:8010"})
    
    with open('./ip.html','wb') as fp:
        fp.write(res.content)
        fp.close()
    

    三、模拟登陆

    • 模拟古诗文网登陆
      </br>
      使用Session状态保持
    import requests
    from lxml import etree
    
    
    headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
        }
    
    # 登录页的url
    page_url = 'https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx'
    
    # 登录需要提交的内容:用户名、密码、静态表单、两个token、验证码
    
    # 经过分析,两个token和验证码是在访问登录页的时候动态生成
    # log_html = requests.get(page_url)
    s =requests.Session()    # 状态保持
    log_html = s.get(page_url,headers=headers)
    # 访问登录页的时候也要用到session,因为token值保存在后台,Session可以记录后台token值
    
    
    # 获取两个token
    login_tree = etree.HTML(log_html.text)
    
    token_a = login_tree.xpath('//*[@id="__VIEWSTATE"]/@value')
    token_b = login_tree.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value')
    
    # 获取验证码图片
    code_url = 'https://so.gushiwen.org'+login_tree.xpath('//*[@id="imgCode"]/@src')[0]
    print(code_url)
    # 下载验证码
    code_info = s.get(code_url)
    with open('./code.png','wb') as fp:
        fp.write(code_info.content)
        fp.close()
    
    # 验证码可以有以下几种处理机制,用第三方的ai平台接口来识别、自己训练AI、人工识别
    code = input('请输入你看到的验证码')
    
    # 登录信息提交的信息url
    log_url = 'https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx'
    
    # 请求体
    data = {
    '__VIEWSTATE': token_a,
    '__VIEWSTATEGENERATOR': token_b,
    'from': 'http://so.gushiwen.org/user/collect.aspx',
    'email': 'fanjianbo666@163.com',
    'pwd': '12345678',
    'code': code,
    'denglu': '登录'
    }
    
    result = s.post(url=log_url,data=data)
    print(result.text)
    
    • 模拟登陆chinaunix网
    
    import requests
    from bs4 import BeautifulSoup
    
    headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
        }
    
    
    # 登录页的url
    
    login_page = 'http://bbs.chinaunix.net/member.php?mod=logging&action=login&logsubmit=yes'
    # 访问登录页面
    s = requests.Session()
    page_html = s.get(login_page,headers=headers)
    soup = BeautifulSoup(page_html.text,'lxml')
    action = soup.select('form.cl')[0].get('action')
    formhash = soup.select('[name="formhash"]')[0].get('value')
    
    # formhash、action是动态生成
    
    
    # 登录接口需要动态获取
    login_url = 'http://bbs.chinaunix.net/'+action
    
    data = {
        'formhash':formhash,
        'referer':'http://bbs.chinanuix.net/./',
        'username':'Mrfan666',
        'password':'f12345678',
        'loginsubmit':'true',
        'returen_type':''
    
    }
    
    r = s.post(url=login_url,headers=headers,data=data)
    print(r.text)
    

    相关文章

      网友评论

          本文标题:四、原生数据存储 &ip代理&模拟登陆

          本文链接:https://www.haomeiwen.com/subject/vuvtxqtx.html