美文网首页我爱编程
爬虫二:常用库

爬虫二:常用库

作者: 爱跑步的coder | 来源:发表于2018-05-27 16:50 被阅读0次

    requests库

    传递URL参数

    payload = {'k1':'v1', 'k2':'v2'} #value也可是个数组,如'k2':[1, 2, 3]
    url = r'http://httpbin.org/get'
    
    r = requests.get(url, params = payload)
    print r.url
    

    处理二进制数据

    以图片为例

    import requests
    from io import BytesIO
    from PIL import Image
    
    r = requests.get(r'https://ss3.bdstatic.com/70cFv8Sh_Q1YnxGkpoWK1HF6hhy/it/u=3552799060,760640595&fm=27&gp=0.jpg')
    
    image = Image.open(BytesIO(r.content)) #.content和.text的差别就表现出来了
    image.save(r'd:\beauty.jpg')
    

    json数据处理

    import requests
    r = requests.get('https://api.github.com/events')
    print r.json()
    

    原始数据处理(此方式节约内存)

    r = requests.get('https://ss3.bdstatic.com/70cFv8Sh_Q1YnxGkpoWK1HF6hhy/it/u=3552799060,760640595&fm=27&gp=0.jpg', stream=True)
    with open(r'd:\meinv2.jpg','wb') as fd:
        for chunk in r.iter_content(chunk_size=128):
            fd.write(chunk)
    

    Post请求(提交表单)

    import json
    form = {'username': 'user', 'password':'pass'}

    r = requests.post(r'http://httpbin.org/post', data= form)
    print r.text

    r = requests.post(r'http://httpbin.org/post', data= json.dumps(form))
    print r.text

    Cookies(获取和使用)

    获取Cookies

    r = requests.get(r'https://www.baidu.com')
    for k,v in r.cookies.get_dict().items():
          print k,v 
    

    使用Cookies

    cookies = {'c1':'v1', 'c2':'v2'}
    r = requests.get(r'http://httpbin.org/cookies', cookies = cookies)
    print r.text
    

    重定向和重定向历史

    r = requests.head('http://github.com', allow_redirects = True)
    print r.url
    print r.status_code
    print r.history

    代理

    proxies = {'http':'', 'https':''}
    r = requests.get(, proxies = proxies)

    数据库

    sqlite

    import sqlite3
    
    conn = sqlite3.connect(r'd:\test2.db')
    create_sql = r'create table company(id int primary key not null, emp_name text not null)'
    conn.execute(create_sql)
    
    insert_sql = r'insert into company values (?, ?)'
    conn.execute(insert_sql, (100, 'LY'))
    conn.execute(insert_sql, (200, 'YH'))
    
    cursor = conn.execute(r'select id, emp_name from company')
    
    for row in cursor:
        print row[0], row[1]
    

    mysql的话,记着进行commit操作(使得结果生效)。

    豆瓣实例

    使用session来进行持久化操作

    import requests
    import html5lib
    import re
    from bs4 import BeautifulSoup
    
    
    s = requests.Session()
    url_login = 'http://accounts.douban.com/login'
    
    formdata = {
        'redir':'https://www.douban.com',
        'form_email': 't.t.panda@hotmail.com',
        'form_password': 'tp65536!',
        'login': u'登陆'
    }
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'}
    
    r = s.post(url_login, data = formdata, headers = headers) #发送post请求,带表单数据和请求头
    
    content = r.text
    soup = BeautifulSoup(content, 'html5lib')
    captcha = soup.find('img', id = 'captcha_image')
    if captcha:
        captcha_url = captcha['src']
        re_captcha_id = r'<input type="hidden" name="captcha-id" value="(.*?)"/'
        captcha_id = re.findall(re_captcha_id, content)
        print(captcha_id)
        print(captcha_url)
        captcha_text = input('Please input the captcha:')
        formdata['captcha-solution'] = captcha_text #输入验证码
        formdata['captcha-id'] = captcha_id
        r = s.post(url_login, data = formdata, headers = headers)
    with open('contacts.txt', 'w+', encoding = 'utf-8') as f:
        f.write(r.text)
    

    利用cookie进行网站的登录
    问题(验证码在哪里提交呢?是否必须要通过post请求来解决验证码的问题?)

    import requests
    
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'}
    cookies = {'cookie': 'bid=a3MhK2YEpZw; ll="108296"; ps=y; ue="t.t.panda@hotmail.com"; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1482650884%2C%22https%3A%2F%2Fwww.so.com%2Fs%3Fie%3Dutf-8%26shb%3D1%26src%3Dhome_so.com%26q%3Dpython%2B%25E8%25B1%2586%25E7%2593%25A3%25E6%25BA%2590%22%5D; _gat_UA-7019765-1=1; ap=1; __utmt=1; _ga=GA1.2.1329310863.1477654711; dbcl2="2625855:/V89oXS4WD4"; ck=EePo; push_noty_num=0; push_doumail_num=0; _pk_id.100001.8cb4=40c3cee75022c8e1.1477654710.8.1482652441.1482639716.; _pk_ses.100001.8cb4=*; __utma=30149280.1329310863.1477654711.1482643456.1482650885.10; __utmb=30149280.19.10.1482650885; __utmc=30149280; __utmz=30149280.1482511651.7.6.utmcsr=blog.csdn.net|utmccn=(referral)|utmcmd=referral|utmcct=/alanzjl/article/details/50681289; __utmv=30149280.262; _vwo_uuid_v2=64E0E442544CB2FE2D322C59F01F1115|026be912d24071903cb0ed891ae9af65'}
    url = 'http://www.douban.com'
    r = requests.get(url, cookies = cookies, headers = headers)
    with open('douban_2.txt', 'wb+') as f:
        f.write(r.content)
    
    
    import requests
    from lxml import etree
    
    s = requests.Session()
    for id in range(0, 251, 25):
        url = 'https://movie.douban.com/top250/?start-' + str(id)
        r = s.get(url)
        r.encoding = 'utf-8'
        root = etree.HTML(r.content)
        items = root.xpath('//ol/li/div[@class="item"]')
        # print(len(items))
        for item in items:
            title = item.xpath('./div[@class="info"]//a/span[@class="title"]/text()')
            name = title[0].encode('gb2312', 'ignore').decode('gb2312')
    #encode然后decode
            # rank = item.xpath('./div[@class="pic"]/em/text()')[0]
            rating = item.xpath('.//div[@class="bd"]//span[@class="rating_num"]/text()')[0]
            print(name, rating)
    
    

    相关文章

      网友评论

        本文标题:爬虫二:常用库

        本文链接:https://www.haomeiwen.com/subject/ubrvjftx.html