美文网首页
python爬虫工具-(1)urllib

python爬虫工具-(1)urllib

作者: anthonywj | 来源:发表于2017-09-28 14:54 被阅读0次
    • 简单方法
    importurllib2
    
    response = urllib2.urlopen("http://www.baidu.com")
    
    print response.read()
    
    • 构造request
    import urllib2
    
    request = urllib2.Request("http://www.baidu.com")
    response = urllib2.urlopen(request)
    print response.read()
    
    • get请求
    import urllib
    import urllib2
    
    values = {}
    values["username"] = "anthoy@gmail.com"
    values["passwd"] = "123456"
    #编码values字典,格式化
    data = urllib.urlencode(values)
    print data
    #构建get访问地址
    geturl = "http://www.baidu.com"+"?"+data
    print geturl
    request = urllib2.Request(geturl)
    respose = urllib2.urlopen(request)
    
    
    • post请求
    import urllib
    import urllib2
    
    values = {}
    values['username'] = "anthony@gmail.com"
    values['password'] = "123456"
    #编码values字典,格式化
    data = urllib.urlencode(values)
    print data
    url = "http://passport.csdn.net/account/login?from=http://my.csdn.net/my/mycsdn"
    #构建post请求
    request = urllib2.Request(url, data)
    print request
    response = urllib2.urlopen(request)
    print response.read()
    

    参考http://cuiqingcai.com/947.html

    • headers设置
    import urllib
    import urllib2
    
    enable_proxy = True
    #设置代理
    proxy_handler = urllib2.ProxyHandler({"http": 'http://119.23.161.182:3128'})
    null_proxy_handler = urllib2.ProxyHandler({})
    if enable_proxy:
        opener = urllib2.build_opener(proxy_handler)
    else:
        opener = urllib2.build_opener(null_proxy_handler)
    urllib2.install_opener(opener)
    #设置头部
    url = 'http://passport.csdn.net/account/login?from=http://my.csdn.net/my/mycsdn'
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    #对付反盗链
    referer = 'http://www.jianshu.com/p/q81RER'
    values = {'username': 'anthony', 'password': '123456'}
    headers = {'User-Agent': user_agent, 'Referer': referer}
    data = urllib.urlencode(values)
    request = urllib2.Request(url, data, headers)
    response = urllib2.urlopen(request)
    page = response.read()
    print page
    

    参考http://cuiqingcai.com/954.html

    • Cookielib

    保存cookie到临时变量

    import urllib2
    import cookielib
    
    # 保存cookie到变量
    cookie = cookielib.CookieJar()
    handler = urllib2.HTTPCookieProcessor(cookie)
    opener = urllib2.build_opener(handler)
    response = opener.open('http://www.baidu.com')
    for item in cookie:
        print 'Name = ' + item.name
        print 'Value = ' + item.value
    
    

    保存cookie到本地

    import cookielib
    import urllib2
    
    # 保存cookie到本地文件
    filename = 'cookie.txt'
    cookie = cookielib.MozillaCookieJar(filename)
    handler = urllib2.HTTPCookieProcessor(cookie)
    opener = urllib2.build_opener(handler)
    response = opener.open("http://www.baidu.com")
    cookie.save(ignore_discard=True, ignore_expires=True)
    for item in cookie:
        print 'Name = ' + item.name
        print 'Value = ' + item.value
    
    

    读取本地cookie

    import cookielib
    import urllib2
    
    #读取本地cookie
    cookie = cookielib.MozillaCookieJar()
    cookie.load('cookie.txt', ignore_discard=True, ignore_expires=True)
    req = urllib2.Request("http://www.baidu.com")
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
    response = opener.open(req)
    print response.read()
    for item in cookie:
        print 'Name = ' + item.name
        print 'Value = ' + item.value
    

    参考http://cuiqingcai.com/968.html

    相关文章

      网友评论

          本文标题:python爬虫工具-(1)urllib

          本文链接:https://www.haomeiwen.com/subject/yfyuextx.html