美文网首页
爬虫===urllib2

爬虫===urllib2

作者: 清茶也醉人Q | 来源:发表于2018-04-12 22:03 被阅读0次

    1、获取域名注册信息

      #-*- coding:utf-8 -*-
      import whois
      print(whois.whois('baidu.com'))
    

    urllib2库

    #下载指定网站信息的内容
    def download(url,user_agent='Mozilla/5.0',num_retries=3,proxy=None):
        print('downloading!!!')
        content = None
        # 伪造请求头
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36'}
        request = urllib2.Request(url,headers=headers)
       try:
            #读取指定url地址的内容
            # content = urllib2.urlopen(url).read()
            content = urllib2.urlopen(request).read()
    
          #启用代理IP
             if proxy:
                  handler = urllib2.ProxyHandler(proxies={'http':'http://%(host)s:%(port)d'%proxy})
                  opener = urllib2.build_opener(handler)
                  urllib2.install_opener(opener)
    
        #读取指定url地址的内容
              # content = urllib2.urlopen(url).read()
              content = urllib2.urlopen(request).read()
    
        except Exception as e:
            print e
            #判断重试次数不为0
            if num_retries > 0:
                #错误为服务器错误
                if hasattr(e,'code') and 500 <= e.code < 600:
                    #递归调用,减少重试次数
                    return download(url,user_agent,num_retries-1)
        return content
    #*****************测试**********************
    # print download('https://www.baidu.com/')
    # print download('https://www.taobao.com/')
    # print download('https://www.vip.com/')
    print download('http://www.xicidaili.com/')
    

    请求头:有多种作用:伪装成浏览器
    代理IP的重要性: HTTP代理ip就很重要,如果当前的ip地址受限制,可以换一个新的ip地址,保证爬虫的顺利进行

    robot协议

    查看网站robot协议:域名+robots.txt ===>https://www.taobao.com/robots.txt

      #-*- coding:utf-8 -*-
      import robotparser
      #尝试baidu或taobao的robots.txt协议
      rp = robotparser.RobotFileParser()
      rp.set_url('https://www.taobao.com/robots.txt')
      rp.read()
      print rp.can_fetch(useragent='Baiduspider',url='https://www.taobao.com/article')
    

    网站地图

    requests使用

      import requests
      import re
      #获取请求信息
      text = requests.get('http://example.webscraping.com/places/default/view/Afghanistan-1').text
      #匹配符合条件的内容
      result = re.findall('<label class="readonly" for="(.*?)" id="(.*?)">(.*?)</label>',text)
      print result[3][2]
    

    提取方式:

    BeautifulSoup(美丽汤)

    from bs4 import BeautifulSoup
    import requests
    
    text = requests.get('http://example.webscraping.com/places/default/view/Afghanistan-1').text
    bs = BeautifulSoup(text,'html.parser')
    bs.prettify()
    
    # tr = bs.find('tr',attrs={'id': 'places_phone__row'})
    #提取符合要求的内容
    # print tr.find('td',attrs={'class': 'w2p_fw'}).text
    
    #提取符合要求的图片
    tr = bs.find('tr',attrs={'id': 'places_national_flag__row'})
    # print tr.find('td',attrs={'class': 'w2p_fw'})
    s = tr.find('img').attrs['src']
    print s
    

    lxml提取

      import requests
      import lxml.html
    
      text = requests.get('http://example.webscraping.com/places/default/view/Afghanistan-1').text
      doc = lxml.html.fromstring(text)
      print type(text)         #<class 'lxml.html.HtmlElement'>
      text = lxml.html.tostring(doc,pretty_print=True)
      print type(doc )       #<type 'str'>
      tree = lxml.html.fromstring(text)
      t =  tree.cssselect('tr#places_country__row > td.w2p_fw')[0].text_content()
      # t =  doc.cssselect('tr#places_country__row > td.w2p_fw')[0].text_content()
      print t,tree.get('class')
    

    lxml -- xpath

      #-*- coding:utf-8 -*-
      from lxml import etree
    
      with open('hello.html','r') as f:
          html = f.read()
    
    
      html = etree.HTML(html)
      html = etree.tostring(html,pretty_print=True)
      # print html
      html = etree.HTML(html)
      # html = etree.parse('hello.html')
      # print type(html)
      # #获取所有的 <li> 标签
      result = html.xpath('//li')
      # print result
      # print type(result)
      # print type(result[0])
      # print len(result)
     ---
     #获取所有li标签的所有class
     # print html.xpath('//li/@class')
     ---
    #获取 <li> 标签下 href 为 link1.html 的 <a> 标签
    print html.xpath('//li//a[@href="link1.html"]')
    #获取所有a的href
    print html.xpath('//li//a/@href')
    # print html.xpath('//li//a/text()')
    ---
    

    相关文章

      网友评论

          本文标题:爬虫===urllib2

          本文链接:https://www.haomeiwen.com/subject/zugfkftx.html