美文网首页
Python 爬虫入门

Python 爬虫入门

作者: 沧海一粟谦 | 来源:发表于2018-06-26 11:29 被阅读0次
    Interstellar

    URL统一资源定位符

    URL一般格式如下(带方括号[]的为可选项):
    
    protocol :// hostname[:port] / path / [;parameters][?query]#fragment
    
    URL的格式由三部分组成:
    
    (1)protocol:第一部分就是协议,例如百度使用的就是https协议;
    
    (2)hostname[:port]:第二部分就是主机名(还有端口号为可选参数),一般网站默认的端口号为80,例如百度的主机名就是www.baidu.com,这个就是服务器的地址;
    
    (3)path:第三部分就是主机资源的具体地址,如目录和文件名等。
    
    网络爬虫就是根据这个URL来获取网页信息的。
    

    urllib库

    urllib是基于http的高层库,它有以下三个主要功能:

    1. request处理客户端的请求

    2. response处理服务端的响应

    3. parse会解析url

    4. 主要用来识别网站的robots.txt文件,用得较少

    获取响应信息

    # 获取网页内容
    import urllib.request
    response = urllib.request.urlopen('http://www.baidu.com/')
    html = response.read().decode("utf-8")
    print(html)
    
    # 取响应状态码和头信息
    print(response.status)
    print(response.getheaders())
    print(response.getheader("Server"))
    

    设置超时时间

    import urllib.request
    response = urllib.request.urlopen("http://2018.sina.com.cn/", timeout=1) 
    html = response.read().decode("utf-8")
    print(html)
    

    设置请求头和参数

    from urllib import request, parse
    
    url = "http://2018.sina.com.cn/"
    headers = {
      "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
      "Host": "2018.sina.com.cn",
    }
    dict = {
      "name": "Question"
    }
    data = bytes(parse.urlencode(dict), encoding="utf8")
    req = request.Request(url=url, data=data, headers=headers, method="GET")
    response = request.urlopen(req)
    print(response.read().decode("utf-8"))
    

    异常处理

    from urllib import request, error
    
    try:
      response = request.urlopen("https://cuiqingcai.com/index.htm")
    except error.URLError as e:
      print(e.reason)
    

    用爬虫下载图片

    pip install request

    import requests
    
    r = requests.get("http://www.baidu.com")
    print(r.status_code)
    print(r.text)
    print(r.cookies)
    

    以爬取猫眼电影为例

    import requests
    import re
    
    
    def get_one_page(offset):
        url = "http://maoyan.com./board/4?offset=%d" % offset
        headers = {
            "User-Agent":"Mozilla/4.0 (compatible;MSIE 7.0;Windows NT 5.1;360SE)"
        }
        response = requests.get(url,headers=headers)
        if response.status_code == 200:
    #       print(response.text)
            return response.text
        return None
    
    def parse_one_page(html):
        # 排名信息
        #pattern = re.compile("<dd>.*?board-index.*?>(.*?)</i>",re.S)
        # 加 图片链接
        # pattern = re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?<img.*?<img.*?src="(.*?)"', re.S)
        # 加 主演 上映时间
        # pattern = re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?<img.*?<img.*?src="(.*?)".*?star.*?>(.*?)</p>.*?releasetime.*?>(.*?)</p>', re.S)
    
        # pattern = re.compile("movieId.*?>(.*?)</a>", re.S)
    
        # pattern = re.compile('<p class="star">(.*?)</p>', re.S)
        
        pattern = re.compile('movieId.*?>.*?<img.*?<img.*?src="(.*?)"',re.S)
        items = re.findall(pattern,html)
        return items
    
    def get_all_page():
        for i in range(10):
            offset = i*10
            html = get_one_page(offset)
            items = parse_one_page(html)
            # print(items)
            for item in items:
                write_img(item)
        
    
    #http://p0.meituan.net/movie/283292171619cdfd5b240c8fd093f1eb255670.jpg@160w_220h_1e_1c
    def write_img(url):
        url_parts = url.split("@")
        url_result = url_parts[0]
        filename = "./images/%s" % url_result.split("/")[-1]
        print(filename)
    
        r = requests.get(url)
        with open(filename,"wb") as f:
            f.write(r.content)
    
    get_all_page()
    
    '''
    def main():
        get_all_page()
    
    if __name__ == "__main__":
        main()
    '''
    

    爬虫框架 Beautiful Soup

    一款强大的网页解析工具,不用复杂的正则表达式,依靠网页结构和属性进行解析

    安装

    pip install beautifulsoup4
    

    引入BeautifulSoup

    from bs4 import BeautifulSoup
    

    获取方法

    soup = BeautifulSoup(html, "lxml") # 试用lxml解析器构造beautifulsoup
    print(soup.prettify())  # 取网页缩进格式化输出
    print(soup.title.string) # 取网页title内容
    print(soup.head) 
    print(soup.p)
    
    # 获取节点的名字
    print(soup.title.name)
    # 获取节点属性
    soup.img.attrs["src"]
    print(soup.p.attrs)
    print(soup.p.attrs["name"])
    print(soup.p["class"])
    # 获取节点包含的内容
    print(soup.p.string)
    

    <p class="c1"><span>asdf</span>asdfasdfasdfasdfadsfad</p>

    嵌套选择

    <head>
    <title>this is title</title>
    </head>

    # soup的节点都为 bs4.element.Tag类型,可以继续选择
    print(soup.head.title.string)
    

    关联选择

    有些元素没有特征定位,可以先选择有办法定位的,然后以这个节点为准选择它的子节点、父节点、兄弟节点等
    <p class="p1"></p>
    <p></p>
    <p></p>

    print(soup.p.contents) # 取p节点下面所有子节点列表
    print(soup.p.descendants) #取p节点所有子孙节点
    print(soup.a.parent) # 取父节点
    print(soup.a.parents) # 取所有祖先节点
    print(soup.a.next_sibling) # 同级下一节点
    print(soup.a.previous_sibling) # 同级上一节点
    print(soup.a.next_siblings) # 同级所有后面节点
    print(soup.a.previous_siblings) # 同级所有前面节点
    print(list(soup.a.parents)[0].attrs['class'])
    

    方法选择器

    根据属性和文本进行查找
    <ul><li></li></ul>
    <ul><li></li>jjj<li></li></ul>

    print(soup.find_all(name="ul"))
    
    for ul in soup.find_all(name="ul"):
      print(ul.find_all(name="li"))
      for li in ul.find_all(name="li"):
        print(li.string)
    
    soup.find_all(attrs={"id": "list-1"})
    

    css 选择器

    通过select()直接传入CSS选择器就可以完成选择
    熟悉前端的人对CSS可能更加了解,其实用法也是一样的
    .表示class #表示id
    标签1,标签2 找到所有的标签1和标签2
    标签1 标签2 找到标签1内部的所有的标签2
    [attr] 可以通过这种方法找到具有某个属性的所有标签
    [atrr=value] 例子[target=_blank]表示查找所有target=_blank的标签

    html='''
    <div class="panel">
        <div class="panel-heading">
            <h4>Hello</h4>
        </div>
        <div class="panel-body">
            <ul class="list" id="list-1">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
                <li class="element">Jay</li>
            </ul>
            <ul class="list list-small" id="list-2">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
            </ul>
        </div>
    </div>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, 'lxml')
    print(soup.select('.panel .panel-heading'))
    print(soup.select('ul li'))
    print(soup.select('#list-2 .element'))
    print(type(soup.select('ul')[0]))
    

    以豆瓣为例

    from bs4 import BeautifulSoup
    import requests
    
    def get_page():
        url = "http://www.douban.com/group/explore" 
        headers = {
            "User-Agent":"Mozilla/4.0 (compatible;MSIE 7.0;Windows NT 5.1;360SE)"
        }
        response = requests.get(url,headers=headers)
        if response.status_code == 200:
    #       print(response.text)
            return response.text
        return None
    
    def parse_page(html):
        result_list = []
    #   soup = BeautifulSoup(html,'lxml')
        soup = BeautifulSoup(html,'html.parser')
        items = soup.select('.channel-item')
        for item in items:
            result_dict = {}
            title = item.select('h3 a')[0].string
            result_dict["title"] = title
    
            like = item.select('.likes')[0].contents[0]
            result_dict['like'] = int(like)
    
            result_list.append(result_dict)
    
        print(result_list)
        return result_list
    
    def main():
        html = get_page()
        parse_page(html)
    
    main()
    

    爬取豆瓣电影评分
    Beautiful Soup 4.2.0 文档
    python爬虫从入门到放弃(六)之 BeautifulSoup库的使用
    Python爬虫利器二之Beautiful Soup的用法

    相关文章

      网友评论

          本文标题:Python 爬虫入门

          本文链接:https://www.haomeiwen.com/subject/dyvpyftx.html