美文网首页
Python 爬虫基础|Python网络数据采集笔记

Python 爬虫基础|Python网络数据采集笔记

作者: JaeGwen | 来源:发表于2017-05-18 10:23 被阅读402次

    安装BeautifulSoup

    • *Linux *
    #debian
    $sudo  apt-get install python-pip
    #redhat
    $sudo yum install pip
    $pip install beautifulsoup4
    
    
    • Windows
    安装Windows版本的pip 
    >pip install beautifulsoup4
    

    运行BeautifulSoup

    from urllib.request import urlopen
    from bs4 import BeautifulSoup
    html = urlopen("http://www.pythonscraping.com/pages/page1.html")
    bs0bj = BeautifulSoup(html.read())
    print bs0bj.h1
    
    

    可靠的网络连接

    try:
        html = urlopen("http://www.pythonscraping.com/pages/page1.html")
    except HTTPError as e:
        print e
        #返回空值,中断程序,或者执行另一方案
    else:
        #程序继续。注意:如果已经在上面异常捕获那一段代码里返回或中断(break),那么就不需要使用else语句,这段代码也不会执行
    
    

    示例代码

    from urllib.request import urlopen
    from urllib.error import HTTPError
    from bs4 import BeautifulSoup
        def getTitle(url):
            try:
                html = urlopen(url)
            except HTTPError as e:
                return None
            try:
                bs0bj = BeautifulSoup(html.read())
                titile  = bs0bj.body.h1
            except AtrributeError as e:
                return None
            return title
    title = gettitle("http://www.pythonscraping.com/pages/page1.html")
    if title == None:
        print "Title could not be found"
    else:
        print title
    
    

    BeautifulSoup的find()和findAll()

    BeautifulSoup的find()和findall() 可能是你最常用的两个函数,借助它们,你可以通过标签的不同属性过滤HTML页面,查找需要的标签组或者单个标签。
    BeautifulSoup文档里两者的定义就是这样:

    findAll(tag, attributes, recursive, text, limit, keywords)
    find(tag, attributes, recursive,text,keywords)
    # 95%的时间 只需使用前两个参数: tag和attribute
    # findAll 默认是支持递归查找的(recursive 默认值是True)
    #文本参数text有点不同,它是用标签的文本内容去匹配,而不是用标签的属性。
    

    处理子标签和其他后代标签

    在BeautifulSoup库里,孩子 child后代 descendant有显著的不同:和人类的家谱一样,子标签就是一个父标签的下一级,而后代标签是指一个父标签下面所有级别的标签。

    处理兄弟标签

    BeautifulSoup的处理兄弟标签的函数可以让收集表格数据成为简单的事情,尤其是处理带标题行的表格
    next_siblings()函数
    next_sibling()函数
    previous_sibling()函数
    previous_siblings()函数
    他们之间的区别只是返回的是单个标签 和返回一组标签的区别。
    
    
    from urllib.request  import urlopen
    from bs4 import BeautifulSoup
    html = urlopen("http://www.pythonscraping.com/pages/page3.html")
    bs0bj = BeautifulSoup(html)
    
    for sibling in bs0bj.find("table",{"id":"giftList"}).tr.next_siblings:
        print sibling
    
    print bs0bj.find("img",{"src":"../img/gifts/img1.jpg"}).parent.previous_sibling.get_text()
    

    解析JSON数据

    import json
    from urllib.request import urlopen
    def getCountry(ipAddress):
        response = urlopen("http://freegeoip.net/json/"+ipAddress).read().decode('utf-8')
        responseJson = json.loads(response)
        return responseJson.get('country_code')
    
    print getCountry("50.58.253.58")
    
    
    import json
    jsonString = '{"arrayOfNums":[{"number":0},{"number":1},{"number":2}],
                          "arrayOfFruits":[{"fruit":"apple"},{"fruit":"banana"},{"fruit":"pear"}]}'
    json0bj = json.loads(jsonString)
    
    print json0bj.get("arrayOfNums")
    print json0bj.get("arrayOfNums")[1]
    print json0bj.get("arrayOfNums")[1].get("number") + json0bj.get("arrayOfNums")[2].get("number")
    print json0bj.get("arrayOfFruits")[2].get("fruit")
    

    存储数据

    下载小文件

    # 方法一:使用urllib库
    # -*- coding:utf-8 -*-
    import urllib
    import time
    
    url = 'http://mvideo.spriteapp.cn/video/2017/0414' \
          '/697de826-20b5-11e7-9c72-1866daeb0df1cut_wpcco.mp4'
    print "downloading with urllib"
    start = time.time()
    urllib.urlretrieve(url, "video.mp4")
    end = time.time()
    print 'Finish in :', end - start
    
    
    # 方法二:使用urllib2库
    # -*- coding:utf-8 -*-
    import urllib2
    import time
    
    url = 'http://mvideo.spriteapp.cn/video/2017/0414/' \
          '697de826-20b5-11e7-9c72-1866daeb0df1cut_wpcco.mp4'
    
    print "downloading with urllib2"
    start = time.time()
    data = urllib2.urlopen(url).read()
    with open('video.mp4', 'wb') as video:
        video.write(data)
    end = time.time()
    print 'Finish in :', end - start
    
    
    # 方法三:使用requests库
    # -*- coding:utf-8 -*-
    import requests
    import time
    
    url = 'http://mvideo.spriteapp.cn/video/2017/0414/' \
          '697de826-20b5-11e7-9c72-1866daeb0df1cut_wpcco.mp4'
    
    print "downloading with requests"
    start = time.time()
    r = requests.get(url)
    with open('video.mp4', 'wb') as video:
        video.write(r.content)
    end = time.time()
    print 'Finish in :', end - start
    
    

    下载大文件

    # 方法一:使用urllib2库
    # -*- coding:utf-8 -*-
    import urllib2
    import time
    
    url = 'http://mvideo.spriteapp.cn/video/2017/0414/' \
          '697de826-20b5-11e7-9c72-1866daeb0df1cut_wpcco.mp4'
    
    r = urllib2.Request(url)
    u = urllib2.urlopen(r)
    start = time.time()
    with open('video.mp4', 'w') as f:
        while True:
            tmp = u.read(1024)
            if not tmp:
                break
            f.write(tmp)
    end = time.time()
    print 'Finish in :', end - start
    
    # 方法二:使用requests库
    # -*- coding:utf-8 -*-
    import requests
    import time
    url = 'http://mvideo.spriteapp.cn/video/2017/0414/' \
          '697de826-20b5-11e7-9c72-1866daeb0df1cut_wpcco.mp4'
    # 当把get函数的stream参数设置成False时,
    # 它会立即开始下载文件并放到内存中,如果文件过大,有可能导致内存不足。
    
    # 当把get函数的stream参数设置成True时,它不会立即开始下载,
    # 使用iter_content或iter_lines遍历内容或访问内容属性时才开始下载
    r = requests.get(url, stream=True)
    f = open("file_path", "wb")
    start = time.time()
    for chunk in r.iter_content(chunk_size=1024):
        if chunk:
            f.write(chunk)
            f.flush()
    # iter_content:一块一块的遍历要下载的内容
    # iter_lines:一行一行的遍历要下载的内容
    # 这两个函数下载大文件可以防止占用过多的内存,因为每次只下载小部分数据
    end = time.time()
    print 'Finish in :', end - start
    

    把数据存储到CSV

    import csv
    csvFile = open("../files/test.csv",'w+')
    try:
        writer = csv.write(csvFile)
        writer.writerow(('number','number plus 2','number times 2'))
        for i in range(10):
            writer.writerow((i, i+2, i*2))
    finally:
        csvFile.close()
    
    import csv 
    from urllib.request import urlopen
    from bs4 import BeautifulSoup
    
    html = urlopen("http://en.wikipedia.org/wiki/Comparison_of_text_editors")
    bs0bj=BeautifulSoup(html)
    tables = bs0bj.findAll("table",{"class":"wikitable"})[0]
    rows = tables.findAll("tr")
    
    csvFile = open("../files/editors.csv",'wt',newline='',encoding='utf-8')
    writer = csv.writer(csvFile)
    try:
        for row in rows:
            csvRow = []
            for cell in row.findAll(['td','th']):
                csvRow.append(cell.get_text())
                writer.writerow(csvRow)
    finally:
        csvFile.close()
    

    读取CSV文件

    • 手动把csv文件下载到本地,然后用python定位文件位置
    • 写python程序下载文件,读取之后再把源文件删除
    • 从网上直接把文件读成一个字符串,然后转换成一个stringIO对象,使它具有文件的属性
    from urllib import urlopen
    from io import stringIO
    import csv
    data = urlopen("http://pythonscraping.com/files/MontyPythonAlbums.csv").read().decode('ascii','ignore')
    dataFile = stringIO(data)
    csvReader = csv.reader(dataFile)
    
    for row in csvReader:
        print row
    
    from urllib.request import urlopen
    from io import stringIO
    import csv
    
    data = urlopen("http://pythonscraping.com/files/MontyPythonAlbums.csv").read().decode('ascii','ignore')
    dataFile = stringIO(data) 
    dictReader = csv.DictReader(dataFile)
    
    #打印表头第一行
    print dictReader.fieldnames
    
    for row in dictReader:
        print row
    
    

    相关文章

      网友评论

          本文标题:Python 爬虫基础|Python网络数据采集笔记

          本文链接:https://www.haomeiwen.com/subject/glvlxxtx.html