美文网首页视觉艺术
大师兄的Python学习笔记(二十二): 爬虫(三)

大师兄的Python学习笔记(二十二): 爬虫(三)

作者: superkmi | 来源:发表于2020-07-22 18:45 被阅读0次

师兄的Python学习笔记(二十一): 爬虫(二)
大师兄的Python学习笔记(二十三): 爬虫(四)

四、保存数据

  • 数据在提取后,即可以保存在文件中,也可以保存在数据库中。
4.1 文件存储
4.1.1 存储到txt文件
  • 存储到txt文件是最基础的数据保存方式。
>>>from pyquery import PyQuery as pq
>>>import requests

>>>def sort_data(func):
>>>    def deco(*args,**kargs):
>>>        # 处理内容
>>>        data = func(*args,**kargs)
>>>        html_data = pq(data)
>>>        hd = html_data('.hd')
>>>        bd = html_data('.bd')

>>>        index = [x.text() for x in html_data.find('em').items()]
>>>        name = [x.text() for x in hd.find('.title:first-child').items()]
>>>        director_actor = [x.html().strip().split('<br/>')[0] for x in bd.children('.star').siblings('p:first-of-type').items()]
>>>        director = [x.split('\xa0\xa0\xa0')[0] for x in director_actor]
>>>        actor = [x.split('\xa0\xa0\xa0')[1] for x in director_actor]
>>>        star = bd('.star')('span:nth-child(4)').text().split()
>>>        link = [x.attr.href for x in hd('a').items()]

>>>        result = zip(index,name,director,actor,star,link)
>>>        return result
>>>    return deco

>>>@sort_data
>>>def get_page(url):
>>>    # 获得页面内容
>>>    headers = {
>>>        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like >>>>Gecko) Chrome/78.0.3904.108 Safari/537.36'
>>>    }
>>>    res = requests.get(url=url,headers=headers)
>>>    if res.status_code == 200:
>>>        return res.text # 获得HTML页面
>>>    else:
>>>        return None

>>>def save_to_txt(data,filename="douban_top10.txt"):
>>>    # 保存到txt文档
>>>    with open(filename,'w',encoding='utf-8') as file:
>>>        for i in range(10):
>>>            line = f'{" ".join(list(next(data)))}\n'
>>>            file.write(line)

>>>def main():
>>>    # 入口
>>>    url = 'https://movie.douban.com/top250'
>>>    page_data = get_page(url)
>>>    save_to_txt(page_data)

>>>if __name__ == '__main__':
>>>    main()
4.1.2 存储为json文件
>>>from pyquery import PyQuery as pq
>>>import requests,json

>>>def sort_data(func):
>>>    def deco(*args,**kargs):
>>>        # 处理内容
>>>        data = func(*args,**kargs)
>>>        html_data = pq(data)
>>>        hd = html_data('.hd')
>>>        bd = html_data('.bd')

>>>        index = [x.text() for x in html_data.find('em').items()]
>>>        name = [x.text() for x in hd.find('.title:first-child').items()]
>>>        director_actor = [x.html().strip().split('<br/>')[0] for x in bd.children('.star').siblings('p:first-of-type').items()]
>>>        director = [x.split('\xa0\xa0\xa0')[0] for x in director_actor]
>>>        actor = [x.split('\xa0\xa0\xa0')[1] for x in director_actor]
>>>        star = bd('.star')('span:nth-child(4)').text().split()
>>>        link = [x.attr.href for x in hd('a').items()]

>>>        result = zip(index,name,director,actor,star,link)
>>>        return result
>>>    return deco

>>>@sort_data
>>>def get_page(url):
>>>    # 获得页面内容
>>>    headers = {
>>>        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like >>>>Gecko) Chrome/78.0.3904.108 Safari/537.36'
>>>    }
>>>    res = requests.get(url=url,headers=headers)
>>>    if res.status_code == 200:
>>>        return res.text # 获得HTML页面
>>>    else:
>>>        return None

>>>def save_to_json(data,filename="douban_top10.json"):
>>>    # 保存到json文档
>>>    with open(filename,'w',encoding='utf-8') as file:
>>>        for i in range(10):
>>>            json.dump(next(data),file)

>>>def main():
>>>    # 入口
>>>    url = 'https://movie.douban.com/top250'
>>>    page_data = get_page(url)
>>>    save_to_json(page_data)

>>>if __name__ == '__main__':
>>>    main()
4.1.3 存储为pickle文件
  • pickle支持直接保存Python的大部分数据类型,但只能在Python中使用。
>>>from pyquery import PyQuery as pq
>>>import requests,pickle

>>>def sort_data(func):
>>>    def deco(*args,**kargs):
>>>        # 处理内容
>>>        data = func(*args,**kargs)
>>>        html_data = pq(data)
>>>        hd = html_data('.hd')
>>>        bd = html_data('.bd')

>>>        index = [x.text() for x in html_data.find('em').items()]
>>>        name = [x.text() for x in hd.find('.title:first-child').items()]
>>>        director_actor = [x.html().strip().split('<br/>')[0] for x in bd.children('.star').siblings('p:first-of-type').items()]
>>>        director = [x.split('\xa0\xa0\xa0')[0] for x in director_actor]
>>>        actor = [x.split('\xa0\xa0\xa0')[1] for x in director_actor]
>>>        star = bd('.star')('span:nth-child(4)').text().split()
>>>        link = [x.attr.href for x in hd('a').items()]

>>>        result = zip(index,name,director,actor,star,link)
>>>        return result
>>>    return deco

>>>@sort_data
>>>def get_page(url):
>>>    # 获得页面内容
>>>    headers = {
>>>        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like >>>>Gecko) Chrome/78.0.3904.108 Safari/537.36'
>>>    }
>>>    res = requests.get(url=url,headers=headers)
>>>    if res.status_code == 200:
>>>        return res.text # 获得HTML页面
>>>    else:
>>>        return None

>>>def save_to_pickle(data,filename="douban_top10.pk"):
>>>    # 保存到pickle文档
>>>    with open(filename,'wb') as file:
>>>        pickle.dump(data,file)

>>>def main():
>>>    # 入口
>>>    url = 'https://movie.douban.com/top250'
>>>    page_data = get_page(url)
>>>    save_to_pickle(page_data)

>>>if __name__ == '__main__':
>>>    main()
4.1.4 存储为shelve文件
  • shelve支持所有的Python数据类型,但只能在Python中使用。
  • 可以把shelve看成是一个临时的缓存数据库。
  • 操作方法类似字典。
>>>from pyquery import PyQuery as pq
>>>import requests,shelve

>>>def sort_data(func):
>>>    def deco(*args,**kargs):
>>>        # 处理内容
>>>        data = func(*args,**kargs)
>>>        html_data = pq(data)
>>>        hd = html_data('.hd')
>>>        bd = html_data('.bd')

>>>        index = [x.text() for x in html_data.find('em').items()]
>>>        name = [x.text() for x in hd.find('.title:first-child').items()]
>>>        director_actor = [x.html().strip().split('<br/>')[0] for x in bd.children('.star').siblings('p:first-of-type').items()]
>>>        director = [x.split('\xa0\xa0\xa0')[0] for x in director_actor]
>>>        actor = [x.split('\xa0\xa0\xa0')[1] for x in director_actor]
>>>        star = bd('.star')('span:nth-child(4)').text().split()
>>>        link = [x.attr.href for x in hd('a').items()]

>>>        result = zip(index,name,director,actor,star,link)
>>>        return result
>>>    return deco

>>>@sort_data
>>>def get_page(url):
>>>    # 获得页面内容
>>>    headers = {
>>>        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like >>>>Gecko) Chrome/78.0.3904.108 Safari/537.36'
>>>    }
>>>    res = requests.get(url=url,headers=headers)
>>>    if res.status_code == 200:
>>>        return res.text # 获得HTML页面
>>>    else:
>>>        return None

>>>def save_to_shelve(data,filename="douban_top10.db"):
>>>    # 保存到pickle文档
>>>    with shelve.open(filename) as db:
>>>        for i in range(10):
>>>            db[f"{i+1}"] = next(data)

>>>def main():
>>>    # 入口
>>>    url = 'https://movie.douban.com/top250'
>>>    page_data = get_page(url)
>>>    save_to_shelve(page_data)

>>>if __name__ == '__main__':
>>>    main()
4.1.5 存储为csv文件
  • csv文件可以看作是纯文本格式的xls文件。
>>>from pyquery import PyQuery as pq
>>>import requests,csv

>>>def sort_data(func):
>>>    def deco(*args,**kargs):
>>>        # 处理内容
>>>        data = func(*args,**kargs)
>>>        html_data = pq(data)
>>>        hd = html_data('.hd')
>>>        bd = html_data('.bd')

>>>        index = [x.text() for x in html_data.find('em').items()]
>>>        name = [x.text() for x in hd.find('.title:first-child').items()]
>>>        director_actor = [x.html().strip().split('<br/>')[0] for x in bd.children('.star').siblings('p:first-of-type').items()]
>>>        director = [x.split('\xa0\xa0\xa0')[0] for x in director_actor]
>>>        actor = [x.split('\xa0\xa0\xa0')[1] for x in director_actor]
>>>        star = bd('.star')('span:nth-child(4)').text().split()
>>>        link = [x.attr.href for x in hd('a').items()]

>>>        result = zip(index,name,director,actor,star,link)
>>>        return result
>>>    return deco

>>>@sort_data
>>>def get_page(url):
>>>    # 获得页面内容
>>>    headers = {
>>>        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like >>>>Gecko) Chrome/78.0.3904.108 Safari/537.36'
>>>    }
>>>    res = requests.get(url=url,headers=headers)
>>>    if res.status_code == 200:
>>>        return res.text # 获得HTML页面
>>>    else:
>>>        return None

>>>def save_to_csv(data,filename="douban_top10.csv"):
>>>    # 保存到pickle文档
>>>    with open(filename,'w',encoding='utf-8') as file:
>>>        writer = csv.writer(file)
>>>        writer.writerows(data)

>>>def main():
>>>    # 入口
>>>    url = 'https://movie.douban.com/top250'
>>>    page_data = get_page(url)
>>>    save_to_csv(page_data)

>>>if __name__ == '__main__':
>>>    main()
4.2 数据库存储
4.2.1 存储到MySQL
  • MySQL是目前最流行的关系型数据库之一。
  • 通过行和列组成的二维表存储。
  • 以本地MySQL server为例。
>>>from pyquery import PyQuery as pq
>>>import pymysql
>>>import requests

>>>def sort_data(func):
>>>    def deco(*args,**kargs):
>>>        # 处理内容
>>>        data = func(*args,**kargs)
>>>        html_data = pq(data)
>>>        hd = html_data('.hd')
>>>        bd = html_data('.bd')

>>>        index = [x.text() for x in html_data.find('em').items()]
>>>        name = [x.text() for x in hd.find('.title:first-child').items()]
>>>        director_actor = [x.html().strip().split('<br/>')[0] for x in bd.children('.star').siblings('p:first-of-type').items()]
>>>        director = [x.split('\xa0\xa0\xa0')[0] for x in director_actor]
>>>        actor = [x.split('\xa0\xa0\xa0')[1] for x in director_actor]
>>>        star = bd('.star')('span:nth-child(4)').text().split()
>>>        link = [x.attr.href for x in hd('a').items()]

>>>        result = zip(index,name,director,actor,star,link)
>>>        return result
>>>    return deco

>>>@sort_data
>>>def get_page(url):
>>>    # 获得页面内容
>>>    headers = {
>>>        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like >>>>Gecko) Chrome/78.0.3904.108 Safari/537.36'
>>>    }
>>>    res = requests.get(url=url,headers=headers)
>>>    if res.status_code == 200:
>>>        return res.text # 获得HTML页面
>>>    else:
>>>        return None

>>>def save_to_mysql_db(data,host='localhost',user='root',password='root',port=3306,db='note_sample',table='douban_top10'):
>>>    # 连接到mysql服务器
>>>    db = pymysql.connect(host=host,user=user,password=password,port=port,db=db,charset='utf8')
>>>    with db:
>>>        cursor = db.cursor()

>>>        # 如果没有表就创建表
>>>        sql_create_table = f'CREATE TABLE IF NOT EXISTS {str(table)}(ind_order VARCHAR(255) NOT NULL PRIMARY KEY, name VARCHAR(255) NOT NULL, director VARCHAR(255), actor VARCHAR(255), star VARCHAR(255), link_to VARCHAR(255))'
>>>        cursor.execute(sql_create_table)
>>>        db.commit()

>>>        for i in range(10):
>>>            next_data = next(data)
>>>            sql_insert_data = f'INSERT INTO {table}(ind_order,name,director,actor,star,link_to) VALUES({next_data[0]}, "{next_data[1]}", "{next_data[2]}", "{next_data[3]}", "{next_data[4]}", "{next_data[5]}")'

>>>            try:
>>>                cursor.execute(sql_insert_data)
>>>            except Exception as e:
>>>                print(e)
>>>                db.rollback()
>>>        db.commit()

>>>def main():
>>>    # 入口
>>>    url = 'https://movie.douban.com/top250'
>>>    page_data = get_page(url)
>>>    save_to_mysql_db(page_data)

>>>if __name__ == '__main__':
>>>    main()
4.2.2 存储到MongoDB
  • MongoDB是基于分布式文件存储的开源非关系型数据库。
  • 内容存储类似JSON格式。
  • 以单机版MongoDB为例。
>>>from pyquery import PyQuery as pq
>>>import pymongo
>>>import requests

>>>def sort_data(func):
>>>    def deco(*args,**kargs):
>>>        # 处理内容
>>>        data = func(*args,**kargs)
>>>        html_data = pq(data)
>>>        hd = html_data('.hd')
>>>        bd = html_data('.bd')

>>>        index = [x.text() for x in html_data.find('em').items()]
>>>        name = [x.text() for x in hd.find('.title:first-child').items()]
>>>        director_actor = [x.html().strip().split('<br/>')[0] for x in bd.children('.star').siblings('p:first-of-type').items()]
>>>        director = [x.split('\xa0\xa0\xa0')[0] for x in director_actor]
>>>        actor = [x.split('\xa0\xa0\xa0')[1] for x in director_actor]
>>>        star = bd('.star')('span:nth-child(4)').text().split()
>>>        link = [x.attr.href for x in hd('a').items()]

>>>        result = zip(index,name,director,actor,star,link)
>>>        return result
>>>    return deco

>>>@sort_data
>>>def get_page(url):
>>>    # 获得页面内容
>>>    headers = {
>>>        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like >>>>Gecko) Chrome/78.0.3904.108 Safari/537.36'
>>>    }
>>>    res = requests.get(url=url,headers=headers)
>>>    if res.status_code == 200:
>>>        return res.text # 获得HTML页面
>>>    else:
>>>        return None

>>>def save_to_mongo_db(data,host='localhost',port=27017,db='note_sample',collection='douban_top10'):
>>>    # 连接到mongodb
>>>    client = pymongo.MongoClient(host=host, port=port)
>>>    db = client[db] # 创建数据库
>>>    collection = db[collection]  # 获得集合

>>>    for i in range(10):
>>>        next_data = dict((title,value) for title,value in zip(['ind','name','director','actor','stars','link'],next(data)))
>>>        collection.insert_one(next_data)

>>>def main():
>>>    # 入口
>>>    url = 'https://movie.douban.com/top250'
>>>    page_data = get_page(url)
>>>    save_to_mongo_db(page_data)

>>>if __name__ == '__main__':
>>>    main()
4.2.3 存储到Redis
  • Redis是一款基于内存非关系型数据库。
  • 基于键值对保存。
>>>from pyquery import PyQuery as pq
>>>from redis import StrictRedis
>>>import requests

>>>def sort_data(func):
>>>    def deco(*args,**kargs):
>>>        # 处理内容
>>>        data = func(*args,**kargs)
>>>        html_data = pq(data)
>>>        hd = html_data('.hd')
>>>        bd = html_data('.bd')

>>>        index = [x.text() for x in html_data.find('em').items()]
>>>        name = [x.text() for x in hd.find('.title:first-child').items()]
>>>        director_actor = [x.html().strip().split('<br/>')[0] for x in bd.children('.star').siblings('p:first-of-type').items()]
>>>        director = [x.split('\xa0\xa0\xa0')[0] for x in director_actor]
>>>        actor = [x.split('\xa0\xa0\xa0')[1] for x in director_actor]
>>>        star = bd('.star')('span:nth-child(4)').text().split()
>>>        link = [x.attr.href for x in hd('a').items()]

>>>        result = zip(index,name,director,actor,star,link)
>>>        return result
>>>    return deco

>>>@sort_data
>>>def get_page(url):
>>>    # 获得页面内容
>>>    headers = {
>>>        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like >>>>Gecko) Chrome/78.0.3904.108 Safari/537.36'
>>>    }
>>>    res = requests.get(url=url,headers=headers)
>>>    if res.status_code == 200:
>>>        return res.text # 获得HTML页面
>>>    else:
>>>        return None

>>>def save_to_redis(data,host='localhost',port=6379,db=0,password=None):
>>>    # 连接到redis
>>>    redis = StrictRedis(host=host,port=port,db=db,password=password)

>>>    for i in range(10):
>>>        next_data = next(data)
>>>        redis.set(next_data[0]," ".join(next_data[1:]))

>>>def main():
>>>    # 入口
>>>    url = 'https://movie.douban.com/top250'
>>>    page_data = get_page(url)
>>>    save_to_redis(page_data)

>>>if __name__ == '__main__':
>>>    main()

参考资料



本文作者:大师兄(superkmi)

相关文章

网友评论

    本文标题:大师兄的Python学习笔记(二十二): 爬虫(三)

    本文链接:https://www.haomeiwen.com/subject/zvxekktx.html