美文网首页
爬虫:豆瓣电影图书音乐Top250

爬虫:豆瓣电影图书音乐Top250

作者: 居山羽 | 来源:发表于2019-02-25 09:44 被阅读0次

    爬取目标

    豆瓣电影 Top 250

    #!/usr/bin/env python
    # -*- encoding:utf-8 -*-
    """
    @file: DoubanMovie.py
    @desc: 豆瓣电影 Top 250
    """
    
    import requests
    import csv
    from lxml import etree
    
    # 从网页上获取电影数据
    filmsdata = []
    root = 10000
    count = 0
    
    for i in range(10):
        # 地址 start=0为第一页,25为第二页以此类推,top250则250为最后一页,循环10次,每次递增25
        url = 'https://movie.douban.com/top250?start=' + str(25*i)
        # 获取网页 text
        data = requests.get(url).text
        # 构建 etree html对象
        html = etree.HTML(data)
        # 找到目标的父级元素
        films = html.xpath('//*[@id="content"]/div/div[1]/ol/li')
        for film in films:
            _title = film.xpath('./div/div[2]/div[1]/a/span[1]/text()')[0]  # 电影标题
            film_info = film.xpath('./div/div[2]/div[2]/p/text()')  # 电影信息
            _director = film_info[0].strip('\n').strip('\xa0').strip(' ').split('\xa0')[0].split(':')[1]  # 电影信息中获取导演信息
            detailed = film_info[1].strip('\n').strip('\xa0').strip(' ') # 获取电影的详情,包括日期/ 地区/ 类型
            _year = detailed.split('/')[0].strip('\xa0')  # 年份
            _area = detailed.split('/')[1].strip('\xa0')  # 制片国家/地区
            _type = detailed.split('/')[2].strip('\xa0').strip('\n')  # 类型
    
            # 获取更多信息,包括 评分,评价人数
            _score = '\t'+film.xpath('./div/div[2]/div[2]/div/span[2]/text()')[0]  # 评分
            _count = film.xpath('./div/div[2]/div[2]/div/span[4]/text()')[0].split('人')[0]
            count += 1
            _index = count + root
            _all = [_index, _title, _director, _year, _area, _type, _score, _count]
            filmsdata.append(_all)
    
    # 将数据写入CSV文件中
    csv_file = 'film_top205.csv'
    with open(csv_file, 'w', encoding='utf-8-sig', newline='') as _file:
        writer = csv.writer(_file)
        header = ['序号', '电影名', '导演', '年份', '制片地区', '电影分类', '豆瓣评分', '评论人数']
        writer.writerow(header)
        for row in filmsdata:
            print(row)
            writer.writerow(row)
    

    豆瓣图书 Top 250

    #!/usr/bin/env python
    # -*- encoding:utf-8 -*-
    """
    @file: BookSpider.py
    @desc: 豆瓣图书 Top 250
    """
    
    import requests
    import csv
    from lxml import etree
    
    # 从网页上获取图书数据
    bookdata = []
    root = 20000
    count = 0
    for i in range(10):
        url = 'https://book.douban.com/top250?start=' + str(25*i)
        # 获取网页 text  并 构建 etree html对象
        data = requests.get(url).text
        html = etree.HTML(data)
        books = html.xpath('//*[@id="content"]/div/div[1]/div/table')
        for book in books:
            _name = book.xpath('./tr/td[2]/div/a/@title')[0]  # 书名
            alias = book.xpath('./tr/td[2]/div/span[1]/text()')
            if len(alias) == 0:
                alias.append('(无)')
            _othername = alias[0]  # 外文名(国内作家所著书无外文名)
            info = book.xpath('./tr/td[2]/p[1]/text()')[0].split('/')
            _author = info[0].strip().replace(' 口述', '')  # 作者
            # 数据处理
            if ']' in _author:
                _author = _author.split(']')[1].strip().replace(' ', '')
            if _author[0] == '(':
                _author = _author.split(')')[1].replace(' ', '')
            if '】' in _author:
                _author = _author.split('】')[1].replace(' ', '')
            _price = info[-1].replace('元', '').replace('RMB', '').replace('CNY','').replace(' ', '')  # 图数售价
            _score = '\t'+book.xpath('./tr/td[2]/div[2]/span[2]/text()')[0]  # 评分
            _count = book.xpath('./tr/td[2]/div[2]/span[3]/text()')[0]\
                .replace('(', '').replace(')', '').replace('\n', '').strip().split('人')[0]  # 评论人数
            _content = book.xpath('./tr/td[2]/p[2]/span/text()')
            if len(_content) == 0:
                _content.append('(无)')
            _content = _content[0]
            count += 1
            _index = root + count
            # 封装数据
            bookdata.append([_index, _name, _author, _price, _score, _count, _content])
    
    # 将数据写入CSV文件中
    csv_file = 'book_top205.csv'
    with open(csv_file, 'w', encoding='utf-8-sig', newline='') as _file:
        writer = csv.writer(_file)
        header = ['序号', '书名', '作者', '售价', '豆瓣评分', '评论人数', '描述']
        writer.writerow(header)
        for row in bookdata:
            print(row)
            writer.writerow(row)
    

    豆瓣音乐 Top 250

    #!/usr/bin/env python
    # -*- encoding:utf-8 -*-
    """
    @file: MusicSpider.py
    @desc: 豆瓣音乐 Top 250 爬虫
    """
    import requests
    import csv
    from lxml import etree
    
    # 从网页上获取数据
    musicdata = []
    root = 3000
    count = 0
    
    for i in range(10):
        # 构建 etree html对象
        url = 'https://music.douban.com/top250?start=' + str(25 * i)
        data = requests.get(url).text
        html = etree.HTML(data)
        # 找到目标父级元素
        music_set = html.xpath('//*[@id="content"]/div/div[1]/div/table')
        for music in music_set:
            title_author = music.xpath('./tr/td[1]/a/@title')[0].split('-')
            _title = title_author[1]    # 标题
            _author = title_author[0]   # 作者
            info = music.xpath('./tr/td[2]/div/p[1]/text()')[0].split(' / ')
            _date = '\t'+info[1]\
                .replace('年', '-')\
                .replace('月', '-')\
                .replace('日', '-')\
                .replace('/', '-')\
                .strip('-')     # 日期
            _type = info[2]     # 类型
            if len(info) == 5:
                _media = info[3]    # 介质
                _style = info[4]    # 风格
            elif len(info) == 4:
                _media = info[-1]
                _style = '(未知)'
            _score = '\t'+music.xpath('./tr/td[2]/div/div[1]/span[2]/text()')[0]    # 评分
            target = './tr/td[2]/div/div[1]/span[3]/text()'
            _count = music.xpath(target)[0].replace(')', '').replace('(', '').strip().split('人')[0] # 评论人数
            count += 1
            _index = root + count
            musicdata.append([_index, _title, _author, _date, _type, _media, _style, _score, _count])
    
    # 将数据写入CSV文件中
    csv_file = 'music_top250.csv'
    with open(csv_file, 'w', encoding='utf-8-sig', newline='') as _file:
        writer = csv.writer(_file)
        header = ['序号', '名称', '作者', '年份', '类型', '介质', '风格', '豆瓣评分', '评论人数']
        writer.writerow(header)
        for row in musicdata:
            print(row)
            writer.writerow(row)
    

    结尾

    直接复制粘贴到IDE中保存运行即可,默认会将爬取到的数据存放至当前文件夹下。

    相关文章

      网友评论

          本文标题:爬虫:豆瓣电影图书音乐Top250

          本文链接:https://www.haomeiwen.com/subject/nlkwyqtx.html