美文网首页
Python爬取豆瓣电影TOP250

Python爬取豆瓣电影TOP250

作者: 3ni | 来源:发表于2018-08-02 16:49 被阅读0次

    部分代码引用于此:链接
    运行于macOS 10.13.6 python2.7
    代码:

    # -*- coding:utf-8 -*-
    import urllib
    import urllib2
    # useragent 存放着各个浏览器的User-Agent,自己写的模块,非系统库
    import useragent
    import BeautifulSoup
    import re
    import csv
    import sys
    
    url = 'https://movie.douban.com/top250'
    
    
    def get_movie_info(req):
        res = urllib2.urlopen(req)
        page = res.read()
        soup = BeautifulSoup.BeautifulSoup(page)
        data = soup.find('ol', {'class': 'grid_view'})
        li = data.findAll('li')
        record = []
        for l in li:
            rank = l.find('em').getText()
            name = l.find('img')['alt']
            info = l.find('p').getText()
            director = re.findall('导演: (.*?)  &nbsp', info.encode('utf-8'))
            if len(director) == 0:
                director = '佚名'
            else:
                director = director[0]
            starring = re.findall('主演: (.*?) /...', info.encode('utf-8'))
            if len(starring) == 0:
                starring = '佚名'
            else:
                starring = starring[0]
            year = re.search(r'\d{4}', info).group()
            area = re.findall('/ (.*?) ', info)[0]
            grade = l.findAll('span', {'class': 'rating_num'})[0].getText()
            quote = l.findAll('span', {'class': 'inq'})
            if len(quote) == 0:
                quote = '无'
            else:
                quote = quote[0].getText()
            record.append([rank, name, director, starring, year, area, grade, quote])
        return record
    
    
    def start(url):
        head = ['排名', '名字', '导演', '主演', '年份', '地区', '评分', '简介']
        with open('doubantop250.csv', mode='w') as f:
            fd = csv.writer(f)
            fd.writerow(head)
            for page in range(0, 250, 25):
                user_agent = useragent.osx_user_agent
                values = {'start': page, 'filter': None}
                headers = {'User-Agent': user_agent}
                data = urllib.urlencode(values)
                request = urllib2.Request(url=url, data=data, headers=headers)
                print url + '?' + request.data
                movie_info = get_movie_info(request)
                for record in movie_info:
                    fd.writerow(record)
    
    def main():
        reload(sys)
        sys.setdefaultencoding('utf-8')
        start(url)
    
    
    if __name__ == '__main__':
        main()
    
    

    相关文章

      网友评论

          本文标题:Python爬取豆瓣电影TOP250

          本文链接:https://www.haomeiwen.com/subject/hudsvftx.html