Python爬取豆瓣电影TOP250

作者: Treehl | 来源:发表于2017-11-14 18:52 被阅读187次

    闲来无事温故了下BeautifulSoup和requests,之后又写了个简单的爬虫,网址豆瓣TOP250电影
    先附上关键模块文档,对新手还是比较友好的!

    这次不总结详细的过程,有兴趣了解的可以看下我之前写的Python爬取豆瓣电子小说或者有兴趣了解scrapy的可以看下我另外一篇Scrapy爬取豆瓣电影TOP250

    我们先来看下,requests和bs4怎么结合使用。

    import requests
    from bs4 import BeautifulSoup
    
    url = 'https://movie.douban.com/top250'
    # 使用U-A伪装成浏览器发送请求
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
    # 先使用requests发送网络请求从而获取网页
    r = requests.get('https://movie.douban.com/top250', headers=headers)
    # 使用bs4解析获取的网页
    soup = BeautifulSoup(r.text, 'html.parser')
    # 调用prettify()方法来使解析的HTML更加规范化
    print(soup.prettify())
    

    解析出来的部分HTML

    <!DOCTYPE html>
    <html class="ua-windows ua-webkit" lang="zh-cmn-Hans">
     <head>
      <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
      <meta content="webkit" name="renderer"/>
      <meta content="always" name="referrer"/>
      <title>
       豆瓣电影 Top 250
      </title>
      <meta content="cZdR4xxR7RxmM4zE" name="baidu-site-verification">
       <meta content="no-cache" http-equiv="Pragma"/>
       <meta content="Sun, 6 Mar 2005 01:00:00 GMT" http-equiv="Expires"/>
       <link href="https://img3.doubanio.com/f/movie/d59b2715fdea4968a450ee5f6c95c7d7a2030065/pics/movie/apple-touch-icon.png" rel="apple-touch-icon"/>
       <link href="https://img3.doubanio.com/f/shire/da18e98a294047dd92298000fe2080b38efda2ae/css/douban.css" rel="stylesheet" type="text/css"/>
       <link href="https://img3.doubanio.com/f/shire/ae3f5a3e3085968370b1fc63afcecb22d3284848/css/separation/_all.css" rel="stylesheet" type="text/css"/>
       <link href="https://img3.doubanio.com/f/movie/8864d3756094f5272d3c93e30ee2e324665855b0/css/movie/base/init.css" rel="stylesheet"/>
       <script type="text/javascript">
    

    分析元素

    image

    movie_list_soup = soup.find('ol', attrs={'class': 'grid_view'}) #电影列表
    movie_name = movie_li.find('span', attrs={'class': 'title'}).get_text() #电影名字
    movie_info = movie_li.find('div', attrs={'class': 'bd'}).find('p').get_text() # 电影信息
    movie_star = movie_li.find('span', attrs={'class': 'rating_num'}).get_text() # 电影评分

    先来看下电影列表

    for movie in movie_list.find_all('li'):
        movie_name = movie.find('span', attrs={'class': 'title'})
        print(movie_name)
        
    <span class="title">肖申克的救赎</span>
    <span class="title">霸王别姬</span>
    <span class="title">这个杀手不太冷</span>
    <span class="title">阿甘正传</span>
    <span class="title">美丽人生</span>
    <span class="title">千与千寻</span>
    <span class="title">辛德勒的名单</span>
    <span class="title">泰坦尼克号</span>
    <span class="title">盗梦空间</span>
    <span class="title">机器人总动员</span>
    <span class="title">海上钢琴师</span>
    <span class="title">三傻大闹宝莱坞</span>
    <span class="title">忠犬八公的故事</span>
    <span class="title">放牛班的春天</span>
    <span class="title">大话西游之大圣娶亲</span>
    

    调用get_text()获取内容

    for movie in movie_list.find_all('li'):
        movie_name = movie.find('span', attrs={'class': 'title'})
        print(movie_name.get_text())
        
    肖申克的救赎
    霸王别姬
    这个杀手不太冷
    阿甘正传
    美丽人生
    千与千寻
    辛德勒的名单
    泰坦尼克号
    盗梦空间
    机器人总动员
    海上钢琴师
    三傻大闹宝莱坞
    忠犬八公的故事
    放牛班的春天
    大话西游之大圣娶亲
    

    电影信息,电影评分同理

    movie_info = movie.find('div', attrs={'class': 'bd'}).find('p')
    print(movie_info.get_text())
                                导演: 大卫·芬奇 David Fincher   主演: 爱德华·诺顿 Edward Norton / 布拉...
                                1999 / 美国 德国 / 剧情 动作 悬疑 惊悚
    
    

    附上完整代码,关于next_page的理解可以看下我的python爬取豆瓣小说

    #_*_ coding=utf-8 _*_
    
    '''
    爬取豆瓣电影TOP250
    '''
    
    import codecs
    import requests
    from bs4 import BeautifulSoup
    
    DOWNLOAD_URL = 'https://movie.douban.com/top250'
    
    def download_page(url):
        return requests.get(url, headers={
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
        }).content
    
    def parse_html(html):
        soup = BeautifulSoup(html, 'html.parser')
        # 电影列表
        movie_list_soup = soup.find('ol', attrs={'class': 'grid_view'})
    
        movie_name_list = []
    
        for movie_li in movie_list_soup.find_all('li'):
            movie_name = movie_li.find('span', attrs={'class': 'title'}).get_text()
            movie_info = movie_li.find('div', attrs={'class': 'bd'}).find('p').get_text()
            movie_star = movie_li.find('span', attrs={'class': 'rating_num'}).get_text()
    
            movie_name_list.append(movie_name)
            movie_name_list.append(movie_info)
            movie_name_list.append(movie_star)
            
        # 下一页链接
        next_page = soup.find('span', attrs={'class': 'next'}).find('a')
        if next_page:
            return movie_name_list,DOWNLOAD_URL + next_page['href']
        return movie_name_list, None
    
    def main():
        url = DOWNLOAD_URL
        with codecs.open('movies','wb', encoding='utf-8') as f:
            while True:
                html = download_page(url)
                movies, url =parse_html(html)
                f.write(u'{movies}\n'.format(movies='\n'.join(movies)))
    
    if __name__ == '__main__':
        main()
    
    

    欢迎大家访问我的博客Treehl的博客
    GitHub
    简书

    相关文章

      网友评论

        本文标题:Python爬取豆瓣电影TOP250

        本文链接:https://www.haomeiwen.com/subject/nnwfvxtx.html