美文网首页
练习:豆瓣电影TOP250爬虫

练习:豆瓣电影TOP250爬虫

作者: code与有荣焉 | 来源:发表于2019-10-31 10:22 被阅读0次

    练习:豆瓣电影TOP250爬虫(优化)

    import requests
    from lxml import html
    import pandas as pd
    def spider():
        movie_list = []
        for i in range(0, 226, 25):
            url = 'https://movie.douban.com/top250?start={}&filter='.format(i)
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
    
            # 请求url
            response = requests.get(url, headers=headers)
            selector = html.fromstring(response.text)
            # 用xpath语法抓取数据放入容器中
            li_list = selector.xpath('//div[@id="content"]/div[@class="grid-16-8 clearfix"]/div[@class="article"]/ol[@class="grid_view"]/li')
    
            for li in li_list:
                movie_img = li.xpath('div[@class="item"]/div[@class="pic"]/a/img/@src')
                movie_img = '找不到数据' if len(movie_img)==0 else movie_img[0]
                # print(movie_img)
                movie_id = li.xpath('div[@class="item"]/div[@class="pic"]/em/text()')
                movie_id = '找不到数据' if len(movie_id) == 0 else int(movie_id[0])
                # print(movie_id)
                movie_name = li.xpath('div[@class="item"]/div[@class="info"]/div[@class="hd"]/a/span[1]/text()')
                movie_name = '找不到数据' if len(movie_name) == 0 else movie_name[0]
                # print(movie_name)
                movie_link = li.xpath('div[@class="item"]/div[@class="info"]/div[@class="hd"]/a/@href')
                movie_link = '找不到数据' if len(movie_link) == 0 else movie_link[0]
                # print(movie_link)
                movie_info = li.xpath('div[@class="item"]/div[@class="info"]/div[@class="bd"]/p[1]/text()')
                movie_info = '找不到数据' if len(movie_info) == 0 else movie_info[0].strip()
                # print(movie_info)
                movie_rating_num = li.xpath('div[@class="item"]/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')
                movie_rating_num = '找不到数据' if len(movie_rating_num) == 0 else float(movie_rating_num[0])
                # print(movie_rating_num)
                movie_rating_people = li.xpath('div[@class="item"]/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[4]/text()')
                movie_rating_people = '找不到数据' if len(movie_rating_people) == 0 else int(movie_rating_people[0].replace('人评价', ''))
                # print(movie_rating_people)
                movie_quote = li.xpath('div[@class="item"]/div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span/text()')
                movie_quote = '找不到数据' if len(movie_quote) == 0 else movie_quote[0]
                # print(movie_quote)
                movie_list.append({
                    'movie_img': movie_img,
                    'movie_id': movie_id,
                    'movie_name': movie_name,
                    'movie_link': movie_link,
                    'movie_info': movie_info,
                    'movie_rating_num': movie_rating_num,
                    'movie_rating_people': movie_rating_people,
                    'movie_quote': movie_quote
                })
    
            # print(movie_list)
            # 存入本地
        pd.DataFrame(movie_list).to_csv('豆瓣top250.csv')
        for movie in movie_list:
            print(movie)
            with open('./douban_top250_img/{}.jpg'.format(movie['movie_name']), 'wb') as f:
                f.write(requests.get(movie['movie_img'], headers=headers).content)
        print('共爬取了{}部电影'.format(len(movie_list)))
    spider()
    
    豆瓣电影TOP250
    豆瓣电影TOP250

    相关文章

      网友评论

          本文标题:练习:豆瓣电影TOP250爬虫

          本文链接:https://www.haomeiwen.com/subject/yvmzvctx.html