爬取电影并存为excel

作者: 未知之眼 | 来源:发表于2018-03-13 22:56 被阅读6次

import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import lxml

douban = 'https://movie.douban.com/top250'

name=[]#film name
quote=[]#film star
score=[]#film score

def parseHtml(html):
soup = BeautifulSoup(html,'lxml')
movie_list_soup = soup.find('ol', attrs={'class': 'grid_view'})
for movie_li in movie_list_soup.find_all('li'):
movie_name = movie_li.find('span', attrs={'class': 'title'}).getText()
movie_star = movie_li.find('span', attrs={'class': 'rating_num'}).getText()
movieQuote = movie_li.find('span', attrs={'class': 'inq'}).getText()
print('{0} {1} {2}'.format(movie_name, movie_star, movieQuote))
name.append(movie_name)
score.append(movie_star)
quote.append(movieQuote)

nextPage = soup.find('span', attrs={'class': 'next'}).find('a')
if nextPage:
    download(douban + '{0}'.format(nextPage['href']))
else:
    print('all is ok')

def download(url):
print(url)
content = requests.get(url).content
parseHtml(content)

download(douban)

df = pd.DataFrame({'title':name,'rate':quote,'pingyu':score}
data={'title':name,'rate':quote,'pingyu':score}
df.to_excel('foo.xlsx', sheet_name='Sheet1')

相关文章

网友评论

    本文标题:爬取电影并存为excel

    本文链接:https://www.haomeiwen.com/subject/ukfxqftx.html