目标网址:http://movie.douban.com/top250/
#!/usr/bin/env python
# encoding=utf-8
"""
爬取豆瓣电影TOP250 - 完整示例代码
"""
import codecs
import xlwings as xw
import requests
from bs4 import BeautifulSoup
DOWNLOAD_URL = 'http://movie.douban.com/top250/'
movie_name_list = [] # 电影名字
director_name_list = [] # 导演名字
score_list = [] # 评分
def download_page(url):
return requests.get(url, headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'
}).content
def parse_html(html):
soup = BeautifulSoup(html)
movie_list_soup = soup.find('ol', attrs={'class': 'grid_view'})
#寻找所有的条目
for movie_li in movie_list_soup.find_all('li'):
detail = movie_li.find('div', attrs={'class': 'hd'})
director_detail = movie_li.find('div',attrs={'class':'bd'})
score_detail = movie_li.find('div',attrs={'class':'star'})
movie_name = detail.find('span', attrs={'class': 'title'}).getText()
director_name = director_detail.find('p',attrs={'':''}).getText()
score = score_detail.find('span',attrs={'class':'rating_num'}).getText()
#print(score)
#print(director_name)
director_name_list.append(director_name) #导演名字
movie_name_list.append(movie_name) #电影名字
score_list.append(score) #电影评分
next_page = soup.find('span', attrs={'class': 'next'}).find('a')
#下一页存在
if next_page:
return movie_name_list, DOWNLOAD_URL + next_page['href']
#下一页不存在
return movie_name_list, None
def showExcel():
i = 0
app = xw.App(visible=True, add_book=False)
app.display_alerts = False
# 文件位置:filepath,打开test文档,然后保存,关闭,结束程序
filepath = r'D://Desktop/myexcel.xlsx'
wb = app.books.open(filepath)
sht = wb.sheets['sheet1']
sht.range('A1').value = "电影名称"
sht.range('B1').value = "详细信息"
sht.range('C1').value = "豆瓣评分"
while i < len(movie_name_list):
sht.cells(i+2,1).value = movie_name_list[i]
sht.cells(i+2,2).value = director_name_list[i]
sht.cells(i+2,3).value = score_list[i]
i = i + 1
def main():
url = DOWNLOAD_URL
while url:
html = download_page(url)
movies, url = parse_html(html)
#fp.write(u'{movies}\n{director}\n{score}'.format(movies='\n'.join(movies)))
if __name__ == '__main__':
main()
i = 0
showExcel()
结果:


网友评论