作为练手项目,没想过要精简代码,重要的是在爬取的过程中找到思路。
- 全部代码
import requests
import re
import pandas as pd
import openpyxl
import xlwings
import time
from bs4 import BeautifulSoup
# 爬取口碑列表
print("正在爬取数据,请稍等...")
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
url = "https://movie.douban.com/"
get = requests.get(url,headers=headers)
text = BeautifulSoup(get.text,"lxml")
mod = text.findAll("div","billboard-bd")
title = mod[0].findAll("td","title")
# 制作口碑URL字典
movie = {}
for i in title:
string = str(i)
# 键
pattern_name = re.compile("mv_rk.*</a>")
string_name = re.findall(pattern_name,string)
string_name = "".join(string_name)
# 值
pattern_url = re.compile("https\://movie.douban.com/subject/[0-9]+/")
string_url = re.findall(pattern_url,string)
string_url = "".join(string_url)
# 构造字典
movie[string_name[10:-4]] = string_url
# 保存Excel文件
df = pd.DataFrame([movie,]).T
filepath = r"C:/Users/longxiaojiangi/Documents/JupyterNotebook/AutoSaveFiles/豆瓣电影一周口碑榜.xlsx"
df.to_excel(filepath,)
# 修改保存时间
app = xlwings.App(visible=False,add_book=False)
wb = app.books.open(filepath)
sheet1 = wb.sheets[0]
sheet1.range("A1").value = "电影"
sheet1.range("B1").value = "URL"
sheet1.range("A14").value = "最后一次更新时间"
sheet1.range("B14").value = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime())
wb.save()
wb.close()
print("数据已保存至:\n{}".format(filepath))
网友评论