Requests库+正则表达式爬取猫眼Top100里的正则表达式编写有的时候过于繁琐,而beautifulsoup可以较好的解决这一麻烦。
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
results = []
def get_one_page(offset):
url = 'https://maoyan.com/board/4?offset='+str(offset)
response = requests.get(url)
soup = BeautifulSoup(response.content,'lxml')
top =soup.find(name='dl',class_="board-wrapper")
for data in top.find_all('dd'):
rank = data.find('i').get_text()
title = data.find('p',class_="name").get_text()
actors = data.find('p',class_="star").get_text().strip()[3:]
time = data.find('p',class_='releasetime').get_text().strip()[5:]
score = data.find('p',class_="score").get_text()
result_list = [int(rank),title,actors,time,float(score)]
results.append(result_list)
def main():
for i in range(10):
get_one_page(i*10)
pd.DataFrame(results,columns=['rank','title','actors','time','score']).to_excel('top100.xlsx',index=False)
main()
结果部分如图:
![](https://img.haomeiwen.com/i11486274/b9ce87347f8ae1d0.png)
网友评论