爬虫练习, 使用几种不同的方式爬取猫眼电影TOP100
猫眼电影TOP100的页面结构比较简单, 电影的信息都存储在dl标签下, 每一个dd标签存储一部电影的信息
在这里插入图片描述
-
requests + 正则表达式提取
import requests
import re
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/67.0.3396.99 Safari/537.36'}
def get_one_page(url):
try:
r = requests.get(url, headers=headers)
r.raise_for_status()
return r.text
except Exception as e:
print(e)
return ''
def parse_one_page(html):
pattern = '<dd>.*?index.*?>(.*?)</i>.*?name.*?a.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)' \
'</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>'
items = re.findall(pattern, html, re.S)
for item in items:
index = item[0]
name = item[1]
star = item[2].strip()[3:]
time = item[3].strip()[5:]
score = item[4] + item[5]
print(index, name, star, time, score)
def main():
for i in range(10):
url = "http://maoyan.com/board/4?offset=" + str(i * 10)
html = get_one_page(url)
parse_one_page(html)
if __name__ == '__main__':
main()
-
requests + BeautifulSoup提取, 信息保存到excel表格内
import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook
from openpyxl.styles import Alignment
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/67.0.3396.99 Safari/537.36'}
def get_one_page(url):
try:
r = requests.get(url, headers=headers)
r.raise_for_status()
return r.text
except Exception as e:
print(e)
return ''
def parse_one_page(html, sheet):
soup = BeautifulSoup(html, 'lxml')
# 每一部电影的信息都在<dl class='board-wrapper'>下面的<dd>标签里面
items = soup.find(class_='board-wrapper').find_all(name='dd')
for item in items:
# 遍历每一个dd标签, 将找到的数据追加到表格末尾
index = item.find(name='i').string
name = item.find(class_='name').string
star = item.find(class_='star').string.replace('主演:', '').strip()
time = item.find(class_='releasetime').string.replace('上映时间:', '')
score = item.find(class_='integer').string + item.find(class_='fraction').string
sheet.append([index, name, star, time, score])
def main():
# 新建表格
wb = Workbook()
# 使用默认的工作表
ws = wb.active
# 设置列宽
ws.column_dimensions['A'].width = 8
ws.column_dimensions['B'].width = 20
ws.column_dimensions['C'].width = 50
ws.column_dimensions['D'].width = 26
ws.column_dimensions['E'].width = 8
# 填写标题, 在openpyxl中行列号均从1开始
ws.cell(1, 1).value = '排名'
ws.cell(1, 2).value = '电影名'
ws.cell(1, 3).value = '主演'
ws.cell(1, 4).value = '上映时间'
ws.cell(1, 5).value = '评分'
for i in range(10):
url = "http://maoyan.com/board/4?offset=" + str(i * 10)
html = get_one_page(url)
parse_one_page(html, ws)
# 设置单元格居中对齐
ag = Alignment(horizontal='center', vertical='center')
for i in range(ws.max_row):
for j in range(ws.max_column):
ws.cell(i + 1, j + 1).alignment = ag
wb.save('./info.xlsx')
if __name__ == '__main__':
main()
-
selenium自动化爬取, 保存为csv文件
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.chrome.options import Options
import csv
def get_one_page(driver, url, writer):
try:
driver.get(url)
# 每一部电影的信息都在<dl class='board-wrapper'>下面的<dd>标签里面
data = driver.find_element_by_class_name("board-wrapper").find_elements_by_tag_name("dd")
for j in range(len(data)):
# 遍历每一个dd标签, 将找到的数据追加到文件末尾
index = data[j].find_element_by_tag_name("i").text
name = data[j].find_element_by_class_name("name").text
star = data[j].find_element_by_class_name("star").text.replace("主演:", "")
time = data[j].find_element_by_class_name("releasetime").text.replace("上映时间:", "")
score = data[j].find_element_by_class_name("integer").text + data[j].find_element_by_class_name(
"fraction").text
writer.writerow([index, name, star, time, score])
except TimeoutError or WebDriverException as e:
print(e)
def main():
# 使用chrome的无头模式
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(chrome_options=chrome_options)
with open('info.csv', 'w', encoding='utf-8') as f:
writer = csv.writer(f)
for i in range(10):
url = "http://maoyan.com/board/4?offset=" + str(i * 10)
get_one_page(driver, url, writer)
driver.quit()
if __name__ == '__main__':
main()
网友评论