网页页面:
猫眼电影网页
源代码实现:
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import time
import json
def get_one_page(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def page_parser(html):
soup = BeautifulSoup(html,'lxml')
for dd in soup.select('dd'):
# 排名
num = dd.select('.board-index')[0].get_text()
# 标题
name = dd.find(attrs={'class':'name'}).a.string
# 图片
pic_src = dd.find(attrs={'class':'board-img'}).attrs['data-src']
# 演员
star = dd.select('.star')[0].get_text()
# 去除两边的空格
star = star.strip()
# 时间
releasetime = dd.select('.releasetime')[0].string
# 评分
integer = dd.select('.integer')[0].string
fraction = dd.select('.fraction')[0].string
score = integer+fraction
yield {
'index':num,
'image':pic_src,
'title':name,
'actor':star[5:] if len(star) > 3 else '',
'time':releasetime if len(releasetime) > 5 else '',
'score':score
}
def write_to_file(content):
with open('result.txt','a',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii=False) + '\n')
def main(offset):
url = 'http://maoyan.com/board/4?offset=' +str(offset)
html = get_one_page(url)
for item in page_parser(html):
print(item)
write_to_file(item)
if __name__ == '__main__':
for i in range(10):
main(offset=i*10)
实现效果:
截取的数据截图
网友评论