import json, requests, re
from datetime import time
def get_one_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'
}
try:
response = requests.get(url,headers = headers)
if response.status_code == 200:
return response.text
except BaseException as base:
print(base)
def parse_one_page(html):
# 匹配i节点中的排名信息
rank = '<dd>.*?board-index.*?>(.*?)</i>'
# 提取图片信息
img = '.*?data-src="(.*?)"'
# 提取电影名称
name = '.*?<a.*?data-val=".*?">(.*?)</a>'
# 提取主演
act = '.*?class="star">(.*?)</p>'
# 提取上映时间
time = '.*?class="releasetime">(.*?)</p>'
# 评分
grade = '.*?class="integer">(.*?)</i><i.*?class=".*?">(.*?)</i>'
regul = rank + img + name + act + time + grade
pattern = re.compile(regul, re.S)
results = re.findall(pattern, html)
for result in results:
yield {
'index': result[0],
'image': result[1],
'title': result[2],
'actor': result[3].strip()[3:],
'time': result[4].strip()[4:],
'score': result[5].strip() + result[6].strip()
}
def write_json(data):
with open('movie.json', 'a', encoding='utf-8') as w:
json.dump(data, w)
w.write('\n')
def main(offset):
url = 'Http://maoyan.com/board/4?offset=' + str(offset)
html = get_one_page(url)
for i in parse_one_page(html):
# print(i)
write_json(i)
if __name__ == '__main__':
for i in range(10):
main(offset=i * 10)
time.sleep(2)
网友评论