import re
import urllib.request
def load_data(page=1):
offset = (page - 1) * 10
url = "https://maoyan.com/board/4?offset=" + str(offset)
resp = urllib.request.urlopen(url)
return resp.read().decode("utf-8")
def load_ranking(html):
reg = '<dd.*?<i.*?>(.*?)</i>'
return re.findall(reg, html, re.S)
def load_name(html):
reg = '<dd.*?<p\sclass="name".*?><a.*?>(.*?)</a>'
return re.findall(reg, html, re.S)
def load_info(html):
reg = '<dd.*?<div\sclass="board-item-main".*?<p\sclass="star".*?>(.*?)</p>'
rs = re.findall(reg, html, re.S)
return list(map(str.strip, rs))
def load_time(html):
reg = '<dd.*?<div\sclass="board-item-main".*?<p\sclass="releasetime".*?>(.*?)</p>'
rs = re.findall(reg, html, re.S)
return list(map(str.strip, rs))
def load_score(html):
reg = '<dd.*?<p\sclass="releasetime".*?<i\sclass="integer".*?>(.*?)</i>'
iteger = re.findall(reg, html, re.S)
reg_frag = '<dd.*?<p\sclass="releasetime".*?<i\sclass="fraction".*?>(.*?)</i>'
fraction = re.findall(reg_frag, html, re.S)
score = list(zip(iteger, fraction))
rs = map(lambda x: str(x[0]) + str(x[1]), score)
return list(rs)
def save_file(html):
records = zip(load_ranking(html), load_name(html), load_info(html), load_time(html), load_score(html))
infos = list(records)
with open("top.txt", "a+") as f:
for line in infos:
data = "\t".join(line)
print(data)
f.writelines(data)
# 换行
f.write("\n")
for i in range(10):
html = load_data(i+1)
save_file(html)
网友评论