今天终于静下心来学习python爬虫了,写了一个爬取猫眼top100榜单电影的小爬虫,效率不高,下次一定加油。
一张效果图(1~100)
都是上学期学过的,现在复习复习
上代码
import requests
from bs4 import BeautifulSoup
import urllib
def download(url, num_retries=2):
try:
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3278.0 Safari/537.36'}
html = requests.get(url, headers=headers)
'''html.encoding=gb2312'''
except Exception as e:
print ('Download error:', e.reason)
html = None
if num_retries > 0:
if 500 <= html.status_code < 600:
return download(url, num_retries-1)
return html
def tiqu(html):
contents = []
bsobject = BeautifulSoup(html, "lxml")
caption = bsobject.findAll('p',{"class":'board-content'})[0].string
for lists in bsobject.findAll('dl',{'class':'board-wrapper'})[0]:
score = ''
content = []
if len(lists) == 1:
continue
content.append(lists.find('i').string)
test = lists.findAll('p',)
for aa in test:
bb = str(aa.string)
bb = bb.replace('\n','').strip()
if bb == 'None':
continue
content.append(bb)
for num in test[3]:
score = score + num.string
content.append(float(score.strip()))
contents.append(content)
return caption, contents
def getUrl(url, html):
urllist = []
listobject = BeautifulSoup(html, 'lxml')
urlone = listobject.findAll('a', {"class":'page_2'})[0].attrs['href'][:-2]
for num in range(0,100,10):
urls = urlone + str(num)
urllist.append(urllib.parse.urljoin(url, urls))
return urllist
def main():
contents = []
url = "http://maoyan.com/board/4"
html = download(url)
urllist = getUrl(url, html.text)
for url in urllist:
html = download(url)
caption , content = tiqu(html.text)
for con in content:
contents.append(con)
for con in contents:
print(con)
if __name__ == '__main__':
main()
看起来效果不好,还是上传文件吧
链接:https://pan.baidu.com/s/1VBFH4RUagRMgIINT30xP7g 密码:tydw
网友评论