美文网首页
爬取猫眼top100榜单电影

爬取猫眼top100榜单电影

作者: 两分与桥 | 来源:发表于2018-03-08 21:19 被阅读4次

    今天终于静下心来学习python爬虫了,写了一个爬取猫眼top100榜单电影的小爬虫,效率不高,下次一定加油。

    一张效果图(1~100)

    都是上学期学过的,现在复习复习

    上代码

    import requests

    from bs4 import BeautifulSoup

    import urllib

    def download(url, num_retries=2):

    try:

    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3278.0 Safari/537.36'}

    html = requests.get(url, headers=headers)

    '''html.encoding=gb2312'''

    except Exception as e:

    print ('Download error:', e.reason)

    html = None

    if num_retries > 0:

    if 500 <= html.status_code < 600:

    return download(url, num_retries-1)

    return html

    def tiqu(html):

    contents = []

    bsobject = BeautifulSoup(html, "lxml")

    caption = bsobject.findAll('p',{"class":'board-content'})[0].string

    for lists in bsobject.findAll('dl',{'class':'board-wrapper'})[0]:

    score = ''

    content = []

    if len(lists) == 1:

    continue

    content.append(lists.find('i').string)

    test = lists.findAll('p',)

    for aa in test:

    bb = str(aa.string)

    bb = bb.replace('\n','').strip()

    if bb == 'None':

    continue

    content.append(bb)

    for num in test[3]:

    score = score + num.string

    content.append(float(score.strip()))

    contents.append(content)

    return caption, contents

    def getUrl(url, html):

    urllist = []

    listobject = BeautifulSoup(html, 'lxml')

    urlone = listobject.findAll('a', {"class":'page_2'})[0].attrs['href'][:-2]

    for num in range(0,100,10):

    urls = urlone + str(num)

    urllist.append(urllib.parse.urljoin(url, urls))

    return urllist

    def main():

    contents = []

    url = "http://maoyan.com/board/4"

    html = download(url)

    urllist = getUrl(url, html.text)

    for url in urllist:

    html = download(url)

    caption , content = tiqu(html.text)

    for con in content:

    contents.append(con)

    for con in contents:

    print(con)

    if __name__ == '__main__':

    main()

    看起来效果不好,还是上传文件吧

    链接:https://pan.baidu.com/s/1VBFH4RUagRMgIINT30xP7g 密码:tydw

    相关文章

      网友评论

          本文标题:爬取猫眼top100榜单电影

          本文链接:https://www.haomeiwen.com/subject/zfhufftx.html