美文网首页Python
基于bs4+requests的豆瓣电影爬虫

基于bs4+requests的豆瓣电影爬虫

作者: 潇洒坤 | 来源:发表于2018-07-18 16:43 被阅读52次

    1.爬取豆瓣电影前250详情页面

    豆瓣电影前250详情页面持久化为250个htm文件,打包文件下载链接: https://pan.baidu.com/s/1_zlZJQJtl9pPEJUGYVMYaw 密码: ehrq
    文件解压后的文件夹命名为doubanSourcePages,下面代码复制到py文件中,py文件和doubanSourcePages文件夹在同一级目录下

    from bs4 import BeautifulSoup as bs
    import re
    import pandas as pd
    
    def cssFind(movie,cssSelector,nth=1):
        if len(movie.select(cssSelector)) >= nth:
            return movie.select(cssSelector)[nth-1].text.strip()
        else:
            return ''
    
    def reFind(pattern,sourceStr,nth=1):
        if len(re.findall(pattern,sourceStr)) >= nth:
            return re.findall(pattern,sourceStr)[nth-1]
        else:
            return ''
    
    if __name__ == "__main__":
        movie_list =[]
        for i in range(1,251):
            print("正在解析排名第%d页"%i)
            fileName = "doubanSourcePages/%03d.html"%i
            try:
                with open(fileName,encoding='utf8') as file:
                    soup = bs(file,'lxml')
                movie = {}
                movie['得分'] = cssFind(soup, 'strong[class="ll rating_num"]')
                movie['片名'] = cssFind(soup, 'span[property="v:itemreviewed"]')
                info = cssFind(soup, "div[id='info']")
                for item in info.split('\n')[:9]:
                    key = item.split(":")[0]
                    value = item.split(":")[1]
                    movie[key] = value
                movie_list.append(movie)
                movie['标签'] = ','.join([k.text for k in soup.select("div.tags a")])
                movie['图片链接'] = soup.select('a.nbgnbg img')[0]['src']
            except:
                print("解析排名第%d页失败"%i)
                movie_list.append({})
        df = pd.DataFrame(movie_list,columns=movie_list[0].keys())
        df.to_excel("豆瓣电影详情信息.xlsx")
    

    2.详情页面持久化

    代码如下:

    from bs4 import BeautifulSoup as bs
    import requests
    from time import sleep
    
    def save_webPage(url,fileName):
        response = requests.get(url)
        response.encoding = 'utf-8'
        with open(fileName,'w',encoding='utf-8') as file:
            file.write(response.text)
    
    if __name__ == "__main__":
        #解析网页并将每条电影信息插入mysql数据库
        url_before = "https://movie.douban.com/top250?start={}"
        count = 0
        for i in range(0,250,25):
            url = url_before.format(i)
            fileName = "{}-{}.html".format(i+1,i+25)
            save_webPage(url,fileName)
            response = requests.get(url)
            response.encoding = 'utf-8'
            soup = bs(response.text, 'lxml')
            movie_list = soup.select("ol.grid_view li")
            for movie in movie_list:
                nextUrl = movie.select("div.hd a")[0]['href']
                count +=1
                fileName = "%03d.html"%count
                print("正在把排名第%d的电影详情页面保存到本地"%count)
                save_webPage(nextUrl,fileName)
            sleep(3)
    

    3.人员随机分组

    import random
    
    def getGroup(lt,n):
        lt_len = len(lt)
        left = lt_len%n
        m = lt_len//n
        group_number_list = [m] * (n-left) + [m+1] * left
        random.shuffle(group_number_list)
        group_list = []
        print(group_number_list)
        for group_number in group_number_list:
            group = random.sample(lt,group_number)
            print(group)
            for i in group:
                lt.remove(i)
            group_list.append(group)
        return group_list
    
    if __name__ == "__main__":
        name_str = "陶宇,王燕琪,雷杰,韦民童,余鹏,李波,雷坤,"\
            "石月,丁松,郑志杰,陶雨,程韶曦,葛振刚,王雪虎,李响,仲雯,王海宾"
        name_list = name_str.split(',')
        getGroup(name_list,4)
    

    相关文章

      网友评论

        本文标题:基于bs4+requests的豆瓣电影爬虫

        本文链接:https://www.haomeiwen.com/subject/eyxapftx.html