美文网首页
spider - 猫眼电影top100

spider - 猫眼电影top100

作者: 憧憬001 | 来源:发表于2019-02-18 17:37 被阅读0次

    一、获取网页

    import requests
    
    # 获取网页
    def get_page():
        # 请求头(有些反扒机制需要检测)
        headers = {
            "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
        }
    
        
        url = 'https://maoyan.com/board/4'
        response = requests.get(url,headers=headers)
        if response.status_code == 200:
            return response.content.decode('utf-8')
        return None
    
    def main():
        html = get_page()
        print(html)
    
        
    if __name__ == '__main__':
        main()
    

    解析网页

    • 在main()上面再定义一个parse_page()
    import requests
    import re
    
    # 获取网页
    def get_page():
        # 请求头(有些反扒机制需要检测)
        headers = {
            "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
        }
    
        
        url = 'https://maoyan.com/board/4'
        response = requests.get(url)
        if response.status_code == 200:
            return response.content.decode('utf-8')
        return None
    
    # 解析网页
    def parse_page(html):
        # 片名
        pattern = re.compile('movieId.*?>.*?<img.*?<img.*?alt="(.*?)" class.*?', re.S)
        movie_names = re.findall(pattern, html)
        print('片名:',movie_names)
    
        # 主演
        pattern = re.compile('<p class="star">(.*?)</p>', re.S)
        actors = re.findall(pattern,html)
        # 去字符串前后空格
        actors = [actor.strip() for actor in actors]
        print('主演:',actors)
    
        # 上映时间
        pattern = re.compile('<p class="releasetime">(.*?)</p>', re.S)
        time = re.findall(pattern, html)
        time = [i.strip() for i in time]
        print('上映时间:',time)
    
        # 封面图片
        pattern = re.compile('movieId.*?>.*?<img.*?<img.*?data-src="(.*?)" alt.*?', re.S)
        img = re.findall(pattern, html)
        print('封面图片:',img)
    
        # 排名
        pattern = re.compile('<i class="board-index board-index-(.*?)">.*?</i>', re.S)
        rank = re.findall(pattern, html)
        print('排名:',rank)
    
        # 评分
        pattern = re.compile('<p class="score"><i class="integer">(.*?)</i><i class="fraction">(.*?)</i></p>', re.S)
        score = re.findall(pattern, html)
        score = [''.join(i) for i in score]
        print('评分:', score)
    
    
        # 链接
        pattern = re.compile('<div class="movie-item-info">.*?<p class="name"><a href="(.*?)" title', re.S)
        detail = re.findall(pattern, html)
        detail = [i.strip() for i in detail]
        print('链接:', detail)
    
    
    def main():
        html = get_page()
    #     print(html)
        parse_page(html)
    
        
    if __name__ == '__main__':
        main()
    
    # 打印的内容
    >>>>
    片名: ['霸王别姬', '肖申克的救赎', '罗马假日', '这个杀手不太冷', '泰坦尼克号', '教父', '唐伯虎点秋香', '千与千寻', '魂断蓝桥', '乱世佳人']
    主演: ['主演:张国荣,张丰毅,巩俐', '主演:蒂姆·罗宾斯,摩根·弗里曼,鲍勃·冈顿', '主演:格利高里·派克,奥黛丽·赫本,埃迪·艾伯特', '主演:让·雷诺,加里·奥德曼,娜塔莉·波特曼', '主演:莱昂纳多·迪卡普里奥,凯特·温丝莱特,比利·赞恩', '主演:马龙·白兰度,阿尔·帕西诺,詹姆斯·肯恩', '主演:周星驰,巩俐,郑佩佩', '主演:柊瑠美,入野自由,夏木真理', '主演:费雯·丽,罗伯特·泰勒,露塞尔·沃特森', '主演:费雯·丽,克拉克·盖博,奥利维娅·德哈维兰']
    上映时间: ['上映时间:1993-01-01', '上映时间:1994-10-14(美国)', '上映时间:1953-09-02(美国)', '上映时间:1994-09-14(法国)', '上映时间:1998-04-03', '上映时间:1972-03-24(美国)', '上映时间:1993-07-01(中国香港)', '上映时间:2001-07-20(日本)', '上映时间:1940-05-17(美国)', '上映时间:1939-12-15(美国)']
    封面图片: ['https://p1.meituan.net/movie/20803f59291c47e1e116c11963ce019e68711.jpg@160w_220h_1e_1c', 'https://p0.meituan.net/movie/283292171619cdfd5b240c8fd093f1eb255670.jpg@160w_220h_1e_1c', 'https://p0.meituan.net/movie/54617769d96807e4d81804284ffe2a27239007.jpg@160w_220h_1e_1c', 'https://p0.meituan.net/movie/e55ec5d18ccc83ba7db68caae54f165f95924.jpg@160w_220h_1e_1c', 'https://p1.meituan.net/movie/0699ac97c82cf01638aa5023562d6134351277.jpg@160w_220h_1e_1c', 'https://p1.meituan.net/movie/f5a924f362f050881f2b8f82e852747c118515.jpg@160w_220h_1e_1c', 'https://p0.meituan.net/movie/da64660f82b98cdc1b8a3804e69609e041108.jpg@160w_220h_1e_1c', 'https://p0.meituan.net/movie/b076ce63e9860ecf1ee9839badee5228329384.jpg@160w_220h_1e_1c', 'https://p0.meituan.net/movie/46c29a8b8d8424bdda7715e6fd779c66235684.jpg@160w_220h_1e_1c', 'https://p0.meituan.net/movie/230e71d398e0c54730d58dc4bb6e4cca51662.jpg@160w_220h_1e_1c']
    排名: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
    评分: ['9.6', '9.5', '9.1', '9.5', '9.6', '9.3', '9.2', '9.3', '9.2', '9.1']
    链接: ['/films/1203', '/films/1297', '/films/2641', '/films/4055', '/films/267', '/films/1247', '/films/837', '/films/1212', '/films/2760', '/films/7431']
    

    下面附完整代码(将爬取到的内容存到本地)

    import json
    import re
    
    import requests
    
    
    # 获取网页
    def get_page(page):
        # 请求头
        headers = {
            "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
        }
    
        url = 'https://maoyan.com/board/4?offset='+str(page)
        # url = 'https://maoyan.com/board/4'
        response = requests.get(url)
        if response.status_code == 200:
            return response.content.decode('utf-8')
        return None
    
    # 获取所有 网页
    def get_all_pages():
        result = []
        for i in range(10):
            page = i*10
            html = get_page(page)
            result_list = parse_page(html)
            result.append(result_list)
    
        return result
    
    # 写入图片
    def save_img(cover_url):
        response = requests.get(cover_url)
        filename = cover_url.split('/')[-1].split('@')[0]
    
        with open('./images/%s' % filename,'wb')as f:
            f.write(response.content)
    
    # 解析网页
    def parse_page(html):
        # 片名
        pattern = re.compile('movieId.*?>.*?<img.*?<img.*?alt="(.*?)" class.*?', re.S)
        movie_names = re.findall(pattern, html)
        # print('片名:',movie_names)
    
        # 主演
        pattern = re.compile('<p class="star">(.*?)</p>', re.S)
        actors = re.findall(pattern,html)
        # 去字符串前后空格
        actors = [actor.strip() for actor in actors]
        # print('主演:',actors)
    
        # 上映时间
        pattern = re.compile('<p class="releasetime">(.*?)</p>', re.S)
        time = re.findall(pattern, html)
        time = [i.strip() for i in time]
        # print('上映时间:',time)
    
        # 封面图片
        pattern = re.compile('movieId.*?>.*?<img.*?<img.*?data-src="(.*?)" alt.*?', re.S)
        img = re.findall(pattern, html)
    
        # print('封面图片:',img)
    
        # 排名
        pattern = re.compile('<i class="board-index board-index-(.*?)">.*?</i>', re.S)
        rank = re.findall(pattern, html)
        # print('排名:',rank)
    
        # 评分
        # '<p class="score"><i class="integer">(.*?)</i><i class="fraction">(.*?)</i></p>'
        pattern = re.compile('<p class="score"><i class="integer">(.*?)</i><i class="fraction">(.*?)</i></p>', re.S)
        score = re.findall(pattern, html)
        score = [''.join(i) for i in score]
        # print('评分:', score)
    
    
        # 链接
        # '<div class="movie-item-info">.*?<p class="name"><a href="(.*?)" title'
        pattern = re.compile('<div class="movie-item-info">.*?<p class="name"><a href="(.*?)" title', re.S)
        detail = re.findall(pattern, html)
        detail = [i.strip() for i in detail]
        # print('链接:', detail)
    
        print('spider...')
    
        # 组装json
        result_list = []
    
        for i in range(len(movie_names)):
            result_dict = {}
            result_dict['movie_name'] = movie_names[i]
            result_dict['actor'] = actors[i]
            result_dict['time'] = time[i]
            result_dict['img'] = img[i]
            # 保存图片到本地
            save_img(result_dict['img'])
            result_dict['rank'] = rank[i]
            result_dict['score'] = score[i]
            result_dict['detail'] = detail[i]
            result_list.append(result_dict)
    
        return result_list
    
    
    def save_json_file(result):
        json_str = json.dumps(result, ensure_ascii=False)
        with open('maoyan.json','w', encoding='utf-8')as f:
            f.write(json_str)
    
    
    def main():
        # html = get_page()
    
        # print(html)
        # parse_page(html)
        result = get_all_pages()
        print(result)
        save_json_file(result)
    
    
    if __name__ == '__main__':
        main()
    
    
    
    
    
    
    

    相关文章

      网友评论

          本文标题:spider - 猫眼电影top100

          本文链接:https://www.haomeiwen.com/subject/enczeqtx.html