美文网首页
Python 爬虫 - 爬虫-正则表达式

Python 爬虫 - 爬虫-正则表达式

作者: 莫名ypc | 来源:发表于2019-01-03 10:26 被阅读0次
    
    import requests
    import re
    
    
    # 获取单个网页
    def get_page(url):
        headers = {
            "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
        }
        response = requests.get(url=url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    
    
    # 获取单个网页
    def get_resource(url):
        headers = {
            "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
        }
        response = requests.get(url=url, headers=headers)
        if response.status_code == 200:
            return response.content
        return None
    
    
    # 保存图片
    def save_pic(url):
        img_content = get_resource(url)
        file_name = url.split('/')[-1].split("@")[0]
        with open('./images/%s' % file_name, 'wb') as f:
            f.write(img_content)
    
    
    # 获取所有网页
    def get_all_pages():
        result_all_list = []
        for i in range(10):
            page = i * 10
            url = 'http://maoyan.com/board/4?offset=%d' % page
            html = get_page(url)
            result_list = parse_page(html)
            result_all_list.extend(result_list)
        return result_all_list
    
    
    def strips(l):
        result_list = []
        for item in l:
            result_list.append(item.strip())
        return result_list
    
    
    def parse_score(socre_html):
        pattern = re.compile('<i class="integer">(.*?)</i><i class="fraction">(.*?)</i>', re.S)
        score = re.findall(pattern, socre_html)
        return score
    
    
    # 保存json数据
    def save_json(result_list):
        result_json_str = json.dumps(result_list, ensure_ascii=False)
        with open('maoyan.json', 'w', encoding='utf-8') as f:
            f.write(result_json_str)
    
    
    # 解析网页
    def parse_page(html):
        result_list = []
    
        pattern = re.compile('<p class="star">(.*?)</p>', re.S)
        actors = re.findall(pattern, html)
        actors = strips(actors)
    
        pattern = re.compile('movieId.*?>.*?<img.*?<img.*?alt="(.*?)" class.*?', re.S)
        movie_names = re.findall(pattern, html)
        movie_names = strips(movie_names)
    
        pattern = re.compile('<p class="releasetime">(.*?)</p>', re.S)
        release_time = re.findall(pattern, html)
        release_time = strips(release_time)
    
        # 排名
        pattern = re.compile('<i class="board-index board-index-.*?">(.*?)</i>', re.S)
        index = re.findall(pattern, html)
        index = strips(index)
    
        # 链接
        pattern = re.compile('<p class="name">.*?<a href="(.*?)"', re.S)
        link = re.findall(pattern, html)
        link = strips(link)
    
        # 图片链接
        pattern = re.compile('movieId.*?>.*?<img.*?<img data-src="(.*?)"', re.S)
        img_link = re.findall(pattern, html)
        for pic_url in img_link:
            save_pic(pic_url)
    
        pattern = re.compile('<p class="score">(.*?)</p>', re.S)
        score = re.findall(pattern, html)
        score = strips(score)
        score_list = []
        for score_html in score:
            scores = parse_score(score_html)
            score = ''.join(scores[0])
            score_list.append(score)
    
        # 组合结果列表
        for i in range(len(movie_names)):
            item = {}
            item['title'] = movie_names[i]
            item['actor'] = actors[i]
            item['release_time'] = release_time[i]
            item['index'] = index[i]
            item['link'] = link[i]
            item['img_link'] = img_link[i]
            item['score_list'] = score_list[i]
            result_list.append(item)
    
        return result_list
    
    
    def main():
        # url = 'http://maoyan.com/board/4'
        # # 获取网页
        # html = get_page(url)
        # result_list = parse_page(html)
        result_list = get_all_pages()
        save_json(result_list)
    
    
    if __name__ == '__main__':
        main()
    

    相关文章

      网友评论

          本文标题:Python 爬虫 - 爬虫-正则表达式

          本文链接:https://www.haomeiwen.com/subject/jftfrqtx.html