美文网首页
抓取猫眼电影 Code

抓取猫眼电影 Code

作者: 其徐如林 | 来源:发表于2018-09-01 22:56 被阅读0次
    import json, requests, re
    from datetime import time
    
    def get_one_page(url):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'
        }
        try:
            response = requests.get(url,headers = headers)
            if response.status_code == 200:
                return response.text
        except BaseException as base:
            print(base)
    
    def parse_one_page(html):
        # 匹配i节点中的排名信息
        rank = '<dd>.*?board-index.*?>(.*?)</i>'
        # 提取图片信息
        img = '.*?data-src="(.*?)"'
        # 提取电影名称
        name = '.*?<a.*?data-val=".*?">(.*?)</a>'
        # 提取主演
        act = '.*?class="star">(.*?)</p>'
        # 提取上映时间
        time = '.*?class="releasetime">(.*?)</p>'
        # 评分
        grade = '.*?class="integer">(.*?)</i><i.*?class=".*?">(.*?)</i>'
        regul = rank + img + name + act + time + grade
    
        pattern = re.compile(regul, re.S)
        results = re.findall(pattern, html)
        for result in results:
            yield {
                'index': result[0],
                'image': result[1],
                'title': result[2],
                'actor': result[3].strip()[3:],
                'time': result[4].strip()[4:],
                'score': result[5].strip() + result[6].strip()
            }
    
    def write_json(data):
        with open('movie.json', 'a', encoding='utf-8') as w:
            json.dump(data, w)
            w.write('\n')
    
    def main(offset):
        url = 'Http://maoyan.com/board/4?offset=' + str(offset)
        html = get_one_page(url)
        for i in parse_one_page(html):
            # print(i)
            write_json(i)
    if __name__ == '__main__':
        for i in range(10):
            main(offset=i * 10)
            time.sleep(2)
    
    

    相关文章

      网友评论

          本文标题:抓取猫眼电影 Code

          本文链接:https://www.haomeiwen.com/subject/wdkpwftx.html