美文网首页
网络爬虫:urllib模块应用8--猫眼

网络爬虫:urllib模块应用8--猫眼

作者: 牛耀 | 来源:发表于2018-12-23 14:47 被阅读0次
    # 1: 分析网站,找到目标url,判断是否是静态页面
    # https://maoyan.com/board/4?offset=0
    # https://maoyan.com/board/4?offset=10
    from urllib import parse,request
    import re,pymysql
    
    def maoyanSpider(url):
        """
        根据url请求,解析数据,构造下一次请求
        :param url: 分页url地址
        :return:
        """
        html,current_url = load_page_data(url)
        # 解析数据
        movies = parse_page_data(html)
        if len(movies)>0:
            for movie in movies:
                movie_data = {}
                # 排名
                movie_data['rank'] = int(movie[0])
                # 封面图片
                movie_data['coverImage'] = movie[1]
                # 电影名称
                movie_data['name'] = movie[2]
                #    主演
                movie_data['actor'] = movie[3].replace('\n','').replace(' ','')
                # 时间
                movie_data['publishTime'] = movie[4].replace('上映时间','')
                # 评分数
                movie_data['scorenum'] = float(movie[5]+movie[6])
                # 存
                # save_data_to_db(movie_data)
                print(movie_data)
            #       构造下一页
            pattern = re.compile('.*?offset=(\d+)')
            current_offset = int(re.findall(pattern,current_url)[0])
            nextpage_offset = current_offset+10
        # next_url = 'https://maoyan.com/board/4?offset='+str(nextpage_offset)
        # 通过正则替换
            pattern = re.compile('offset=\d+')
            next_url = re.sub(pattern,'offset='+str(nextpage_offset),current_url)
            maoyanSpider(next_url)
        else:
            print('结束')
    
    def load_page_data(url):
        req_header = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
            'Referer': 'https://www.lagou.com/jobs/list_c%2B%2B?labelWords=&fromSearch=true&suginput=',
        }
        # 构建一个request对象
        req = request.Request(url, headers=req_header)
        # 根据Request对象发起请求
        response = request.urlopen(req)
        if response.status == 200:
            # decode 解码
            return response.read().decode('utf-8'),response.url
    
    def parse_page_data(html):
        """
        从页面源码中提取目标数据
        :param html:
        :return:
        """
        pattern =  re.compile('<dd>.*?<i.*?>(.*?)</i>'+
                   '.*?<img.*?data-src="(.*?)"'+
                   '.*?<p.*?>.*?<a.*?>(.*?)</a>'+
                   '.*?<p.*?>(.*?)</p>'+
                   '.*?<p.*?>(.*?)</p>'+
                   '.*?<i.*?>(.*?)</i>'+
                   '.*?<i.*?>(.*?)</i>.*?</dd>',re.S
                   )
        result = re.findall(pattern,html)
        return result
    
    def save_data_to_db(movieInfo):
        """
        存储数据
        :param movieInfo:
        :return:
        """
        pass
    
    if __name__ == '__main__':
    
        mysql_client = pymysql.Connect('127.0.0.1', 'root', '18603503110', '1712B', 3306, charset='utf8')
        # 创建游标(执行mysql语句)
        cursor = mysql_client.cursor()
    
    
        start_url = 'https://maoyan.com/board/4?offset=0'
        maoyanSpider(start_url)
    

    相关文章

      网友评论

          本文标题:网络爬虫:urllib模块应用8--猫眼

          本文链接:https://www.haomeiwen.com/subject/silnkqtx.html