美文网首页
网络爬虫:urllib模块应用7--拉钩

网络爬虫:urllib模块应用7--拉钩

作者: 牛耀 | 来源:发表于2018-12-23 14:44 被阅读0次
    # 目标url: https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false
    # post请求要提交的表单数据
    from urllib import request,parse
    import json,pymysql,time
    
    
    
    def lagouspider(url,formdata):
    # 发起请求返回响应结果
        response_data = load_page_data(url,formdata)
    # 得到一个json数据,需返回一个python类型的数据
        data = json.loads(response_data)
        if data['success']:
            print('请求成功')
    # 拿到职位信息
            postionJobs  = data['content']['positionResult']['result']
            for jobinfo in postionJobs:
                jobdata = {}
                jobdata['positionName'] = jobinfo['positionName']
                jobdata['publishTime'] = jobinfo['formatCreateTime']
                jobdata['companyname'] = jobinfo['companyShortName']
                jobdata['salary'] = jobinfo['salary']
                jobdata['worYear'] = jobinfo['workYear']
                jobdata['education'] = jobinfo['education']
                jobdata['industry'] = jobinfo['industryField']
                jobdata['stage'] = jobinfo['financeStage']
                jobdata['companySize'] = jobinfo['companySize']
                jobdata['fuli'] = ','.join(jobinfo['companyLabelList'])
                jobdata['positionAdvantage'] = ','.join(jobinfo['positionAdvantage'])
                # 存数据
                save_data_to_db(jobdata)
    
    
    
    
    # 判断是否需要发起下一次请求
            # 取出当前页码
            cur_page = int(data['content']['pageNo'])
            # 每页多少条
            page_size = int(data['content']['pageSize'])
            #职位总数
            totalcount = int(data['content']['positionResult']['totalCount'])
            if cur_page*page_size < totalcount:
                next_page = cur_page +1
                print('继续发起请求第'+str(next_page)+'页')
                formdata['pn'] = next_page
                lagouspider(url,formdata)
        else:
            print('请求不成功,请稍后尝试')
            time.sleep(10)
            print('重新发起第'+formdata['pn']+'页请求')
            lagouspider(url,formdata)
    def load_page_data(url,formdata):
        """
        发起请求
        :param url:
        :param formdata:
        :return:
        """
        # 将表单数据转为web服务器可以识别的url编码格式的bytes类型的数据
        form_data  = parse.urlencode(formdata).encode('utf-8')
        req_header = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
            'Referer':'https://www.lagou.com/jobs/list_c%2B%2B?labelWords=&fromSearch=true&suginput=',
        }
        # 构建一个request对象
        req = request.Request(url, headers=req_header,data=formdata)
    
        # 根据Request对象发起请求
        response = request.urlopen(req)
        if response.status == 200:
            return response.read().decode('utf-8')
    
    def save_data_to_db(jobdata):
        """
        存储数据
        :param jobdata:
        :return:
        """
        sql = """
        INSERT INTO lagou(%s)
        VALUE (%s)
        """%(','.join(jobdata.keys()),','.join(['%s']*len(jobdata)))
        try:
            cursor.execute(sql,list(jobdata.values()))
            mysql_client.commit()
        except Exception as err:
            print(err)
            mysql_client.rollback()
    if __name__ == '__main__':
    
        # 创建数据库链接
        """
         host=None, user=None, password="",
                     database=None, port=0, unix_socket=None,
                     charset=''
        """
        mysql_client = pymysql.Connect('127.0.0.1','root','18603503110','1712B',3306,charset='utf8')
        # 创建游标(执行mysql语句)
        cursor = mysql_client.cursor()
        # 目标
        url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
    
        # 要提交的参数
        formdata = {
            'first':'true',
            'pn':1,
            'kd':'c++',
        }
        lagouspider(url,formdata)
    
    
    

    相关文章

      网友评论

          本文标题:网络爬虫:urllib模块应用7--拉钩

          本文链接:https://www.haomeiwen.com/subject/xvlnkqtx.html