美文网首页
网络爬虫:urllib模块应用6--智联

网络爬虫:urllib模块应用6--智联

作者: 牛耀 | 来源:发表于2018-12-23 14:43 被阅读0次
    from urllib import request,parse
    import json,pymysql,re
    def zhilianSpider(url):
        response_data = load_page_data(url)
        data = json.loads(response_data)
        if data['code'] == 200:
            print('请求成功')
            postionJobs = data['data']['results']
    
            for job in postionJobs:
                companyJobs = {}
                zhilian_job = {}
                zhilian_job['jobName'] = job['jobName']
                zhilian_job['salary'] = job['salary']
                zhilian_job['city'] = job['city']['display']
                zhilian_job['workingExp'] = job['workingExp']['name']
                zhilian_job['sduLevel'] = job['eduLevel']['name']
                zhilian_job['welfare'] = ' '.join(job['welfare'])
                zhilian_job['company'] = job['company']['name']
                zhilian_job['companyType'] = job['company']['type']['name']
                zhilian_job['people'] = job['company']['size']['name']
                save_data_to_db(zhilian_job)
                print('添加成功')
                # print(job)
                # companyJobs['company'] = job['company']['name']
                # company_url = job['company']['url']
                # html = load_page_data(company_url)
                # print(html)
                # company_data = company_page_data(html)
                # zhilianSpider(next_url)
    
    def company_page_data(html):
        pattern = re.compile(
            '<div.*?class="mian-company">'+
            '.*?<div.*?url.*?"(.*?)"'+
            '.*?<span.*?>(.*?)</span>'+
            '.*?<span.*?>(.*?)</span>'+
            '.*?<span.*?>(.*?)</span>'+
            '.*?<span.*?>(.*?)</span>'+
            '.*?<span.*?>(.*?)</span>'+
            '.*?<p.*?style="text-indent.*?>'+
            '.*?<span.*?>(.*?)</span>'
            ,re.S
    
        )
        result = re.findall(pattern,html)
        print(result)
    def save_data_to_db(zhilian_job):
        sql = """
        INSERT INTO zhilian(%s)
        VALUE (%s)
        """%(','.join(zhilian_job.keys()),','.join(['%s']*len(zhilian_job)))
        try:
            cursor.execute(sql,list(zhilian_job.values()))
            mysql_client.commit()
        except Exception as err:
            print(err)
            mysql_client.rollback()
    def load_page_data(url):
        req_header = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
        }
        req = request.Request(url,headers=req_header)
        response = request.urlopen(req)
        a = response.read().decode('utf-8')
        if response.status == 200:
            return a
    
    
    if __name__ == '__main__':
         mysql_client = pymysql.Connect('127.0.0.1','root','18603503110','1712B',3306,charset='utf8')
         #创建游标(执行sql语句)
         url = 'https://fe-api.zhaopin.com/c/i/sou?pageSize=90&cityId=489&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=%E6%8A%80%E6%9C%AF&kt=3&_v=0.41792226&x-zp-page-request-id=99f4ba4b537c448e831a297ae4de73f9-1545304025814-219164'
         cursor = mysql_client.cursor()
         zhilianSpider(url)
    

    相关文章

      网友评论

          本文标题:网络爬虫:urllib模块应用6--智联

          本文链接:https://www.haomeiwen.com/subject/xplnkqtx.html