美文网首页
网络爬虫:urllib模块应用6--智联

网络爬虫:urllib模块应用6--智联

作者: 牛耀 | 来源:发表于2018-12-23 14:43 被阅读0次
from urllib import request,parse
import json,pymysql,re
def zhilianSpider(url):
    response_data = load_page_data(url)
    data = json.loads(response_data)
    if data['code'] == 200:
        print('请求成功')
        postionJobs = data['data']['results']

        for job in postionJobs:
            companyJobs = {}
            zhilian_job = {}
            zhilian_job['jobName'] = job['jobName']
            zhilian_job['salary'] = job['salary']
            zhilian_job['city'] = job['city']['display']
            zhilian_job['workingExp'] = job['workingExp']['name']
            zhilian_job['sduLevel'] = job['eduLevel']['name']
            zhilian_job['welfare'] = ' '.join(job['welfare'])
            zhilian_job['company'] = job['company']['name']
            zhilian_job['companyType'] = job['company']['type']['name']
            zhilian_job['people'] = job['company']['size']['name']
            save_data_to_db(zhilian_job)
            print('添加成功')
            # print(job)
            # companyJobs['company'] = job['company']['name']
            # company_url = job['company']['url']
            # html = load_page_data(company_url)
            # print(html)
            # company_data = company_page_data(html)
            # zhilianSpider(next_url)

def company_page_data(html):
    pattern = re.compile(
        '<div.*?class="mian-company">'+
        '.*?<div.*?url.*?"(.*?)"'+
        '.*?<span.*?>(.*?)</span>'+
        '.*?<span.*?>(.*?)</span>'+
        '.*?<span.*?>(.*?)</span>'+
        '.*?<span.*?>(.*?)</span>'+
        '.*?<span.*?>(.*?)</span>'+
        '.*?<p.*?style="text-indent.*?>'+
        '.*?<span.*?>(.*?)</span>'
        ,re.S

    )
    result = re.findall(pattern,html)
    print(result)
def save_data_to_db(zhilian_job):
    sql = """
    INSERT INTO zhilian(%s)
    VALUE (%s)
    """%(','.join(zhilian_job.keys()),','.join(['%s']*len(zhilian_job)))
    try:
        cursor.execute(sql,list(zhilian_job.values()))
        mysql_client.commit()
    except Exception as err:
        print(err)
        mysql_client.rollback()
def load_page_data(url):
    req_header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
    }
    req = request.Request(url,headers=req_header)
    response = request.urlopen(req)
    a = response.read().decode('utf-8')
    if response.status == 200:
        return a


if __name__ == '__main__':
     mysql_client = pymysql.Connect('127.0.0.1','root','18603503110','1712B',3306,charset='utf8')
     #创建游标(执行sql语句)
     url = 'https://fe-api.zhaopin.com/c/i/sou?pageSize=90&cityId=489&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=%E6%8A%80%E6%9C%AF&kt=3&_v=0.41792226&x-zp-page-request-id=99f4ba4b537c448e831a297ae4de73f9-1545304025814-219164'
     cursor = mysql_client.cursor()
     zhilianSpider(url)

相关文章

网友评论

      本文标题:网络爬虫:urllib模块应用6--智联

      本文链接:https://www.haomeiwen.com/subject/xplnkqtx.html