美文网首页
抓取智联招聘数据

抓取智联招聘数据

作者: 幻想无极 | 来源:发表于2018-07-17 23:32 被阅读146次

    运行平台: Mac
    Python版本: Python3.7
    IDE: Sublime Text
    其他工具: Chrome浏览器
    时间:2018.7.17

    1.分析请求链接

    搜索页面链接

    https://sou.zhaopin.com/jobs/searchresult.ashx?jl=成都&kw=iOS&sm=0&p=1
    

    字段

    l=成都 #工作地点
    kw=iOS  #职位
    sm=0  #区
    p=1 #页数
    

    2.模拟请求获取网页

    
    def get_one_page(city, keyword, region, page):
        paras = {
            'jl': city,         # 搜索城市
            'kw': keyword,      # 搜索关键词 
            'isadv': 0,         # 是否打开更详细搜索选项
            'isfilter': 1,      # 是否对结果过滤
            'p': page,          # 页数
            # 're': region        # region的缩写,地区,2005代表海淀
        }
    
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
            'Host': 'sou.zhaopin.com',
            'Referer': 'https://www.zhaopin.com/',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9'
        }
    
        url = 'https://sou.zhaopin.com/jobs/searchresult.ashx?' + urlencode(paras)
        try:
            # 获取网页内容,返回html数据
            response = requests.get(url, headers=headers)
            # 通过状态码判断是否获取成功
            if response.status_code == 200:
                return response.text
            return None
        except RequestException as e:
            return None
    
    

    3.分析网页标签编写正则

        pattern = re.compile('td class="zwmc".*?href="(.*?)" target="_blank">(.*?)</a>.*?'
        '<td class="gsmc"><a href="(.*?)".*?target="_blank">(.*?)</a>.*?'
        '<td class="zwyx">(.*?)</td>.*?'
        '<td class="gxsj"><span>(.*?)</span>.*?'
        '<li class="newlist_deatil_two"><span>(.*?)</span><span>(.*?)</span><span>(.*?)</span><span>(.*?)</span><span>(.*?)</span>', re.S)                  
            #匹配所有符合条件的内容
    

    4.取出数据并存入数据

    def main(city, keyword, region, pages):
        
        filename = '智联招聘测试_' + city + '_' + keyword+ '.csv'
        headers = ['网站','岗位', '公司网站', '公司', '薪水','首发日','地点','公司性质','规模','经验']
        write_csv_headers(filename, headers)
    
        for i in tqdm(range(pages)):
            
            jobs = []
    
            html = get_one_page(city,keyword,region,i)
            items = parse_one_page(html)
            for item in items:
                jobs.append(item)
            write_csv_rows(filename, headers, jobs)
    
    

    5.源码

    #-*- coding: utf-8 -*-
    import requests
    import re
    import csv
    from tqdm import tqdm
    from urllib.parse import urlencode
    from requests.exceptions import RequestException
    
    
    def write_csv_file(path, headers, rows):
        '''
        将表头和行写入csv文件
        '''
        # 加入encoding防止中文写入报错
        # newline参数防止每写入一行都多一个空行
        with open(path, 'a', encoding='gb18030', newline='') as f:
            f_csv = csv.DictWriter(f, headers)
            f_csv.writeheader()
            f_csv.writerows(rows)
    
    def write_csv_headers(path, headers):
        '''
        写入表头
        '''
        with open(path, 'a', encoding='gb18030', newline='') as f:
            f_csv = csv.DictWriter(f, headers)
            f_csv.writeheader()
    
    def write_csv_rows(path, headers, rows):
        '''
        写入行
        '''
        with open(path, 'a', encoding='gb18030', newline='') as f:
            f_csv = csv.DictWriter(f, headers)
            f_csv.writerows(rows)
    
    
    def get_one_page(city, keyword, region, page):
        paras = {
            'jl': city,         # 搜索城市
            'kw': keyword,      # 搜索关键词 
            'isadv': 0,         # 是否打开更详细搜索选项
            'isfilter': 1,      # 是否对结果过滤
            'p': page,          # 页数
            # 're': region        # region的缩写,地区,2005代表海淀
        }
    
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
            'Host': 'sou.zhaopin.com',
            'Referer': 'https://www.zhaopin.com/',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9'
        }
    
        url = 'https://sou.zhaopin.com/jobs/searchresult.ashx?' + urlencode(paras)
        try:
            # 获取网页内容,返回html数据
            response = requests.get(url, headers=headers)
            # 通过状态码判断是否获取成功
            if response.status_code == 200:
                return response.text
            return None
        except RequestException as e:
            return None
    
    def parse_one_page(html):
        #
        pattern = re.compile('td class="zwmc".*?href="(.*?)" target="_blank">(.*?)</a>.*?'
        '<td class="gsmc"><a href="(.*?)".*?target="_blank">(.*?)</a>.*?'
        '<td class="zwyx">(.*?)</td>.*?'
        '<td class="gxsj"><span>(.*?)</span>.*?'
        '<li class="newlist_deatil_two"><span>(.*?)</span><span>(.*?)</span><span>(.*?)</span><span>(.*?)</span><span>(.*?)</span>', re.S)                  
            #匹配所有符合条件的内容
        items = re.findall(pattern,html)
    
        for item in items:
            #ob_name = job_name.replace('<b>', '')
            #地点:
            job_name = item[1].replace('<b>', '')
            job_name = job_name.replace('</b>', '')
            city = item[6]
            city = city.replace('地点:', '')
            nature = item[7].replace('公司性质:', '')
            size = item[8].replace('公司规模:', '')
            experience = item[9].replace('经验:', '')
            experience = experience.replace('学历:', '')
            if '学历:' in size:
                experience = size
                experience = experience.replace('学历:', '')
                size = ''
            if len(experience)>10:
                experience = ''
            yield {
                '网站': item[0],
                '岗位': job_name,
                '公司网站': item[2],
                '公司': item[3],
                '薪水': item[4],
                '首发日': item[5],
                '地点': city,
                '公司性质': nature,
                '规模': size,
                '经验': experience,
            }
    
    def main(city, keyword, region, pages):
        
        filename = '智联招聘测试_' + city + '_' + keyword+ '.csv'
        headers = ['网站','岗位', '公司网站', '公司', '薪水','首发日','地点','公司性质','规模','经验']
        write_csv_headers(filename, headers)
    
        for i in tqdm(range(pages)):
            
            jobs = []
    
            html = get_one_page(city,keyword,region,i)
            items = parse_one_page(html)
            for item in items:
                jobs.append(item)
            write_csv_rows(filename, headers, jobs)
    
    if __name__ == '__main__':
        main('成都', 'ios', 2005, 10)
    
    
    

    6.效果

    1153D852E628D56A71873CFBC3E2B937.jpg

    相关文章

      网友评论

          本文标题:抓取智联招聘数据

          本文链接:https://www.haomeiwen.com/subject/cqwppftx.html