从远古时代开始吧,各种招聘网站都是爬虫爱好者的战场了,我们今天就来爬爬 智联招聘 吧。整个的爬取过程并不是很顺畅啊,边写边查,一整天下来才勉勉强强写好,期间也学到了许多实用的小知识,后续也会陆陆续续的在这里分享出来。
使用工具:requests, datetime, re, openpyxl
- 搭建python开发环境
- 在cmd下运行下面的命令
pip install requests
pip install openpyxl
import requests
import openpyxl
import re
import datetime
# 取字符串中两个符号之间的字符串
def txt(start_str, end, html): #头,尾,字符串
start = html.find(start_str)
if start >= 0:
start += len(start_str)
end = html.find(end, start)
if end >= 0:
return html[start:end].strip()
# 通过正则表达式分割数据,返回数据表
def return_data_sheet(file_list):
data_sheet_all = []
for file in file_list:
with open(file, "r", encoding='utf-8')as f:
string = f.read()
# 懒惰模式 .*?
# 岗位名称:IT培训讲师(Python)
a1 = re.findall('<div class="fn-left position">.*?</div>', string)
# 招聘方式:校园招聘
a2 = re.findall('<span class="fn-left.*?</span>', string)
# 招聘公司:中公教育
a3 = re.findall('<div class="fn-right company">.*?</div>', string)
# 城市: 西安
a4 = re.findall('<span class="city fn-left">.*?</span>', string)
# 招聘人数: 10
a5 = re.findall('<span class="num fn-left">.*?</span>', string)
# 发布时间:昨天
a6 = re.findall('<span class="time fn-left">.*?</span>', string)
# 关键词:培训/课外教育/教育辅助
a7 = re.findall('<span class="industry fn-right">.*?</span>', string)
data_sheet = []
for i in range(0, len(a1)-1):
s1 = txt('<div class="fn-left position">', '</div>', a1[i])
s2 = txt('>', '<', a2[i])
s3 = txt('>', '<', a3[i])
s4 = txt('>', '<', a4[i])
s5 = txt('>', '<', a5[i])
s6 = txt('>', '<', a6[i])
s7 = txt('>', '<', a7[i])
data_sheet.append([s1, s2, s3, s4, s5, s6, s7])
# for i in data_sheet:
# print(i)
data_sheet_all = data_sheet_all + data_sheet
return data_sheet_all
# 爬虫部分
def get_code(kw, pg):
n = pg.find("-")
file_list = []
for i in range(int(pg[:n]), int(pg[n+1:])+1):
header = {
# cookie需要自己登录后,抓包获取,使用这个cookie就可以随意访问登录后才能获取到的网页
url = "https://xiaoyuan.zhaopin.com/search/jn=2&kw={}&pg={}".format(kw, i)
# params = {
# 'kw':kw,
# 'pg':i
# }
# r = requests.get(url, headers=header, params=params)
r = requests.get(url, headers=header)
r = r.text
with open('index_zlzp-{}.html'.format(i), 'w', encoding='utf-8')as f:
return file_list
# 将数据写入excel
def write_in_excel(data_sheet_all):
wb = openpyxl.Workbook()
wb.create_sheet('智联招聘', index=0)
ws = wb['智联招聘']
title = [['岗位名称', '招聘方式', '招聘公司', '城市', '招聘人数', '发布时间', '关键词']]
data_sheet_all = title + data_sheet_all
# print(len(data_sheet_all))
# print(data_sheet_all)
for i in range(0, len(data_sheet_all)):
for j in range(0, len(data_sheet_all[i])):
rStr = convertToTitle(j+1)
ws['{}{}'.format(rStr, i+1)] = data_sheet_all[i][j]
t = datetime.datetime.now().date()
# 利用列表长度,获取excel的列标题
# 例子:传入数字 27 输出 'AA'
def convertToTitle(n):
:type n: int
:rtype: str
rStr = ""
while n != 0:
res = n % 26
if res == 0:
res = 26
n -= 26
rStr = chr(ord('A') + res - 1) + rStr
n = n // 26
return rStr
if __name__ == "__main__":
# 参入搜索词语,与想爬取的页数
file_list = get_code('python', '1-4')
data_sheet_all = return_data_sheet(file_list)