在知乎上爬虫入门的 100 个案例,不过大多数案例拉下来之后,都是不能运行的,自己就决定把这 100 个案例当成题目,自己手动写一下,防止自己眼高手低。接下来进入我们的主题
方案选择
- 第一种:示例上给的是使用浏览器的开发者工具中的 Network,找到我们需要的 Api 接口,根据结构获取我们需要的信息。步骤如下:
1.浏览器打开拉勾网地址:https://www.lagou.com
2.浏览器右上角 设置-->更多工具-->开发者工具
打开开发工具
3.开发者工具选中-Network,然后选中 XHR。
选中 开发者工具中 Network下的 XHR
4.在搜索框输入 Python,点击搜索按钮,可以看到右侧会出现一堆请求.最终我们筛选到了https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false,这个url 对应的 response 内容是我们需要的,参数有("pn","kd","sid")分别代表页码,关键词和一个 id 照抄过来就可以用了。
5.使用 python 模拟浏览器的网络请求
import requests
import json
import pymysql.cursors
def get_position_info(pageNo,keyWords):
# """返回当前页面的信息列表"""
url = "https://www.lagou.com/jobs/positionAjax.json"
headers = {
"Accept":"application/json, text/javascript, */*; q=0.01",
"Accept-Encoding":"gzip, deflate, br",
"Accept-Language":"zh-CN,zh;q=0.9,en;q=0.8,und;q=0.7",
"Connection":"keep-alive",
"Content-Length":"63",
"Content-Type":"application/x-www-form-urlencoded; charset=UTF-8",
"Cookie":"JSESSIONID=ABAAAECABGFABFF2355DDD810AA4B7200888D1AD6748FCE; user_trace_token=20200114140751-a42b5275-8421-48d3-b9e7-68dfd54a7268; WEBTJ-ID=20200114140806-16fa2aa95b2747-088a3c550e67ef-1136685a-1764000-16fa2aa95b37cb; _ga=GA1.2.1962027842.1578982086; _gid=GA1.2.552871320.1578982086; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1578982086; LGUID=20200114140752-33060498-3694-11ea-b2bf-525400f775ce; index_location_city=%E5%85%A8%E5%9B%BD; X_MIDDLE_TOKEN=52a24466865c3ceb89c3746da8f4044e; lagou_utm_source=A; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216fa2aa96bf214-0b00a41712c003-1136685a-1764000-16fa2aa96c0e6c%22%2C%22%24device_id%22%3A%2216fa2aa96bf214-0b00a41712c003-1136685a-1764000-16fa2aa96c0e6c%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; X_HTTP_TOKEN=c2100305a94fa1bd24375097511424ef3d7cabe162; _gat=1; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1579057355; LGSID=20200115110222-735d7e62-3743-11ea-b2de-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2Futrack%2FtrackMid.html%3Ff%3Dhttps%253A%252F%252Fwww.lagou.com%252Fjobs%252Flist%255Fpython%253FlabelWords%253D%2526fromSearch%253Dtrue%2526suginput%253D%26t%3D1579057335%26_ti%3D1; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_python%3FlabelWords%3D%26fromSearch%3Dtrue%26suginput%3D; LGRID=20200115110222-735d8073-3743-11ea-b2de-525400f775ce; SEARCH_ID=72019fd8d0824c75bf82cd2e145a7d13",
"Host":"www.lagou.com",
"Origin":"https://www.lagou.com",
"Referer":"https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=",
"User-Agent":"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Mobile Safari/537.36",
"X-Anit-Forge-Code":"0",
"X-Anit-Forge-Token":"None",
"X-Requested-With":"XMLHttpRequest"
}
postData = {"needAddtionalResult":"false","pn":pageNo,"kd":keyWords,"sid":"098c18fdd37945f7a5781f7d465e0bd8"}
response = requests.post(url, data=postData, headers=headers)
return response.json()
- 把打印的结果json 格式化,查找一下我们需要的字段
格式化后的数据
(ps:我们需要的字段分别是:positionName, companyFullName, companySize, financeStage, industryField, city, salary, workYear)
7.把单个的职位信息转换成元组,再把所有的职位信息转换成 list
def get_positions_list(positions_dic):
list_positions = positions_dic['content']['positionResult']['result']
result = []
for position in list_positions:
info = []
info.append(position["positionName"])
info.append(position["companyFullName"])
info.append(position["companySize"])
info.append(position["financeStage"])
info.append(position["industryField"])
info.append(position["city"])
info.append(position["salary"])
info.append(position["workYear"])
result.append(info)
return result
lsit = get_positions_list(get_position_info(1,"python"))
print(list)
8.通过数据库,把爬取到的信息存起来(首先要安装数据库,并且创建好数据库pythondb和表position_info_detail),下方代码是使用 python 连接数据库和向数据库pythondb的position_info_detail表中插入数据
def get_conn():
'''建立数据库连接'''
conn = pymysql.connect(host='localhost',
user='username',
password='password',
db='pythondb',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
return conn
def insert(conn, info):
'''数据写入数据库'''
# print(info)
with conn.cursor() as cursor:
sql = "INSERT INTO `position_info_detail` (`positionName`, `companyFullName`, `companySize`, `financeStage`, `industryField`, `city`, `salary`, `workYear`) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"
cursor.execute(sql, info)
conn.commit()
9.完整代码
#!/usr/bin/env python3
# -*- encoding=utf-8 -*-
import requests
import pymysql.cursors
def get_position_info(pageNo,keyWords):
# """返回当前页面的信息列表"""
url = "https://www.lagou.com/jobs/positionAjax.json"
headers = {
"Accept":"application/json, text/javascript, */*; q=0.01",
"Accept-Encoding":"gzip, deflate, br",
"Accept-Language":"zh-CN,zh;q=0.9,en;q=0.8,und;q=0.7",
"Connection":"keep-alive",
"Content-Length":"63",
"Content-Type":"application/x-www-form-urlencoded; charset=UTF-8",
"Cookie":"JSESSIONID=ABAAAECABGFABFF2355DDD810AA4B7200888D1AD6748FCE; user_trace_token=20200114140751-a42b5275-8421-48d3-b9e7-68dfd54a7268; WEBTJ-ID=20200114140806-16fa2aa95b2747-088a3c550e67ef-1136685a-1764000-16fa2aa95b37cb; _ga=GA1.2.1962027842.1578982086; _gid=GA1.2.552871320.1578982086; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1578982086; LGUID=20200114140752-33060498-3694-11ea-b2bf-525400f775ce; index_location_city=%E5%85%A8%E5%9B%BD; X_MIDDLE_TOKEN=52a24466865c3ceb89c3746da8f4044e; lagou_utm_source=A; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216fa2aa96bf214-0b00a41712c003-1136685a-1764000-16fa2aa96c0e6c%22%2C%22%24device_id%22%3A%2216fa2aa96bf214-0b00a41712c003-1136685a-1764000-16fa2aa96c0e6c%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; X_HTTP_TOKEN=c2100305a94fa1bd24375097511424ef3d7cabe162; _gat=1; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1579057355; LGSID=20200115110222-735d7e62-3743-11ea-b2de-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2Futrack%2FtrackMid.html%3Ff%3Dhttps%253A%252F%252Fwww.lagou.com%252Fjobs%252Flist%255Fpython%253FlabelWords%253D%2526fromSearch%253Dtrue%2526suginput%253D%26t%3D1579057335%26_ti%3D1; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_python%3FlabelWords%3D%26fromSearch%3Dtrue%26suginput%3D; LGRID=20200115110222-735d8073-3743-11ea-b2de-525400f775ce; SEARCH_ID=72019fd8d0824c75bf82cd2e145a7d13",
"Host":"www.lagou.com",
"Origin":"https://www.lagou.com",
"Referer":"https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=",
"User-Agent":"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Mobile Safari/537.36",
"X-Anit-Forge-Code":"0",
"X-Anit-Forge-Token":"None",
"X-Requested-With":"XMLHttpRequest"
}
postData = {"needAddtionalResult":"false","pn":pageNo,"kd":keyWords,"sid":"098c18fdd37945f7a5781f7d465e0bd8"}
response = requests.post(url, data=postData, headers=headers)
return response.json()
def get_positions_list(positions_dic):
list_positions = positions_dic['content']['positionResult']['result']
result = []
for position in list_positions:
info = []
info.append(position["positionName"])
info.append(position["companyFullName"])
info.append(position["companySize"])
info.append(position["financeStage"])
info.append(position["industryField"])
info.append(position["city"])
info.append(position["salary"])
info.append(position["workYear"])
result.append(info)
return result
def get_conn():
'''建立数据库连接'''
conn = pymysql.connect(host='localhost',
user='root',
password='password',
db='pythondb',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
return conn
def insert(conn, info):
'''数据写入数据库'''
# print(info)
with conn.cursor() as cursor:
sql = "INSERT INTO `position_info_detail` (`positionName`, `companyFullName`, `companySize`, `financeStage`, `industryField`, `city`, `salary`, `workYear`) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"
cursor.execute(sql, info)
conn.commit()
def main():
try:
conn = get_conn() # 建立数据库连接 不存数据库 注释此行
for i in [2,3,4,5,6,7,8,9,10,11]:
positions_dic = get_position_info(i,"python")
print(positions_dic)
array = get_positions_list(positions_dic)
for position in array:
insert(conn, tuple(position))
conn.close() # 关闭数据库连接,不存数据库 注释此行
except Exception as e:
print(e)
if __name__ == '__main__':
main()
- 第二种:使用 Selenium进行数据爬取
1.首先就是 selenium 的安装,具体参照selenium官网
#这里写一下简要步骤,terminal 执行以下命令
pip3 install selenium
2.接下来我们使用 selenium 获取到我们需要的 html 页面内容
def get_page_content():
driver = webdriver.Chrome()
driver.get("https://www.lagou.com/")
htmls = []
try:
chooseLocationBox = driver.find_element_by_id("cboxWrapper")
cboxCloseBtn = driver.find_element_by_id("cboxClose")
cboxCloseBtn.click()
time.sleep(2)
input = driver.find_element_by_id("search_input")
input.clear()
input.send_keys("python")
searchBtn = driver.find_element_by_id("search_button")
searchBtn.click()
time.sleep(2)
closeADBtn = driver.find_element_by_class_name("body-btn")
closeADBtn.click()
htmls.append(driver.page_source)
try:
# nextPageBtn = driver.find_element_by_class_name("pager_next")
i = 1
while i<2:
nextPageBtn = driver.find_element_by_class_name("pager_next")
ActionChains(driver).move_to_element(nextPageBtn).perform()
nextPageBtn.click();
time.sleep(2)
htmls.append(driver.page_source)
i +=1
except Exception as e:
print(e)
print("当前已经是最后一页了,下一页按钮不可点击")
# print("html Content:",driver.page_source)
return htmls
except Exception as e:
print("----------error-----------")
print(e)
print("--------error end---------")
return htmls
driver.close()
3.此处需要安装 BeautifulSoup,不了解的小伙伴可以看一下官网
安装是为了将上一步获取到的 html页面内容,转换成我们需要的职位信息,代码如下:
def get_position_info(htmlContents):
res = []
for html in htmlContents:
soup = BeautifulSoup(html, 'html.parser')
positions = soup.find_all("li",class_=("con_list_item default_list"))
for position in positions:
posi = []
posi.append(position.get("data-company"))
posi.append(position.get("data-salary"))
posi.append(position.get("data-positionname"))
res.append(posi)
return res
4.最后将爬取到的数据信息存储到数据库和 excel 表格即可
def main():
info = get_position_info(get_page_content())
lang_name = 'python'
wb = Workbook() # 打开 excel 工作簿
ws1 = wb.active
ws1.title = lang_name
conn = get_conn()
for row in info:
ws1.append(row)
insert(conn,tuple(row))
wb.save('{}职位信息.xlsx'.format(lang_name))
完整代码:
#!/usr/bin/env python3
# -*-encoding:utf-8-*-
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup
import requests
import re
from selenium.webdriver.common.action_chains import ActionChains #引入ActionChains鼠标操作类
from openpyxl import Workbook
import pymysql.cursors
def get_conn():
"""建立数据库连接"""
conn = pymysql.connect(host='localhost',
user='root',
password='jj1234567',
db='pythondb',
charset='utf8',
cursorclass=pymysql.cursors.DictCursor)
return conn;
def insert(conn,info):
print(info)
with conn.cursor() as cursor:
sql = "INSERT INTO `position_info` (`company_name`,`salary`,`position_name`) VALUES (%s, %s, %s)"
cursor.execute(sql,info)
conn.commit()
def get_page_content():
driver = webdriver.Chrome()
driver.get("https://www.lagou.com/")
htmls = []
try:
chooseLocationBox = driver.find_element_by_id("cboxWrapper")
cboxCloseBtn = driver.find_element_by_id("cboxClose")
cboxCloseBtn.click()
time.sleep(2)
input = driver.find_element_by_id("search_input")
input.clear()
input.send_keys("python")
searchBtn = driver.find_element_by_id("search_button")
searchBtn.click()
time.sleep(2)
closeADBtn = driver.find_element_by_class_name("body-btn")
closeADBtn.click()
htmls.append(driver.page_source)
try:
# nextPageBtn = driver.find_element_by_class_name("pager_next")
i = 1
while i<2:
nextPageBtn = driver.find_element_by_class_name("pager_next")
ActionChains(driver).move_to_element(nextPageBtn).perform()
nextPageBtn.click();
time.sleep(2)
htmls.append(driver.page_source)
i +=1
except Exception as e:
print(e)
print("当前已经是最后一页了,下一页按钮不可点击")
# print("html Content:",driver.page_source)
return htmls
except Exception as e:
print("----------error-----------")
print(e)
print("--------error end---------")
return htmls
driver.close()
def get_position_info(htmlContents):
res = []
for html in htmlContents:
soup = BeautifulSoup(html, 'html.parser')
positions = soup.find_all("li",class_=("con_list_item default_list"))
for position in positions:
posi = []
posi.append(position.get("data-company"))
posi.append(position.get("data-salary"))
posi.append(position.get("data-positionname"))
res.append(posi)
return res
def main():
info = get_position_info(get_page_content())
lang_name = 'python'
wb = Workbook() # 打开 excel 工作簿
ws1 = wb.active
ws1.title = lang_name
conn = get_conn()
for row in info:
ws1.append(row)
insert(conn,tuple(row))
wb.save('{}职位信息.xlsx'.format(lang_name))
if __name__ == '__main__':
main()
- GitHub 地址: https://github.com/jiangongzheng/spider
网友评论