环境 Python3.6
本人新手,刚开始写爬虫,使用的是requests,lxml和xpath,代码写的很凌乱,轻喷。
选择爬取了51job上搜索大数据得到的职位信息,并将爬取到的数据存入了Mysql数据库中。
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Time : 2017/10/31 14:32
# @Author : mj
# @File : 51job.py
import requests
from lxml import etree
import pymysql
def get_info(link):
global id_1
global id_2
print(link)
see = requests.session()
see.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36",
}
html = see.get(link)
html.encoding = "GBK"
selector = etree.HTML(html.text)
"""enterprise_info"""
# 1.enterprise_name 企业名称
enterprise_name = selector.xpath("//p/a/@title")
if len(enterprise_name)<=0:
return
print(enterprise_name[0])
e_name = enterprise_name[0]
f_test = selector.xpath("//p[@class='msg ltype']/text()")
F = str(f_test[0]).split('|')
if len(F)<3:
return
# 2.enterprise_form 企业形式
enterprise_form = F[0].strip()
print(enterprise_form)
# 3.enterprise_scale 企业规模
enterprise_scale = F[1].strip()
print(enterprise_scale)
# 4.enterprise_kind
enterprise_kind = F[2].strip()
print(enterprise_kind)
# 5.enterprise_intro
e_intro = selector.xpath("//div/div[@class='tmsg inbox']/text()")
enterprise_intro = ""
for e in e_intro:
enterprise_intro = enterprise_intro + e.strip()
print(enterprise_intro)
values_entreprise_info = [int(id_1),e_name,enterprise_form,enterprise_scale,enterprise_kind,enterprise_intro]
print(values_entreprise_info)
"""job_info"""
#1.job_name 职位名称
job_name = selector.xpath("//div/h1/@title")
print(job_name[0])
#2.enterprise_name 企业名称
#3.salary 薪资
salary = selector.xpath("//div[@class='cn']/strong/text()")
if len(salary)<1:
return
print(salary[0])
#4.job_site 工作地点
job_site = selector.xpath("//div/span[@class='lname']/text()")
if len(job_site)<0:
return
print(job_site[0])
#5.job_intro职位信息
job_intro = selector.xpath("//div[@class='t1']/span[@class='sp4']/text()")
print(job_intro)
#6.job_des 职位描述
des = selector.xpath("//div[@class='bmsg job_msg inbox']/text()")
job_des = ""
for i in des:
job_des = job_des+ i.strip()
print(job_des)
#7.link 职位连接
values_job_info = [int(id_2),job_name,e_name,str(salary[0]),job_site,str(job_intro),job_des,link]
print(values_job_info)
conn1 = pymysql.connect(
host='127.0.0.1',
port=3306,
user='root',
passwd='123456',
db='db',)
# 使用cursor()方法获取操作游标
cur1 = conn1.cursor()
try:
cur1.execute("insert into enterprise_info VALUES(%s,%s,%s,%s,%s,%s);",values_entreprise_info)
conn1.commit()
except Exception as e:
print(e)
conn1.rollback()
finally:
id_1 = id_1+1
conn1.close()
conn2 = pymysql.connect(
host='127.0.0.1',
port=3306,
user='root',
passwd='123456',
db='db',)
# 使用cursor()方法获取操作游标
cur2 = conn2.cursor()
try:
cur2.execute("insert into job_info VALUES(%s,%s,%s,%s,%s,%s,%s,%s);",values_job_info)
conn2.commit()
except Exception as e:
print(e)
conn2.rollback()
finally:
id_2 = id_2+1
conn2.close()
see = requests.session()
see.headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"
}
global id_1
id_1= 1
global id_2
id_2= 1
pages = set()
i = 1
j = 1
a = 1
url = "http://search.51job.com/list/000000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,"+str(a)+".html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
for i in range(1,9999):
print(url)
html = see.get(url)
html.encoding = "GBK"
selector = etree.HTML(html.text)
links = selector.xpath("//span/a[@onmousedown='']/@href")
for link in links:
print(i)
a = i
print("a="+str(a))
url = "http://search.51job.com/list/000000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2," + str(a) + ".html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
if link not in pages:
pages.add(link)
get_info(link)
print(link)
j = j+1
# j=记录数
print(j)
部分结果
图片1.png 图片2.png
网友评论