1.在python3环境下新建一个叫做51job的项目

在biz/urlbiz.py里面
#coding=utf-8
from dao import urldao, infodao
def getDataByUrl(url):
rs = urldao.getDataUrl(url)
if rs == False:
return 0
elif rs[0] == 0:
return 1
else:
return 2
def insertData(url,job,company,companyWeb,recruitmentInfo,salary,welfare,workAddr,companyMap,jobInfo):
# 先入url表
logId = urldao.insertUrl(url=url)
# 再去入info
infodao.insertInfo(logId,job,company,companyWeb,recruitmentInfo,salary,welfare,workAddr,companyMap,jobInfo)
dao/urldao.py
#coding=utf-8
from utils import util, dbmysql
def insertUrl(url):
try:
logId = util.getUUID()
sql = "insert into urls(logId,insertTime,url) VALUES ('%s',now(),'%s');" % (logId, url)
dbmysql.query(sql)
return logId
except Exception as ex:
util.logger.error(ex)
return None
def getDataUrl(url):
rs = None
try:
sql = "select count(1) from urls where url='%s';" % url
rs = dbmysql.first(sql)
except Exception as ex:
util.logger.error(ex)
return rs
infodao.py
#coding=utf-8
from utils import util, dbmysql
def insertInfo(logId,job,company,companyWeb,recruitmentInfo,salary,welfare,workAddr,companyMap,jobInfo):
try:
sql = "insert into info(logId,insertTime,job,company,companyWeb,recruitmentInfo,salary,welfare,workAddr,companyMap,jobInfo) VALUES ('%s',now(),'%s','%s','%s','%s','%s','%s','%s','%s','%s');" % (
logId,job,company,companyWeb,recruitmentInfo,salary,welfare,workAddr,companyMap,jobInfo)
dbmysql.query(sql)
except Exception as ex:
util.logger.error(ex)
utils/init.py
import importlib,sys
importlib.reload(sys)
default_encoding = 'utf8'
if sys.getdefaultencoding() != default_encoding:
sys.getdefaultencoding()
utils/dbmysql.py
# coding=utf-8
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
from sqlalchemy.sql import text
import config
from utils import util
# 配置文件中读取连接串
DB_URI = config.MySql.DB_URI_MYSQL
engine = create_engine(DB_URI, echo=False, pool_size=10, pool_recycle=60)
# 插入,修改,删除操作
def query(sql):
# 创建DBSession类型:
DB_Session = sessionmaker(bind=engine)
# 创建session对象:
DB = DB_Session()
try:
# 执行sql语句
DB.execute(text(sql))
DB.commit()
return True
except Exception as ex:
util.logger.error("exec sql got error:%s" % (ex))
DB.rollback()
return False
finally:
DB.close()
# 查询第一条数据
def first(sql):
# 创建DBSession类型:
DB_Session = sessionmaker(bind=engine)
# 创建session对象:
DB = DB_Session()
try:
# 执行sql语句,.first session对象返回第一条数据
rs = DB.execute(text(sql)).first()
DB.commit()
return rs
except Exception as ex:
util.logger.error("exec sql got error:%s" % (ex))
DB.rollback()
return False
finally:
DB.close()
# 查询多条数据
def fetchall(sql):
# 创建DBSession类型:
DB_Session = sessionmaker(bind=engine)
# 创建session对象:
DB = DB_Session()
try:
# 执行sql语句,.fetchall session对象返回全部数据
rs = DB.execute(text(sql)).fetchall()
DB.commit()
return rs
except Exception as ex:
util.logger.error("exec sql got error:%s" % (ex))
DB.rollback()
return False
finally:
DB.close()
utils/util.py
# coding=utf-8
import requests
import logging, uuid
try:
logging.basicConfig(level=logging.INFO,
format='[%(asctime)s - %(filename)s -%(funcName)s %(levelname)s]:%(message)s')
logger = logging.getLogger()
except Exception as ex:
print(ex)
def get(url, params=None, headers=None, proxies=None, verify=None, cookie=None):
s = requests.session()
try:
if params:
s.params = params
if headers:
s.headers = headers
if proxies:
s.proxies = proxies
if verify:
s.verify = verify
if cookie:
s.cookies = cookie
r = s.get(url=url, timeout=20) # timeout要放在get中,由requeest.get方法
return r.content
except Exception as ex:
print (ex)
finally:
if s:
s.close()
def post(url, data, params=None, headers=None, proxies=None, verify=None, cookie=None):
s = requests.session()
try:
if params:
s.params = params
if headers:
s.headers = headers
if proxies:
s.proxies = proxies
if verify:
s.verify = verify
if cookie:
s.cookies = cookie
r = s.post(url=url, data=data, timeout=20) # timeout要放在get中,由requeest.get方法
return r.content
except Exception as ex:
print(ex)
finally:
if s:
s.close()
def getNoHtml(html):
import re
dr = re.compile(r'<[^>]+>', re.S)
dd = dr.sub(' ', html)
return dd
def getUUID():
return str(uuid.uuid4())
# def replaceTs(name):
# s = name.replace("'", '"')
# return s
XiangMu/init.py
# -*- coding: utf-8 -*-
import importlib,sys
importlib.reload(sys)
default_encoding = 'utf8'
if sys.getdefaultencoding() != default_encoding:
sys.getdefaultencoding()
XiangMu/base.py
# coding=utf-8
from biz import urlbiz
import requests
s=requests.session()
class Base():
def __init__(self):
self.url = None
self.position = None
self.company = None
self.companyWeb = None
self.recruitmentInfo = None
self.salary = None
self.welfare = None
self.workAddr = None
self.companyMap = None
self.jobInfo = None
def getDataByUrl(self):
rs = urlbiz.getDataByUrl(self.url)
return rs
def insertData(self):
urlbiz.insertData(url=self.url, job =self.job ,company=self.company,companyWeb=self.companyWeb,
recruitmentInfo=self.recruitmentInfo,salary=self.salary,welfare=self.welfare,
workAddr=self.workAddr,companyMap=self.companyMap,jobInfo=self.jobInfo)
self.cleardata()
def cleardata(self):
self.url = None
self.job = None
self.company = None
self.companyWeb = None
self.recruitmentInfo = None
self.salary = None
self.welfare = None
self.workAddr = None
self.companyMap = None
self.jobInfo = None
bse=Base()
XiangMu/wuYiJob.py
#coding=utf-8
from utils import util
import re,time
from xiangMu import base
head = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache - Control": "max - age = 0",
"Connection": "keep-alive",
# "Cookie": "Cookie:bdshare_firstime=1524211930835;"
# " username=tangjes; _identity-frontend=af16ccf063c411fcab235bb149f5a5c2d3274609c5ef9f1bf4f02b0e96ebde32a%3A2%3A%7Bi%3A0%3Bs%3A18%3A%22_identity-frontend%22%3Bi%3A1%3Bs%3A20%3A%22%5B2946954%2C%22%22%2C2592000%5D%22%3B%7D;"
# " Hm_lvt_ac168fe809d6bd1c9e16493d086e741e=1524217555,1524275270,1524291622,1524300800;"
# " Hm_lpvt_ac168fe809d6bd1c9e16493d086e741e=1524300800",
"Host": "search.51job.com",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/65.0.3325.181 Safari/537.36"}
class jobInfomations(base.Base):
def __init__(self):
base.Base.__init__(self)
# self.login()
def getXingxi(self):
for i in range(1, 101):
url = ("https://search.51job.com/list/040000,000000,0000,00,9,99,python,2,%s.html?"%i)
r = util.get(url, headers=head)
r = r.decode('gbk').replace('\n', '').replace('\r', '').replace('\t', '')
if r:
# r = r.replace('\n','').replace('\r','').replace('\t','')
info = re.findall(
'<input class="checkbox" type="checkbox".*?href="(.*?)"',
r)
if len(info) > 0:
for item in info:
self.url = item
time.sleep(3)
rs= self.getDataByUrl()
# rs表示数据库中是否存在此url,如果存在,程序继续执行下一条,如果不存在,要求爬取明细页面了吧
if rs == 0:
return
elif rs == 2:
continue
self.__getDetail()
def __getDetail(self):
ret = util.get(self.url)
ret = ret.decode('gbk').replace('\n', '').replace('\r', '').replace('\t', '')
zhiwei = re.findall('<h1 title="(.*?)">', ret)
if len(zhiwei) > 0: #职位信息
self.job =zhiwei[0]
gongsi = re.findall('target="_blank" title="(.*?)"', ret)
if len(gongsi) > 0: #公司信息
self.company =gongsi[0]
gongsiWeb = re.findall('<a track-type="jobsButtonClick" event-type="2" class="i_house" href="(.*?)" target="_blank">', ret)
if len(gongsiWeb) > 0: #公司网页
self.companyWeb = gongsiWeb[0]
zhaopinInfo = re.findall('<div class="t1">.*?</em>(.*?)</span>.*?</em>(.*?)'
'</span>.*?</em>(.*?)</span>.*?</em>(.*?)</span><div', ret)
if len(zhaopinInfo) > 0: #招聘信息
self.recruitmentInfo =','.join( zhaopinInfo[0])
xinZi = re.findall('</span><strong>(.*?)</strong>', ret)
if len(xinZi) > 0: #月薪
self.salary = xinZi[0]
fuLi = re.findall('<p class="t2">(.*?)</p>', ret)
if len( fuLi) > 0: #福利
self.welfare = util.getNoHtml(fuLi[0])
diZhi = re.findall('<div class="bmsg inbox">.*?</span>(.*?)</p>', ret)
if len(diZhi) > 0: #工作地址
self.workAddr = diZhi[0]
luXian = re.findall('<div class="tBorderTop_box">.*? onclick="showMapIframe(.*?);return false;">', ret)
if len(luXian) > 0: #公司地图
self.companyMap = luXian[0].split(',')[0][2:-1]
jobXinxi = re.findall('<div class="bmsg job_msg inbox">(.*?)<div', ret)
if len(jobXinxi) > 0: #岗位职责
self.jobInfo = util.getNoHtml(jobXinxi[0])
# comInfo = re.findall('<div class="tmsg inbox">(.*?)</div>', ret)
# if len(comInfo) > 0:
# self.companyInfo = util.getNoHtml(comInfo[0])
# 数据入库
self.insertData()
# urlbiz.insertData(url=url, textName=textName, writer=writer,Tsize=Tsize, readUrl=readUrl, textDesc=textDesc)
config.py
class MySql():
DB_URI_MYSQL = "mysql+pymysql://root:root@localhost:3306/51Job?charset=utf8"
run.py
# -*- coding: utf-8 -*-
from xiangMu import wuYiJob
import time
while 1:
xs=wuYiJob.jobInfomations()
xs.getXingxi()
time.sleep(60)
test.py
s="('https://search.51job.com/jobsearch/bmap/map.php?jobid=101545728','龙珠四路金谷创业园B座5层509')"
s2=s.split(',')
print(s2[0][2:-1])
创建的数据库有一个,数据库名字叫做51job,下面有两个表
1.第一个表urls
CREATE TABLE `urls` (
`logId` varchar(100) DEFAULT NULL,
`insertTime` datetime DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP,
`url` varchar(200) DEFAULT NULL,
KEY `logId` (`logId`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
1.第二个表info
CREATE TABLE `info` (
`logId` varchar(100) DEFAULT NULL,
`insertTime` datetime DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP,
`job` varchar(100) DEFAULT NULL,
`company` varchar(100) DEFAULT NULL,
`companyWeb` varchar(200) DEFAULT NULL,
`recruitmentInfo` longtext,
`salary` varchar(100) DEFAULT NULL,
`welfare` longtext,
`workAddr` varchar(200) DEFAULT NULL,
`companyMap` varchar(255) DEFAULT NULL,
`jobInfo` longtext,
KEY `logId` (`logId`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
直接运行run.py这个文件


网友评论