美文网首页
51job爬取项目

51job爬取项目

作者: 楚糖的糖 | 来源:发表于2018-11-14 20:41 被阅读0次

1.在python3环境下新建一个叫做51job的项目

51job (2).png

在biz/urlbiz.py里面

#coding=utf-8
from dao import urldao, infodao

def getDataByUrl(url):
    rs = urldao.getDataUrl(url)
    if rs == False:
        return 0
    elif  rs[0] == 0:
        return 1
    else:
        return 2

def insertData(url,job,company,companyWeb,recruitmentInfo,salary,welfare,workAddr,companyMap,jobInfo):
    # 先入url表
    logId = urldao.insertUrl(url=url)
    # 再去入info
    infodao.insertInfo(logId,job,company,companyWeb,recruitmentInfo,salary,welfare,workAddr,companyMap,jobInfo)

dao/urldao.py

#coding=utf-8
from utils import util, dbmysql


def insertUrl(url):
  try:
      logId = util.getUUID()
      sql = "insert into urls(logId,insertTime,url) VALUES ('%s',now(),'%s');" % (logId, url)
      dbmysql.query(sql)
      return logId
  except Exception as ex:
      util.logger.error(ex)
      return None


def getDataUrl(url):
  rs = None
  try:
      sql = "select count(1) from urls where url='%s';" % url
      rs = dbmysql.first(sql)
  except Exception as ex:
      util.logger.error(ex)
  return rs

infodao.py

#coding=utf-8
from utils import util, dbmysql

def insertInfo(logId,job,company,companyWeb,recruitmentInfo,salary,welfare,workAddr,companyMap,jobInfo):
    try:
        sql = "insert into info(logId,insertTime,job,company,companyWeb,recruitmentInfo,salary,welfare,workAddr,companyMap,jobInfo) VALUES ('%s',now(),'%s','%s','%s','%s','%s','%s','%s','%s','%s');" % (
            logId,job,company,companyWeb,recruitmentInfo,salary,welfare,workAddr,companyMap,jobInfo)
        dbmysql.query(sql)
    except Exception as ex:
        util.logger.error(ex)

utils/init.py

import importlib,sys
importlib.reload(sys)
default_encoding = 'utf8'
if sys.getdefaultencoding() != default_encoding:
    sys.getdefaultencoding()

utils/dbmysql.py

# coding=utf-8
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
from sqlalchemy.sql import text
import config
from utils import util

# 配置文件中读取连接串
DB_URI = config.MySql.DB_URI_MYSQL
engine = create_engine(DB_URI, echo=False, pool_size=10, pool_recycle=60)

# 插入,修改,删除操作
def query(sql):
  # 创建DBSession类型:
  DB_Session = sessionmaker(bind=engine)
  # 创建session对象:
  DB = DB_Session()
  try:
      # 执行sql语句
      DB.execute(text(sql))
      DB.commit()
      return True
  except Exception as ex:
      util.logger.error("exec sql got error:%s" % (ex))
      DB.rollback()
      return False
  finally:
      DB.close()


# 查询第一条数据
def first(sql):
  # 创建DBSession类型:
  DB_Session = sessionmaker(bind=engine)
  # 创建session对象:
  DB = DB_Session()
  try:
      # 执行sql语句,.first  session对象返回第一条数据
      rs = DB.execute(text(sql)).first()
      DB.commit()
      return rs
  except Exception as  ex:
      util.logger.error("exec sql got error:%s" % (ex))
      DB.rollback()
      return False
  finally:
      DB.close()


# 查询多条数据
def fetchall(sql):
  # 创建DBSession类型:
  DB_Session = sessionmaker(bind=engine)
  # 创建session对象:
  DB = DB_Session()
  try:
      # 执行sql语句,.fetchall  session对象返回全部数据
      rs = DB.execute(text(sql)).fetchall()
      DB.commit()
      return rs
  except Exception as ex:
      util.logger.error("exec sql got error:%s" % (ex))
      DB.rollback()
      return False
  finally:
      DB.close()

utils/util.py

# coding=utf-8
import requests
import logging, uuid
try:
    logging.basicConfig(level=logging.INFO,
                        format='[%(asctime)s - %(filename)s -%(funcName)s %(levelname)s]:%(message)s')
    logger = logging.getLogger()
except Exception as ex:
    print(ex)


def get(url, params=None, headers=None, proxies=None, verify=None, cookie=None):
    s = requests.session()
    try:
        if params:
            s.params = params
        if headers:
            s.headers = headers
        if proxies:
            s.proxies = proxies
        if verify:
            s.verify = verify
        if cookie:
            s.cookies = cookie
        r = s.get(url=url, timeout=20)  # timeout要放在get中,由requeest.get方法
        return r.content
    except Exception as ex:
        print (ex)
    finally:
        if s:
            s.close()


def post(url, data, params=None, headers=None, proxies=None, verify=None, cookie=None):
    s = requests.session()
    try:
        if params:
            s.params = params
        if headers:
            s.headers = headers
        if proxies:
            s.proxies = proxies
        if verify:
            s.verify = verify
        if cookie:
            s.cookies = cookie
        r = s.post(url=url, data=data, timeout=20)  # timeout要放在get中,由requeest.get方法
        return r.content
    except Exception as ex:
        print(ex)
    finally:
        if s:
            s.close()


def getNoHtml(html):
    import re
    dr = re.compile(r'<[^>]+>', re.S)
    dd = dr.sub(' ', html)
    return dd

def getUUID():
    return str(uuid.uuid4())

# def replaceTs(name):
#     s = name.replace("'", '"')
#     return s

XiangMu/init.py

# -*- coding: utf-8 -*-
import importlib,sys
importlib.reload(sys)
default_encoding = 'utf8'
if sys.getdefaultencoding() != default_encoding:
    sys.getdefaultencoding()

XiangMu/base.py

# coding=utf-8
from biz import urlbiz
import requests
s=requests.session()

class Base():
    def __init__(self):
        self.url = None
        self.position = None
        self.company = None
        self.companyWeb = None
        self.recruitmentInfo = None
        self.salary = None
        self.welfare = None
        self.workAddr = None
        self.companyMap = None
        self.jobInfo = None

    def getDataByUrl(self):
        rs = urlbiz.getDataByUrl(self.url)
        return rs

    def insertData(self):
        urlbiz.insertData(url=self.url, job =self.job ,company=self.company,companyWeb=self.companyWeb,
                          recruitmentInfo=self.recruitmentInfo,salary=self.salary,welfare=self.welfare,
                          workAddr=self.workAddr,companyMap=self.companyMap,jobInfo=self.jobInfo)
        self.cleardata()

    def cleardata(self):
        self.url = None
        self.job = None
        self.company = None
        self.companyWeb = None
        self.recruitmentInfo = None
        self.salary = None
        self.welfare = None
        self.workAddr = None
        self.companyMap = None
        self.jobInfo = None
bse=Base()

XiangMu/wuYiJob.py

#coding=utf-8
from utils import util
import re,time
from xiangMu import base

head = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                "Accept-Encoding": "gzip, deflate, br",
                "Accept-Language": "zh-CN,zh;q=0.9",
                "Cache - Control": "max - age = 0",
                "Connection": "keep-alive",
                # "Cookie": "Cookie:bdshare_firstime=1524211930835;"
                #           " username=tangjes; _identity-frontend=af16ccf063c411fcab235bb149f5a5c2d3274609c5ef9f1bf4f02b0e96ebde32a%3A2%3A%7Bi%3A0%3Bs%3A18%3A%22_identity-frontend%22%3Bi%3A1%3Bs%3A20%3A%22%5B2946954%2C%22%22%2C2592000%5D%22%3B%7D;"
                #           " Hm_lvt_ac168fe809d6bd1c9e16493d086e741e=1524217555,1524275270,1524291622,1524300800;"
                #           " Hm_lpvt_ac168fe809d6bd1c9e16493d086e741e=1524300800",
                "Host": "search.51job.com",
                "Upgrade-Insecure-Requests": "1",
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) "
                              "AppleWebKit/537.36 (KHTML, like Gecko) "
                              "Chrome/65.0.3325.181 Safari/537.36"}

class jobInfomations(base.Base):
    def __init__(self):
        base.Base.__init__(self)
        # self.login()
    def getXingxi(self):
        for i in range(1, 101):
            url = ("https://search.51job.com/list/040000,000000,0000,00,9,99,python,2,%s.html?"%i)
            r = util.get(url, headers=head)
            r = r.decode('gbk').replace('\n', '').replace('\r', '').replace('\t', '')
            if r:
                # r = r.replace('\n','').replace('\r','').replace('\t','')
                info = re.findall(
                    '<input class="checkbox" type="checkbox".*?href="(.*?)"',
                    r)
                if len(info) > 0:
                    for item in info:
                        self.url = item
                        time.sleep(3)
                        rs= self.getDataByUrl()
                        # rs表示数据库中是否存在此url,如果存在,程序继续执行下一条,如果不存在,要求爬取明细页面了吧
                        if rs == 0:
                            return
                        elif rs == 2:
                            continue
                        self.__getDetail()

    def __getDetail(self):
            ret = util.get(self.url)
            ret = ret.decode('gbk').replace('\n', '').replace('\r', '').replace('\t', '')
            zhiwei = re.findall('<h1 title="(.*?)">', ret)
            if len(zhiwei) > 0:                 #职位信息
                self.job =zhiwei[0]
            gongsi = re.findall('target="_blank" title="(.*?)"', ret)
            if len(gongsi) > 0:                 #公司信息
                self.company =gongsi[0]
            gongsiWeb = re.findall('<a track-type="jobsButtonClick" event-type="2" class="i_house" href="(.*?)" target="_blank">', ret)
            if len(gongsiWeb) > 0:              #公司网页
                self.companyWeb = gongsiWeb[0]
            zhaopinInfo = re.findall('<div class="t1">.*?</em>(.*?)</span>.*?</em>(.*?)'
                                     '</span>.*?</em>(.*?)</span>.*?</em>(.*?)</span><div', ret)
            if len(zhaopinInfo) > 0:            #招聘信息
                self.recruitmentInfo =','.join( zhaopinInfo[0])
            xinZi = re.findall('</span><strong>(.*?)</strong>', ret)
            if len(xinZi) > 0:              #月薪
                self.salary = xinZi[0]
            fuLi = re.findall('<p class="t2">(.*?)</p>', ret)
            if len( fuLi) > 0:              #福利
                self.welfare =  util.getNoHtml(fuLi[0])
            diZhi = re.findall('<div class="bmsg inbox">.*?</span>(.*?)</p>', ret)
            if len(diZhi) > 0:              #工作地址
                self.workAddr = diZhi[0]
            luXian = re.findall('<div class="tBorderTop_box">.*? onclick="showMapIframe(.*?);return false;">', ret)
            if len(luXian) > 0:              #公司地图
                self.companyMap = luXian[0].split(',')[0][2:-1]
            jobXinxi = re.findall('<div class="bmsg job_msg inbox">(.*?)<div', ret)
            if len(jobXinxi) > 0:           #岗位职责
                self.jobInfo = util.getNoHtml(jobXinxi[0])

            # comInfo = re.findall('<div class="tmsg inbox">(.*?)</div>', ret)
            # if len(comInfo) > 0:
            #     self.companyInfo = util.getNoHtml(comInfo[0])

            # 数据入库
            self.insertData()
 # urlbiz.insertData(url=url, textName=textName, writer=writer,Tsize=Tsize, readUrl=readUrl, textDesc=textDesc)

config.py

class MySql():
    DB_URI_MYSQL = "mysql+pymysql://root:root@localhost:3306/51Job?charset=utf8"

run.py

# -*- coding: utf-8 -*-

from xiangMu import wuYiJob
import time

while 1:
    xs=wuYiJob.jobInfomations()
    xs.getXingxi()
    time.sleep(60)

test.py

s="('https://search.51job.com/jobsearch/bmap/map.php?jobid=101545728','龙珠四路金谷创业园B座5层509')"
s2=s.split(',')
print(s2[0][2:-1])

创建的数据库有一个,数据库名字叫做51job,下面有两个表

1.第一个表urls

CREATE TABLE `urls` (
  `logId` varchar(100) DEFAULT NULL,
  `insertTime` datetime DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP,
  `url` varchar(200) DEFAULT NULL,
  KEY `logId` (`logId`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

1.第二个表info

CREATE TABLE `info` (
  `logId` varchar(100) DEFAULT NULL,
  `insertTime` datetime DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP,
  `job` varchar(100) DEFAULT NULL,
  `company` varchar(100) DEFAULT NULL,
  `companyWeb` varchar(200) DEFAULT NULL,
  `recruitmentInfo` longtext,
  `salary` varchar(100) DEFAULT NULL,
  `welfare` longtext,
  `workAddr` varchar(200) DEFAULT NULL,
  `companyMap` varchar(255) DEFAULT NULL,
  `jobInfo` longtext,
  KEY `logId` (`logId`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

直接运行run.py这个文件

urls.png info.png

相关文章

网友评论

      本文标题:51job爬取项目

      本文链接:https://www.haomeiwen.com/subject/ylqsfqtx.html