美文网首页码农的世界python热爱者我爱编程
selenium和Python3.6实现招聘狗网站自动识别验证码

selenium和Python3.6实现招聘狗网站自动识别验证码

作者: Python树苗 | 来源:发表于2018-05-20 14:27 被阅读12次

首先你得注册一个账号,可以跳过企业验证,招聘狗网站是给企业HR使用的,所以一般要求企业验证,这里我们直接跳过企业验证,下面是实现过程,有详细注释:

import json

import os

import random

import re

import sys

import traceback

import time

from PIL import Image

from lxml import html as lxml_html

import selenium

from selenium import webdriver

from selenium.common.exceptions import NoSuchElementException

from selenium.webdriver import ActionChains

import requests

import base64

from requests.exceptions import ConnectionError

import http.cookiejar

import logging

from dama2_API import Dama2API

#随机获取useragent的第三方库

from fake_useragent import UserAgent

ua = UserAgent()

class RTC_zhaopingou(object):

def __init__(self, account: dict, debug=False, visible=-1, last_try=False):

assert account['user_id']

assert account['password']

logging.info('Change webdriver to FireFox')

#创建seeion对象,爬取列表页和详情页使用

self.session = requests.Session()

self.session.headers = {

'Host': "qiye.zhaopingou.com",

"Origin":"http://qiye.zhaopingou.com",

"Referer":"http://qiye.zhaopingou.com",

"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",

}

#需要注册打码兔账号,从打码兔平台下载代码

self.dama2 = Dama2API()

def login(self):

l = logging

l.info("Processing Login...")

self.driver = webdriver.Firefox()

self.driver.set_window_size(1920, 1080)

self.driver.implicitly_wait(10)

driver = self.driver

# login_url = 'http://qiye.zhaopingou.com/zhaopingou_interface/security_login?timestamp='+str(int(time.time()*1000))

login_url = 'http://qiye.zhaopingou.com/'

driver.get(login_url)

#打开页面后出现的需要选择城市

driver.find_element_by_xpath('//div[@class="city-now citys"]').click()

#找到用户名和密码元素,模仿人手动输入

for i inself.account['username']:

driver.find_element_by_xpath('//input[@placeholder="请输入手机号/邮箱/狗狗号"]').send_keys(i)

time.sleep(random.uniform(0.2,0.8))

for j inself.account['password']:

driver.find_element_by_xpath('//input[@placeholder="请输入密码"]').send_keys(j)

time.sleep(random.uniform(0.2, 0.8))

# 获取弹出验证码的按钮元素,这里有一个坑,按钮元素在iframe节点中,不能直接获取,需要通过driver.find_element_by_tag_name("iframe")切入到第一个iframe中,然后在通过xpath获取按钮元素

# iframe = driver.find_element_by_id('captcha_widget_aiwaylekc')

driver.switch_to.frame(driver.find_element_by_tag_name("iframe"))

# driver.switch_to.frame('captcha_widget_aiwaylekc')

driver.find_element_by_xpath('//span[@class="captcha-widget-text"]').click()

#等待5秒,避免出现有时候还未加载出来的情况,通过driver.switch_to.default_content()从iframe切换到主html页面

time.sleep(5)

driver.switch_to.default_content()

#点击弹出验证码按钮后出现一个新的iframe,此时有两个iframe,并列的,从这页面切入到第二个iframe

driver.switch_to.frame(driver.find_elements_by_tag_name("iframe")[1])

# 验证码区域

captcha_xpath = '//div[@class="lc-panel"]'

# captcha_xpath = '#l-captcha-float_aiwaylekc'

re = self._login_process_captcha(captcha_xpath)

#登录成功

if re:

driver.switch_to.default_content()

driver.find_element_by_id('form_login').click()

time.sleep(3)

current_url = driver.current_url

#判断登录后的url是否是期望值

expect_url = 'http://qiye.zhaopingou.com/'

if current_url==expect_url:

l.info('login sucess!!!')

#获取cookie,并将cookie保存到session中,以便爬虫列表页和详情页使用

cookie = dict()

print(driver.get_cookies())

for item in driver.get_cookies():

# cookie += "; {}={}".format(item['name'], item["value"])

cookie[item['name']] = item['value']

if item['name'] == 'hrkeepToken':

self.token = item['value']

# 存储cookie

self.session.cookies = requests.utils.cookiejar_from_dict(cookie, self.cookiejar)

l.info("get cookie: {}".format(cookie))

#登录成功,退出driver,后面不使用了

self.driver.quit()

returnTrue

else:

l.info('login failed due to CAPTCHA, submit_count')

returnFalse

def _login_process_captcha(self,captcha_xpath):

l = logging

driver = self.driver

captcha_element = driver.find_element_by_xpath(captcha_xpath)

#验证码坐标和大小

offset = captcha_element.location

print('offset:',offset)

size = captcha_element.size

# 验证码接口

dama2 = self.dama2

#保存验证码图片

shm_dir = r'/tmp/zhaopingou/'

if os.path.exists(shm_dir) isFalse:

os.makedirs(shm_dir)

captcha_img_path = os.path.join(shm_dir, 'captcha_img_{user_id}.png'.format(user_id=self.account['user_id']))

maximum = 20

attempt = 0

while attempt<=maximum:

l.info(f'Trying to decode CAPTCHA: {attempt}/{maximum}')

#验证码元素

captcha_element = driver.find_element_by_xpath(captcha_xpath)

#截取验证码图片保存到captcha_img_path

captcha_element.screenshot(captcha_img_path)

try:

#调用打码兔接口,传入验证码类型,验证码图片文件,返回坐标值coordinate_list

captcha_id, coordinate_list = dama2.decode_captcha(captcha_type=6137, file_path=captcha_img_path)

l.info(f'coordinate_list:{coordinate_list}')

except Exception as err:

err_str = str(err)

tb = traceback.format_exc()

msg = f'Exception occurred when decode CAPTCHA, err: {err_str}, tb:\n{tb}'

l.warning(msg)

attempt+=1

# 发生异常时先返回主页面

continue

#将鼠标移动到返回的坐标位置并点击

for xy in coordinate_list:

action = ActionChains(driver)

action.move_to_element_with_offset(captcha_element, xy[0], xy[1]).click()

action.perform()

time.sleep(random.uniform(0.5,2))

#先切回到主html,再切到第一个iframe,获取之前的弹出验证按钮,判断内容是否是验证成功

driver.switch_to.default_content()

driver.switch_to.frame(driver.find_elements_by_tag_name("iframe")[0])

text = driver.find_element_by_xpath('//span[@class="captcha-widget-text"]').text

if text.find('验证成功')!=-1:

l.info('验证码验证成功!')

time.sleep(random.uniform(1,2))

returnTrue

else: #失败则再切回到第二个iframe,从新获取验证码

driver.switch_to.default_content()

driver.switch_to.frame(driver.find_elements_by_tag_name("iframe")[1])

l.info('fail,and try it again')

attempt+=1

time.sleep(2)

continue

returnFalse

#通过搜索关键字获取列表页面,并定位到某一页

def search(self, keyword, page_to_go):

'''''搜索简历,得到列表页面,数据为json格式'''

l = logging

assert keyword

self.keyword = keyword

# 使用firefox浏览器抓取post请求参数

params = {

"pageSize":page_to_go,

"pageNo":"25",

"keyStr":keyword,

"companyName":"",

"schoolName":"",

"keyStrPostion":"",

"postionStr":"",

"startDegrees":"-1",

"endDegress":"-1",

"startAge":"0",

"endAge":"0",

"gender":"-1",

"region":"",

"timeType":"-1",

"startWorkYear":"-1",

"endWorkYear":"-1",

"beginTime":"",

"endTime":"",

"isMember":"-1",

"hopeAdressStr":"",

"cityId":"-1",

"updateTime":"",

"tradeId":"",

"clientNo":"",

"userToken":self.token,

"clientType":"2"

}

retry = 0

whileTrue:

#抓包获取请求的真实URL,后面是随机的数字字符串

search_url = "http://qiye.zhaopingou.com/zhaopingou_interface/find_warehouse_by_position_new?timestamp=" + str(int(time.time() * 1000))

l.info('search_url:{}'.format(search_url))

self.current_url = search_url

l.debug(f'Open search page. url,params,keyword,userToken: {search_url},{params},{keyword},{self.token}')

retry += 1

if retry == 11:

return''

try:

#使用session请求

res = self.session.post(search_url, data=params)

except ConnectionError:

l.info("ConnectionError! Sleep 5 minutes and retry...")

time.sleep(300)

self.current_url = search_url

continue

else:

l.info('current url is:{}'.format(res.url))

if res.url != search_url:

login_result = self.login(load=False)

if login_result:

continue

else:

l.warning("Login failed!")

sys.exit('login failed')

elifnot res.text:

l.info("Service is busy. Wait 5 minutes and retry...")

time.sleep(300)

l.info('Continue Searching...')

continue

#返回的数据异常,内容很少

elif len(str(res.text))<2000:

#若返回‘请您登录后查看简历’,则重新登录后在爬取

if'请您登录后查看简历'in str(res.text):

self.login(load=False)

continue

result = str(res.text)

#更换useragent

self.session.headers['User-Agent'] = ua.firefox

l.info(f'errorcode msg:{result}')

l.info('Too frequent operation, please try again in a minute')

time.sleep(random.randint(61,100))

continue

else:

try:

#返回的正常数据,通过json.dumps()获取json数据

resume_list = json.loads(res.text)

resume_list["current_page"]=page_to_go

# 在列表页面加入搜索页面

res = json.dumps(resume_list,ensure_ascii=False)

l.info(f'search_resume_list_info:{res}')

return res

except:

l.warning(res.text)

l.warning("something wrong!sleep 5 minutes and retry...")

time.sleep(300)

continue

def open_resume(self, url):

'''''

打开简历,得到详情页面

url可通过base64加密的用户id构造

'''

l = logging

l.debug(f'Open a resume: request_url: {url}')

resumeHtmlId=(url.split("="))[1]

# 设置前链

#self.session.headers['Referer'] = "http://qiye.zhaopingou.com/resume?key="+self.keyword

# 抓包获取简历详情页的请求参数

open_resume_data={

"resumeHtmlId": resumeHtmlId,

"keyStr":"",

"keyPositionName":"",

"tradeId":"",

"postionStr":"",

"jobId":"0",

"companyName":"",

"schoolName":"",

"clientNo":"",

"userToken":self.token,

"clientType":"2"

}

retry = 0

whileTrue:

#抓包获取详情页真实url

openresumeurl = "http://qiye.zhaopingou.com/zhaopingou_interface/zpg_find_resume_html_details?timestamp=" + str(int(time.time() * 1000))

l.info('resume_url:{}'.format(openresumeurl))

retry += 1

if retry == 11:

return''

try:

res = self.session.post(url=openresumeurl,data=open_resume_data)

except ConnectionError:

l.info("ConnectionError! Sleep 5 minutes and retry...")

time.sleep(300)

continue

else:

# 返回的html页面

l.info('current url is:{}'.format(res.url))

if res.url != openresumeurl:

l.info("cookie is invalid. Login with webdriver")

login_result = self.login(load=False)

if login_result:

continue

else:

l.warning("Login failed!")

sys.exit('login failed')

ifnot res.text:

l.info("Service is busy. Wait 5 minutes and retry...")

time.sleep(300)

continue

elif len(str(res.text))<2000:

print('errorcode:',res.text)

result = str(res.text)

l.info(f'errorcode msg:{result}')

l.info('Too frequent operation, please try again in a minute')

time.sleep(random.randint(61, 100))

continue

else:

try:

page_len = len(res.text)

self.current_url = openresumeurl

l.info(f'Downloaded a resume, len: {page_len:,d}, current_url: {url}')

resp_json=json.loads(res.text)

res_utf=json.dumps(resp_json,ensure_ascii=False)

return res_utf

except:

l.warning(res.text)

l.warning("something wrong! sleep 5 minutes and retry...")

time.sleep(300)

continue

if __name__ == '__main__':

#账号密码是假的,大家填写自己的账号密码

rtc_zhaopingou = RTC_zhaopingou(account={'user_id': '-701', 'username': '13419696888', 'password': '123'},

debug=False,

visible=1, last_try=False)

rtc_zhaopingou.login()

keyword_list = ['python','大数据','人工智能','java']

for kw in keyword_list:

for i in range(1,200):

search_result = rtc_zhaopingou.search(kw, i)

print('****************************************************************')

res = rtc_zhaopingou.open_resume(' http://qiye.zhaopingou.com/resume/detail?resumeId=5761920')

print(res)

打码兔平台的代码需要自己下载,放在同级目录后可以跑一下

希望能帮助大家!

欢迎大家关注我的博客:https://home.cnblogs.com/u/Python1234/

欢迎大家加入万人交流答疑群:125240963

相关文章

网友评论

    本文标题:selenium和Python3.6实现招聘狗网站自动识别验证码

    本文链接:https://www.haomeiwen.com/subject/jyctjftx.html