目前遇到base64验证码的识别问题,使用代码无法解决。
最后的解决方案是:
1.拿到验证码图片的base64编码
2.使用在线解码网站解码
3.使用selenium进行截图
4.将截图用tesseract进行识别
具体代码如下:
browser.implicitly_wait(60)# 设置隐示等待60秒
#从mongodb获取数据
cursor = col_old.find({"total_house":716}).batch_size(30).skip(100)
for item in cursor:
n = 1
name = item["residential_area"]
url = "https://newhouse.cnnbfdc.com/project/page_1?q=%s"%name
browser.get(url)
residential_number =int(browser.find_element_by_xpath("//div[@class='listbody__head__result']/strong").text)
if residential_number == 1:
#使用selenium进行模拟解码操作(按照顺序进行网页点击)
try:
browser.find_element_by_xpath("//div[@class='group-left']/a/img").click()
img_base64 = browser.find_element_by_xpath("//div[@class='latest-news-list']/div[1]/div[@class='latest-news-list__item__right']/ul/li[3]/img").get_attribute("src")
browser.get("http://tool.chinaz.com/tools/imgtobase/")
browser.find_element_by_id("basestr").clear()
browser.find_element_by_id("basestr").send_keys(img_base64)
browser.find_element_by_id("basetoimg").click()
time.sleep(2)
#以下获取验证码位置的方法有时不准确,具体原因没去寻找,等有空去寻找
# img = browser.find_element_by_xpath("//div[@class='JsTxtCo bor-a1s h200 ptb10 plr10 pr']")
# location = img.location
# size = img.size
# left = location['x']
# top = location['y']
# right = left + size['width']
# bottom = top + size['height']
browser.save_screenshot('code.png')
page_snap_obj = Image.open('code.png')
# print(left,top,right,bottom)
#经过几次调整,获取该网页的合适截图坐标
left = 34
top = 320
right = 100
bottom =345
image_obj = page_snap_obj.crop((left, top, right, bottom))
image_obj.save("code.png")
image = Image.open("code.png")
# 使用tesseract识别图片
text = pytesseract.image_to_string(image, lang='eng', config="-psm 7")
print(text)
total_house = int(text)
#将获得的数据更新到mongodb中
col_old.update({"residential_area":item["residential_area"]},{"$set":{"total_house":total_house}},upsert=True)
print("更新成功")
except:
print("出错")
网友评论