import os
import base64
import urllib
import requests
import re
import pandas as pd
# 百度图片识别API,填入自己申请的API_KEY和SECRET_KEY
API_KEY = "XXXXXXXXX"
SECRET_KEY = "XXXXXXXXX"
def extract_word(path):
url = "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate?access_token=" + get_access_token()
image = get_file_content_as_base64(path, urlencoded=True)
payload = 'image=' + image
headers = {
'Content-Type': 'application/x-www-form-urlencoded',
'Accept': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
return(response)
def get_file_content_as_base64(path, urlencoded=False):
"""
获取文件base64编码
:param path: 文件路径
:param urlencoded: 是否对结果进行urlencoded
:return: base64编码信息
"""
with open(path, "rb") as f:
content = base64.b64encode(f.read()).decode("utf8")
if urlencoded:
content = urllib.parse.quote_plus(content)
return content
def get_access_token():
"""
使用 AK,SK 生成鉴权签名(Access Token)
:return: access_token,或是None(如果错误)
"""
url = "https://aip.baidubce.com/oauth/2.0/token"
params = {"grant_type": "client_credentials", "client_id": API_KEY, "client_secret": SECRET_KEY}
return str(requests.post(url, params=params).json().get("access_token"))
def transform_word(response):
for item in response["words_result"]:
item.pop("location", None)
data = response["words_result"]
result_data = {}
current_key = None
for item in data:
words = item['words']
for key_start in ('经度', '纬度', '地址', '时间', '海拔', '天气', '备注'):
if words.startswith(key_start):
current_key = key_start
result_data[current_key] = words[len(current_key)+1:] # Extract the value after the key and colon
break
else:
if current_key is not None:
result_data[current_key] += words
df = pd.DataFrame([result_data])
return(df)
def get_file_extension(file_path):
_, file_extension = os.path.splitext(file_path)
return file_extension.lower()
if __name__ == '__main__':
image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp']
# 遍历文件夹中所有文件
folder_path = os.path.dirname(os.path.abspath(__file__))
info_pic = pd.DataFrame()
for root, _, files in os.walk(folder_path):
for file in files:
file_path = os.path.join(root, file)
extension = get_file_extension(file_path)
if extension in image_extensions:
try:
print('正在提取' + file_path + '...')
res = extract_word(file_path).json()
info_pic_temp = transform_word(res)
if info_pic_temp.shape[1] > 2:
info_pic_temp.insert(0, '文件路径', file_path)
info_pic = info_pic.append(info_pic_temp, ignore_index=True)
except Exception as e:
pass
info_pic.to_csv('picture_information.csv', index=False, encoding='utf_8_sig')
print('提取完成')
网友评论