美文网首页
元道经纬相机照片文字提取2023-07-27

元道经纬相机照片文字提取2023-07-27

作者: 手握镰刀和锤子的打工人 | 来源:发表于2023-07-26 19:58 被阅读0次
    import os
    import base64
    import urllib
    import requests
    import re
    import pandas as pd
    
    # 百度图片识别API,填入自己申请的API_KEY和SECRET_KEY
    API_KEY = "XXXXXXXXX"
    SECRET_KEY = "XXXXXXXXX"
    
    def extract_word(path):
            
        url = "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate?access_token=" + get_access_token()
        
        image = get_file_content_as_base64(path, urlencoded=True)
        
        payload = 'image=' + image
        
        headers = {
            'Content-Type': 'application/x-www-form-urlencoded',
            'Accept': 'application/json'
        }
        
        response = requests.request("POST", url, headers=headers, data=payload)
        
        return(response)
    
    def get_file_content_as_base64(path, urlencoded=False):
        """
        获取文件base64编码
        :param path: 文件路径
        :param urlencoded: 是否对结果进行urlencoded 
        :return: base64编码信息
        """
        with open(path, "rb") as f:
            content = base64.b64encode(f.read()).decode("utf8")
            if urlencoded:
                content = urllib.parse.quote_plus(content)
        return content
    
    def get_access_token():
        """
        使用 AK,SK 生成鉴权签名(Access Token)
        :return: access_token,或是None(如果错误)
        """
        url = "https://aip.baidubce.com/oauth/2.0/token"
        params = {"grant_type": "client_credentials", "client_id": API_KEY, "client_secret": SECRET_KEY}
        return str(requests.post(url, params=params).json().get("access_token"))
    
    
    def transform_word(response):
        
        for item in response["words_result"]:
            item.pop("location", None)
    
        data = response["words_result"]
        
        
        result_data = {}
        current_key = None
    
        for item in data:
            words = item['words']
            for key_start in ('经度', '纬度', '地址', '时间', '海拔', '天气', '备注'):
                if words.startswith(key_start):
                    current_key = key_start
                    result_data[current_key] = words[len(current_key)+1:]  # Extract the value after the key and colon
                    break
            else:
                if current_key is not None:
                    result_data[current_key] += words
    
        df = pd.DataFrame([result_data])
    
        return(df)
    
    def get_file_extension(file_path):
        _, file_extension = os.path.splitext(file_path)
        return file_extension.lower()
    
    
    if __name__ == '__main__':
        
        image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp']
        
        # 遍历文件夹中所有文件
        folder_path = os.path.dirname(os.path.abspath(__file__))
    
        info_pic = pd.DataFrame() 
    
        for root, _, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                
                extension = get_file_extension(file_path)
                
                if extension in image_extensions:
                    try:
                        print('正在提取' + file_path + '...')
                        res = extract_word(file_path).json()
                        info_pic_temp = transform_word(res)
    
                        if info_pic_temp.shape[1] > 2:
                            info_pic_temp.insert(0, '文件路径', file_path)
                            info_pic = info_pic.append(info_pic_temp, ignore_index=True)
    
                    except Exception as e:
                        pass
    
        info_pic.to_csv('picture_information.csv', index=False, encoding='utf_8_sig')
    
        print('提取完成')
    

    相关文章

      网友评论

          本文标题:元道经纬相机照片文字提取2023-07-27

          本文链接:https://www.haomeiwen.com/subject/jhnkpdtx.html