美文网首页
利用华为云OCR批量处理数据

利用华为云OCR批量处理数据

作者: yousa_ | 来源:发表于2019-09-26 13:42 被阅读0次
# -*- coding:utf-8 -*-
# Copyright 2018 Huawei Technologies Co.,Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use
# this file except in compliance with the License.  You may obtain a copy of the
# License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations under the License.

from cloud.HWOcrClientAKSK import HWOcrClientAKSK
from cloud.HWOcrClientToken import HWOcrClientToken
import os

def getOCR(pic_dir, pic_content_dir, number_of_pics):
    '''
    :param pic_dir: directory to store picture
    :param content_dir: directory to store contents
    :return:
    '''
    AK="此处隐藏"  #AK from authentication
    SK="此处隐藏"   #SK from authentication
    endpoint = "ocr.cn-north-4.myhuaweicloud.com"  #http endpoint information
    ocr_client = HWOcrClientAKSK(AK,SK,endpoint)  #initialize ocr_client from ak,sk and endpoint information
    option = {}
    num = 1353  # 断点续连
    while num < number_of_pics:
        try:
            response = ocr_client.request_ocr_service_base64("/v1.0/ocr/general-text",
                    (pic_dir + '/'+ str(num) + '.jpg'), option)  # call OCR interface to recognize picture
            # print(response.text)
            response_dir = eval(response.text)
            content = ''
            for i in range(response_dir['result']['words_block_list'].__len__()):
                content += response_dir['result']['words_block_list'][i]['words']
            with open(pic_content_dir + '/content.txt', 'a', encoding='utf-8') as f1:
                f1.write(str(num) + '\t' +content + '\n')
                num += 1
                print('generate ' + str(num) + '.jpg')
        except Exception as e:
            print('OOPS~ in ' + str(num) + 'location.')
            print(e)
            with open(pic_content_dir + '/false_content.txt', 'a', encoding='utf-8') as f1:
                f1.write(str(num) + '\t' + str(e) + '\n')
            num += 1

if __name__ == '__main__':

# AK SK demo code
    pwd = os.getcwd()
    pic_dir = os.path.abspath(os.path.dirname(pwd)+os.path.sep+".")+'/精/pic/'
    pic_content_dir = os.path.abspath(os.path.dirname(pwd)+os.path.sep+".")+'/精/pic_content/'
    number_of_pics = 4122
    getOCR(pic_dir, pic_content_dir, number_of_pics)




相关文章

网友评论

      本文标题:利用华为云OCR批量处理数据

      本文链接:https://www.haomeiwen.com/subject/pzlpyctx.html