美文网首页
利用华为云OCR批量处理数据

利用华为云OCR批量处理数据

作者: yousa_ | 来源:发表于2019-09-26 13:42 被阅读0次
    # -*- coding:utf-8 -*-
    # Copyright 2018 Huawei Technologies Co.,Ltd.
    #
    # Licensed under the Apache License, Version 2.0 (the "License"); you may not use
    # this file except in compliance with the License.  You may obtain a copy of the
    # License at
    #
    #     http://www.apache.org/licenses/LICENSE-2.0
    #
    # Unless required by applicable law or agreed to in writing, software distributed
    # under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
    # CONDITIONS OF ANY KIND, either express or implied.  See the License for the
    # specific language governing permissions and limitations under the License.
    
    from cloud.HWOcrClientAKSK import HWOcrClientAKSK
    from cloud.HWOcrClientToken import HWOcrClientToken
    import os
    
    def getOCR(pic_dir, pic_content_dir, number_of_pics):
        '''
        :param pic_dir: directory to store picture
        :param content_dir: directory to store contents
        :return:
        '''
        AK="此处隐藏"  #AK from authentication
        SK="此处隐藏"   #SK from authentication
        endpoint = "ocr.cn-north-4.myhuaweicloud.com"  #http endpoint information
        ocr_client = HWOcrClientAKSK(AK,SK,endpoint)  #initialize ocr_client from ak,sk and endpoint information
        option = {}
        num = 1353  # 断点续连
        while num < number_of_pics:
            try:
                response = ocr_client.request_ocr_service_base64("/v1.0/ocr/general-text",
                        (pic_dir + '/'+ str(num) + '.jpg'), option)  # call OCR interface to recognize picture
                # print(response.text)
                response_dir = eval(response.text)
                content = ''
                for i in range(response_dir['result']['words_block_list'].__len__()):
                    content += response_dir['result']['words_block_list'][i]['words']
                with open(pic_content_dir + '/content.txt', 'a', encoding='utf-8') as f1:
                    f1.write(str(num) + '\t' +content + '\n')
                    num += 1
                    print('generate ' + str(num) + '.jpg')
            except Exception as e:
                print('OOPS~ in ' + str(num) + 'location.')
                print(e)
                with open(pic_content_dir + '/false_content.txt', 'a', encoding='utf-8') as f1:
                    f1.write(str(num) + '\t' + str(e) + '\n')
                num += 1
    
    if __name__ == '__main__':
    
    # AK SK demo code
        pwd = os.getcwd()
        pic_dir = os.path.abspath(os.path.dirname(pwd)+os.path.sep+".")+'/精/pic/'
        pic_content_dir = os.path.abspath(os.path.dirname(pwd)+os.path.sep+".")+'/精/pic_content/'
        number_of_pics = 4122
        getOCR(pic_dir, pic_content_dir, number_of_pics)
    
    
    
    
    

    相关文章

      网友评论

          本文标题:利用华为云OCR批量处理数据

          本文链接:https://www.haomeiwen.com/subject/pzlpyctx.html