美文网首页Swartz动物园
Abbyy 入门(一)

Abbyy 入门(一)

作者: dzjx | 来源:发表于2017-02-05 16:37 被阅读539次

Abbyy 图片(PDF) 转word 实践

1. 简介

Abbyy 是一个俄罗斯软件,官网:https://www.abbyy.com (中文: https://www.abbyy.com ) 作为OCR软件拥有自己的客户端通知提供SDK供开发使用,功能强大。这里是一个简单的OCR图片文字识别Demo。

2. 干货

1. 安装

(1)从http://www.abbyychina.com/xiazai.html 下载ABBYY FineReader。

(2)安装该引擎。

(3)激活序列号:在License Manager中激活你的序列号(该序列号会出现在后文的代码中)。如图:

激活.png

到此为止准备工作就OK了。

2.实践(单个转换)

执行流程:

流程.png

(1)自定义一个配置文件,用户设置文件转换路径、用户序列号等

    public class FreConfig
    {
        /// <summary>
        /// 是否64位
        /// </summary>
        /// <returns></returns>
        private static bool Is64Bit
        {
            get { return IntPtr.Size == 8; }
        }

        /// <summary>
        /// 开发者序列号
        /// </summary>
        /// <returns></returns>
        public static string DeveloperSN
        {
            get { return "xxxxxxxxx"; }
        }

        /// <summary>
        /// dll 文件夹路径
        /// </summary>
        public static string DllFolder
        {
            get
            {
                if (Is64Bit)
                {
                    return "C:\\Program Files\\ABBYY SDK\\11\\FineReader Engine\\Bin64";
                }
                else
                {
                    return "32位dll地址";
                }
            }
        }

        /// <summary>
        /// 文件存放路径
        /// </summary>
        public static string FileFolder
        {
            get { return @"C:\Users\帝子降兮\Desktop\AbbyyImage"; }
        }
    }

==注意: 1. DeveloperSN 就是之前在License Manager中激活的序列号 2. 这里是实践64bit机器==

(2) 加载引擎

 /// <summary>
    ///  装载和初始化(卸载)引擎
    /// </summary>
    public class EngineLoader : IDisposable
    {
        [DllImport("kernel32.dll")]
        private static extern IntPtr LoadLibraryEx(string dllToLoad, IntPtr reserved, uint flags);
        private const uint LOAD_WITH_ALTERED_SEARCH_PATH = 0x00000008;
        [DllImport("kernel32.dll")]
        private static extern IntPtr GetProcAddress(IntPtr hModule, string procedureName);
        [DllImport("kernel32.dll")]
        private static extern bool FreeLibrary(IntPtr hModule);


        [UnmanagedFunctionPointer(CallingConvention.StdCall, CharSet = CharSet.Unicode)]
        private delegate int GetEngineObject(string devSN, ref IEngine engine);
        [UnmanagedFunctionPointer(CallingConvention.StdCall)]
        private delegate int DeinitializeEngine();
        [UnmanagedFunctionPointer(CallingConvention.StdCall)]
        private delegate int DllCanUnloadNow();

        private IEngine engine = null;
        private IntPtr dllHandle = IntPtr.Zero;
        private GetEngineObject getEngineObject = null;
        private DeinitializeEngine deinitializeEngine = null;
        private DllCanUnloadNow dllCanUnloadNow = null;

        /// <summary>
        /// 引擎对象
        /// </summary>
        public IEngine Engine
        {
            get
            {
                return engine;
            }
        }

        /// <summary>
        /// 加载引擎
        /// </summary>
        /// <param name="developerSN">开发者序列号</param>
        public EngineLoader(string developerSN)
        {
            string enginePath = Path.Combine(FreConfig.DllFolder, "FREngine.dll");

            try
            {
                dllHandle = LoadLibraryEx(enginePath, IntPtr.Zero, LOAD_WITH_ALTERED_SEARCH_PATH);
                if (dllHandle == IntPtr.Zero)
                {
                    throw new Exception("无法加载" + enginePath);
                }

                IntPtr getEngineObjectPtr = GetProcAddress(dllHandle, "GetEngineObject");
                if (getEngineObjectPtr == IntPtr.Zero)
                {
                    throw new Exception("无法找到 GetEngineObject 函数");
                }

                IntPtr deinitializeEnginePtr = GetProcAddress(dllHandle, "DeinitializeEngine");
                if (deinitializeEnginePtr == IntPtr.Zero)
                {
                    throw new Exception("无法找到 DeinitializeEngine 函数");
                }

                IntPtr dllCanUnloadNowPtr = GetProcAddress(dllHandle, "DllCanUnloadNow");
                if (dllCanUnloadNowPtr == IntPtr.Zero)
                {
                    throw new Exception("无法找到 DllCanUnloadNow 函数");
                }

                //将指针转换为委托
                getEngineObject = (GetEngineObject)Marshal.GetDelegateForFunctionPointer(getEngineObjectPtr, typeof(GetEngineObject));
                deinitializeEngine = (DeinitializeEngine)Marshal.GetDelegateForFunctionPointer(deinitializeEnginePtr, typeof(DeinitializeEngine));
                dllCanUnloadNow = (DllCanUnloadNow)Marshal.GetDelegateForFunctionPointer(dllCanUnloadNowPtr, typeof(DllCanUnloadNow));

                //获取引擎对象
                int hresult = getEngineObject(developerSN, ref engine);
                Marshal.ThrowExceptionForHR(hresult);
            }
            catch
            {
                engine = null;
                GC.Collect();
                GC.WaitForPendingFinalizers();
                GC.Collect();
                FreeLibrary(dllHandle);
                dllHandle = IntPtr.Zero;
                getEngineObject = null;
                deinitializeEngine = null;
                dllCanUnloadNow = null;
            }
        }

        /// <summary>
        /// 卸载引擎
        /// </summary>
        public void Dispose()
        {
            if (engine == null)
            {
                return;
            }
            engine = null;
            int hresult = deinitializeEngine();

            GC.Collect();
            GC.WaitForPendingFinalizers();
            GC.Collect();
            hresult = dllCanUnloadNow();
            if (hresult == 0)
            {
                FreeLibrary(dllHandle);
            }
            dllHandle = IntPtr.Zero;
            getEngineObject = null;
            deinitializeEngine = null;
            dllCanUnloadNow = null;
            Marshal.ThrowExceptionForHR(hresult);
        }

    }

(3) 执行逻辑

 public void Start()
        {
            try
            {
                using (EngineLoader loade = new EngineLoader(FreConfig.DeveloperSN))
                {
                    //加载配置文件,其他配置文件可见帮助文档
                    loade.Engine.LoadPredefinedProfile("DocumentConversion_Accuracy");
                   
                    string imagePath = Path.Combine(FreConfig.FileFolder, @"20161121103603.JPG");

                    //设置引擎的消息语言(默认英语)
                    loade.Engine.MessagesLanguage = MessagesLanguageEnum.ML_ChinesePRC;

                    FRDocument document = loade.Engine.CreateFRDocument();
                    try
                    {
                        //将图片作为照片处理
                        var prepareMode = loade.Engine.CreatePrepareImageMode();
                        prepareMode.PhotoProcessingMode = PhotoProcessingModeEnum.PPM_TreatAsPhoto;

                        //设置解析参数
                        DocumentProcessingParams processingParams = loade.Engine.CreateDocumentProcessingParams();

                        //设置解析的语言(这儿选择中英混合,详见帮助文档)
                        RecognizerParams recognizerParams = processingParams.PageProcessingParams.RecognizerParams;
                        recognizerParams.SetPredefinedTextLanguage("ChinesePRC,English");

                        //添加文件到document
                        document.AddImageFile(imagePath, prepareMode, null);

                        // 执行解析
                        document.Process(processingParams);

                        //var x = document.BasicLanguage;

                        //导出指定格式
                        document.Export(Path.Combine(FreConfig.FileFolder, @"Demo.docx"), FileExportFormatEnum.FEF_DOCX, null);
                    }
                    catch (Exception ex)
                    {
                    }
                    finally
                    {
                        document.Close();
                    }
                }

            }
            catch (Exception ex)
            {

            }
        }

直接调用Start() 方法,一个简单的示例就完成了。
注意: 在测试的时候一定要注意SetPredefinedTextLanguage,默认是英文,如果你的图片或pdf里面是中文的话,==会出现乱码==,同时abbyy 支持设置导出后文档的格式、识别图片文字时旋转,背景色控制等等功能,详见帮助文档

3. 实践(批量转换)

执行流程与单个转换的执行流程大致相似,只是我们创建的不是FRDocument 而是BatchProcessor,同时批量执行时 需要实现IImageSource接口,完成图片(PDF)文件的遍历。

public class FileAdapterImpl : IFileAdapter
{
        private string fileName;

        public FileAdapterImpl(string fileName)
        {
            this.fileName = fileName;
        }

        public string GetFileName()
        {
            return fileName;
        }

        public IntsCollection GetPagesToProcess()
        {
            return null;
        }

        public string GetPassword()
        {
            return string.Empty;
        }
    }

    public class ImageSourceImpl : IImageSource
    {
        private bool isEmpty;
        private Queue<string> imagesNames = new Queue<string>();

        public ImageSourceImpl(string sourceDir)
        {
            string extensionsMask = "bmp|dcx|pcx|png|jpg|jpeg|jp2|jpc|jfif|pdf|tif|tiff|gif|djvu|djv|jb2";
            string[] fileNames = Directory.GetFiles(sourceDir, "*.*");

            foreach (string fileName in fileNames)
            {
                if (extensionsMask.Contains(Path.GetExtension(fileName).Remove(0, 1).ToLower()))
                {
                    imagesNames.Enqueue(fileName);
                }
            }

            isEmpty = imagesNames.Count == 0;
        }

        public IFileAdapter GetNextImageFile()
        {
            if (isEmpty)
            {
                return null;
            }
            FileAdapterImpl fileAdapter = new FileAdapterImpl(imagesNames.Dequeue());
            isEmpty = imagesNames.Count == 0;
            return fileAdapter;
        }

        public bool IsEmpty()
        {
            return this.isEmpty;
        }
    }

执行逻辑:

 public void Start()
        {
            try
            {
                using (EngineLoader loade = new EngineLoader(FreConfig.DeveloperSN))
                {
                    loade.Engine.LoadPredefinedProfile("DocumentConversion_Accuracy");

                    if (!Directory.Exists(sourceFolder))
                    {
                        throw new Exception(sourceFolder + "不存在");
                    }

                    if (!Directory.Exists(resultFolder))
                    {
                        DirectoryInfo newDir = Directory.CreateDirectory(resultFolder);
                        if (!newDir.Exists)
                        {
                            throw new Exception("无法创建" + resultFolder);
                        }
                    }

                    ImageSourceImpl imageSource = new ImageSourceImpl(sourceFolder);
                    if (imageSource.IsEmpty())
                    {
                        throw new Exception("转换文件夹中没有文件");
                    }

                    //创建批量处理
                    BatchProcessor batchProcessor = loade.Engine.CreateBatchProcessor();

                    //设置语言
                    PageProcessingParams processingParams = loade.Engine.CreatePageProcessingParams();
                    processingParams.RecognizerParams.SetPredefinedTextLanguage("ChinesePRC,English");

                    //开始解析
                    batchProcessor.Start(imageSource, null, null, processingParams, null);
                    FRPage page = batchProcessor.GetNextProcessedPage();
                    while (page != null)
                    {
                        //序列化
                        page.Synthesize(null);

                        //导出目标文件
                        string resultFilePath = Path.Combine(resultFolder, Path.GetFileName(page.SourceImagePath) + ".docx");
                        page.Export(resultFilePath, FileExportFormatEnum.FEF_DOCX, null);
                        page = batchProcessor.GetNextProcessedPage();
                    }
                }

            }
            catch (Exception ex)
            {

            }
        }

一个简单的demo就完成了,更多高级的功能可以查看帮助文档(其他地方貌似也查不到。。。)

帮助文档路径:C:\Program Files\ABBYY SDK\11\FineReader Engine\Help\FREngine11.chm,我是默认装在C盘的,你可以根据自己的安装路径查找。

相关文章

网友评论

  • 三月白与:您好!请问怎样可以用java实现中文解析,我现在可以解析英文,但是解析中文乱码,请指教!谢谢!
    dzjx:@三月白与 不好意思、才看到你的回复,这边文章是很久之前写的一个小demo、用的是.net

本文标题:Abbyy 入门(一)

本文链接:https://www.haomeiwen.com/subject/teyeittx.html