美文网首页
OCR 竟然还要收费?

OCR 竟然还要收费?

作者: 老瓦在霸都 | 来源:发表于2023-06-02 08:58 被阅读0次

    我曾经是 evernote(印象笔记)的重度用户,现在用的少了,主要原因是我建了自己的网站,有了自己的 wordpress, 自己用 moinmoin 搭的 wiki, 没有什么需求非要有一个印象笔记,尤其还要收不少钱,连 OCR 还得收钱,也有次数限制,这对一个程序员来说这钱花得不值

    当然,我常有需求要把图片上的文本转成文字的需求,例如在网上浏览的 ppt,pdf,png时,发现有不错的内容我会写个小程序来提取图片中的文本, 特别是其中的超文本链接。

    1. 先安装相关的依赖
    sudo apt install tesseract-ocr
    sudo apt install libtesseract-dev
    pip install pytesseract
    
    1. 相关的 Python 代码很简单
    #!/usr/bin/env python3
    
    from PIL import Image
    import pytesseract
    import cv2
    import re
    import os
    import sys
    
    URL_PATTERN = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    
    def extract_text(image_file):
        img=cv2.imread(image_file)
    
        gray=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
    
        gray_image="{}_gray_{}.png".format(image_file, os.getpid())
    
        cv2.imwrite(gray_image, gray)
        print("begin to extract urls from {}".format(gray_image))
        text=pytesseract.image_to_string(Image.open(gray_image))
        os.remove(gray_image)
    
        return text
    
    def extract_urls(image_file):
        text = extract_text(image_file)
    
        urls = re.findall(URL_PATTERN, text)
        links = []
        for url in urls:
            #link = '<a href="{}" target="_blank">{}</a>'.format(url[0], url[0])
            print(url)
            links.append("".join(url))
        return links
    
    
    if __name__ == '__main__':
        image_file  = "test.png"
        if len(sys.argv) > 1:
            image_file = sys.argv[1]
    
        extract_urls(image_file)
    
    
    1. Web 版本的 OCR

    基于这个小程序,我又做了一些扩展,通过 Web 页面来抽取文本和超链接

    • views.py
    from flask import render_template, redirect, url_for, flash, request, current_app
    from portal.test import test_module
    from portal.test.forms import OcrForm
    from portal.test.ocr import *
    
    from portal import logger
    from portal import db
    import os
    
    
    dir_path = os.path.dirname(os.path.realpath(__file__))
    
    ALLOWED_EXTENSIONS = set(['gif', 'jpg', 'png', 'bmp'])
    
    def allowed_file(filename):
        return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
    
    
    @test_module.route('/test', methods=['GET', 'POST'])
    def index():
        test_form = OcrForm()
        links = []
        if test_form.validate_on_submit():
            logger.info("submit: {}".format(test_form.ocr_command.data))
    
            image_file=dir_path + "/../static/image/reference.png"
            if test_form.ocr_command.data == 1:
                links = extract_urls(image_file)
                test_form.output_content.data = "\n".join(links)
            elif test_form.ocr_command.data == 2:
                text = extract_text(image_file)
                test_form.output_content.data = text
        else:
            if test_form.is_submitted():
                logger.error(test_form.errors)
    
        return render_template('test.html', form=test_form, links=links)
    
    
    
    • forms.py
    from flask_wtf import FlaskForm
    from flask_wtf.file import FileAllowed
    from wtforms import StringField, SubmitField, TextAreaField, PasswordField
    from wtforms import BooleanField, SelectField, FileField
    from wtforms import HiddenField
    from wtforms.validators import DataRequired, Length, Optional
    import datetime
    
    
    
    class OcrForm(FlaskForm):
    
        input_content = TextAreaField('input', validators=[Optional()],
                                       render_kw={
                                           "class": "form-control",
                                           "rows": 5})
        output_content = TextAreaField('output', validators=[Optional()],
                                       render_kw={
                                           "class": "form-control",
                                           "rows": 5,
                                           "cols": 60})
    
        ocr_command = SelectField('test_command',
                                  choices=[(1, 'Extract URLs'),
                                            (2, 'Extract Text')
                                            ],
                                  render_kw={"class": "form-control"},
                                  coerce=int)
    
        ocr_params = StringField('test_params', validators=[Optional()],
                                 render_kw={"class": "form-control"})
    
        submit_button = SubmitField('Submit', render_kw={"class": "btn btn-primary"})
    
    • ocr.py
    #!/usr/bin/env python3
    
    from PIL import Image
    import pytesseract
    import cv2
    import re
    import os
    import sys
    
    
    
    URL_PATTERN = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    
    
    def extract_text(image_file):
        img=cv2.imread(image_file)
    
        gray=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
    
        gray_image="{}_gray_{}.png".format(image_file, os.getpid())
    
        cv2.imwrite(gray_image, gray)
        print("begin to extract urls from {}".format(gray_image))
        text=pytesseract.image_to_string(Image.open(gray_image))
        os.remove(gray_image)
    
        return text
    
    def extract_urls(image_file):
        text = extract_text(image_file)
    
        urls = re.findall(URL_PATTERN, text)
        links = []
        for url in urls:
            #link = '<a href="{}" target="_blank">{}</a>'.format(url[0], url[0])
            print(url)
            links.append("".join(url))
        return links
    
    
    if __name__ == '__main__':
        image_file  = "test.png"
        if len(sys.argv) > 1:
            image_file = sys.argv[1]
    
        extract_urls(image_file)
    

    完整代码参见 https://github.com/walterfan/webocr/tree/master

    相关文章

      网友评论

          本文标题:OCR 竟然还要收费?

          本文链接:https://www.haomeiwen.com/subject/rfzzqdtx.html