美文网首页读书
ocr识别,纯文本阅读

ocr识别,纯文本阅读

作者: 2016晓 | 来源:发表于2023-11-06 10:42 被阅读0次

有些读者阅读书籍时,需要将扫描版的书籍转换为纯文本,以便在阅读器上获得更好的显示效果。
大部分免费的识别工具,识别出的质量并不高。真正专业的识别技术常常又作为商业机密,不向大众提供。这里推荐一个https://duguang.aliyun.com/experience?type=universal&subtype=general#intro
页面中可以免费体验,但如果想把整本书识别,还需要编写程序。以下是作者自己的思路:
1、使用抓包软件,将页面的main.js文件请求、识别提交请求转发到自行开发的本地web服务

image.png
image.png

2、编写代码,将main.js文件中的示例图片url,在每次请求时,依次替换为待识别的图片url
3、获取识别请求的返回内容,解析、打印内容

package main

import (
    "bytes"
    "compress/gzip"
    "encoding/json"
    "fmt"
    "io/ioutil"
    "log"
    "net/http"
    "net/http/httputil"
    "os"
    "os/signal"
    "strconv"
    "sync"
    "syscall"

    "gitlab.p1staff.com/common/slf"

    "github.com/gin-gonic/gin"
    "gitlab.p1staff.com/backend/tantan-backend-common/http/server"
    "gitlab.p1staff.com/intl/intl-live-campaign/app/config"
    "gitlab.p1staff.com/intl/intl-live-campaign/app/util"
)

var (
    wg      = sync.WaitGroup{}
    signals = make(chan os.Signal, 1)
)

func main() {

    httpServer, err := server.NewHttpServer(server.HttpConfig{
        ServiceName: config.ServiceNameIntlLiveRestapi,
        Listen:      ":26668",
        Router:      &myRouter{},
    })
    if err != nil {
        log.Fatalf("error:%+v", err)
    }
    wg.Add(1)
    util.Silently(httpServer.Start())
    handleSignal()
    wg.Wait()
    util.Silently(httpServer.Stop())
}

func handleSignal() {
    signal.Notify(signals, syscall.SIGPIPE, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT, syscall.SIGABRT)
    for {
        select {
        case sig := <-signals:
            slf.Infof("received signal: %s", sig)
            switch sig {
            case syscall.SIGPIPE:
            case syscall.SIGINT:
                //r.signalHandler()
                slf.Infoln("Failure exit for systemd restarting")
                os.Exit(1)
            default:
                //r.signalHandler()
                wg.Done()
            }
        }
    }
}

type myRouter struct {
}

func (r *myRouter) Route(g *gin.Engine) {
    target := "duguang.aliyun.com"
    g.POST("/ocrdemo/ocrDemoSecondService.json", func(c *gin.Context) {
        director := func(req *http.Request) {
            req.URL.Scheme = "https"
            req.URL.Host = target
            req.Host = target
        }
        proxy := &httputil.ReverseProxy{Director: director}
        proxy.ModifyResponse = rewriteBody
        //c.Writer = writer
        proxy.ServeHTTP(c.Writer, c.Request)
    })
    g.GET("/sail-web/duguang-public/1.3.4/js/main.js", func(c *gin.Context) {
        director := func(req *http.Request) {
            req.URL.Scheme = "https"
            req.URL.Host = "g.alicdn.com"
            req.Host = "g.alicdn.com"
        }
        proxy := &httputil.ReverseProxy{Director: director}
        proxy.ModifyResponse = rewriteBody2
        //c.Writer = writer
        proxy.ServeHTTP(c.Writer, c.Request)
    })
}

//https://img.alicdn.com/tfs/TB1GHXrXzDpK1RjSZFrXXa78VXa-912-1128.jpg
var (
    idx  = 0
    pics = []string{"image-00001.jpg", "image-00002.jpg", "image-00003.jpg", "image-00004.jpg", "image-00005.jpg", "image-00006.jpg", "image-00007.jpg", "image-00008.jpg", "image-00009.jpg", "image-00010.jpg", "image-00011.jpg", "image-00012.jpg", "image-00013.jpg", "image-00014.jpg", "image-00015.jpg", "image-00016.jpg", "image-00017.jpg", "image-00018.jpg", "image-00019.jpg", "image-00020.jpg", "image-00021.jpg", "image-00022.jpg", "image-00023.jpg", "image-00024.jpg", "image-00025.jpg", "image-00026.jpg", "image-00027.jpg", "image-00028.jpg", "image-00029.jpg", "image-00030.jpg", "image-00031.jpg", "image-00032.jpg", "image-00033.jpg", "image-00034.jpg", "image-00035.jpg", "image-00036.jpg", "image-00037.jpg", "image-00038.jpg", "image-00039.jpg", "image-00040.jpg", "image-00041.jpg", "image-00042.jpg", "image-00043.jpg", "image-00044.jpg", "image-00045.jpg", "image-00046.jpg", "image-00047.jpg", "image-00048.jpg", "image-00049.jpg", "image-00050.jpg", "image-00051.jpg", "image-00052.jpg", "image-00053.jpg", "image-00054.jpg", "image-00055.jpg", "image-00056.jpg", "image-00057.jpg", "image-00058.jpg", "image-00059.jpg", "image-00060.jpg", "image-00061.jpg", "image-00062.jpg", "image-00063.jpg", "image-00064.jpg", "image-00065.jpg", "image-00066.jpg", "image-00067.jpg", "image-00068.jpg", "image-00069.jpg", "image-00070.jpg", "image-00071.jpg", "image-00072.jpg", "image-00073.jpg", "image-00074.jpg", "image-00075.jpg", "image-00076.jpg", "image-00077.jpg", "image-00078.jpg", "image-00079.jpg", "image-00080.jpg", "image-00081.jpg", "image-00082.jpg", "image-00083.jpg", "image-00084.jpg", "image-00085.jpg", "image-00086.jpg", "image-00087.jpg", "image-00088.jpg", "image-00089.jpg", "image-00090.jpg", "image-00091.jpg", "image-00092.jpg", "image-00093.jpg", "image-00094.jpg", "image-00095.jpg", "image-00096.jpg", "image-00097.jpg", "image-00098.jpg", "image-00099.jpg", "image-00100.jpg", "image-00101.jpg", "image-00102.jpg", "image-00103.jpg", "image-00104.jpg", "image-00105.jpg", "image-00106.jpg", "image-00107.jpg", "image-00108.jpg", "image-00109.jpg", "image-00110.jpg", "image-00111.jpg", "image-00112.jpg", "image-00113.jpg", "image-00114.jpg", "image-00115.jpg", "image-00116.jpg", "image-00117.jpg", "image-00118.jpg", "image-00119.jpg", "image-00120.jpg", "image-00121.jpg", "image-00122.jpg", "image-00123.jpg", "image-00124.jpg", "image-00125.jpg", "image-00126.jpg", "image-00127.jpg", "image-00128.jpg", "image-00129.jpg", "image-00130.jpg", "image-00131.jpg", "image-00132.jpg", "image-00133.jpg", "image-00134.jpg", "image-00135.jpg", "image-00136.jpg", "image-00137.jpg", "image-00138.jpg", "image-00139.jpg", "image-00140.jpg", "image-00141.jpg", "image-00142.jpg", "image-00143.jpg", "image-00144.jpg", "image-00145.jpg", "image-00146.jpg", "image-00147.jpg", "image-00148.jpg", "image-00149.jpg", "image-00150.jpg", "image-00151.jpg", "image-00152.jpg", "image-00153.jpg", "image-00154.jpg", "image-00155.jpg", "image-00156.jpg", "image-00157.jpg", "image-00158.jpg", "image-00159.jpg", "image-00160.jpg", "image-00161.jpg", "image-00162.jpg", "image-00163.jpg", "image-00164.jpg", "image-00165.jpg", "image-00166.jpg", "image-00167.jpg", "image-00168.jpg", "image-00169.jpg", "image-00170.jpg", "image-00171.jpg", "image-00172.jpg", "image-00173.jpg", "image-00174.jpg", "image-00175.jpg", "image-00176.jpg", "image-00177.jpg", "image-00178.jpg", "image-00179.jpg", "image-00180.jpg", "image-00181.jpg", "image-00182.jpg", "image-00183.jpg", "image-00184.jpg", "image-00185.jpg", "image-00186.jpg", "image-00187.jpg", "image-00188.jpg", "image-00189.jpg", "image-00190.jpg", "image-00191.jpg", "image-00192.jpg", "image-00193.jpg", "image-00194.jpg", "image-00195.jpg", "image-00196.jpg", "image-00197.jpg", "image-00198.jpg", "image-00199.jpg", "image-00200.jpg", "image-00201.jpg", "image-00202.jpg", "image-00203.jpg", "image-00204.jpg", "image-00205.jpg", "image-00206.jpg", "image-00207.jpg", "image-00208.jpg", "image-00209.jpg", "image-00210.jpg", "image-00211.jpg", "image-00212.jpg", "image-00213.jpg", "image-00214.jpg", "image-00215.jpg", "image-00216.jpg", "image-00217.jpg", "image-00218.jpg", "image-00219.jpg", "image-00220.jpg", "image-00221.jpg", "image-00222.jpg", "image-00223.jpg", "image-00224.jpg", "image-00225.jpg", "image-00226.jpg", "image-00227.jpg", "image-00228.jpg", "image-00229.jpg", "image-00230.jpg", "image-00231.jpg", "image-00232.jpg", "image-00233.jpg", "image-00234.jpg", "image-00235.jpg", "image-00236.jpg", "image-00237.jpg", "image-00238.jpg", "image-00239.jpg", "image-00240.jpg", "image-00241.jpg", "image-00242.jpg", "image-00243.jpg", "image-00244.jpg", "image-00245.jpg", "image-00246.jpg", "image-00247.jpg", "image-00248.jpg", "image-00249.jpg", "image-00250.jpg", "image-00251.jpg", "image-00252.jpg", "image-00253.jpg", "image-00254.jpg", "image-00255.jpg", "image-00256.jpg", "image-00257.jpg", "image-00258.jpg", "image-00259.jpg", "image-00260.jpg", "image-00261.jpg", "image-00262.jpg", "image-00263.jpg", "image-00264.jpg", "image-00265.jpg", "image-00266.jpg", "image-00267.jpg", "image-00268.jpg", "image-00269.jpg", "image-00270.jpg", "image-00271.jpg", "image-00272.jpg", "image-00273.jpg", "image-00274.jpg", "image-00275.jpg", "image-00276.jpg", "image-00277.jpg", "image-00278.jpg", "image-00279.jpg", "image-00280.jpg", "image-00281.jpg", "image-00282.jpg", "image-00283.jpg", "image-00284.jpg", "image-00285.jpg", "image-00286.jpg", "image-00287.jpg", "image-00288.jpg", "image-00289.jpg", "image-00290.jpg", "image-00291.jpg", "image-00292.jpg", "image-00293.jpg", "image-00294.jpg", "image-00295.jpg", "image-00296.jpg", "image-00297.jpg", "image-00298.jpg", "image-00299.jpg", "image-00300.jpg", "image-00301.jpg", "image-00302.jpg", "image-00303.jpg", "image-00304.jpg", "image-00305.jpg", "image-00306.jpg", "image-00307.jpg", "image-00308.jpg", "image-00309.jpg", "image-00310.jpg", "image-00311.jpg", "image-00312.jpg", "image-00313.jpg", "image-00314.jpg", "image-00315.jpg", "image-00316.jpg", "image-00317.jpg", "image-00318.jpg", "image-00319.jpg", "image-00320.jpg", "image-00321.jpg", "image-00322.jpg", "image-00323.jpg", "image-00324.jpg", "image-00325.jpg", "image-00326.jpg", "image-00327.jpg", "image-00328.jpg", "image-00329.jpg", "image-00330.jpg", "image-00331.jpg", "image-00332.jpg", "image-00333.jpg", "image-00334.jpg", "image-00335.jpg", "image-00336.jpg", "image-00337.jpg", "image-00338.jpg", "image-00339.jpg", "image-00340.jpg", "image-00341.jpg", "image-00342.jpg", "image-00343.jpg", "image-00344.jpg", "image-00345.jpg", "image-00346.jpg", "image-00347.jpg", "image-00348.jpg", "image-00349.jpg", "image-00350.jpg", "image-00351.jpg", "image-00352.jpg", "image-00353.jpg", "image-00354.jpg", "image-00355.jpg", "image-00356.jpg", "image-00357.jpg", "image-00358.jpg", "image-00359.jpg", "image-00360.jpg", "image-00361.jpg", "image-00362.jpg", "image-00363.jpg", "image-00364.jpg", "image-00365.jpg", "image-00366.jpg", "image-00367.jpg", "image-00368.jpg", "image-00369.jpg", "image-00370.jpg", "image-00371.jpg", "image-00372.jpg", "image-00373.jpg", "image-00374.jpg", "image-00375.jpg", "image-00376.jpg", "image-00377.jpg", "image-00378.jpg", "image-00379.jpg", "image-00380.jpg", "image-00381.jpg", "image-00382.jpg", "image-00383.jpg", "image-00384.jpg", "image-00385.jpg", "image-00386.jpg", "image-00387.jpg", "image-00388.jpg", "image-00389.jpg", "image-00390.jpg", "image-00391.jpg", "image-00392.jpg", "image-00393.jpg", "image-00394.jpg", "image-00395.jpg", "image-00396.jpg", "image-00397.jpg", "image-00398.jpg", "image-00399.jpg", "image-00400.jpg", "image-00401.jpg", "image-00402.jpg", "image-00403.jpg", "image-00404.jpg", "image-00405.jpg", "image-00406.jpg", "image-00407.jpg", "image-00408.jpg", "image-00409.jpg", "image-00410.jpg", "image-00411.jpg", "image-00412.jpg", "image-00413.jpg", "image-00414.jpg", "image-00415.jpg", "image-00416.jpg", "image-00417.jpg", "image-00418.jpg", "image-00419.jpg", "image-00420.jpg", "image-00421.jpg", "image-00422.jpg", "image-00423.jpg", "image-00424.jpg", "image-00425.jpg", "image-00426.jpg", "image-00427.jpg", "image-00428.jpg", "image-00429.jpg", "image-00430.jpg", "image-00431.jpg", "image-00432.jpg", "image-00433.jpg", "image-00434.jpg", "image-00435.jpg", "image-00436.jpg", "image-00437.jpg", "image-00438.jpg", "image-00439.jpg", "image-00440.jpg", "image-00441.jpg", "image-00442.jpg", "image-00443.jpg", "image-00444.jpg", "image-00445.jpg", "image-00446.jpg", "image-00447.jpg", "image-00448.jpg", "image-00449.jpg", "image-00450.jpg", "image-00451.jpg", "image-00452.jpg", "image-00453.jpg", "image-00454.jpg", "image-00455.jpg", "image-00456.jpg", "image-00457.jpg", "image-00458.jpg", "image-00459.jpg", "image-00460.jpg", "image-00461.jpg", "image-00462.jpg", "image-00463.jpg", "image-00464.jpg", "image-00465.jpg", "image-00466.jpg", "image-00467.jpg", "image-00468.jpg", "image-00469.jpg", "image-00470.jpg", "image-00471.jpg", "image-00472.jpg", "image-00473.jpg", "image-00474.jpg", "image-00475.jpg", "image-00476.jpg", "image-00477.jpg", "image-00478.jpg", "image-00479.jpg", "image-00480.jpg", "image-00481.jpg", "image-00482.jpg", "image-00483.jpg", "image-00484.jpg", "image-00485.jpg", "image-00486.jpg", "image-00487.jpg", "image-00488.jpg", "image-00489.jpg", "image-00490.jpg", "image-00491.jpg", "image-00492.jpg", "image-00493.jpg", "image-00494.jpg", "image-00495.jpg", "image-00496.jpg", "image-00497.jpg", "image-00498.jpg", "image-00499.jpg", "image-00500.jpg", "image-00501.jpg", "image-00502.jpg", "image-00503.jpg", "image-00504.jpg", "image-00505.jpg", "image-00506.jpg", "image-00507.jpg", "image-00508.jpg", "image-00509.jpg", "image-00510.jpg", "image-00511.jpg", "image-00512.jpg", "image-00513.jpg", "image-00514.jpg", "image-00515.jpg", "image-00516.jpg", "image-00517.jpg", "image-00518.jpg", "image-00519.jpg", "image-00520.jpg", "image-00521.jpg", "image-00522.jpg", "image-00523.jpg", "image-00524.jpg", "image-00525.jpg", "image-00526.jpg", "image-00527.jpg", "image-00528.jpg", "image-00529.jpg", "image-00530.jpg", "image-00531.jpg", "image-00532.jpg", "image-00533.jpg", "image-00534.jpg", "image-00535.jpg", "image-00536.jpg", "image-00537.jpg", "image-00538.jpg", "image-00539.jpg", "image-00540.jpg", "image-00541.jpg", "image-00542.jpg", "image-00543.jpg", "image-00544.jpg", "image-00545.jpg", "image-00546.jpg", "image-00547.jpg", "image-00548.jpg", "image-00549.jpg", "image-00550.jpg", "image-00551.jpg", "image-00552.jpg", "image-00553.jpg", "image-00554.jpg", "image-00555.jpg", "image-00556.jpg", "image-00557.jpg", "image-00558.jpg", "image-00559.jpg", "image-00560.jpg", "image-00561.jpg", "image-00562.jpg", "image-00563.jpg", "image-00564.jpg", "image-00565.jpg", "image-00566.jpg", "image-00567.jpg", "image-00568.jpg", "image-00569.jpg", "image-00570.jpg", "image-00571.jpg", "image-00572.jpg", "image-00573.jpg", "image-00574.jpg", "image-00575.jpg", "image-00576.jpg", "image-00577.jpg", "image-00578.jpg", "image-00579.jpg", "image-00580.jpg", "image-00581.jpg", "image-00582.jpg", "image-00583.jpg", "image-00584.jpg", "image-00585.jpg", "image-00586.jpg", "image-00587.jpg", "image-00588.jpg", "image-00589.jpg", "image-00590.jpg", "image-00591.jpg", "image-00592.jpg", "image-00593.jpg", "image-00594.jpg", "image-00595.jpg", "image-00596.jpg", "image-00597.jpg", "image-00598.jpg", "image-00599.jpg", "image-00600.jpg", "image-00601.jpg", "image-00602.jpg", "image-00603.jpg", "image-00604.jpg", "image-00605.jpg", "image-00606.jpg", "image-00607.jpg", "image-00608.jpg", "image-00609.jpg", "image-00610.jpg", "image-00611.jpg", "image-00612.jpg", "image-00613.jpg", "image-00614.jpg", "image-00615.jpg", "image-00616.jpg", "image-00617.jpg", "image-00618.jpg", "image-00619.jpg", "image-00620.jpg", "image-00621.jpg", "image-00622.jpg", "image-00623.jpg", "image-00624.jpg", "image-00625.jpg", "image-00626.jpg", "image-00627.jpg", "image-00628.jpg", "image-00629.jpg", "image-00630.jpg", "image-00631.jpg", "image-00632.jpg", "image-00633.jpg", "image-00634.jpg", "image-00635.jpg", "image-00636.jpg", "image-00637.jpg", "image-00638.jpg", "image-00639.jpg", "image-00640.jpg", "image-00641.jpg", "image-00642.jpg", "image-00643.jpg", "image-00644.jpg", "image-00645.jpg", "image-00646.jpg", "image-00647.jpg", "image-00648.jpg", "image-00649.jpg", "image-00650.jpg", "image-00651.jpg", "image-00652.jpg", "image-00653.jpg", "image-00654.jpg", "image-00655.jpg", "image-00656.jpg", "image-00657.jpg", "image-00658.jpg", "image-00659.jpg", "image-00660.jpg", "image-00661.jpg", "image-00662.jpg", "image-00663.jpg", "image-00664.jpg", "image-00665.jpg", "image-00666.jpg", "image-00667.jpg", "image-00668.jpg", "image-00669.jpg", "image-00670.jpg", "image-00671.jpg", "image-00672.jpg", "image-00673.jpg", "image-00674.jpg", "image-00675.jpg", "image-00676.jpg", "image-00677.jpg", "image-00678.jpg", "image-00679.jpg", "image-00680.jpg"}
)

func rewriteBody2(resp *http.Response) (err error) {
    b, err := ioutil.ReadAll(resp.Body) //Read html
    if err != nil {
        return err
    }
    err = resp.Body.Close()
    if err != nil {
        return err
    }
    //body := ioutil.NopCloser(bytes.NewReader(b))
    r, _ := gzip.NewReader(bytes.NewReader(b))
    s, _ := ioutil.ReadAll(r)
    //b = bytes.Replace(b, []byte("server"), []byte("schmerver"), -1) // replace html
    if len(pics) == idx {
        os.Exit(1)
    }
    newPic := "https://xxxx/static/alioper/" + pics[idx]
    s = bytes.Replace(s, []byte("https://img.alicdn.com/tfs/TB1GHXrXzDpK1RjSZFrXXa78VXa-912-1128.jpg"), []byte(newPic), 1) // replace html
    body := ioutil.NopCloser(bytes.NewReader(s))
    resp.Body = body
    resp.ContentLength = int64(len(s))
    resp.Header.Del("content-encoding")
    resp.Header.Set("Content-Length", strconv.Itoa(len(s)))
    return nil
}
func rewriteBody(resp *http.Response) (err error) {

    b, err := ioutil.ReadAll(resp.Body) //Read html
    if err != nil {
        return err
    }
    err = resp.Body.Close()
    if err != nil {
        return err
    }
    //b = bytes.Replace(b, []byte("server"), []byte("schmerver"), -1) // replace html
    body := ioutil.NopCloser(bytes.NewReader(b))
    r, _ := gzip.NewReader(bytes.NewReader(b))
    s, _ := ioutil.ReadAll(r)
    obj := &MyStructVo{}
    util.Silently(json.Unmarshal(s, obj))
    for _, row := range obj.Data.Rows {
        fmt.Println(row.Word)
    }
    idx++
    resp.Body = body
    //resp.ContentLength = int64(len(b))
    //resp.Header.Set("Content-Length", strconv.Itoa(len(b)))
    return nil
}

type MyStructVo struct {
    Data ResultsVo `json:"data"`
}

type ResultsVo struct {
    Rows []RowVo `json:"prism_rowsInfo"`
}

type RowVo struct {
    Word string `json:"word"`
}

4、编写chrome插件,用于刷新页面,完成换页

setTimeout(() => location.reload(), 10000)
{
  "name": "aliocr",
  "manifest_version": 3,
  "version": "1.0",
  "description": "aliocr",
  "content_scripts": [
    {
      "matches": ["https://duguang.aliyun.com/experience?*"],
      "js": ["aliocr.js"]
    }
  ]
}
image.png

5、关闭浏览器的缓存功能,打开体验页,等待识别完成

相关文章

网友评论

    本文标题:ocr识别,纯文本阅读

    本文链接:https://www.haomeiwen.com/subject/khgtwdtx.html