美文网首页
加拼音(新格式)-代码

加拼音(新格式)-代码

作者: 姜附 | 来源:发表于2020-07-16 09:54 被阅读0次

    word文档全篇加拼音(批量注音)+批量修改读音

    加拼音(新格式)-代码

    package main
    
    import (
        "archive/zip"
        "runtime"
    
        "bufio"
        "bytes"
        "errors"
        "fmt"
        "io"
        "io/ioutil"
        "os"
        "path"
        _ "regexp"
        "strconv"
        "strings"
        "time"
        "unicode"
    
        "golang.org/x/text/encoding/simplifiedchinese"
        "golang.org/x/text/transform"
    
        // "github.com/aurelien-rainone/assertgo"
        "github.com/etree"
        "github.com/mozillazg/go-pinyin"
    )
    
    func assert(cond bool, failPrompt string) {
        if !cond {
            pc, file, line, _ := runtime.Caller(1)
            pcName := runtime.FuncForPC(pc).Name()
            panic(file + ":" + strconv.Itoa(line) + ":" + pcName + "  " + failPrompt)
        }
    }
    
    var g_configPinyinFont string = "微软雅黑"
    var g_configPinyinFontSize int = -1 // 使用值是字面值*2
    // var g_configPinyinUseFontSize bool = false // 是否使用统一的拼音字号
    var g_configPinyinAlignmentMap map[string]string = map[string]string{
        "居中":    "center",
        "0-1-0": "distributeLetter",
        "1-2-1": "distributeSpace",
        "左对齐":   "left",
        "右对齐":   "right",
    }
    
    var g_configPinyinAlignment = "center" // 对齐方式,默认为居中
    
    var g_configPinyinRaise int = -1 // 拼音偏移
    
    func parseConfigLine(line string) error {
    
        line2 := strings.TrimSpace(line)
        vals := strings.Split(line2, "=")
        assert(2 == len(vals), "无效配置:"+line)
    
        switch vals[0] {
        case "拼音字体":
            {
                if "" != vals[1] {
                    g_configPinyinFont = vals[1]
                }
            }
        case "拼音字号":
            {
                if "" != vals[1] {
                    v, err := strconv.Atoi(vals[1])
                    assert((nil == err) && (1 <= v), "无效配置:"+line)
                    // return errors.New(fmt.Sprintf("字号无效:%s: %s\r\n", line, err))
    
                    // g_configPinyinUseFontSize = true
                    g_configPinyinFontSize = v
                }
    
            }
        case "拼音对齐":
            {
                if "" != vals[1] {
                    v := g_configPinyinAlignmentMap[vals[1]]
                    assert("" != v, "无效配置:"+line)
                    g_configPinyinAlignment = v
                }
            }
        case "拼音偏移":
            {
                v, err := strconv.Atoi(vals[1])
                assert((nil == err) && (0 <= v), "无效配置:"+line)
                g_configPinyinRaise = v
            }
        default:
            {
                assert(false, "无效配置:"+line)
            }
        }
    
        return nil
    }
    
    func parseConfig() error {
        fileName := "拼音配置.txt"
        fileinfo, err := os.Stat(fileName)
        if nil != err {
            // 文件不存在,创建一个新的,并写入注释说明
            newFile, err := os.Create(fileName)
            if err != nil {
                return err
            }
            defer newFile.Close()
            newFile.WriteString(
                fmt.Sprintf(
                    // 默认不填写具体值
                    `# 说明:只支持docx格式的word文档;
    # 拼音字号默认动态变化,指定字号后全篇拼音使用指定字号;
    拼音字体=
    拼音字号=
    拼音对齐=
    拼音偏移=
    `,
                    // g_configPinyinFont,
                    // g_configPinyinFontSize/2,
                    // g_configPinyinOffset,
                ),
                // fmt.Sprintf("# 说明:只支持docx格式的word文档\r\n拼音字体=%s",
                //  g_configPinyinFont,
                // ),
            )
        } else if fileinfo.IsDir() {
            return errors.New("无法创建配置文件:" + fileName)
        } else {
    
            f, err := os.Open(fileName)
            if err != nil {
                return nil
            }
    
            br := bufio.NewReader(f)
            for {
                line, _, err := br.ReadLine()
                if err == io.EOF {
                    break
                }
    
                // line = strings.TrimSpace(line)
                strLine := strings.ReplaceAll(string(line), " ", "") // 去掉所有空格
    
                if "" == strLine {
                    continue
                }
    
                // strLine := strings.TrimLeft(string(line), " ")
                if strings.HasPrefix(strLine, "#") {
                    continue
                }
    
                if err = parseConfigLine(strLine); nil != err {
                    return err
                }
            }
    
        }
        return nil
    }
    
    // func createWrPrNode(oldWrPrNode *etree.Element) *etree.Element {
    //  newWrPr := oldWrPrNode.Copy()
    
    //  wrFonts := newWrPr.FindElement("w:rFonts")
    //  if nil == wrFonts {
    //      // panic(fileLine() + "no w:rFonts")
    //      // wrFonts = newWrPr.CreateElement("w:rFonts")
    //  } else {
    //      whint := wrFonts.SelectAttr("w:hint")
    //      if nil != whint {
    //          whint.Value = "default"
    //      } else {
    //          wrFonts.CreateAttr("w:hint", "default")
    //      }
    //  }
    
    //  return newWrPr
    // }
    
    // func createWrBegin(wrPrNode *etree.Element) *etree.Element {
    
    //  wr := etree.NewElement("w:r")
    //  if nil != wrPrNode {
    //      wr.AddChild(createWrPrNode(wrPrNode))
    //  }
    
    //  wfldChar := etree.NewElement("w:fldChar")
    //  wfldChar.CreateAttr("w:fldCharType", "begin")
    //  wr.AddChild(wfldChar)
    
    //  return wr
    // }
    
    // func createWrinstrText(text string, wrPrNode *etree.Element) *etree.Element {
    //  if nil == wrPrNode {
    //      // panic(fileLine() + "nil == wrPrNode")
    //  }
    
    //  wr := etree.NewElement("w:r")
    
    //  if nil != wrPrNode {
    //      wr.AddChild(createWrPrNode(wrPrNode))
    //  }
    
    //  winstrText := etree.NewElement("w:instrText")
    //  winstrText.CreateAttr("xml:space", "preserve")
    //  winstrText.SetText(text)
    //  // fmt.Println(wfldChar.Text())
    //  wr.AddChild(winstrText)
    
    //  return wr
    // }
    
    // func createWrEnd(wrPrNode *etree.Element) *etree.Element {
    //  wr := etree.NewElement("w:r")
    
    //  if nil != wrPrNode {
    //      wr.AddChild(createWrPrNode(wrPrNode))
    //  }
    
    //  wfldChar := etree.NewElement("w:fldChar")
    //  wfldChar.CreateAttr("w:fldCharType", "end")
    //  wr.AddChild(wfldChar)
    
    //  return wr
    // }
    
    func create__w_r__w_t(w string, oldWrNode *etree.Element) *etree.Element {
        wr := oldWrNode.Copy()
    
        wt := wr.FindElement("w:t")
        if nil == wt {
            wt = etree.NewElement("w:t")
            wr.AddChild(wt)
        }
        wt.SetText(w)
    
        return wr
    }
    
    // "一"字
    // 單用或在一詞一句的末尾,念陰平聲;
    // 在去聲字前,念陽平聲;
    // 在陰平、陽平、上聲之前,念去聲。
    // 阴阳上去分别为第一二三四声
    func procYI(nextHan string, nextPinyin string) string {
    
        // 句末
        if ("" == nextHan) || !unicode.Is(unicode.Han, []rune(nextHan)[0]) {
            return "yī"
        }
    
        assert("" != nextPinyin, "nextPinyin为空")
    
        // 后面跟着汉字
        if strings.ContainsAny(nextPinyin, "àòèìùǜ") {
            return "yí"
        }
    
        return "yì"
    }
    
    // 「不」字
    // 在去聲字之前,變讀為陽平
    // 阴阳上去分别为第一二三四声
    func procBU(nextHan string, nextPinyin string) string {
    
        if ("" != nextHan) && strings.ContainsAny(nextPinyin, "àòèìùǜ") {
            assert("" != nextPinyin, "nextPinyin为空")
    
            return "bú"
        }
    
        return "bù"
    }
    
    func createNodeAttr(tag, attrName, attrValue string) *etree.Element {
        newTag := etree.NewElement(tag)
        newTag.CreateAttr(attrName, attrValue)
        return newTag
    }
    
    func create__w_rubyPr(fontSize int, pinyinFontSize int) *etree.Element {
    
        w_rubyPr := etree.NewElement("w:rubyPr")
        w_rubyPr.AddChild(createNodeAttr("w:rubyAlign", "w:val", g_configPinyinAlignment))
        w_rubyPr.AddChild(createNodeAttr("w:hps", "w:val", strconv.Itoa(pinyinFontSize)))
    
        var pinyinRaise int
        if -1 != g_configPinyinRaise {
            pinyinRaise = fontSize - 2 + g_configPinyinRaise*2 //- 2 // 如果设置了偏移,要多减一个2,不知道为什么。。。
        } else {
            pinyinRaise = fontSize - 2
        }
    
        w_rubyPr.AddChild(createNodeAttr("w:hpsRaise", "w:val", strconv.Itoa(pinyinRaise)))
        w_rubyPr.AddChild(createNodeAttr("w:hpsBaseText", "w:val", strconv.Itoa(fontSize)))
        w_rubyPr.AddChild(createNodeAttr("w:lid", "w:val", "zh-CN"))
    
        return w_rubyPr
    }
    
    func set_attr(node *etree.Element, attr_name string, value string) {
        if attr := node.SelectAttr(attr_name); nil != attr {
            attr.Value = value
        } else {
            node.CreateAttr(attr_name, value)
        }
        return
    }
    
    func create__w_rt(w_rPr *etree.Element, fontSize int,
        pinyinFontSize int, pinyin string) *etree.Element {
        w_rt := etree.NewElement("w:rt")
        w_r := w_rt.CreateElement("w:r")
    
        var new__w_rPr *etree.Element
        if nil == w_rPr {
            new__w_rPr = w_r.CreateElement("w:rPr")
        } else {
            new__w_rPr = w_rPr.Copy()
            w_r.AddChild(new__w_rPr)
        }
    
        w_rFonts := new__w_rPr.FindElement("w:rFonts")
        if nil == w_rFonts {
            w_rFonts = new__w_rPr.CreateElement("w:rFonts")
        }
    
        set_attr(w_rFonts, "w:ascii", g_configPinyinFont)
        set_attr(w_rFonts, "w:eastAsia", g_configPinyinFont)
        set_attr(w_rFonts, "w:hAnsi", g_configPinyinFont)
    
        // w_rFonts.RemoveAttr("w:hint")
        // w_rFonts.RemoveAttr("w:hAnsi") // for test
    
        // 排一下序
        // remove_add_attr(w_rFonts, "w:ascii")
        // remove_add_attr(w_rFonts, "w:eastAsia")
        // remove_add_attr(w_rFonts, "w:hAnsi")
        // remove_add_attr(w_rFonts, "w:cs")
    
        w_sz := new__w_rPr.FindElement("w:sz")
        if nil == w_sz {
            w_sz = new__w_rPr.CreateElement("w:sz")
        }
    
        // 调整一下顺序
        new__w_rPr.RemoveChild(w_sz)
        new__w_rPr.AddChild(w_sz)
        remove_then_add_child(new__w_rPr, "w:szCs")
        remove_then_add_child(new__w_rPr, "w:shd")
    
        w_val := w_sz.SelectAttr("w:val")
        if nil != w_val {
            w_val.Value = strconv.Itoa(pinyinFontSize)
        } else {
            w_sz.CreateAttr("w:val", strconv.Itoa(pinyinFontSize))
        }
    
        w_r.CreateElement("w:t").SetText(pinyin)
    
        return w_rt
    }
    
    func create__w_rubyBase(w_rPr *etree.Element, han string) *etree.Element {
        w_rubyBase := etree.NewElement("w:rubyBase")
        w_r := w_rubyBase.CreateElement("w:r")
        if nil != w_rPr {
            w_r.AddChild(w_rPr.Copy())
        }
    
        w_t := w_r.CreateElement("w:t")
        w_t.SetText(han)
    
        return w_rubyBase
    }
    
    func create__w_ruby(w_rPr *etree.Element, fontSize int,
        pinyin string, han string) *etree.Element {
    
        var pinyinFontSize int
        if -1 != g_configPinyinFontSize {
            pinyinFontSize = g_configPinyinFontSize * 2
        } else {
            pinyinFontSize = fontSize / 2
        }
    
        w_ruby := etree.NewElement("w:ruby")
        w_ruby.AddChild(create__w_rubyPr(fontSize, pinyinFontSize))
        w_ruby.AddChild(create__w_rt(w_rPr, fontSize, pinyinFontSize, pinyin)) // 拼音
        w_ruby.AddChild(create__w_rubyBase(w_rPr, han))                        // 汉字
    
        return w_ruby
    }
    
    func create__w_r(w_rPr *etree.Element, fontSize int,
        pinyin string, han string) *etree.Element {
    
        w_r := etree.NewElement("w:r")
        if nil != w_rPr {
            new__w_rPr := w_rPr.Copy()
    
            // if w_rFonts := new_w_rPr.FindElement("w:rFonts"); nil != w_rFonts {
            // new_w_rPr.RemoveChild(w_rFonts) // for test
            // }
    
            w_r.AddChild(new__w_rPr)
        }
        w_r.AddChild(create__w_ruby(w_rPr, fontSize, pinyin, han))
        return w_r
    }
    
    func remove_then_add_attr(node *etree.Element, name string) {
        // 删掉再加回去,用于调整顺序
        if attr := node.RemoveAttr(name); nil != attr {
            node.CreateAttr(name, attr.Value)
        }
        return
    }
    
    func remove_child(node *etree.Element, child_name string) {
        if child := node.FindElement(child_name); nil != child {
            node.RemoveChild(child)
        }
        return
    }
    
    func remove_then_add_child(node *etree.Element, child_name string) {
        if child := node.FindElement(child_name); nil != child {
            node.RemoveChild(child)
            node.AddChild(child)
        }
        return
    }
    
    func addPinyin(buf []byte) (string, error) {
    
        pinyinArg := pinyin.NewArgs()
        pinyinArg.Style = pinyin.Tone // 包含声调
    
        doc := etree.NewDocument()
        err := doc.ReadFromBytes(buf)
        if nil != err {
            fmt.Println(err)
    
            transformers := []transform.Transformer{
                simplifiedchinese.GBK.NewDecoder(),
                simplifiedchinese.HZGB2312.NewDecoder(),
            }
    
            fmt.Println("尝试转码")
            for _, t := range transformers {
    
                I := bytes.NewReader(buf)
                O := transform.NewReader(I, t)
                var d []byte
                d, err = ioutil.ReadAll(O)
                if nil != err {
                    continue
                }
    
                err = doc.ReadFromBytes(d)
                if nil == err {
                    fmt.Println("转码成功")
                    break
                }
            }
    
            if nil != err {
                fmt.Println("转码失败")
                return "", err
            }
        }
    
        wdocument := doc.SelectElement("w:document")
    
        // w:p是一个段落,一段一段的处理
        for _, w_p := range wdocument.FindElements("w:body/w:p") {
    
            if w_p__w_pPr__w_rPr := w_p.FindElement("w:pPr/w:rPr"); nil != w_p__w_pPr__w_rPr {
                if w_rFonts := w_p__w_pPr__w_rPr.FindElement("w:rFonts"); nil != w_rFonts {
                    // 排一下序
                    remove_then_add_attr(w_rFonts, "w:ascii")
                    remove_then_add_attr(w_rFonts, "w:eastAsia")
                    remove_then_add_attr(w_rFonts, "w:hAnsi")
                    remove_then_add_attr(w_rFonts, "w:cs")
                    w_rFonts.RemoveAttr("w:hint")
                }
    
                remove_child(w_p__w_pPr__w_rPr, "w:lang")
                // remove_child(w_p__w_pPr__w_rPr, "w:szCs")
    
                // w_p__w_pPr__w_rPr = w_p__w_pPr__w_rPr.Copy()
    
                // w_lang.CreateAttr("w:lang", "en-US")
                // w_lang.CreateAttr("w:eastAsia", "zh-CN")
            }
    
            allStrOfWp := "" // 段内所有文字
            for _, wr := range w_p.FindElements("w:r") {
                if wt := wr.FindElement("w:t"); nil != wt {
                    allStrOfWp += wt.Text()
                }
            }
    
            pinyins := pinyin.Pinyin(allStrOfWp, pinyinArg)
            pinyinIndex := 0
            allHansArr := strings.Split(allStrOfWp, "")
            hanIndex := 0
    
            for _, w_r := range w_p.FindElements("w:r") {
    
                fontSize := 21 // 默认值,word中看到默认字号是10.5,这里xml中的数字是word中可视参数的2倍。
    
                // if w_rPr := w_r.FindElement("w:rPr"); nil != w_rPr {
                //  w_r.RemoveChild("w:rPr")
                // }
    
                w_r__w_rPr := w_r.FindElement("w:rPr")
                if nil != w_r__w_rPr {
    
                    if w_rFonts := w_r__w_rPr.FindElement("w:rFonts"); nil != w_rFonts {
                        w_rFonts.RemoveAttr("w:hint")
                        if 0 == len(w_rFonts.Attr) {
                            w_r__w_rPr.RemoveChild(w_rFonts)
    
                        } else {
    
                            // 排一下序
                            remove_then_add_attr(w_rFonts, "w:ascii")
                            remove_then_add_attr(w_rFonts, "w:eastAsia")
                            remove_then_add_attr(w_rFonts, "w:hAnsi")
                            remove_then_add_attr(w_rFonts, "w:cs")
                        }
                    }
    
                    if w_sz := w_r__w_rPr.FindElement("w:sz"); nil != w_sz {
                        if w_val := w_sz.SelectAttr("w:val"); nil != w_val {
                            if v, err := strconv.Atoi(w_val.Value); nil == err {
                                fontSize = v
                            }
                        }
                    }
    
                    // 调整一下顺序
                    // remove_then_add_child(w_r__w_rPr, "w:szCs")
    
                    if 0 == len(w_r__w_rPr.Child) {
                        w_r__w_rPr.Parent().RemoveChild(w_r__w_rPr)
                        w_r__w_rPr = nil
                    }
    
                    // if w_rFonts := w_r__w_rPr.FindElement("w:rFonts"); nil != w_rFonts {
                    //  w_rFonts.CreateAttr("w:ascii", g_configPinyinFont)
                    //  w_rFonts.CreateAttr("w:eastAsia", g_configPinyinFont)
                    //  w_rFonts.CreateAttr("w:hAnsi", g_configPinyinFont)
                    //  w_rFonts.CreateAttr("w:cs", g_configPinyinFont)
                    // }
                    // if w_lang := w_r__w_rPr.FindElement("w:lang"); nil != w_lang {
                    //  w_r__w_rPr.RemoveChild(w_lang)
                    // }
                }
    
                // var pinyinFontSize int
                // if -1 != g_configPinyinFontSize {
                //  pinyinFontSize = g_configPinyinFontSize
                // } else {
                //  pinyinFontSize = fontSize / 2
                // }
    
                // var pinyinRaise int
                // if -1 != g_configPinyinRaise {
                //  pinyinRaise = g_configPinyinRaise
                // } else {
                //  pinyinRaise = pinyinFontSize - 2
                // }
    
                if w_t := w_r.FindElement("w:t"); nil != w_t {
                    text := w_t.Text()
                    // fmt.Println(text)
    
                    lastStr := ""
    
                    for _, w := range text {
                        han := string(w)
                        assert(han == allHansArr[hanIndex], "出错啦")
    
                        if unicode.Is(unicode.Han, w) {
                            if "" != lastStr {
                                w_p.InsertChild(w_r, create__w_r__w_t(lastStr, w_r))
                                lastStr = ""
                            }
    
                            nextHan := ""
                            nextPinyin := ""
                            if hanIndex+1 < len(allHansArr) {
                                nextHan = allHansArr[hanIndex+1]
                                if unicode.Is(unicode.Han, []rune(nextHan)[0]) {
                                    nextPinyin = pinyins[pinyinIndex+1][0]
                                }
                            }
    
                            var pinyin string
                            switch han {
                            case "一":
                                {
                                    pinyin = procYI(nextHan, nextPinyin)
                                }
                            case "不":
                                {
                                    pinyin = procBU(nextHan, nextPinyin)
                                }
                            default:
                                {
                                    pinyin = pinyins[pinyinIndex][0]
                                }
                            }
    
                            w_r.Parent().InsertChild(w_r, create__w_r(w_r__w_rPr, fontSize, pinyin, han))
    
                            pinyinIndex++
    
                        } else {
                            lastStr += han
                        }
    
                        hanIndex++
                    }
    
                    if "" != lastStr {
                        w_p.InsertChild(w_r, create__w_r__w_t(lastStr, w_r))
                        lastStr = ""
                    }
    
                    w_r.Parent().RemoveChild(w_r)
                }
            }
        }
    
        return doc.WriteToString()
    }
    
    func procOneDocxFile(fromPath string, toPath string) error {
        zipReader, err := zip.OpenReader(fromPath)
        if err != nil {
            fmt.Print(err)
            return err
        }
        defer zipReader.Close()
    
        newZipFile, err := os.Create(toPath)
        if err != nil {
            fmt.Println(err)
            return err
        }
        defer newZipFile.Close()
    
        zipWriter := zip.NewWriter(newZipFile)
        defer zipWriter.Close()
    
        var f *zip.File
        for _, file := range zipReader.File {
    
            rc, err := file.Open()
            if nil != err {
                return err
            }
    
            buf := make([]byte, file.UncompressedSize)
    
            // zipfile文件一次可能不能读完,循环读完为止
            readLen := 0
            for file.UncompressedSize != uint32(readLen) {
                n, err := rc.Read(buf[readLen:])
                if nil != err && (0 != strings.Compare("EOF", err.Error())) {
                    fmt.Println(err)
                    return err
                }
                if 0 == n {
                    return errors.New("读取zip出错")
                }
                readLen += n
            }
    
            var newBuf []byte
            if "word/document.xml" == file.Name {
                f = file
    
                assert(file.UncompressedSize == uint32(readLen), "读取错误")
    
                newXmlStr, err := addPinyin(buf)
                if nil != err {
                    return err
                }
    
                newBuf = []byte(newXmlStr)
    
            } else {
                newBuf = buf
    
            }
    
            newFile, err := zipWriter.Create(file.Name)
            if err != nil {
                return err
            }
    
            _, err = newFile.Write(newBuf)
            if err != nil {
                return err
            }
        }
    
        if nil == f {
            err = errors.New(fromPath + ": 没有 word/document.xml")
            return err
        }
    
        return nil
    }
    
    func initDir(paths []string) error {
    
        for _, path := range paths {
            fileinfo, err := os.Stat(path)
            if nil != err {
                err = os.Mkdir(path, os.ModePerm)
                if err != nil {
                    fmt.Println(err)
                    return err
                }
            } else if !fileinfo.IsDir() {
                return errors.New("无法创建目录:" + path)
            }
        }
    
        return nil
    }
    
    func main() {
    
        var err error
        var startTime, endTime time.Time
        startTime = time.Now()
        defer func() {
            if p := recover(); nil != p {
                fmt.Printf("panic recover! : %v\r\n", p)
            }
            if nil != err {
                fmt.Printf("error : %v\r\n", err)
            }
    
            endTime = time.Now()
            fmt.Println("耗时:", endTime.Sub(startTime))
            fmt.Println("按任意键结束")
            var data int
            fmt.Scanf("%d", &data)
    
            return
        }()
    
        todoDir := "./1-加拼音的docx-待处理"
        doneDir := "./2-加拼音的docx-结果"
        if err = initDir([]string{todoDir, doneDir}); nil != err {
            return
        }
    
        if err = parseConfig(); nil != err {
            return
        }
    
        files, err := ioutil.ReadDir(todoDir)
        if nil != err {
            return
        }
        for _, f := range files {
    
            if f.IsDir() {
                continue
            }
            if strings.HasPrefix(path.Base(f.Name()), "~$") {
                continue
            }
    
            ext := path.Ext(f.Name())
            if !strings.EqualFold(".docx", ext) {
                continue
            }
    
            fmt.Println("正在处理文件:" + f.Name())
    
            fromPath := todoDir + "/" + f.Name()
            toPath := doneDir + "/" + strings.TrimSuffix(path.Base(f.Name()), ext) + time.Now().Format("_20060102_150405.docx")
            err = procOneDocxFile(fromPath, toPath)
            if nil != err {
                return
            }
    
        }
    
        return
    }
    
    

    相关文章

      网友评论

          本文标题:加拼音(新格式)-代码

          本文链接:https://www.haomeiwen.com/subject/atouhktx.html