word文档全篇加拼音(批量注音)+批量修改读音
加拼音(新格式)-代码
package main
import (
"archive/zip"
"runtime"
"bufio"
"bytes"
"errors"
"fmt"
"io"
"io/ioutil"
"os"
"path"
_ "regexp"
"strconv"
"strings"
"time"
"unicode"
"golang.org/x/text/encoding/simplifiedchinese"
"golang.org/x/text/transform"
// "github.com/aurelien-rainone/assertgo"
"github.com/etree"
"github.com/mozillazg/go-pinyin"
)
func assert(cond bool, failPrompt string) {
if !cond {
pc, file, line, _ := runtime.Caller(1)
pcName := runtime.FuncForPC(pc).Name()
panic(file + ":" + strconv.Itoa(line) + ":" + pcName + " " + failPrompt)
}
}
var g_configPinyinFont string = "微软雅黑"
var g_configPinyinFontSize int = -1 // 使用值是字面值*2
// var g_configPinyinUseFontSize bool = false // 是否使用统一的拼音字号
var g_configPinyinAlignmentMap map[string]string = map[string]string{
"居中": "center",
"0-1-0": "distributeLetter",
"1-2-1": "distributeSpace",
"左对齐": "left",
"右对齐": "right",
}
var g_configPinyinAlignment = "center" // 对齐方式,默认为居中
var g_configPinyinRaise int = -1 // 拼音偏移
func parseConfigLine(line string) error {
line2 := strings.TrimSpace(line)
vals := strings.Split(line2, "=")
assert(2 == len(vals), "无效配置:"+line)
switch vals[0] {
case "拼音字体":
{
if "" != vals[1] {
g_configPinyinFont = vals[1]
}
}
case "拼音字号":
{
if "" != vals[1] {
v, err := strconv.Atoi(vals[1])
assert((nil == err) && (1 <= v), "无效配置:"+line)
// return errors.New(fmt.Sprintf("字号无效:%s: %s\r\n", line, err))
// g_configPinyinUseFontSize = true
g_configPinyinFontSize = v
}
}
case "拼音对齐":
{
if "" != vals[1] {
v := g_configPinyinAlignmentMap[vals[1]]
assert("" != v, "无效配置:"+line)
g_configPinyinAlignment = v
}
}
case "拼音偏移":
{
v, err := strconv.Atoi(vals[1])
assert((nil == err) && (0 <= v), "无效配置:"+line)
g_configPinyinRaise = v
}
default:
{
assert(false, "无效配置:"+line)
}
}
return nil
}
func parseConfig() error {
fileName := "拼音配置.txt"
fileinfo, err := os.Stat(fileName)
if nil != err {
// 文件不存在,创建一个新的,并写入注释说明
newFile, err := os.Create(fileName)
if err != nil {
return err
}
defer newFile.Close()
newFile.WriteString(
fmt.Sprintf(
// 默认不填写具体值
`# 说明:只支持docx格式的word文档;
# 拼音字号默认动态变化,指定字号后全篇拼音使用指定字号;
拼音字体=
拼音字号=
拼音对齐=
拼音偏移=
`,
// g_configPinyinFont,
// g_configPinyinFontSize/2,
// g_configPinyinOffset,
),
// fmt.Sprintf("# 说明:只支持docx格式的word文档\r\n拼音字体=%s",
// g_configPinyinFont,
// ),
)
} else if fileinfo.IsDir() {
return errors.New("无法创建配置文件:" + fileName)
} else {
f, err := os.Open(fileName)
if err != nil {
return nil
}
br := bufio.NewReader(f)
for {
line, _, err := br.ReadLine()
if err == io.EOF {
break
}
// line = strings.TrimSpace(line)
strLine := strings.ReplaceAll(string(line), " ", "") // 去掉所有空格
if "" == strLine {
continue
}
// strLine := strings.TrimLeft(string(line), " ")
if strings.HasPrefix(strLine, "#") {
continue
}
if err = parseConfigLine(strLine); nil != err {
return err
}
}
}
return nil
}
// func createWrPrNode(oldWrPrNode *etree.Element) *etree.Element {
// newWrPr := oldWrPrNode.Copy()
// wrFonts := newWrPr.FindElement("w:rFonts")
// if nil == wrFonts {
// // panic(fileLine() + "no w:rFonts")
// // wrFonts = newWrPr.CreateElement("w:rFonts")
// } else {
// whint := wrFonts.SelectAttr("w:hint")
// if nil != whint {
// whint.Value = "default"
// } else {
// wrFonts.CreateAttr("w:hint", "default")
// }
// }
// return newWrPr
// }
// func createWrBegin(wrPrNode *etree.Element) *etree.Element {
// wr := etree.NewElement("w:r")
// if nil != wrPrNode {
// wr.AddChild(createWrPrNode(wrPrNode))
// }
// wfldChar := etree.NewElement("w:fldChar")
// wfldChar.CreateAttr("w:fldCharType", "begin")
// wr.AddChild(wfldChar)
// return wr
// }
// func createWrinstrText(text string, wrPrNode *etree.Element) *etree.Element {
// if nil == wrPrNode {
// // panic(fileLine() + "nil == wrPrNode")
// }
// wr := etree.NewElement("w:r")
// if nil != wrPrNode {
// wr.AddChild(createWrPrNode(wrPrNode))
// }
// winstrText := etree.NewElement("w:instrText")
// winstrText.CreateAttr("xml:space", "preserve")
// winstrText.SetText(text)
// // fmt.Println(wfldChar.Text())
// wr.AddChild(winstrText)
// return wr
// }
// func createWrEnd(wrPrNode *etree.Element) *etree.Element {
// wr := etree.NewElement("w:r")
// if nil != wrPrNode {
// wr.AddChild(createWrPrNode(wrPrNode))
// }
// wfldChar := etree.NewElement("w:fldChar")
// wfldChar.CreateAttr("w:fldCharType", "end")
// wr.AddChild(wfldChar)
// return wr
// }
func create__w_r__w_t(w string, oldWrNode *etree.Element) *etree.Element {
wr := oldWrNode.Copy()
wt := wr.FindElement("w:t")
if nil == wt {
wt = etree.NewElement("w:t")
wr.AddChild(wt)
}
wt.SetText(w)
return wr
}
// "一"字
// 單用或在一詞一句的末尾,念陰平聲;
// 在去聲字前,念陽平聲;
// 在陰平、陽平、上聲之前,念去聲。
// 阴阳上去分别为第一二三四声
func procYI(nextHan string, nextPinyin string) string {
// 句末
if ("" == nextHan) || !unicode.Is(unicode.Han, []rune(nextHan)[0]) {
return "yī"
}
assert("" != nextPinyin, "nextPinyin为空")
// 后面跟着汉字
if strings.ContainsAny(nextPinyin, "àòèìùǜ") {
return "yí"
}
return "yì"
}
// 「不」字
// 在去聲字之前,變讀為陽平
// 阴阳上去分别为第一二三四声
func procBU(nextHan string, nextPinyin string) string {
if ("" != nextHan) && strings.ContainsAny(nextPinyin, "àòèìùǜ") {
assert("" != nextPinyin, "nextPinyin为空")
return "bú"
}
return "bù"
}
func createNodeAttr(tag, attrName, attrValue string) *etree.Element {
newTag := etree.NewElement(tag)
newTag.CreateAttr(attrName, attrValue)
return newTag
}
func create__w_rubyPr(fontSize int, pinyinFontSize int) *etree.Element {
w_rubyPr := etree.NewElement("w:rubyPr")
w_rubyPr.AddChild(createNodeAttr("w:rubyAlign", "w:val", g_configPinyinAlignment))
w_rubyPr.AddChild(createNodeAttr("w:hps", "w:val", strconv.Itoa(pinyinFontSize)))
var pinyinRaise int
if -1 != g_configPinyinRaise {
pinyinRaise = fontSize - 2 + g_configPinyinRaise*2 //- 2 // 如果设置了偏移,要多减一个2,不知道为什么。。。
} else {
pinyinRaise = fontSize - 2
}
w_rubyPr.AddChild(createNodeAttr("w:hpsRaise", "w:val", strconv.Itoa(pinyinRaise)))
w_rubyPr.AddChild(createNodeAttr("w:hpsBaseText", "w:val", strconv.Itoa(fontSize)))
w_rubyPr.AddChild(createNodeAttr("w:lid", "w:val", "zh-CN"))
return w_rubyPr
}
func set_attr(node *etree.Element, attr_name string, value string) {
if attr := node.SelectAttr(attr_name); nil != attr {
attr.Value = value
} else {
node.CreateAttr(attr_name, value)
}
return
}
func create__w_rt(w_rPr *etree.Element, fontSize int,
pinyinFontSize int, pinyin string) *etree.Element {
w_rt := etree.NewElement("w:rt")
w_r := w_rt.CreateElement("w:r")
var new__w_rPr *etree.Element
if nil == w_rPr {
new__w_rPr = w_r.CreateElement("w:rPr")
} else {
new__w_rPr = w_rPr.Copy()
w_r.AddChild(new__w_rPr)
}
w_rFonts := new__w_rPr.FindElement("w:rFonts")
if nil == w_rFonts {
w_rFonts = new__w_rPr.CreateElement("w:rFonts")
}
set_attr(w_rFonts, "w:ascii", g_configPinyinFont)
set_attr(w_rFonts, "w:eastAsia", g_configPinyinFont)
set_attr(w_rFonts, "w:hAnsi", g_configPinyinFont)
// w_rFonts.RemoveAttr("w:hint")
// w_rFonts.RemoveAttr("w:hAnsi") // for test
// 排一下序
// remove_add_attr(w_rFonts, "w:ascii")
// remove_add_attr(w_rFonts, "w:eastAsia")
// remove_add_attr(w_rFonts, "w:hAnsi")
// remove_add_attr(w_rFonts, "w:cs")
w_sz := new__w_rPr.FindElement("w:sz")
if nil == w_sz {
w_sz = new__w_rPr.CreateElement("w:sz")
}
// 调整一下顺序
new__w_rPr.RemoveChild(w_sz)
new__w_rPr.AddChild(w_sz)
remove_then_add_child(new__w_rPr, "w:szCs")
remove_then_add_child(new__w_rPr, "w:shd")
w_val := w_sz.SelectAttr("w:val")
if nil != w_val {
w_val.Value = strconv.Itoa(pinyinFontSize)
} else {
w_sz.CreateAttr("w:val", strconv.Itoa(pinyinFontSize))
}
w_r.CreateElement("w:t").SetText(pinyin)
return w_rt
}
func create__w_rubyBase(w_rPr *etree.Element, han string) *etree.Element {
w_rubyBase := etree.NewElement("w:rubyBase")
w_r := w_rubyBase.CreateElement("w:r")
if nil != w_rPr {
w_r.AddChild(w_rPr.Copy())
}
w_t := w_r.CreateElement("w:t")
w_t.SetText(han)
return w_rubyBase
}
func create__w_ruby(w_rPr *etree.Element, fontSize int,
pinyin string, han string) *etree.Element {
var pinyinFontSize int
if -1 != g_configPinyinFontSize {
pinyinFontSize = g_configPinyinFontSize * 2
} else {
pinyinFontSize = fontSize / 2
}
w_ruby := etree.NewElement("w:ruby")
w_ruby.AddChild(create__w_rubyPr(fontSize, pinyinFontSize))
w_ruby.AddChild(create__w_rt(w_rPr, fontSize, pinyinFontSize, pinyin)) // 拼音
w_ruby.AddChild(create__w_rubyBase(w_rPr, han)) // 汉字
return w_ruby
}
func create__w_r(w_rPr *etree.Element, fontSize int,
pinyin string, han string) *etree.Element {
w_r := etree.NewElement("w:r")
if nil != w_rPr {
new__w_rPr := w_rPr.Copy()
// if w_rFonts := new_w_rPr.FindElement("w:rFonts"); nil != w_rFonts {
// new_w_rPr.RemoveChild(w_rFonts) // for test
// }
w_r.AddChild(new__w_rPr)
}
w_r.AddChild(create__w_ruby(w_rPr, fontSize, pinyin, han))
return w_r
}
func remove_then_add_attr(node *etree.Element, name string) {
// 删掉再加回去,用于调整顺序
if attr := node.RemoveAttr(name); nil != attr {
node.CreateAttr(name, attr.Value)
}
return
}
func remove_child(node *etree.Element, child_name string) {
if child := node.FindElement(child_name); nil != child {
node.RemoveChild(child)
}
return
}
func remove_then_add_child(node *etree.Element, child_name string) {
if child := node.FindElement(child_name); nil != child {
node.RemoveChild(child)
node.AddChild(child)
}
return
}
func addPinyin(buf []byte) (string, error) {
pinyinArg := pinyin.NewArgs()
pinyinArg.Style = pinyin.Tone // 包含声调
doc := etree.NewDocument()
err := doc.ReadFromBytes(buf)
if nil != err {
fmt.Println(err)
transformers := []transform.Transformer{
simplifiedchinese.GBK.NewDecoder(),
simplifiedchinese.HZGB2312.NewDecoder(),
}
fmt.Println("尝试转码")
for _, t := range transformers {
I := bytes.NewReader(buf)
O := transform.NewReader(I, t)
var d []byte
d, err = ioutil.ReadAll(O)
if nil != err {
continue
}
err = doc.ReadFromBytes(d)
if nil == err {
fmt.Println("转码成功")
break
}
}
if nil != err {
fmt.Println("转码失败")
return "", err
}
}
wdocument := doc.SelectElement("w:document")
// w:p是一个段落,一段一段的处理
for _, w_p := range wdocument.FindElements("w:body/w:p") {
if w_p__w_pPr__w_rPr := w_p.FindElement("w:pPr/w:rPr"); nil != w_p__w_pPr__w_rPr {
if w_rFonts := w_p__w_pPr__w_rPr.FindElement("w:rFonts"); nil != w_rFonts {
// 排一下序
remove_then_add_attr(w_rFonts, "w:ascii")
remove_then_add_attr(w_rFonts, "w:eastAsia")
remove_then_add_attr(w_rFonts, "w:hAnsi")
remove_then_add_attr(w_rFonts, "w:cs")
w_rFonts.RemoveAttr("w:hint")
}
remove_child(w_p__w_pPr__w_rPr, "w:lang")
// remove_child(w_p__w_pPr__w_rPr, "w:szCs")
// w_p__w_pPr__w_rPr = w_p__w_pPr__w_rPr.Copy()
// w_lang.CreateAttr("w:lang", "en-US")
// w_lang.CreateAttr("w:eastAsia", "zh-CN")
}
allStrOfWp := "" // 段内所有文字
for _, wr := range w_p.FindElements("w:r") {
if wt := wr.FindElement("w:t"); nil != wt {
allStrOfWp += wt.Text()
}
}
pinyins := pinyin.Pinyin(allStrOfWp, pinyinArg)
pinyinIndex := 0
allHansArr := strings.Split(allStrOfWp, "")
hanIndex := 0
for _, w_r := range w_p.FindElements("w:r") {
fontSize := 21 // 默认值,word中看到默认字号是10.5,这里xml中的数字是word中可视参数的2倍。
// if w_rPr := w_r.FindElement("w:rPr"); nil != w_rPr {
// w_r.RemoveChild("w:rPr")
// }
w_r__w_rPr := w_r.FindElement("w:rPr")
if nil != w_r__w_rPr {
if w_rFonts := w_r__w_rPr.FindElement("w:rFonts"); nil != w_rFonts {
w_rFonts.RemoveAttr("w:hint")
if 0 == len(w_rFonts.Attr) {
w_r__w_rPr.RemoveChild(w_rFonts)
} else {
// 排一下序
remove_then_add_attr(w_rFonts, "w:ascii")
remove_then_add_attr(w_rFonts, "w:eastAsia")
remove_then_add_attr(w_rFonts, "w:hAnsi")
remove_then_add_attr(w_rFonts, "w:cs")
}
}
if w_sz := w_r__w_rPr.FindElement("w:sz"); nil != w_sz {
if w_val := w_sz.SelectAttr("w:val"); nil != w_val {
if v, err := strconv.Atoi(w_val.Value); nil == err {
fontSize = v
}
}
}
// 调整一下顺序
// remove_then_add_child(w_r__w_rPr, "w:szCs")
if 0 == len(w_r__w_rPr.Child) {
w_r__w_rPr.Parent().RemoveChild(w_r__w_rPr)
w_r__w_rPr = nil
}
// if w_rFonts := w_r__w_rPr.FindElement("w:rFonts"); nil != w_rFonts {
// w_rFonts.CreateAttr("w:ascii", g_configPinyinFont)
// w_rFonts.CreateAttr("w:eastAsia", g_configPinyinFont)
// w_rFonts.CreateAttr("w:hAnsi", g_configPinyinFont)
// w_rFonts.CreateAttr("w:cs", g_configPinyinFont)
// }
// if w_lang := w_r__w_rPr.FindElement("w:lang"); nil != w_lang {
// w_r__w_rPr.RemoveChild(w_lang)
// }
}
// var pinyinFontSize int
// if -1 != g_configPinyinFontSize {
// pinyinFontSize = g_configPinyinFontSize
// } else {
// pinyinFontSize = fontSize / 2
// }
// var pinyinRaise int
// if -1 != g_configPinyinRaise {
// pinyinRaise = g_configPinyinRaise
// } else {
// pinyinRaise = pinyinFontSize - 2
// }
if w_t := w_r.FindElement("w:t"); nil != w_t {
text := w_t.Text()
// fmt.Println(text)
lastStr := ""
for _, w := range text {
han := string(w)
assert(han == allHansArr[hanIndex], "出错啦")
if unicode.Is(unicode.Han, w) {
if "" != lastStr {
w_p.InsertChild(w_r, create__w_r__w_t(lastStr, w_r))
lastStr = ""
}
nextHan := ""
nextPinyin := ""
if hanIndex+1 < len(allHansArr) {
nextHan = allHansArr[hanIndex+1]
if unicode.Is(unicode.Han, []rune(nextHan)[0]) {
nextPinyin = pinyins[pinyinIndex+1][0]
}
}
var pinyin string
switch han {
case "一":
{
pinyin = procYI(nextHan, nextPinyin)
}
case "不":
{
pinyin = procBU(nextHan, nextPinyin)
}
default:
{
pinyin = pinyins[pinyinIndex][0]
}
}
w_r.Parent().InsertChild(w_r, create__w_r(w_r__w_rPr, fontSize, pinyin, han))
pinyinIndex++
} else {
lastStr += han
}
hanIndex++
}
if "" != lastStr {
w_p.InsertChild(w_r, create__w_r__w_t(lastStr, w_r))
lastStr = ""
}
w_r.Parent().RemoveChild(w_r)
}
}
}
return doc.WriteToString()
}
func procOneDocxFile(fromPath string, toPath string) error {
zipReader, err := zip.OpenReader(fromPath)
if err != nil {
fmt.Print(err)
return err
}
defer zipReader.Close()
newZipFile, err := os.Create(toPath)
if err != nil {
fmt.Println(err)
return err
}
defer newZipFile.Close()
zipWriter := zip.NewWriter(newZipFile)
defer zipWriter.Close()
var f *zip.File
for _, file := range zipReader.File {
rc, err := file.Open()
if nil != err {
return err
}
buf := make([]byte, file.UncompressedSize)
// zipfile文件一次可能不能读完,循环读完为止
readLen := 0
for file.UncompressedSize != uint32(readLen) {
n, err := rc.Read(buf[readLen:])
if nil != err && (0 != strings.Compare("EOF", err.Error())) {
fmt.Println(err)
return err
}
if 0 == n {
return errors.New("读取zip出错")
}
readLen += n
}
var newBuf []byte
if "word/document.xml" == file.Name {
f = file
assert(file.UncompressedSize == uint32(readLen), "读取错误")
newXmlStr, err := addPinyin(buf)
if nil != err {
return err
}
newBuf = []byte(newXmlStr)
} else {
newBuf = buf
}
newFile, err := zipWriter.Create(file.Name)
if err != nil {
return err
}
_, err = newFile.Write(newBuf)
if err != nil {
return err
}
}
if nil == f {
err = errors.New(fromPath + ": 没有 word/document.xml")
return err
}
return nil
}
func initDir(paths []string) error {
for _, path := range paths {
fileinfo, err := os.Stat(path)
if nil != err {
err = os.Mkdir(path, os.ModePerm)
if err != nil {
fmt.Println(err)
return err
}
} else if !fileinfo.IsDir() {
return errors.New("无法创建目录:" + path)
}
}
return nil
}
func main() {
var err error
var startTime, endTime time.Time
startTime = time.Now()
defer func() {
if p := recover(); nil != p {
fmt.Printf("panic recover! : %v\r\n", p)
}
if nil != err {
fmt.Printf("error : %v\r\n", err)
}
endTime = time.Now()
fmt.Println("耗时:", endTime.Sub(startTime))
fmt.Println("按任意键结束")
var data int
fmt.Scanf("%d", &data)
return
}()
todoDir := "./1-加拼音的docx-待处理"
doneDir := "./2-加拼音的docx-结果"
if err = initDir([]string{todoDir, doneDir}); nil != err {
return
}
if err = parseConfig(); nil != err {
return
}
files, err := ioutil.ReadDir(todoDir)
if nil != err {
return
}
for _, f := range files {
if f.IsDir() {
continue
}
if strings.HasPrefix(path.Base(f.Name()), "~$") {
continue
}
ext := path.Ext(f.Name())
if !strings.EqualFold(".docx", ext) {
continue
}
fmt.Println("正在处理文件:" + f.Name())
fromPath := todoDir + "/" + f.Name()
toPath := doneDir + "/" + strings.TrimSuffix(path.Base(f.Name()), ext) + time.Now().Format("_20060102_150405.docx")
err = procOneDocxFile(fromPath, toPath)
if nil != err {
return
}
}
return
}
网友评论