doc html互转

作者: 小李_a98e | 来源:发表于2019-06-12 09:43 被阅读0次

    java doc与富文本的转换

    • 引入maven所需jar包

        <dependency>
    
            <groupId>org.apache.poi</groupId>
    
            <artifactId>poi</artifactId>
    
            <version>3.14</version>
    
        </dependency>
    
        <dependency>
    
            <groupId>org.apache.poi</groupId>
    
            <artifactId>poi-scratchpad</artifactId>
    
            <version>3.14</version>
    
            <exclusions>
    
                <exclusion>
    
                    <groupId>org.apache.poi</groupId>
    
                    <artifactId>poi</artifactId>
    
                </exclusion>
    
            </exclusions>
    
        </dependency>
    
        <!--poi word 处理部分-->
    
        <dependency>
    
            <groupId>org.apache.poi</groupId>
    
            <artifactId>poi-ooxml</artifactId>
    
            <version>3.14</version>
    
        </dependency>
    
        <dependency>
    
            <groupId>org.apache.poi</groupId>
    
            <artifactId>poi-ooxml-schemas</artifactId>
    
            <version>3.14</version>
    
        </dependency>
    
        <dependency>
    
            <groupId>fr.opensagres.xdocreport</groupId>
    
            <artifactId>xdocreport</artifactId>
    
            <version>1.0.4</version>
    
        </dependency>
    
        <dependency>
    
            <groupId>fr.opensagres.xdocreport</groupId>
    
            <artifactId>org.apache.poi.xwpf.converter.core</artifactId>
    
            <version>1.0.4</version>
    
        </dependency>
    
        <dependency>
    
            <groupId>fr.opensagres.xdocreport</groupId>
    
            <artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId>
    
            <version>1.0.4</version>
    
        </dependency>
    
        <dependency>
    
            <groupId>commons-fileupload</groupId>
    
            <artifactId>commons-fileupload</artifactId>
    
            <version>1.3.2</version>
    
        </dependency>
    
        <!-- html2doc 这里采用 docx4j-->
    
        <dependency>
    
            <groupId>org.docx4j</groupId>
    
            <artifactId>docx4j</artifactId>
    
            <version>3.3.6</version>
    
        </dependency>
    
        <!--jsoup-->
    
        <dependency>
    
            <groupId>org.jsoup</groupId>
    
            <artifactId>jsoup</artifactId>
    
            <version>1.10.2</version>
    
        </dependency>
    
    • 工具类


      package com.example.word.common;
      
      import org.apache.commons.io.FileUtils;
      import org.apache.commons.lang3.StringUtils;
      import org.apache.poi.hwpf.HWPFDocument;
      import org.apache.poi.hwpf.converter.WordToHtmlConverter;
      import org.apache.poi.poifs.filesystem.POIFSFileSystem;
      import org.docx4j.openpackaging.exceptions.Docx4JException;
      import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
      import org.docx4j.openpackaging.parts.WordprocessingML.AltChunkType;
      import org.jsoup.Jsoup;
      import org.jsoup.nodes.Document;
      import org.jsoup.nodes.Element;
      import org.jsoup.select.Elements;
      import org.slf4j.Logger;
      import org.slf4j.LoggerFactory;
      
      import javax.xml.parsers.DocumentBuilderFactory;
      import javax.xml.transform.OutputKeys;
      import javax.xml.transform.Transformer;
      import javax.xml.transform.TransformerFactory;
      import javax.xml.transform.dom.DOMSource;
      import javax.xml.transform.stream.StreamResult;
      import java.io.File;
      import java.io.FileOutputStream;
      import java.util.StringJoiner;
      
      public class DocumentTransformUtil {
      
          private  static Logger logger = LoggerFactory.getLogger(DocumentTransformUtil.class);
          /**
           * doc--->html
           * @param docFile  doc文件
           * @return  html文本
           */
          public static String doc2Html(File docFile) {
              String htmlPath=docFile.getAbsolutePath().replaceAll(docFile.getName(),"")+docFile.getName().replaceAll(".doc",".html");
      
              String result = "";
              File htmlFile = new File(htmlPath);
              if(!docFile.exists()){
                  logger.error("{}.doc不存在",docFile.getName());
              }
              try{
                  HWPFDocument wordDocument = new HWPFDocument(new POIFSFileSystem(docFile));
                  org.w3c.dom.Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
                  WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(document);
                  //保存图片,并返回图片的相对路径
                  wordToHtmlConverter.setPicturesManager((content, pictureType, name, width, height) -> {
                      //图片byte[],图片type,图片名,图片宽度,图片高度
                      //上传文件返回url  伪代码
                      //String url =  FileUploadUtil.upload(fileByte, name, true);
                      String url = "https://gss2.bdstatic.com/-fo3dSag_xI4khGkpoWK1HF6hhy/baike/w%3D268%3Bg%3D0/sign=bcd0f6384290f60304b09b410129d426/91ef76c6a7efce1bab44b2c3a751f3deb48f654f.jpg";
                      return url;
                  });
                  wordToHtmlConverter.processDocument(wordDocument);
                  org.w3c.dom.Document htmlDocument = wordToHtmlConverter.getDocument();
                  DOMSource domSource = new DOMSource(htmlDocument);
                  StreamResult streamResult = new StreamResult();
                  streamResult.setOutputStream(new FileOutputStream(htmlFile));
                  TransformerFactory tf = TransformerFactory.newInstance();
                  Transformer serializer = tf.newTransformer();
                  serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
                  serializer.setOutputProperty(OutputKeys.INDENT, "yes");
                  serializer.setOutputProperty(OutputKeys.METHOD, "html");
                  serializer.transform(domSource, streamResult);
                  result = FileUtils.readFileToString(htmlFile,"UTF-8");
                 //这里拿到html文本后 jsoup 解析
                  result =  processUeditorStyle(result);
              } catch (Exception e) {
                  e.printStackTrace();
                  logger.error("doc--->html出错:{}",e.getMessage());
              }finally {
                  htmlFile.delete();
              }
              return result;
          }
      
          /**
           *  处理 ueditor 样式
            * @param result
           * @return
           */
          private static String processUeditorStyle(String result) {
              Document doc = Jsoup.parse(result);
              //body 处理
              //<body class="view" contenteditable="true" spellcheck="false" style="overflow-y: hidden; height: 500px; cursor: text;">
              doc.body().attr("class","view").attr("contenteditable","true")
                          .attr("spellcheck","false").attr("style","overflow-y: hidden; height: 500px; cursor: text;");
              //img处理
              Elements imgs = doc.select("img[src]");
              for (Element img : imgs) {
                  String width = "";
                  String[] styles = img.attr("style").split(";");
                  if (styles.length>0){
                      width = styles[0].split(":")[1].replaceAll("in","");
                      if (Float.parseFloat(width) > 7.43f){
                          StringJoiner styleValue = new StringJoiner(";");
                          styleValue.add("width:173px");
                          for (int i = 1; i < styles.length; i++) {
                              styleValue.add(styles[i]);
                          }
                          img.attr("style",styleValue.toString());
                      }
                  }
                  if (StringUtils.isBlank(width)){
                      //7.43英寸  doc 1920
                      width = img.attr("width").replaceAll("px","");
                      if (Integer.parseInt(width) > 713 ){
                          img.attr("width","713px");
                      }
                  }
                  result = doc.outerHtml();
              }
              return result;
          }
      
          /**
           *  处理 doc 样式
           * @param htmlContent
           * @return
           */
          private static String processDocStyle(String htmlContent) {
              Document doc = Jsoup.parse(htmlContent);
              doc.select("table").attr("cellspacing","0px")
              .attr("cellpadding","0px").attr("border-collapse","collapse");
              return doc.outerHtml();
          }
      
          /**
           *  html转doc
           * @param file  输出doc文件
           * @param html  html文本内容
           * @throws Exception
           */
          public static void html2doc(File file, String html)  {
              logger.info("开始html--->doc");
              try {
                  WordprocessingMLPackage wordMLPackage = WordprocessingMLPackage.createPackage();
                  String table = "<style type=\"text/css\"> table td{border:1px solid #000000} </style>";
                  String htmlContent = "<html><head><title></title>"+table+"</head><body>"+html+"</body></html>";
                  htmlContent = processDocStyle(htmlContent);
                  wordMLPackage.getMainDocumentPart().addAltChunk(AltChunkType.Html, htmlContent.getBytes());
                  wordMLPackage.save(file);
                  //上传至文件服务器  删除零时文件 .. 将此返回值改为String url入库
      //            String url =  FileUploadUtil.upload(fileByte, name, true);
              } catch (Docx4JException e) {
                  e.printStackTrace();
                  logger.error("html转doc出错:{}",e.getMessage());
              }
              logger.info("转换完成html--->doc");
      
          }
      
          public static void main(String[] args) throws Exception {
      //        doc2Html(new File("c:/优化规则.doc"));
            html2doc(new File("d:/优化规则.doc"),FileUtils.readFileToString(new File("c:/优化规则.html"),"UTF-8"));
          }
      }
       
      

    相关文章

      网友评论

        本文标题:doc html互转

        本文链接:https://www.haomeiwen.com/subject/qwckfctx.html