美文网首页
记录:PDF关键字寻找

记录:PDF关键字寻找

作者: 小鸡在路上 | 来源:发表于2019-06-05 16:46 被阅读0次

    关于获取PDF中关键字位置可以直接用的demo

    package com.sign;
    
    import java.io.ByteArrayOutputStream;
    import java.io.IOException;
    import java.io.OutputStreamWriter;
    import java.io.Writer;
    import java.util.ArrayList;
    import java.util.List;
    
    import org.apache.pdfbox.pdmodel.PDDocument;
    import org.apache.pdfbox.text.PDFTextStripper;
    import org.apache.pdfbox.text.TextPosition;
    
    public class BoxKeyPosition extends PDFTextStripper {
    
        private char[] key;
        private byte[] src;
        private List<float[]> list = new ArrayList<float[]>();
        private List<float[]> pagelist = new ArrayList<float[]>();
    
        public BoxKeyPosition(String keyWords, byte[] src) throws IOException {
            super();
            super.setSortByPosition(true);
            this.src = src;
    
            char[] key = new char[keyWords.length()];
            for (int i = 0; i < keyWords.length(); i++) {
                key[i] = keyWords.charAt(i);
            }
            this.key = key;
        }
    
        public char[] getKey() {
            return key;
        }
    
        public void setKey(char[] key) {
            this.key = key;
        }
    
        public byte[] getSrc() {
            return src;
        }
    
        public void setSrc(byte[] src) {
            this.src = src;
        }
    
        public List<float[]> getPosition() throws IOException {
            try {
                document = PDDocument.load(src);
                int pages = document.getNumberOfPages();
    
                for (int i = 1; i <= pages; i++) {
                    pagelist.clear();
                    super.setSortByPosition(true);
                    super.setStartPage(i);
                    super.setEndPage(i);
                    Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
                    super.writeText(document, dummy);
                    for (float[] li : pagelist) {
                        li[2] = i;
                    }
                    list.addAll(pagelist);
                }
                return list;
    
            } finally {
                if (document != null) {
                    document.close();
                }
            }
    
        }
    
        @Override
        protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
            for (int i = 0; i < textPositions.size(); i++) {
    
                String str = textPositions.get(i).getUnicode();
                if (str.equals(key[0] + "")) {
                    int count = 0;
                    for (int j = 1; j < key.length; j++) {
                        String s = "";
                        try {
                            s = textPositions.get(i + j).getUnicode();
                        } catch (Exception e) {
                            s = "";
                        }
                        if (s.equals(key[j] + "")) {
                            count++;
                        }
    
                    }
                    if (count == key.length - 1) {
                        float[] idx = new float[3];
                        idx[0] = textPositions.get(i).getX()+key.length*textPositions.get(i).getWidth()/2;
                        idx[1] = textPositions.get(i).getY()-textPositions.get(i).getHeight();
                        //  idx[3] = textPositions.get(i).getUnicode();
                        pagelist.add(idx);
                    }
                }
    
            }
        }
    }
    
    package com.sign;
    
    
    
    import java.io.*;
    import java.util.List;
    
    /**
     * @ClassName SignPostionTest
     * @Description TODD
     * @Author MG01857
     * @Date 2018/12/15
     * @Version 1.0
     **/
    
    public class SignPostionTest {
        private static byte[] toByteArray(InputStream in) throws IOException {
    
            ByteArrayOutputStream out = new ByteArrayOutputStream();
            byte[] buffer = new byte[1024 * 4];
            int n = 0;
            while ((n = in.read(buffer)) != -1) {
                out.write(buffer, 0, n);
            }
            return out.toByteArray();
        }
        public static void main(String[] args) throws Exception {
           /* SignPostion signPostion = new SignPostion();
            List<float[]> keyWords = signPostion.getKeyWords("C:\\Users\\MG01857\\Desktop\\AZ新合同生成\\埋点测试\\爱家分期服务合同(金融机构、消费者).pdf",
                    null, "borrower ");
            System.out.println(keyWords);*/
            String filePath = "C:\\Users\\MG01857\\Desktop\\pdf生成浏览\\世联信贷征信查询授权书_word转PDF_黄智炜.pdf";
            InputStream in = new FileInputStream(filePath);
            byte[] data = toByteArray(in);
            in.close();
            BoxKeyPosition boxKeyPosition = new BoxKeyPosition("borrower",data);
            //List<float[]> position = boxKeyPosition.getPosition();
            List<float[]> position = boxKeyPosition.getPosition();
            for (float[] f : position){
                System.out.println(f.toString());
            }
        }
    }
    
    

    相关文章

      网友评论

          本文标题:记录:PDF关键字寻找

          本文链接:https://www.haomeiwen.com/subject/jwjsxctx.html