美文网首页Java学习笔记程序员
使用Apache-Tika进行文本抽取

使用Apache-Tika进行文本抽取

作者: 固安李庆海 | 来源:发表于2018-09-28 09:14 被阅读3次

    功能简介

    Apache Tika是一个用java编写的内容检测和分析框架,能够检测很多不同文件类型的文件,并提取文件的元数据和结构化文本。主要功能包括文档类型检测、内容提取、元数据提取、语言检测。支持的文档类型包括但不限于Excel、Word、PPT、TXT、类文本文件(如.java、.sql、.css等)、PDF、XML、HTML、GZIP、ZIP。

    抽取文本

    添加Maven依赖

    新建一个Maven工程,然后在pom.xml中增加tika依赖。

    <dependency>
        <groupId>org.apache.poi</groupId>
        <artifactId>poi</artifactId>
        <version>3.13</version>
    </dependency>
    <dependency>
        <groupId>org.apache.poi</groupId>
        <artifactId>poi-ooxml</artifactId>
        <version>3.13</version>
    </dependency>
    <dependency>
        <groupId>org.apache.poi</groupId>
        <artifactId>poi-ooxml-schemas</artifactId>
        <version>3.13</version>
    </dependency>
    <dependency>
        <groupId>org.apache.tika</groupId>
        <artifactId>tika-parsers</artifactId>
        <version>1.13</version>
    </dependency>
    <dependency>
        <groupId>org.apache.tika</groupId>
        <artifactId>tika-core</artifactId>
        <version>1.13</version>
    </dependency>
    

    文档抽取工具类

    import java.io.ByteArrayInputStream;
    import java.io.ByteArrayOutputStream;
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.IOException;
    import java.io.InputStream;
    import java.util.HashMap;
    import java.util.Map;
    import javax.xml.transform.OutputKeys;
    import javax.xml.transform.TransformerConfigurationException;
    import javax.xml.transform.sax.SAXTransformerFactory;
    import javax.xml.transform.sax.TransformerHandler;
    import javax.xml.transform.stream.StreamResult;
    import org.apache.tika.exception.TikaException;
    import org.apache.tika.metadata.Metadata;
    import org.apache.tika.parser.AutoDetectParser;
    import org.apache.tika.parser.ParseContext;
    import org.apache.tika.parser.Parser;
    import org.apache.tika.sax.BodyContentHandler;
    import org.apache.tika.sax.ExpandedTitleContentHandler;
    import org.xml.sax.ContentHandler;
    import org.xml.sax.SAXException;
    import com.google.common.io.Files;
    
    /**
     * tika抽取全文内容
     * 
     * @author 李庆海
     *
     */
    public class TikaTool {
        public static Map<String,Object> parseFile(File file) {
            Map<String,Object> meta = new HashMap<String,Object>();
            Parser parser = new AutoDetectParser();
            InputStream input = null;
            try {
                Metadata metadata = new Metadata();
                metadata.set(Metadata.CONTENT_ENCODING, "utf-8");
                metadata.set(Metadata.RESOURCE_NAME_KEY, file.getName());
                input = new FileInputStream(file);
                ContentHandler handler = new BodyContentHandler(10 * 1024 * 1024);
    
                ParseContext context = new ParseContext();
                context.set(Parser.class, parser);
                parser.parse(input, handler, metadata, context);
                 for (String name : metadata.names()) {
                     meta.put(name,metadata.get(name));
                 }
                meta.put("content",handler.toString());
                 return meta;
            } catch (Exception e) {
                e.printStackTrace();
            } finally {
                try {
                    if (input != null) {
                        input.close();
                    }
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            return null;
        }
        
        public static String extractHtml(File file) throws IOException {
            byte[] bytes = Files.toByteArray(file);
            AutoDetectParser tikaParser = new AutoDetectParser();
            ByteArrayOutputStream out = new ByteArrayOutputStream();
            SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
            TransformerHandler handler;
            try {
                handler = factory.newTransformerHandler();
            } catch (TransformerConfigurationException ex) {
                throw new IOException(ex);
            }
            handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
            handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
            handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "UTF-8");
            handler.setResult(new StreamResult(out));
            ExpandedTitleContentHandler handler1 = new ExpandedTitleContentHandler(handler);
            try {
                tikaParser.parse(new ByteArrayInputStream(bytes), handler1, new Metadata());
            } catch (SAXException | TikaException ex) {
                throw new IOException(ex);
            }
            return new String(out.toByteArray(), "UTF-8");
        }
    }
    

    相关文章

      网友评论

        本文标题:使用Apache-Tika进行文本抽取

        本文链接:https://www.haomeiwen.com/subject/lmsuoftx.html