功能简介
Apache Tika是一个用java编写的内容检测和分析框架,能够检测很多不同文件类型的文件,并提取文件的元数据和结构化文本。主要功能包括文档类型检测、内容提取、元数据提取、语言检测。支持的文档类型包括但不限于Excel、Word、PPT、TXT、类文本文件(如.java、.sql、.css等)、PDF、XML、HTML、GZIP、ZIP。
抽取文本
添加Maven依赖
新建一个Maven工程,然后在pom.xml中增加tika依赖。
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.13</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.13</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>3.13</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.13</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>1.13</version>
</dependency>
文档抽取工具类
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ExpandedTitleContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import com.google.common.io.Files;
/**
* tika抽取全文内容
*
* @author 李庆海
*
*/
public class TikaTool {
public static Map<String,Object> parseFile(File file) {
Map<String,Object> meta = new HashMap<String,Object>();
Parser parser = new AutoDetectParser();
InputStream input = null;
try {
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_ENCODING, "utf-8");
metadata.set(Metadata.RESOURCE_NAME_KEY, file.getName());
input = new FileInputStream(file);
ContentHandler handler = new BodyContentHandler(10 * 1024 * 1024);
ParseContext context = new ParseContext();
context.set(Parser.class, parser);
parser.parse(input, handler, metadata, context);
for (String name : metadata.names()) {
meta.put(name,metadata.get(name));
}
meta.put("content",handler.toString());
return meta;
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (input != null) {
input.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
return null;
}
public static String extractHtml(File file) throws IOException {
byte[] bytes = Files.toByteArray(file);
AutoDetectParser tikaParser = new AutoDetectParser();
ByteArrayOutputStream out = new ByteArrayOutputStream();
SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
TransformerHandler handler;
try {
handler = factory.newTransformerHandler();
} catch (TransformerConfigurationException ex) {
throw new IOException(ex);
}
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "UTF-8");
handler.setResult(new StreamResult(out));
ExpandedTitleContentHandler handler1 = new ExpandedTitleContentHandler(handler);
try {
tikaParser.parse(new ByteArrayInputStream(bytes), handler1, new Metadata());
} catch (SAXException | TikaException ex) {
throw new IOException(ex);
}
return new String(out.toByteArray(), "UTF-8");
}
}
网友评论