功能簡介
Apache Tika是一個用java編寫的內容檢測和分析框架,能夠檢測很多不同文件類型的文件,并提取文件的元數據和結構化文本。主要功能包括文檔類型檢測、內容提取、元數據提取、語言檢測。支持的文檔類型包括但不限于Excel、Word、PPT、TXT、類文本文件(如.java、.sql、.css等)、PDF、XML、HTML、GZIP、ZIP。
抽取文本
添加Maven依賴
新建一個Maven工程,然后在pom.xml中增加tika依賴。
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.13</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.13</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>3.13</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.13</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>1.13</version>
</dependency>
文檔抽取工具類
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ExpandedTitleContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import com.google.common.io.Files;
/**
* tika抽取全文內容
*
* @author 李慶海
*
*/
public class TikaTool {
public static Map<String,Object> parseFile(File file) {
Map<String,Object> meta = new HashMap<String,Object>();
Parser parser = new AutoDetectParser();
InputStream input = null;
try {
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_ENCODING, "utf-8");
metadata.set(Metadata.RESOURCE_NAME_KEY, file.getName());
input = new FileInputStream(file);
ContentHandler handler = new BodyContentHandler(10 * 1024 * 1024);
ParseContext context = new ParseContext();
context.set(Parser.class, parser);
parser.parse(input, handler, metadata, context);
for (String name : metadata.names()) {
meta.put(name,metadata.get(name));
}
meta.put("content",handler.toString());
return meta;
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (input != null) {
input.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
return null;
}
public static String extractHtml(File file) throws IOException {
byte[] bytes = Files.toByteArray(file);
AutoDetectParser tikaParser = new AutoDetectParser();
ByteArrayOutputStream out = new ByteArrayOutputStream();
SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
TransformerHandler handler;
try {
handler = factory.newTransformerHandler();
} catch (TransformerConfigurationException ex) {
throw new IOException(ex);
}
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "UTF-8");
handler.setResult(new StreamResult(out));
ExpandedTitleContentHandler handler1 = new ExpandedTitleContentHandler(handler);
try {
tikaParser.parse(new ByteArrayInputStream(bytes), handler1, new Metadata());
} catch (SAXException | TikaException ex) {
throw new IOException(ex);
}
return new String(out.toByteArray(), "UTF-8");
}
}