반응형
tika의 목적
- 다양한 문서(PPT,XML,PDF)를 하나의 프로그램으로 메타데이터나 텍스트를 추출 하기위한 목적
- Tika (The Apache toolkit)
tika의 장점
- 무료
- 범용성
tika의 단점
- 아직 단점은 잘 모르겠음
구현
<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-core -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>1.27</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-parsers -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.27</version>
</dependency>
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.SAXException;
public class PdfReaderUtil {
public static void main(String args[]) {
System.out.println("start!");
PdfReaderUtil pdfReaderUtil = new PdfReaderUtil();
try {
String text3 = pdfReaderUtil.readPdfByTika1(filePath);
String text4 = pdfReaderUtil.readPdfByTika2(filePath);
System.out.println("readPdfByTika1:"+text3);
System.out.println("readPdfByTika2:"+text4);
} catch (Exception e) {
System.out.println("exception : " + e);
}
System.out.println("end !");
}
public static String readPdfByTika1(String filePath) throws IOException, TikaException, SAXException {
Parser parser = new AutoDetectParser();
BodyContentHandler bodyContentHandler = new BodyContentHandler(Integer.MAX_VALUE);
TesseractOCRConfig tesseractOCRConfig = new TesseractOCRConfig();
PDFParserConfig pDFParserConfig = new PDFParserConfig();
pDFParserConfig.setExtractInlineImages(true);
ParseContext parseContext = new ParseContext();
parseContext.set(TesseractOCRConfig.class, tesseractOCRConfig);
parseContext.set(PDFParserConfig.class, pDFParserConfig);
parseContext.set(Parser.class, parser);
FileInputStream fileInputStream = new FileInputStream(filePath);
Metadata metadata = new Metadata();
parser.parse(fileInputStream, bodyContentHandler, metadata, parseContext);
String content = bodyContentHandler.toString();
return content;
}
public static String readPdfByTika2(String filePath) throws Exception {
File file = new File(filePath);
BodyContentHandler bodyContentHandler = new BodyContentHandler();
AutoDetectParser autoDetectParser = new AutoDetectParser();
Metadata metadata = new Metadata();
try (InputStream inputStream = new FileInputStream(file)) {
autoDetectParser.parse(inputStream, bodyContentHandler, metadata);
return bodyContentHandler.toString();
}
}
}
반응형