docs/docs-core/src/main/java/com/sismics/docs/core/util/format/PdfFormatHandler.java

83 lines
3.1 KiB
Java

package com.sismics.docs.core.util.format;
import com.google.common.io.Closer;
import com.sismics.docs.core.util.FileUtil;
import com.sismics.util.mime.MimeType;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.multipdf.PDFMergerUtility;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.awt.image.BufferedImage;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
/**
* PDF format handler.
*
* @author bgamard
*/
public class PdfFormatHandler implements FormatHandler {
/**
* Logger.
*/
private static final Logger log = LoggerFactory.getLogger(PdfFormatHandler.class);
@Override
public boolean accept(String mimeType) {
return mimeType.equals(MimeType.APPLICATION_PDF);
}
@Override
public BufferedImage generateThumbnail(Path file) throws Exception {
try (InputStream inputStream = Files.newInputStream(file);
PDDocument pdfDocument = PDDocument.load(inputStream)) {
PDFRenderer renderer = new PDFRenderer(pdfDocument);
return renderer.renderImage(0);
}
}
@Override
public String extractContent(String language, Path file) {
String content = null;
try (InputStream inputStream = Files.newInputStream(file);
PDDocument pdfDocument = PDDocument.load(inputStream)) {
content = new PDFTextStripper().getText(pdfDocument);
} catch (Exception e) {
log.error("Error while extracting text from the PDF", e);
}
// No text content, try to OCR it
if (language != null && content != null && content.trim().isEmpty()) {
StringBuilder sb = new StringBuilder();
try (InputStream inputStream = Files.newInputStream(file);
PDDocument pdfDocument = PDDocument.load(inputStream)) {
PDFRenderer renderer = new PDFRenderer(pdfDocument);
for (int pageIndex = 0; pageIndex < pdfDocument.getNumberOfPages(); pageIndex++) {
log.info("OCR page " + (pageIndex + 1) + "/" + pdfDocument.getNumberOfPages() + " of PDF file containing only images");
sb.append(" ");
sb.append(FileUtil.ocrFile(language, renderer.renderImageWithDPI(pageIndex, 300, ImageType.GRAY)));
}
return sb.toString();
} catch (Exception e) {
log.error("Error while OCR-izing the PDF", e);
}
}
return content;
}
@Override
public void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) throws Exception {
PDDocument mergeDoc = PDDocument.load(file.toFile(), memUsageSettings);
closer.register(mergeDoc);
PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
pdfMergerUtility.appendDocument(doc, mergeDoc);
}
}