mirror of https://github.com/sismics/docs.git
83 lines
3.1 KiB
Java
83 lines
3.1 KiB
Java
package com.sismics.docs.core.util.format;
|
|
|
|
import com.google.common.io.Closer;
|
|
import com.sismics.docs.core.util.FileUtil;
|
|
import com.sismics.util.mime.MimeType;
|
|
import org.apache.pdfbox.io.MemoryUsageSetting;
|
|
import org.apache.pdfbox.multipdf.PDFMergerUtility;
|
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
|
import org.apache.pdfbox.rendering.ImageType;
|
|
import org.apache.pdfbox.rendering.PDFRenderer;
|
|
import org.apache.pdfbox.text.PDFTextStripper;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
import java.awt.image.BufferedImage;
|
|
import java.io.InputStream;
|
|
import java.nio.file.Files;
|
|
import java.nio.file.Path;
|
|
|
|
/**
|
|
* PDF format handler.
|
|
*
|
|
* @author bgamard
|
|
*/
|
|
public class PdfFormatHandler implements FormatHandler {
|
|
/**
|
|
* Logger.
|
|
*/
|
|
private static final Logger log = LoggerFactory.getLogger(PdfFormatHandler.class);
|
|
|
|
@Override
|
|
public boolean accept(String mimeType) {
|
|
return mimeType.equals(MimeType.APPLICATION_PDF);
|
|
}
|
|
|
|
@Override
|
|
public BufferedImage generateThumbnail(Path file) throws Exception {
|
|
try (InputStream inputStream = Files.newInputStream(file);
|
|
PDDocument pdfDocument = PDDocument.load(inputStream)) {
|
|
PDFRenderer renderer = new PDFRenderer(pdfDocument);
|
|
return renderer.renderImage(0);
|
|
}
|
|
}
|
|
|
|
@Override
|
|
public String extractContent(String language, Path file) {
|
|
String content = null;
|
|
try (InputStream inputStream = Files.newInputStream(file);
|
|
PDDocument pdfDocument = PDDocument.load(inputStream)) {
|
|
content = new PDFTextStripper().getText(pdfDocument);
|
|
} catch (Exception e) {
|
|
log.error("Error while extracting text from the PDF", e);
|
|
}
|
|
|
|
// No text content, try to OCR it
|
|
if (language != null && content != null && content.trim().isEmpty()) {
|
|
StringBuilder sb = new StringBuilder();
|
|
try (InputStream inputStream = Files.newInputStream(file);
|
|
PDDocument pdfDocument = PDDocument.load(inputStream)) {
|
|
PDFRenderer renderer = new PDFRenderer(pdfDocument);
|
|
for (int pageIndex = 0; pageIndex < pdfDocument.getNumberOfPages(); pageIndex++) {
|
|
log.info("OCR page " + (pageIndex + 1) + "/" + pdfDocument.getNumberOfPages() + " of PDF file containing only images");
|
|
sb.append(" ");
|
|
sb.append(FileUtil.ocrFile(language, renderer.renderImageWithDPI(pageIndex, 300, ImageType.GRAY)));
|
|
}
|
|
return sb.toString();
|
|
} catch (Exception e) {
|
|
log.error("Error while OCR-izing the PDF", e);
|
|
}
|
|
}
|
|
|
|
return content;
|
|
}
|
|
|
|
@Override
|
|
public void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) throws Exception {
|
|
PDDocument mergeDoc = PDDocument.load(file.toFile(), memUsageSettings);
|
|
closer.register(mergeDoc);
|
|
PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
|
|
pdfMergerUtility.appendDocument(doc, mergeDoc);
|
|
}
|
|
}
|