Closes #182: format handling refactoring

This commit is contained in:
Benjamin Gamard 2018-03-18 16:16:32 +01:00
parent 996585d7ac
commit 7ea8d0c0f7
16 changed files with 592 additions and 382 deletions

View File

@ -87,4 +87,9 @@ public class Constants {
* Email template for route step validate.
*/
public static final String EMAIL_TEMPLATE_ROUTE_STEP_VALIDATE = "route_step_validate";
/**
* mm per inch.
*/
public static float MM_PER_INCH = 1 / (10 * 2.54f) * 72f;
}

View File

@ -7,15 +7,22 @@ import com.sismics.docs.core.dao.lucene.LuceneDao;
import com.sismics.docs.core.event.FileCreatedAsyncEvent;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.docs.core.model.jpa.User;
import com.sismics.docs.core.util.DirectoryUtil;
import com.sismics.docs.core.util.EncryptionUtil;
import com.sismics.docs.core.util.FileUtil;
import com.sismics.docs.core.util.PdfUtil;
import com.sismics.docs.core.util.TransactionUtil;
import com.sismics.util.mime.MimeTypeUtil;
import com.sismics.docs.core.util.format.FormatHandler;
import com.sismics.docs.core.util.format.FormatHandlerUtil;
import com.sismics.util.ImageUtil;
import com.sismics.util.Scalr;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.crypto.Cipher;
import javax.crypto.CipherOutputStream;
import java.awt.image.BufferedImage;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.text.MessageFormat;
import java.util.concurrent.atomic.AtomicReference;
@ -42,16 +49,12 @@ public class FileCreatedAsyncListener {
log.info("File created event: " + event.toString());
}
// Guess the mime type a second time, for open document format (first detected as simple ZIP file)
// Find a format handler
final File file = event.getFile();
file.setMimeType(MimeTypeUtil.guessOpenDocumentFormat(file, event.getUnencryptedFile()));
// Convert to PDF if necessary (for thumbnail and text extraction)
Path unencryptedPdfFile = null;
try {
unencryptedPdfFile = PdfUtil.convertToPdf(file, event.getUnencryptedFile());
} catch (Exception e) {
log.error("Unable to convert to PDF", e);
FormatHandler formatHandler = FormatHandlerUtil.find(file.getMimeType());
if (formatHandler == null) {
log.error("Format unhandled: " + file.getMimeType());
return;
}
// Get the user from the database
@ -71,15 +74,37 @@ public class FileCreatedAsyncListener {
// Generate file variations
try {
Cipher cipher = EncryptionUtil.getEncryptionCipher(user.get().getPrivateKey());
FileUtil.saveVariations(file, event.getUnencryptedFile(), unencryptedPdfFile, cipher);
BufferedImage image = formatHandler.generateThumbnail(event.getUnencryptedFile());
if (image != null) {
// Generate thumbnails from image
BufferedImage web = Scalr.resize(image, Scalr.Method.ULTRA_QUALITY, Scalr.Mode.AUTOMATIC, 1280);
BufferedImage thumbnail = Scalr.resize(image, Scalr.Method.ULTRA_QUALITY, Scalr.Mode.AUTOMATIC, 256);
image.flush();
// Write "web" encrypted image
Path outputFile = DirectoryUtil.getStorageDirectory().resolve(file.getId() + "_web");
try (OutputStream outputStream = new CipherOutputStream(Files.newOutputStream(outputFile), cipher)) {
ImageUtil.writeJpeg(web, outputStream);
}
// Write "thumb" encrypted image
outputFile = DirectoryUtil.getStorageDirectory().resolve(file.getId() + "_thumb");
try (OutputStream outputStream = new CipherOutputStream(Files.newOutputStream(outputFile), cipher)) {
ImageUtil.writeJpeg(thumbnail, outputStream);
}
}
} catch (Exception e) {
log.error("Unable to generate thumbnails", e);
}
// Extract text content from the file
long startTime = System.currentTimeMillis();
final String content = FileUtil.extractContent(event.getLanguage(), file,
event.getUnencryptedFile(), unencryptedPdfFile);
final AtomicReference<String> content = new AtomicReference<>();
try {
content.set(formatHandler.extractContent(event.getLanguage(), event.getUnencryptedFile()));
} catch (Exception e) {
log.error("Error extracting content from: " + event.getFile());
}
log.info(MessageFormat.format("File content extracted in {0}ms", System.currentTimeMillis() - startTime));
// Save the file to database
@ -91,8 +116,8 @@ public class FileCreatedAsyncListener {
// The file has been deleted since the text extraction started, ignore the result
return;
}
file.setContent(content);
file.setContent(content.get());
fileDao.update(file);
}
});

View File

@ -10,9 +10,7 @@ import com.sismics.docs.core.model.jpa.File;
import com.sismics.docs.core.model.jpa.User;
import com.sismics.tess4j.Tesseract;
import com.sismics.util.ImageDeskew;
import com.sismics.util.ImageUtil;
import com.sismics.util.Scalr;
import com.sismics.util.VideoUtil;
import com.sismics.util.context.ThreadLocalContext;
import com.sismics.util.mime.MimeTypeUtil;
import org.apache.commons.lang.StringUtils;
@ -21,12 +19,9 @@ import org.slf4j.LoggerFactory;
import javax.crypto.Cipher;
import javax.crypto.CipherInputStream;
import javax.crypto.CipherOutputStream;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Collections;
@ -49,40 +44,14 @@ public class FileUtil {
*/
private static Set<String> processingFileSet = Collections.synchronizedSet(new HashSet<String>());
/**
* Extract content from a file.
*
* @param language Language to extract
* @param file File to extract
* @param unencryptedFile Unencrypted file
* @param unencryptedPdfFile Unencrypted PDF file
* @return Content extract
*/
public static String extractContent(String language, File file, Path unencryptedFile, Path unencryptedPdfFile) {
String content = null;
if (language == null) {
return null;
}
if (ImageUtil.isImage(file.getMimeType())) {
content = ocrFile(unencryptedFile, language);
} else if (VideoUtil.isVideo(file.getMimeType())) {
content = VideoUtil.getMetadata(unencryptedFile);
} else if (unencryptedPdfFile != null) {
content = PdfUtil.extractPdf(unencryptedPdfFile, language);
}
return content;
}
/**
* Optical character recognition on an image.
*
* @param image Buffered image
* @param language Language to OCR
* @param image Buffered image
* @return Content extracted
*/
public static String ocrFile(BufferedImage image, String language) {
public static String ocrFile(String language, BufferedImage image) {
// Upscale, grayscale and deskew the image
String content = null;
BufferedImage resizedImage = Scalr.resize(image, Scalr.Method.AUTOMATIC, Scalr.Mode.AUTOMATIC, 3500, Scalr.OP_ANTIALIAS, Scalr.OP_GRAYSCALE);
@ -105,66 +74,6 @@ public class FileUtil {
return content;
}
/**
* Optical character recognition on a file.
*
* @param unecryptedFile Unencrypted file
* @param language Language to OCR
* @return Content extracted
*/
private static String ocrFile(Path unecryptedFile, String language) {
BufferedImage image;
try (InputStream inputStream = Files.newInputStream(unecryptedFile)) {
image = ImageIO.read(inputStream);
} catch (IOException e) {
log.error("Error reading the image", e);
return null;
}
return ocrFile(image, language);
}
/**
* Generate file variations.
*
* @param file File from database
* @param unencryptedFile Unencrypted file
* @param unencryptedPdfFile Unencrypted PDF file
* @param cipher Cipher to use for encryption
*/
public static void saveVariations(File file, Path unencryptedFile, Path unencryptedPdfFile, Cipher cipher) throws Exception {
BufferedImage image = null;
if (ImageUtil.isImage(file.getMimeType())) {
try (InputStream inputStream = Files.newInputStream(unencryptedFile)) {
image = ImageIO.read(inputStream);
}
} else if (VideoUtil.isVideo(file.getMimeType())) {
image = VideoUtil.getThumbnail(unencryptedFile);
} else if (unencryptedPdfFile != null) {
// Generate preview from the first page of the PDF
image = PdfUtil.renderFirstPage(unencryptedPdfFile);
}
if (image != null) {
// Generate thumbnails from image
BufferedImage web = Scalr.resize(image, Scalr.Method.ULTRA_QUALITY, Scalr.Mode.AUTOMATIC, 1280);
BufferedImage thumbnail = Scalr.resize(image, Scalr.Method.ULTRA_QUALITY, Scalr.Mode.AUTOMATIC, 256);
image.flush();
// Write "web" encrypted image
Path outputFile = DirectoryUtil.getStorageDirectory().resolve(file.getId() + "_web");
try (OutputStream outputStream = new CipherOutputStream(Files.newOutputStream(outputFile), cipher)) {
ImageUtil.writeJpeg(web, outputStream);
}
// Write "thumb" encrypted image
outputFile = DirectoryUtil.getStorageDirectory().resolve(file.getId() + "_thumb");
try (OutputStream outputStream = new CipherOutputStream(Files.newOutputStream(outputFile), cipher)) {
ImageUtil.writeJpeg(thumbnail, outputStream);
}
}
}
/**
* Remove a file from the storage filesystem.
*

View File

@ -1,39 +1,23 @@
package com.sismics.docs.core.util;
import com.google.common.base.Charsets;
import com.google.common.base.Strings;
import com.google.common.io.ByteStreams;
import com.google.common.io.Closer;
import com.google.common.io.Resources;
import com.lowagie.text.*;
import com.lowagie.text.pdf.PdfWriter;
import com.lowagie.text.FontFactory;
import com.sismics.docs.core.constant.Constants;
import com.sismics.docs.core.dao.jpa.dto.DocumentDto;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.docs.core.util.format.FormatHandler;
import com.sismics.docs.core.util.format.FormatHandlerUtil;
import com.sismics.docs.core.util.pdf.PdfPage;
import com.sismics.util.ImageUtil;
import com.sismics.util.context.ThreadLocalContext;
import com.sismics.util.mime.MimeType;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.multipdf.PDFMergerUtility;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.DocsPDType1Font;
import org.apache.pdfbox.pdmodel.graphics.image.JPEGFactory;
import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.odftoolkit.odfdom.converter.pdf.PdfConverter;
import org.odftoolkit.odfdom.converter.pdf.PdfOptions;
import org.odftoolkit.odfdom.doc.OdfTextDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
@ -54,128 +38,7 @@ public class PdfUtil {
* Logger.
*/
private static final Logger log = LoggerFactory.getLogger(PdfUtil.class);
/**
* Extract text from a PDF.
*
* @param unencryptedPdfFile Unencrypted PDF file
* @param language Language
* @return Content extracted
*/
public static String extractPdf(Path unencryptedPdfFile, String language) {
String content = null;
try (InputStream inputStream = Files.newInputStream(unencryptedPdfFile);
PDDocument pdfDocument = PDDocument.load(inputStream)) {
content = new PDFTextStripper().getText(pdfDocument);
} catch (Exception e) {
log.error("Error while extracting text from the PDF", e);
}
// No text content, try to OCR it
if (language != null && content != null && content.trim().isEmpty()) {
StringBuilder sb = new StringBuilder();
try (InputStream inputStream = Files.newInputStream(unencryptedPdfFile);
PDDocument pdfDocument = PDDocument.load(inputStream)) {
PDFRenderer renderer = new PDFRenderer(pdfDocument);
for (int pageIndex = 0; pageIndex < pdfDocument.getNumberOfPages(); pageIndex++) {
sb.append(" ");
sb.append(FileUtil.ocrFile(renderer.renderImage(pageIndex), language));
}
return sb.toString();
} catch (Exception e) {
log.error("Error while OCR-izing the PDF", e);
}
}
return content;
}
/**
* Convert a file to PDF if necessary.
*
* @param file File
* @param unencryptedFile Unencrypted file
* @return PDF temporary file
*/
public static Path convertToPdf(File file, Path unencryptedFile) throws Exception {
if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) {
// It's already PDF, just return the file
return unencryptedFile;
}
if (file.getMimeType().equals(MimeType.OFFICE_DOCUMENT)) {
return convertOfficeDocument(unencryptedFile);
}
if (file.getMimeType().equals(MimeType.OPEN_DOCUMENT_TEXT)) {
return convertOpenDocumentText(unencryptedFile);
}
if (file.getMimeType().equals(MimeType.TEXT_PLAIN) || file.getMimeType().equals(MimeType.TEXT_CSV)) {
return convertTextPlain(unencryptedFile);
}
// PDF conversion not necessary/possible
return null;
}
/**
* Convert a text plain document to PDF.
*
* @param unencryptedFile Unencrypted file
* @return PDF file
*/
private static Path convertTextPlain(Path unencryptedFile) throws Exception {
Document output = new Document(PageSize.A4, 40, 40, 40, 40);
Path tempFile = ThreadLocalContext.get().createTemporaryFile();
OutputStream pdfOutputStream = Files.newOutputStream(tempFile);
PdfWriter.getInstance(output, pdfOutputStream);
output.open();
String content = new String(Files.readAllBytes(unencryptedFile), Charsets.UTF_8);
Font font = FontFactory.getFont("LiberationMono-Regular");
Paragraph paragraph = new Paragraph(content, font);
paragraph.setAlignment(Element.ALIGN_LEFT);
output.add(paragraph);
output.close();
return tempFile;
}
/**
* Convert an open document text file to PDF.
*
* @param unencryptedFile Unencrypted file
* @return PDF file
*/
private static Path convertOpenDocumentText(Path unencryptedFile) throws Exception {
Path tempFile = ThreadLocalContext.get().createTemporaryFile();
try (InputStream inputStream = Files.newInputStream(unencryptedFile);
OutputStream outputStream = Files.newOutputStream(tempFile)) {
OdfTextDocument document = OdfTextDocument.loadDocument(inputStream);
PdfOptions options = PdfOptions.create();
PdfConverter.getInstance().convert(document, outputStream, options);
}
return tempFile;
}
/**
* Convert an Office document to PDF.
*
* @param unencryptedFile Unencrypted file
* @return PDF file
*/
private static Path convertOfficeDocument(Path unencryptedFile) throws Exception {
Path tempFile = ThreadLocalContext.get().createTemporaryFile();
try (InputStream inputStream = Files.newInputStream(unencryptedFile);
OutputStream outputStream = Files.newOutputStream(tempFile)) {
XWPFDocument document = new XWPFDocument(inputStream);
org.apache.poi.xwpf.converter.pdf.PdfOptions options = org.apache.poi.xwpf.converter.pdf.PdfOptions.create();
org.apache.poi.xwpf.converter.pdf.PdfConverter.getInstance().convert(document, outputStream, options);
}
return tempFile;
}
/**
* Convert a document and its files to a merged PDF file.
*
@ -192,15 +55,14 @@ public class PdfUtil {
Closer closer = Closer.create();
MemoryUsageSetting memUsageSettings = MemoryUsageSetting.setupMixed(1000000); // 1MB max memory usage
memUsageSettings.setTempDir(new java.io.File(System.getProperty("java.io.tmpdir"))); // To OS temp
float mmPerInch = 1 / (10 * 2.54f) * 72f;
// Create a blank PDF
try (PDDocument doc = new PDDocument(memUsageSettings)) {
// Add metadata
if (metadata) {
PDPage page = new PDPage();
doc.addPage(page);
try (PdfPage pdfPage = new PdfPage(doc, page, margin * mmPerInch, DocsPDType1Font.HELVETICA, 12)) {
try (PdfPage pdfPage = new PdfPage(doc, page, margin * Constants.MM_PER_INCH, DocsPDType1Font.HELVETICA, 12)) {
SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
pdfPage.addText(documentDto.getTitle(), true, DocsPDType1Font.HELVETICA_BOLD, 16)
.newLine()
@ -245,55 +107,9 @@ public class PdfUtil {
// Decrypt the file to a temporary file
Path unencryptedFile = EncryptionUtil.decryptFile(storedFile, file.getPrivateKey());
if (ImageUtil.isImage(file.getMimeType())) {
PDPage page = new PDPage(PDRectangle.A4); // Images into A4 pages
try (PDPageContentStream contentStream = new PDPageContentStream(doc, page);
InputStream storedFileInputStream = Files.newInputStream(unencryptedFile)) {
// Read the image using the correct handler. PDFBox can't do it because it relies wrongly on file extension
PDImageXObject pdImage = null;
if (file.getMimeType().equals(MimeType.IMAGE_JPEG)) {
pdImage = JPEGFactory.createFromStream(doc, storedFileInputStream);
} else if (file.getMimeType().equals(MimeType.IMAGE_GIF) || file.getMimeType().equals(MimeType.IMAGE_PNG)) {
BufferedImage bim = ImageIO.read(storedFileInputStream);
pdImage = LosslessFactory.createFromImage(doc, bim);
}
// Do we want to fill the page with the image?
if (fitImageToPage) {
// Fill the page with the image
float widthAvailable = page.getMediaBox().getWidth() - 2 * margin * mmPerInch;
float heightAvailable = page.getMediaBox().getHeight() - 2 * margin * mmPerInch;
// Compare page format and image format
if (widthAvailable / heightAvailable < (float) pdImage.getWidth() / (float) pdImage.getHeight()) {
float imageHeight = widthAvailable / pdImage.getWidth() * pdImage.getHeight();
contentStream.drawImage(pdImage, margin * mmPerInch, heightAvailable + margin * mmPerInch - imageHeight,
widthAvailable, imageHeight);
} else {
float imageWidth = heightAvailable / pdImage.getHeight() * pdImage.getWidth();
contentStream.drawImage(pdImage, margin * mmPerInch, margin * mmPerInch,
imageWidth, heightAvailable);
}
} else {
// Draw the image as is
contentStream.drawImage(pdImage, margin * mmPerInch,
page.getMediaBox().getHeight() - pdImage.getHeight() - margin * mmPerInch);
}
}
doc.addPage(page);
} else {
// Try to convert the file to PDF
Path unencryptedPdfFile = convertToPdf(file, unencryptedFile);
if (unencryptedPdfFile != null) {
// This file is convertible to PDF, just add it to the end
PDDocument mergeDoc = PDDocument.load(unencryptedPdfFile.toFile(), memUsageSettings);
closer.register(mergeDoc);
PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
pdfMergerUtility.appendDocument(doc, mergeDoc);
}
// All other non-PDF-convertible files are ignored
FormatHandler formatHandler = FormatHandlerUtil.find(file.getMimeType());
if (formatHandler != null) {
formatHandler.appendToPdf(unencryptedFile, doc, fitImageToPage, margin, memUsageSettings, closer);
}
}
@ -302,20 +118,6 @@ public class PdfUtil {
}
}
/**
* Render the first page of a PDF.
*
* @param unencryptedFile PDF document
* @return Render of the first page
*/
public static BufferedImage renderFirstPage(Path unencryptedFile) throws IOException {
try (InputStream inputStream = Files.newInputStream(unencryptedFile);
PDDocument pdfDocument = PDDocument.load(inputStream)) {
PDFRenderer renderer = new PDFRenderer(pdfDocument);
return renderer.renderImage(0);
}
}
/**
* Register fonts.
*/

View File

@ -0,0 +1,70 @@
package com.sismics.docs.core.util.format;
import com.google.common.io.Closer;
import com.sismics.util.context.ThreadLocalContext;
import com.sismics.util.mime.MimeType;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import java.awt.image.BufferedImage;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
/**
* DOCX format handler.
*
* @author bgamard
*/
public class DocxFormatHandler implements FormatHandler {
/**
* Temporary PDF file.
*/
private Path temporaryPdfFile;
@Override
public boolean accept(String mimeType) {
return MimeType.OFFICE_DOCUMENT.equals(mimeType);
}
@Override
public BufferedImage generateThumbnail(Path file) throws Exception {
// Use the PDF format handler
return new PdfFormatHandler().generateThumbnail(getGeneratedPdf(file));
}
@Override
public String extractContent(String language, Path file) throws Exception {
// Use the PDF format handler
return new PdfFormatHandler().extractContent(language, getGeneratedPdf(file));
}
@Override
public void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) throws Exception {
// Use the PDF format handler
new PdfFormatHandler().appendToPdf(getGeneratedPdf(file), doc, fitImageToPage, margin, memUsageSettings, closer);
}
/**
* Generate a PDF from this DOCX.
*
* @param file File
* @return PDF file
* @throws Exception e
*/
private Path getGeneratedPdf(Path file) throws Exception {
if (temporaryPdfFile == null) {
temporaryPdfFile = ThreadLocalContext.get().createTemporaryFile();
try (InputStream inputStream = Files.newInputStream(file);
OutputStream outputStream = Files.newOutputStream(temporaryPdfFile)) {
XWPFDocument document = new XWPFDocument(inputStream);
org.apache.poi.xwpf.converter.pdf.PdfOptions options = org.apache.poi.xwpf.converter.pdf.PdfOptions.create();
org.apache.poi.xwpf.converter.pdf.PdfConverter.getInstance().convert(document, outputStream, options);
}
}
return temporaryPdfFile;
}
}

View File

@ -0,0 +1,55 @@
package com.sismics.docs.core.util.format;
import com.google.common.io.Closer;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument;
import java.awt.image.BufferedImage;
import java.nio.file.Path;
/**
* A format handler.
*
* @author bgamard
*/
public interface FormatHandler {
/**
* Returns true if this format handler can handle this MIME type.
*
* @param mimeType MIME type
* @return True if accepted
*/
boolean accept(String mimeType);
/**
* Generate a thumbnail.
*
* @param file File
* @return Thumbnail
* @throws Exception e
*/
BufferedImage generateThumbnail(Path file) throws Exception;
/**
* Extract text content.
*
* @param language Language
* @param file File
* @return Text content
* @throws Exception e
*/
String extractContent(String language, Path file) throws Exception;
/**
* Append to a PDF.
*
* @param file File
* @param doc PDF document
* @param fitImageToPage Fit image to page
* @param margin Margin
* @param memUsageSettings Memory usage
* @param closer Closer
* @throws Exception e
*/
void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) throws Exception;
}

View File

@ -0,0 +1,45 @@
package com.sismics.docs.core.util.format;
import com.google.common.collect.Lists;
import java.util.List;
/**
* Format handler utilities.
*
* @author bgamard
*/
public class FormatHandlerUtil {
/**
* List of format handlers.
*/
private static final List<Class<? extends FormatHandler>> FORMAT_HANDLERS = Lists.newArrayList(
DocxFormatHandler.class,
OdtFormatHandler.class,
VideoFormatHandler.class,
PdfFormatHandler.class,
TextPlainFormatHandler.class,
ImageFormatHandler.class
);
/**
* Find a suitable format handler for this MIME type.
*
* @param mimeType MIME type
* @return Instancied format handler
*/
public static FormatHandler find(String mimeType) {
try {
for (Class<? extends FormatHandler> formatHandlerClass : FORMAT_HANDLERS) {
FormatHandler formatHandler = formatHandlerClass.newInstance();
if (formatHandler.accept(mimeType)) {
return formatHandler;
}
}
} catch (InstantiationException | IllegalAccessException e) {
return null;
}
return null;
}
}

View File

@ -0,0 +1,108 @@
package com.sismics.docs.core.util.format;
import com.google.common.io.Closer;
import com.sismics.docs.core.constant.Constants;
import com.sismics.docs.core.util.FileUtil;
import com.sismics.util.mime.MimeType;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.image.JPEGFactory;
import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
/**
* Image format handler.
*
* @author bgamard
*/
public class ImageFormatHandler implements FormatHandler {
/**
* Logger.
*/
private static final Logger log = LoggerFactory.getLogger(PdfFormatHandler.class);
/**
* Saved MIME type.
*/
private String mimeType;
@Override
public boolean accept(String mimeType) {
this.mimeType = mimeType;
return mimeType.equals(MimeType.IMAGE_GIF) || mimeType.equals(MimeType.IMAGE_PNG) || mimeType.equals(MimeType.IMAGE_JPEG);
}
@Override
public BufferedImage generateThumbnail(Path file) throws IOException {
try (InputStream inputStream = Files.newInputStream(file)) {
return ImageIO.read(inputStream);
}
}
@Override
public String extractContent(String language, Path file) {
try (InputStream inputStream = Files.newInputStream(file)) {
return FileUtil.ocrFile(language, ImageIO.read(inputStream));
} catch (IOException e) {
log.error("Error reading the image", e);
return null;
}
}
@Override
public void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) throws Exception {
PDPage page = new PDPage(PDRectangle.A4); // Images into A4 pages
try (PDPageContentStream contentStream = new PDPageContentStream(doc, page);
InputStream storedFileInputStream = Files.newInputStream(file)) {
// Read the image using the correct handler. PDFBox can't do it because it relies wrongly on file extension
PDImageXObject pdImage;
switch (mimeType) {
case MimeType.IMAGE_JPEG:
pdImage = JPEGFactory.createFromStream(doc, storedFileInputStream);
break;
case MimeType.IMAGE_GIF:
case MimeType.IMAGE_PNG:
BufferedImage bim = ImageIO.read(storedFileInputStream);
pdImage = LosslessFactory.createFromImage(doc, bim);
break;
default:
return;
}
// Do we want to fill the page with the image?
if (fitImageToPage) {
// Fill the page with the image
float widthAvailable = page.getMediaBox().getWidth() - 2 * margin * Constants.MM_PER_INCH;
float heightAvailable = page.getMediaBox().getHeight() - 2 * margin * Constants.MM_PER_INCH;
// Compare page format and image format
if (widthAvailable / heightAvailable < (float) pdImage.getWidth() / (float) pdImage.getHeight()) {
float imageHeight = widthAvailable / pdImage.getWidth() * pdImage.getHeight();
contentStream.drawImage(pdImage, margin * Constants.MM_PER_INCH, heightAvailable + margin * Constants.MM_PER_INCH - imageHeight,
widthAvailable, imageHeight);
} else {
float imageWidth = heightAvailable / pdImage.getHeight() * pdImage.getWidth();
contentStream.drawImage(pdImage, margin * Constants.MM_PER_INCH, margin * Constants.MM_PER_INCH,
imageWidth, heightAvailable);
}
} else {
// Draw the image as is
contentStream.drawImage(pdImage, margin * Constants.MM_PER_INCH,
page.getMediaBox().getHeight() - pdImage.getHeight() - margin * Constants.MM_PER_INCH);
}
}
doc.addPage(page);
}
}

View File

@ -0,0 +1,72 @@
package com.sismics.docs.core.util.format;
import com.google.common.io.Closer;
import com.sismics.util.context.ThreadLocalContext;
import com.sismics.util.mime.MimeType;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.odftoolkit.odfdom.converter.pdf.PdfConverter;
import org.odftoolkit.odfdom.converter.pdf.PdfOptions;
import org.odftoolkit.odfdom.doc.OdfTextDocument;
import java.awt.image.BufferedImage;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
/**
* ODT format handler.
*
* @author bgamard
*/
public class OdtFormatHandler implements FormatHandler {
/**
* Temporary PDF file.
*/
private Path temporaryPdfFile;
@Override
public boolean accept(String mimeType) {
return MimeType.OPEN_DOCUMENT_TEXT.equals(mimeType);
}
@Override
public BufferedImage generateThumbnail(Path file) throws Exception {
// Use the PDF format handler
return new PdfFormatHandler().generateThumbnail(getGeneratedPdf(file));
}
@Override
public String extractContent(String language, Path file) throws Exception {
// Use the PDF format handler
return new PdfFormatHandler().extractContent(language, getGeneratedPdf(file));
}
@Override
public void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) throws Exception {
// Use the PDF format handler
new PdfFormatHandler().appendToPdf(getGeneratedPdf(file), doc, fitImageToPage, margin, memUsageSettings, closer);
}
/**
* Generate a PDF from this ODT.
*
* @param file File
* @return PDF file
* @throws Exception e
*/
private Path getGeneratedPdf(Path file) throws Exception {
if (temporaryPdfFile == null) {
temporaryPdfFile = ThreadLocalContext.get().createTemporaryFile();
try (InputStream inputStream = Files.newInputStream(file);
OutputStream outputStream = Files.newOutputStream(temporaryPdfFile)) {
OdfTextDocument document = OdfTextDocument.loadDocument(inputStream);
PdfOptions options = PdfOptions.create();
PdfConverter.getInstance().convert(document, outputStream, options);
}
}
return temporaryPdfFile;
}
}

View File

@ -0,0 +1,80 @@
package com.sismics.docs.core.util.format;
import com.google.common.io.Closer;
import com.sismics.docs.core.util.FileUtil;
import com.sismics.util.mime.MimeType;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.multipdf.PDFMergerUtility;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.awt.image.BufferedImage;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
/**
* PDF format handler.
*
* @author bgamard
*/
public class PdfFormatHandler implements FormatHandler {
/**
* Logger.
*/
private static final Logger log = LoggerFactory.getLogger(PdfFormatHandler.class);
@Override
public boolean accept(String mimeType) {
return mimeType.equals(MimeType.APPLICATION_PDF);
}
@Override
public BufferedImage generateThumbnail(Path file) throws Exception {
try (InputStream inputStream = Files.newInputStream(file);
PDDocument pdfDocument = PDDocument.load(inputStream)) {
PDFRenderer renderer = new PDFRenderer(pdfDocument);
return renderer.renderImage(0);
}
}
@Override
public String extractContent(String language, Path file) {
String content = null;
try (InputStream inputStream = Files.newInputStream(file);
PDDocument pdfDocument = PDDocument.load(inputStream)) {
content = new PDFTextStripper().getText(pdfDocument);
} catch (Exception e) {
log.error("Error while extracting text from the PDF", e);
}
// No text content, try to OCR it
if (language != null && content != null && content.trim().isEmpty()) {
StringBuilder sb = new StringBuilder();
try (InputStream inputStream = Files.newInputStream(file);
PDDocument pdfDocument = PDDocument.load(inputStream)) {
PDFRenderer renderer = new PDFRenderer(pdfDocument);
for (int pageIndex = 0; pageIndex < pdfDocument.getNumberOfPages(); pageIndex++) {
sb.append(" ");
sb.append(FileUtil.ocrFile(language, renderer.renderImage(pageIndex)));
}
return sb.toString();
} catch (Exception e) {
log.error("Error while OCR-izing the PDF", e);
}
}
return content;
}
@Override
public void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) throws Exception {
PDDocument mergeDoc = PDDocument.load(file.toFile(), memUsageSettings);
closer.register(mergeDoc);
PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
pdfMergerUtility.appendDocument(doc, mergeDoc);
}
}

View File

@ -0,0 +1,56 @@
package com.sismics.docs.core.util.format;
import com.google.common.base.Charsets;
import com.google.common.io.Closer;
import com.lowagie.text.*;
import com.lowagie.text.pdf.PdfWriter;
import com.sismics.util.context.ThreadLocalContext;
import com.sismics.util.mime.MimeType;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument;
import java.awt.image.BufferedImage;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
/**
* Text plain format handler.
*
* @author bgamard
*/
public class TextPlainFormatHandler implements FormatHandler {
@Override
public boolean accept(String mimeType) {
return mimeType.equals(MimeType.TEXT_CSV) || mimeType.equals(MimeType.TEXT_PLAIN);
}
@Override
public BufferedImage generateThumbnail(Path file) throws Exception {
Document output = new Document(PageSize.A4, 40, 40, 40, 40);
Path tempFile = ThreadLocalContext.get().createTemporaryFile();
OutputStream pdfOutputStream = Files.newOutputStream(tempFile);
PdfWriter.getInstance(output, pdfOutputStream);
output.open();
String content = new String(Files.readAllBytes(file), Charsets.UTF_8);
Font font = FontFactory.getFont("LiberationMono-Regular");
Paragraph paragraph = new Paragraph(content, font);
paragraph.setAlignment(Element.ALIGN_LEFT);
output.add(paragraph);
output.close();
// Use the PDF format handler
return new PdfFormatHandler().generateThumbnail(tempFile);
}
@Override
public String extractContent(String language, Path file) throws Exception {
return new String(Files.readAllBytes(file), "UTF-8");
}
@Override
public void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) {
// TODO Append the text file to the PDF
}
}

View File

@ -1,10 +1,13 @@
package com.sismics.util;
package com.sismics.docs.core.util.format;
import com.google.common.base.Charsets;
import com.google.common.collect.Lists;
import com.google.common.io.ByteStreams;
import com.google.common.io.Closer;
import com.sismics.util.io.InputStreamReaderThread;
import com.sismics.util.mime.MimeType;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
@ -15,27 +18,18 @@ import java.util.Arrays;
import java.util.List;
/**
* Video processing utilities.
* Video format handler.
*
* @author bgamard
*/
public class VideoUtil {
/**
* Returns true if this MIME type is a video.
* @param mimeType MIME type
* @return True if video
*/
public static boolean isVideo(String mimeType) {
public class VideoFormatHandler implements FormatHandler {
@Override
public boolean accept(String mimeType) {
return mimeType.equals(MimeType.VIDEO_MP4) || mimeType.equals(MimeType.VIDEO_WEBM);
}
/**
* Generate a thumbnail from a video file.
*
* @param file Video file
* @return Thumbnail
*/
public static BufferedImage getThumbnail(Path file) throws Exception {
@Override
public BufferedImage generateThumbnail(Path file) throws IOException {
List<String> result = Lists.newLinkedList(Arrays.asList("ffmpeg", "-i"));
result.add(file.toAbsolutePath().toString());
result.addAll(Arrays.asList("-vf", "thumbnail", "-frames:v", "1", "-f", "mjpeg", "-"));
@ -52,13 +46,8 @@ public class VideoUtil {
}
}
/**
* Extract metadata from a video file.
*
* @param file Video file
* @return Metadata
*/
public static String getMetadata(Path file) {
@Override
public String extractContent(String language, Path file) {
List<String> result = Lists.newLinkedList();
result.add("mediainfo");
result.add(file.toAbsolutePath().toString());
@ -81,4 +70,9 @@ public class VideoUtil {
return null;
}
}
@Override
public void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) {
// Video cannot be appended to PDF files
}
}

View File

@ -2,7 +2,6 @@ package com.sismics.util;
import com.google.common.base.Charsets;
import com.google.common.hash.Hashing;
import com.sismics.util.mime.MimeType;
import javax.imageio.IIOImage;
import javax.imageio.ImageIO;
@ -67,15 +66,6 @@ public class ImageUtil {
}
}
/**
* Returns true if this MIME type is an image.
* @param mimeType MIME type
* @return True if image
*/
public static boolean isImage(String mimeType) {
return mimeType.equals(MimeType.IMAGE_GIF) || mimeType.equals(MimeType.IMAGE_PNG) || mimeType.equals(MimeType.IMAGE_JPEG);
}
/**
* Compute Gravatar hash.
* See https://en.gravatar.com/site/implement/hash/.

View File

@ -1,7 +1,6 @@
package com.sismics.util.mime;
import com.google.common.base.Charsets;
import com.sismics.docs.core.model.jpa.File;
import org.apache.commons.compress.utils.IOUtils;
import java.io.IOException;
@ -15,7 +14,7 @@ import java.util.zip.ZipInputStream;
/**
* Utility to check MIME types.
*
* @author jtremeaux
* @author bgamard
*/
public class MimeTypeUtil {
/**
@ -27,11 +26,14 @@ public class MimeTypeUtil {
* @throws IOException e
*/
public static String guessMimeType(Path file, String name) throws IOException {
String mimeType;
try (InputStream is = Files.newInputStream(file)) {
byte[] headerBytes = new byte[64];
is.read(headerBytes);
return guessMimeType(headerBytes, name);
mimeType = guessMimeType(headerBytes, name);
}
return guessOpenDocumentFormat(mimeType, file);
}
/**
@ -116,18 +118,17 @@ public class MimeTypeUtil {
* It's more costly than the simple header check, but needed because open document formats
* are simple ZIP files on the outside and much bigger on the inside.
*
* @param file File
* @param unencryptedFile File on disk
* @param mimeType Currently detected MIME type
* @param file File on disk
* @return MIME type
*/
public static String guessOpenDocumentFormat(File file, Path unencryptedFile) {
if (!MimeType.APPLICATION_ZIP.equals(file.getMimeType())) {
private static String guessOpenDocumentFormat(String mimeType, Path file) {
if (!MimeType.APPLICATION_ZIP.equals(mimeType)) {
// open document formats are ZIP files
return file.getMimeType();
return mimeType;
}
String mimeType = file.getMimeType();
try (InputStream inputStream = Files.newInputStream(unencryptedFile);
try (InputStream inputStream = Files.newInputStream(file);
ZipInputStream zipInputStream = new ZipInputStream(inputStream, Charsets.ISO_8859_1)) {
ZipEntry archiveEntry = zipInputStream.getNextEntry();
while (archiveEntry != null) {
@ -151,7 +152,7 @@ public class MimeTypeUtil {
}
} catch (Exception e) {
// In case of any error, just give up and keep the ZIP MIME type
return file.getMimeType();
return mimeType;
}
return mimeType;

View File

@ -4,7 +4,9 @@ import com.google.common.collect.Lists;
import com.google.common.io.Resources;
import com.sismics.docs.core.dao.jpa.dto.DocumentDto;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.docs.core.util.format.*;
import com.sismics.util.mime.MimeType;
import com.sismics.util.mime.MimeTypeUtil;
import org.junit.Assert;
import org.junit.Test;
@ -25,39 +27,40 @@ public class TestFileUtil {
@Test
public void extractContentOpenDocumentTextTest() throws Exception {
Path path = Paths.get(ClassLoader.getSystemResource("file/document.odt").toURI());
File file = new File();
file.setMimeType(MimeType.OPEN_DOCUMENT_TEXT);
Path pdfPath = PdfUtil.convertToPdf(file, path);
String content = FileUtil.extractContent("eng", file, path, pdfPath);
FormatHandler formatHandler = FormatHandlerUtil.find(MimeTypeUtil.guessMimeType(path, "document.odt"));
Assert.assertNotNull(formatHandler);
Assert.assertTrue(formatHandler instanceof OdtFormatHandler);
String content = formatHandler.extractContent("eng", path);
Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen."));
}
@Test
public void extractContentOfficeDocumentTest() throws Exception {
Path path = Paths.get(ClassLoader.getSystemResource("file/document.docx").toURI());
File file = new File();
file.setMimeType(MimeType.OFFICE_DOCUMENT);
Path pdfPath = PdfUtil.convertToPdf(file, path);
String content = FileUtil.extractContent("eng", file, path, pdfPath);
FormatHandler formatHandler = FormatHandlerUtil.find(MimeTypeUtil.guessMimeType(path, "document.docx"));
Assert.assertNotNull(formatHandler);
Assert.assertTrue(formatHandler instanceof DocxFormatHandler);
String content = formatHandler.extractContent("eng", path);
Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen."));
}
@Test
public void extractContentPdf() throws Exception {
Path path = Paths.get(ClassLoader.getSystemResource("file/udhr.pdf").toURI());
File file = new File();
file.setMimeType(MimeType.APPLICATION_PDF);
String content = FileUtil.extractContent("eng", file, path, path);
FormatHandler formatHandler = FormatHandlerUtil.find(MimeTypeUtil.guessMimeType(path, "udhr.pdf"));
Assert.assertNotNull(formatHandler);
Assert.assertTrue(formatHandler instanceof PdfFormatHandler);
String content = formatHandler.extractContent("eng", path);
Assert.assertTrue(content.contains("All human beings are born free and equal in dignity and rights."));
}
@Test
public void extractContentScannedPdf() throws Exception {
Path path = Paths.get(ClassLoader.getSystemResource("file/scanned.pdf").toURI());
File file = new File();
file.setMimeType(MimeType.APPLICATION_PDF);
String content = FileUtil.extractContent("eng", file, path, path);
System.out.println(content);
FormatHandler formatHandler = FormatHandlerUtil.find(MimeTypeUtil.guessMimeType(path, "scanned.pdf"));
Assert.assertNotNull(formatHandler);
Assert.assertTrue(formatHandler instanceof PdfFormatHandler);
String content = formatHandler.extractContent("eng", path);
Assert.assertTrue(content.contains("All human beings are born free and equal in dignity and rights."));
}

View File

@ -1,6 +1,5 @@
package com.sismics.util;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.util.mime.MimeType;
import com.sismics.util.mime.MimeTypeUtil;
import org.junit.Assert;
@ -19,14 +18,10 @@ public class TestMimeTypeUtil {
public void guessOpenDocumentFormatTest() throws Exception {
// Detect ODT files
Path path = Paths.get(ClassLoader.getSystemResource("file/document.odt").toURI());
File file = new File();
file.setMimeType(MimeType.APPLICATION_ZIP);
Assert.assertEquals(MimeType.OPEN_DOCUMENT_TEXT, MimeTypeUtil.guessOpenDocumentFormat(file, path));
Assert.assertEquals(MimeType.OPEN_DOCUMENT_TEXT, MimeTypeUtil.guessMimeType(path, "document.odt"));
// Detect DOCX files
path = Paths.get(ClassLoader.getSystemResource("file/document.docx").toURI());
file = new File();
file.setMimeType(MimeType.APPLICATION_ZIP);
Assert.assertEquals(MimeType.OFFICE_DOCUMENT, MimeTypeUtil.guessOpenDocumentFormat(file, path));
Assert.assertEquals(MimeType.OFFICE_DOCUMENT, MimeTypeUtil.guessMimeType(path, "document.odt"));
}
}