mirror of
https://github.com/sismics/docs.git
synced 2024-11-22 05:57:57 +01:00
Closes #182: format handling refactoring
This commit is contained in:
parent
996585d7ac
commit
7ea8d0c0f7
@ -87,4 +87,9 @@ public class Constants {
|
||||
* Email template for route step validate.
|
||||
*/
|
||||
public static final String EMAIL_TEMPLATE_ROUTE_STEP_VALIDATE = "route_step_validate";
|
||||
|
||||
/**
|
||||
* mm per inch.
|
||||
*/
|
||||
public static float MM_PER_INCH = 1 / (10 * 2.54f) * 72f;
|
||||
}
|
||||
|
@ -7,15 +7,22 @@ import com.sismics.docs.core.dao.lucene.LuceneDao;
|
||||
import com.sismics.docs.core.event.FileCreatedAsyncEvent;
|
||||
import com.sismics.docs.core.model.jpa.File;
|
||||
import com.sismics.docs.core.model.jpa.User;
|
||||
import com.sismics.docs.core.util.DirectoryUtil;
|
||||
import com.sismics.docs.core.util.EncryptionUtil;
|
||||
import com.sismics.docs.core.util.FileUtil;
|
||||
import com.sismics.docs.core.util.PdfUtil;
|
||||
import com.sismics.docs.core.util.TransactionUtil;
|
||||
import com.sismics.util.mime.MimeTypeUtil;
|
||||
import com.sismics.docs.core.util.format.FormatHandler;
|
||||
import com.sismics.docs.core.util.format.FormatHandlerUtil;
|
||||
import com.sismics.util.ImageUtil;
|
||||
import com.sismics.util.Scalr;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.crypto.Cipher;
|
||||
import javax.crypto.CipherOutputStream;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.OutputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.text.MessageFormat;
|
||||
import java.util.concurrent.atomic.AtomicReference;
|
||||
@ -42,16 +49,12 @@ public class FileCreatedAsyncListener {
|
||||
log.info("File created event: " + event.toString());
|
||||
}
|
||||
|
||||
// Guess the mime type a second time, for open document format (first detected as simple ZIP file)
|
||||
// Find a format handler
|
||||
final File file = event.getFile();
|
||||
file.setMimeType(MimeTypeUtil.guessOpenDocumentFormat(file, event.getUnencryptedFile()));
|
||||
|
||||
// Convert to PDF if necessary (for thumbnail and text extraction)
|
||||
Path unencryptedPdfFile = null;
|
||||
try {
|
||||
unencryptedPdfFile = PdfUtil.convertToPdf(file, event.getUnencryptedFile());
|
||||
} catch (Exception e) {
|
||||
log.error("Unable to convert to PDF", e);
|
||||
FormatHandler formatHandler = FormatHandlerUtil.find(file.getMimeType());
|
||||
if (formatHandler == null) {
|
||||
log.error("Format unhandled: " + file.getMimeType());
|
||||
return;
|
||||
}
|
||||
|
||||
// Get the user from the database
|
||||
@ -71,15 +74,37 @@ public class FileCreatedAsyncListener {
|
||||
// Generate file variations
|
||||
try {
|
||||
Cipher cipher = EncryptionUtil.getEncryptionCipher(user.get().getPrivateKey());
|
||||
FileUtil.saveVariations(file, event.getUnencryptedFile(), unencryptedPdfFile, cipher);
|
||||
BufferedImage image = formatHandler.generateThumbnail(event.getUnencryptedFile());
|
||||
if (image != null) {
|
||||
// Generate thumbnails from image
|
||||
BufferedImage web = Scalr.resize(image, Scalr.Method.ULTRA_QUALITY, Scalr.Mode.AUTOMATIC, 1280);
|
||||
BufferedImage thumbnail = Scalr.resize(image, Scalr.Method.ULTRA_QUALITY, Scalr.Mode.AUTOMATIC, 256);
|
||||
image.flush();
|
||||
|
||||
// Write "web" encrypted image
|
||||
Path outputFile = DirectoryUtil.getStorageDirectory().resolve(file.getId() + "_web");
|
||||
try (OutputStream outputStream = new CipherOutputStream(Files.newOutputStream(outputFile), cipher)) {
|
||||
ImageUtil.writeJpeg(web, outputStream);
|
||||
}
|
||||
|
||||
// Write "thumb" encrypted image
|
||||
outputFile = DirectoryUtil.getStorageDirectory().resolve(file.getId() + "_thumb");
|
||||
try (OutputStream outputStream = new CipherOutputStream(Files.newOutputStream(outputFile), cipher)) {
|
||||
ImageUtil.writeJpeg(thumbnail, outputStream);
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.error("Unable to generate thumbnails", e);
|
||||
}
|
||||
|
||||
// Extract text content from the file
|
||||
long startTime = System.currentTimeMillis();
|
||||
final String content = FileUtil.extractContent(event.getLanguage(), file,
|
||||
event.getUnencryptedFile(), unencryptedPdfFile);
|
||||
final AtomicReference<String> content = new AtomicReference<>();
|
||||
try {
|
||||
content.set(formatHandler.extractContent(event.getLanguage(), event.getUnencryptedFile()));
|
||||
} catch (Exception e) {
|
||||
log.error("Error extracting content from: " + event.getFile());
|
||||
}
|
||||
log.info(MessageFormat.format("File content extracted in {0}ms", System.currentTimeMillis() - startTime));
|
||||
|
||||
// Save the file to database
|
||||
@ -91,8 +116,8 @@ public class FileCreatedAsyncListener {
|
||||
// The file has been deleted since the text extraction started, ignore the result
|
||||
return;
|
||||
}
|
||||
|
||||
file.setContent(content);
|
||||
|
||||
file.setContent(content.get());
|
||||
fileDao.update(file);
|
||||
}
|
||||
});
|
||||
|
@ -10,9 +10,7 @@ import com.sismics.docs.core.model.jpa.File;
|
||||
import com.sismics.docs.core.model.jpa.User;
|
||||
import com.sismics.tess4j.Tesseract;
|
||||
import com.sismics.util.ImageDeskew;
|
||||
import com.sismics.util.ImageUtil;
|
||||
import com.sismics.util.Scalr;
|
||||
import com.sismics.util.VideoUtil;
|
||||
import com.sismics.util.context.ThreadLocalContext;
|
||||
import com.sismics.util.mime.MimeTypeUtil;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
@ -21,12 +19,9 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.crypto.Cipher;
|
||||
import javax.crypto.CipherInputStream;
|
||||
import javax.crypto.CipherOutputStream;
|
||||
import javax.imageio.ImageIO;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Collections;
|
||||
@ -49,40 +44,14 @@ public class FileUtil {
|
||||
*/
|
||||
private static Set<String> processingFileSet = Collections.synchronizedSet(new HashSet<String>());
|
||||
|
||||
/**
|
||||
* Extract content from a file.
|
||||
*
|
||||
* @param language Language to extract
|
||||
* @param file File to extract
|
||||
* @param unencryptedFile Unencrypted file
|
||||
* @param unencryptedPdfFile Unencrypted PDF file
|
||||
* @return Content extract
|
||||
*/
|
||||
public static String extractContent(String language, File file, Path unencryptedFile, Path unencryptedPdfFile) {
|
||||
String content = null;
|
||||
if (language == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (ImageUtil.isImage(file.getMimeType())) {
|
||||
content = ocrFile(unencryptedFile, language);
|
||||
} else if (VideoUtil.isVideo(file.getMimeType())) {
|
||||
content = VideoUtil.getMetadata(unencryptedFile);
|
||||
} else if (unencryptedPdfFile != null) {
|
||||
content = PdfUtil.extractPdf(unencryptedPdfFile, language);
|
||||
}
|
||||
|
||||
return content;
|
||||
}
|
||||
|
||||
/**
|
||||
* Optical character recognition on an image.
|
||||
*
|
||||
* @param image Buffered image
|
||||
* @param language Language to OCR
|
||||
* @param image Buffered image
|
||||
* @return Content extracted
|
||||
*/
|
||||
public static String ocrFile(BufferedImage image, String language) {
|
||||
public static String ocrFile(String language, BufferedImage image) {
|
||||
// Upscale, grayscale and deskew the image
|
||||
String content = null;
|
||||
BufferedImage resizedImage = Scalr.resize(image, Scalr.Method.AUTOMATIC, Scalr.Mode.AUTOMATIC, 3500, Scalr.OP_ANTIALIAS, Scalr.OP_GRAYSCALE);
|
||||
@ -105,66 +74,6 @@ public class FileUtil {
|
||||
return content;
|
||||
}
|
||||
|
||||
/**
|
||||
* Optical character recognition on a file.
|
||||
*
|
||||
* @param unecryptedFile Unencrypted file
|
||||
* @param language Language to OCR
|
||||
* @return Content extracted
|
||||
*/
|
||||
private static String ocrFile(Path unecryptedFile, String language) {
|
||||
BufferedImage image;
|
||||
try (InputStream inputStream = Files.newInputStream(unecryptedFile)) {
|
||||
image = ImageIO.read(inputStream);
|
||||
} catch (IOException e) {
|
||||
log.error("Error reading the image", e);
|
||||
return null;
|
||||
}
|
||||
|
||||
return ocrFile(image, language);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate file variations.
|
||||
*
|
||||
* @param file File from database
|
||||
* @param unencryptedFile Unencrypted file
|
||||
* @param unencryptedPdfFile Unencrypted PDF file
|
||||
* @param cipher Cipher to use for encryption
|
||||
*/
|
||||
public static void saveVariations(File file, Path unencryptedFile, Path unencryptedPdfFile, Cipher cipher) throws Exception {
|
||||
BufferedImage image = null;
|
||||
if (ImageUtil.isImage(file.getMimeType())) {
|
||||
try (InputStream inputStream = Files.newInputStream(unencryptedFile)) {
|
||||
image = ImageIO.read(inputStream);
|
||||
}
|
||||
} else if (VideoUtil.isVideo(file.getMimeType())) {
|
||||
image = VideoUtil.getThumbnail(unencryptedFile);
|
||||
} else if (unencryptedPdfFile != null) {
|
||||
// Generate preview from the first page of the PDF
|
||||
image = PdfUtil.renderFirstPage(unencryptedPdfFile);
|
||||
}
|
||||
|
||||
if (image != null) {
|
||||
// Generate thumbnails from image
|
||||
BufferedImage web = Scalr.resize(image, Scalr.Method.ULTRA_QUALITY, Scalr.Mode.AUTOMATIC, 1280);
|
||||
BufferedImage thumbnail = Scalr.resize(image, Scalr.Method.ULTRA_QUALITY, Scalr.Mode.AUTOMATIC, 256);
|
||||
image.flush();
|
||||
|
||||
// Write "web" encrypted image
|
||||
Path outputFile = DirectoryUtil.getStorageDirectory().resolve(file.getId() + "_web");
|
||||
try (OutputStream outputStream = new CipherOutputStream(Files.newOutputStream(outputFile), cipher)) {
|
||||
ImageUtil.writeJpeg(web, outputStream);
|
||||
}
|
||||
|
||||
// Write "thumb" encrypted image
|
||||
outputFile = DirectoryUtil.getStorageDirectory().resolve(file.getId() + "_thumb");
|
||||
try (OutputStream outputStream = new CipherOutputStream(Files.newOutputStream(outputFile), cipher)) {
|
||||
ImageUtil.writeJpeg(thumbnail, outputStream);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove a file from the storage filesystem.
|
||||
*
|
||||
|
@ -1,39 +1,23 @@
|
||||
package com.sismics.docs.core.util;
|
||||
|
||||
import com.google.common.base.Charsets;
|
||||
import com.google.common.base.Strings;
|
||||
import com.google.common.io.ByteStreams;
|
||||
import com.google.common.io.Closer;
|
||||
import com.google.common.io.Resources;
|
||||
import com.lowagie.text.*;
|
||||
import com.lowagie.text.pdf.PdfWriter;
|
||||
import com.lowagie.text.FontFactory;
|
||||
import com.sismics.docs.core.constant.Constants;
|
||||
import com.sismics.docs.core.dao.jpa.dto.DocumentDto;
|
||||
import com.sismics.docs.core.model.jpa.File;
|
||||
import com.sismics.docs.core.util.format.FormatHandler;
|
||||
import com.sismics.docs.core.util.format.FormatHandlerUtil;
|
||||
import com.sismics.docs.core.util.pdf.PdfPage;
|
||||
import com.sismics.util.ImageUtil;
|
||||
import com.sismics.util.context.ThreadLocalContext;
|
||||
import com.sismics.util.mime.MimeType;
|
||||
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||
import org.apache.pdfbox.multipdf.PDFMergerUtility;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.pdmodel.font.DocsPDType1Font;
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.JPEGFactory;
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
||||
import org.apache.pdfbox.rendering.PDFRenderer;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
||||
import org.odftoolkit.odfdom.converter.pdf.PdfConverter;
|
||||
import org.odftoolkit.odfdom.converter.pdf.PdfOptions;
|
||||
import org.odftoolkit.odfdom.doc.OdfTextDocument;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
@ -54,128 +38,7 @@ public class PdfUtil {
|
||||
* Logger.
|
||||
*/
|
||||
private static final Logger log = LoggerFactory.getLogger(PdfUtil.class);
|
||||
|
||||
/**
|
||||
* Extract text from a PDF.
|
||||
*
|
||||
* @param unencryptedPdfFile Unencrypted PDF file
|
||||
* @param language Language
|
||||
* @return Content extracted
|
||||
*/
|
||||
public static String extractPdf(Path unencryptedPdfFile, String language) {
|
||||
String content = null;
|
||||
try (InputStream inputStream = Files.newInputStream(unencryptedPdfFile);
|
||||
PDDocument pdfDocument = PDDocument.load(inputStream)) {
|
||||
content = new PDFTextStripper().getText(pdfDocument);
|
||||
} catch (Exception e) {
|
||||
log.error("Error while extracting text from the PDF", e);
|
||||
}
|
||||
|
||||
// No text content, try to OCR it
|
||||
if (language != null && content != null && content.trim().isEmpty()) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
try (InputStream inputStream = Files.newInputStream(unencryptedPdfFile);
|
||||
PDDocument pdfDocument = PDDocument.load(inputStream)) {
|
||||
PDFRenderer renderer = new PDFRenderer(pdfDocument);
|
||||
for (int pageIndex = 0; pageIndex < pdfDocument.getNumberOfPages(); pageIndex++) {
|
||||
sb.append(" ");
|
||||
sb.append(FileUtil.ocrFile(renderer.renderImage(pageIndex), language));
|
||||
}
|
||||
return sb.toString();
|
||||
} catch (Exception e) {
|
||||
log.error("Error while OCR-izing the PDF", e);
|
||||
}
|
||||
}
|
||||
|
||||
return content;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a file to PDF if necessary.
|
||||
*
|
||||
* @param file File
|
||||
* @param unencryptedFile Unencrypted file
|
||||
* @return PDF temporary file
|
||||
*/
|
||||
public static Path convertToPdf(File file, Path unencryptedFile) throws Exception {
|
||||
if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) {
|
||||
// It's already PDF, just return the file
|
||||
return unencryptedFile;
|
||||
}
|
||||
|
||||
if (file.getMimeType().equals(MimeType.OFFICE_DOCUMENT)) {
|
||||
return convertOfficeDocument(unencryptedFile);
|
||||
}
|
||||
|
||||
if (file.getMimeType().equals(MimeType.OPEN_DOCUMENT_TEXT)) {
|
||||
return convertOpenDocumentText(unencryptedFile);
|
||||
}
|
||||
|
||||
if (file.getMimeType().equals(MimeType.TEXT_PLAIN) || file.getMimeType().equals(MimeType.TEXT_CSV)) {
|
||||
return convertTextPlain(unencryptedFile);
|
||||
}
|
||||
|
||||
// PDF conversion not necessary/possible
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a text plain document to PDF.
|
||||
*
|
||||
* @param unencryptedFile Unencrypted file
|
||||
* @return PDF file
|
||||
*/
|
||||
private static Path convertTextPlain(Path unencryptedFile) throws Exception {
|
||||
Document output = new Document(PageSize.A4, 40, 40, 40, 40);
|
||||
Path tempFile = ThreadLocalContext.get().createTemporaryFile();
|
||||
OutputStream pdfOutputStream = Files.newOutputStream(tempFile);
|
||||
PdfWriter.getInstance(output, pdfOutputStream);
|
||||
|
||||
output.open();
|
||||
String content = new String(Files.readAllBytes(unencryptedFile), Charsets.UTF_8);
|
||||
Font font = FontFactory.getFont("LiberationMono-Regular");
|
||||
Paragraph paragraph = new Paragraph(content, font);
|
||||
paragraph.setAlignment(Element.ALIGN_LEFT);
|
||||
output.add(paragraph);
|
||||
output.close();
|
||||
|
||||
return tempFile;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert an open document text file to PDF.
|
||||
*
|
||||
* @param unencryptedFile Unencrypted file
|
||||
* @return PDF file
|
||||
*/
|
||||
private static Path convertOpenDocumentText(Path unencryptedFile) throws Exception {
|
||||
Path tempFile = ThreadLocalContext.get().createTemporaryFile();
|
||||
try (InputStream inputStream = Files.newInputStream(unencryptedFile);
|
||||
OutputStream outputStream = Files.newOutputStream(tempFile)) {
|
||||
OdfTextDocument document = OdfTextDocument.loadDocument(inputStream);
|
||||
PdfOptions options = PdfOptions.create();
|
||||
PdfConverter.getInstance().convert(document, outputStream, options);
|
||||
}
|
||||
return tempFile;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert an Office document to PDF.
|
||||
*
|
||||
* @param unencryptedFile Unencrypted file
|
||||
* @return PDF file
|
||||
*/
|
||||
private static Path convertOfficeDocument(Path unencryptedFile) throws Exception {
|
||||
Path tempFile = ThreadLocalContext.get().createTemporaryFile();
|
||||
try (InputStream inputStream = Files.newInputStream(unencryptedFile);
|
||||
OutputStream outputStream = Files.newOutputStream(tempFile)) {
|
||||
XWPFDocument document = new XWPFDocument(inputStream);
|
||||
org.apache.poi.xwpf.converter.pdf.PdfOptions options = org.apache.poi.xwpf.converter.pdf.PdfOptions.create();
|
||||
org.apache.poi.xwpf.converter.pdf.PdfConverter.getInstance().convert(document, outputStream, options);
|
||||
}
|
||||
return tempFile;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a document and its files to a merged PDF file.
|
||||
*
|
||||
@ -192,15 +55,14 @@ public class PdfUtil {
|
||||
Closer closer = Closer.create();
|
||||
MemoryUsageSetting memUsageSettings = MemoryUsageSetting.setupMixed(1000000); // 1MB max memory usage
|
||||
memUsageSettings.setTempDir(new java.io.File(System.getProperty("java.io.tmpdir"))); // To OS temp
|
||||
float mmPerInch = 1 / (10 * 2.54f) * 72f;
|
||||
|
||||
|
||||
// Create a blank PDF
|
||||
try (PDDocument doc = new PDDocument(memUsageSettings)) {
|
||||
// Add metadata
|
||||
if (metadata) {
|
||||
PDPage page = new PDPage();
|
||||
doc.addPage(page);
|
||||
try (PdfPage pdfPage = new PdfPage(doc, page, margin * mmPerInch, DocsPDType1Font.HELVETICA, 12)) {
|
||||
try (PdfPage pdfPage = new PdfPage(doc, page, margin * Constants.MM_PER_INCH, DocsPDType1Font.HELVETICA, 12)) {
|
||||
SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
|
||||
pdfPage.addText(documentDto.getTitle(), true, DocsPDType1Font.HELVETICA_BOLD, 16)
|
||||
.newLine()
|
||||
@ -245,55 +107,9 @@ public class PdfUtil {
|
||||
|
||||
// Decrypt the file to a temporary file
|
||||
Path unencryptedFile = EncryptionUtil.decryptFile(storedFile, file.getPrivateKey());
|
||||
|
||||
if (ImageUtil.isImage(file.getMimeType())) {
|
||||
PDPage page = new PDPage(PDRectangle.A4); // Images into A4 pages
|
||||
try (PDPageContentStream contentStream = new PDPageContentStream(doc, page);
|
||||
InputStream storedFileInputStream = Files.newInputStream(unencryptedFile)) {
|
||||
// Read the image using the correct handler. PDFBox can't do it because it relies wrongly on file extension
|
||||
PDImageXObject pdImage = null;
|
||||
if (file.getMimeType().equals(MimeType.IMAGE_JPEG)) {
|
||||
pdImage = JPEGFactory.createFromStream(doc, storedFileInputStream);
|
||||
} else if (file.getMimeType().equals(MimeType.IMAGE_GIF) || file.getMimeType().equals(MimeType.IMAGE_PNG)) {
|
||||
BufferedImage bim = ImageIO.read(storedFileInputStream);
|
||||
pdImage = LosslessFactory.createFromImage(doc, bim);
|
||||
}
|
||||
|
||||
// Do we want to fill the page with the image?
|
||||
if (fitImageToPage) {
|
||||
// Fill the page with the image
|
||||
float widthAvailable = page.getMediaBox().getWidth() - 2 * margin * mmPerInch;
|
||||
float heightAvailable = page.getMediaBox().getHeight() - 2 * margin * mmPerInch;
|
||||
|
||||
// Compare page format and image format
|
||||
if (widthAvailable / heightAvailable < (float) pdImage.getWidth() / (float) pdImage.getHeight()) {
|
||||
float imageHeight = widthAvailable / pdImage.getWidth() * pdImage.getHeight();
|
||||
contentStream.drawImage(pdImage, margin * mmPerInch, heightAvailable + margin * mmPerInch - imageHeight,
|
||||
widthAvailable, imageHeight);
|
||||
} else {
|
||||
float imageWidth = heightAvailable / pdImage.getHeight() * pdImage.getWidth();
|
||||
contentStream.drawImage(pdImage, margin * mmPerInch, margin * mmPerInch,
|
||||
imageWidth, heightAvailable);
|
||||
}
|
||||
} else {
|
||||
// Draw the image as is
|
||||
contentStream.drawImage(pdImage, margin * mmPerInch,
|
||||
page.getMediaBox().getHeight() - pdImage.getHeight() - margin * mmPerInch);
|
||||
}
|
||||
}
|
||||
doc.addPage(page);
|
||||
} else {
|
||||
// Try to convert the file to PDF
|
||||
Path unencryptedPdfFile = convertToPdf(file, unencryptedFile);
|
||||
if (unencryptedPdfFile != null) {
|
||||
// This file is convertible to PDF, just add it to the end
|
||||
PDDocument mergeDoc = PDDocument.load(unencryptedPdfFile.toFile(), memUsageSettings);
|
||||
closer.register(mergeDoc);
|
||||
PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
|
||||
pdfMergerUtility.appendDocument(doc, mergeDoc);
|
||||
}
|
||||
|
||||
// All other non-PDF-convertible files are ignored
|
||||
FormatHandler formatHandler = FormatHandlerUtil.find(file.getMimeType());
|
||||
if (formatHandler != null) {
|
||||
formatHandler.appendToPdf(unencryptedFile, doc, fitImageToPage, margin, memUsageSettings, closer);
|
||||
}
|
||||
}
|
||||
|
||||
@ -302,20 +118,6 @@ public class PdfUtil {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Render the first page of a PDF.
|
||||
*
|
||||
* @param unencryptedFile PDF document
|
||||
* @return Render of the first page
|
||||
*/
|
||||
public static BufferedImage renderFirstPage(Path unencryptedFile) throws IOException {
|
||||
try (InputStream inputStream = Files.newInputStream(unencryptedFile);
|
||||
PDDocument pdfDocument = PDDocument.load(inputStream)) {
|
||||
PDFRenderer renderer = new PDFRenderer(pdfDocument);
|
||||
return renderer.renderImage(0);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Register fonts.
|
||||
*/
|
||||
|
@ -0,0 +1,70 @@
|
||||
package com.sismics.docs.core.util.format;
|
||||
|
||||
import com.google.common.io.Closer;
|
||||
import com.sismics.util.context.ThreadLocalContext;
|
||||
import com.sismics.util.mime.MimeType;
|
||||
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
||||
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
/**
|
||||
* DOCX format handler.
|
||||
*
|
||||
* @author bgamard
|
||||
*/
|
||||
public class DocxFormatHandler implements FormatHandler {
|
||||
/**
|
||||
* Temporary PDF file.
|
||||
*/
|
||||
private Path temporaryPdfFile;
|
||||
|
||||
@Override
|
||||
public boolean accept(String mimeType) {
|
||||
return MimeType.OFFICE_DOCUMENT.equals(mimeType);
|
||||
}
|
||||
|
||||
@Override
|
||||
public BufferedImage generateThumbnail(Path file) throws Exception {
|
||||
// Use the PDF format handler
|
||||
return new PdfFormatHandler().generateThumbnail(getGeneratedPdf(file));
|
||||
}
|
||||
|
||||
@Override
|
||||
public String extractContent(String language, Path file) throws Exception {
|
||||
// Use the PDF format handler
|
||||
return new PdfFormatHandler().extractContent(language, getGeneratedPdf(file));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) throws Exception {
|
||||
// Use the PDF format handler
|
||||
new PdfFormatHandler().appendToPdf(getGeneratedPdf(file), doc, fitImageToPage, margin, memUsageSettings, closer);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a PDF from this DOCX.
|
||||
*
|
||||
* @param file File
|
||||
* @return PDF file
|
||||
* @throws Exception e
|
||||
*/
|
||||
private Path getGeneratedPdf(Path file) throws Exception {
|
||||
if (temporaryPdfFile == null) {
|
||||
temporaryPdfFile = ThreadLocalContext.get().createTemporaryFile();
|
||||
try (InputStream inputStream = Files.newInputStream(file);
|
||||
OutputStream outputStream = Files.newOutputStream(temporaryPdfFile)) {
|
||||
XWPFDocument document = new XWPFDocument(inputStream);
|
||||
org.apache.poi.xwpf.converter.pdf.PdfOptions options = org.apache.poi.xwpf.converter.pdf.PdfOptions.create();
|
||||
org.apache.poi.xwpf.converter.pdf.PdfConverter.getInstance().convert(document, outputStream, options);
|
||||
}
|
||||
}
|
||||
|
||||
return temporaryPdfFile;
|
||||
}
|
||||
}
|
@ -0,0 +1,55 @@
|
||||
package com.sismics.docs.core.util.format;
|
||||
|
||||
import com.google.common.io.Closer;
|
||||
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.nio.file.Path;
|
||||
|
||||
/**
|
||||
* A format handler.
|
||||
*
|
||||
* @author bgamard
|
||||
*/
|
||||
public interface FormatHandler {
|
||||
/**
|
||||
* Returns true if this format handler can handle this MIME type.
|
||||
*
|
||||
* @param mimeType MIME type
|
||||
* @return True if accepted
|
||||
*/
|
||||
boolean accept(String mimeType);
|
||||
|
||||
/**
|
||||
* Generate a thumbnail.
|
||||
*
|
||||
* @param file File
|
||||
* @return Thumbnail
|
||||
* @throws Exception e
|
||||
*/
|
||||
BufferedImage generateThumbnail(Path file) throws Exception;
|
||||
|
||||
/**
|
||||
* Extract text content.
|
||||
*
|
||||
* @param language Language
|
||||
* @param file File
|
||||
* @return Text content
|
||||
* @throws Exception e
|
||||
*/
|
||||
String extractContent(String language, Path file) throws Exception;
|
||||
|
||||
/**
|
||||
* Append to a PDF.
|
||||
*
|
||||
* @param file File
|
||||
* @param doc PDF document
|
||||
* @param fitImageToPage Fit image to page
|
||||
* @param margin Margin
|
||||
* @param memUsageSettings Memory usage
|
||||
* @param closer Closer
|
||||
* @throws Exception e
|
||||
*/
|
||||
void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) throws Exception;
|
||||
}
|
@ -0,0 +1,45 @@
|
||||
package com.sismics.docs.core.util.format;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Format handler utilities.
|
||||
*
|
||||
* @author bgamard
|
||||
*/
|
||||
public class FormatHandlerUtil {
|
||||
/**
|
||||
* List of format handlers.
|
||||
*/
|
||||
private static final List<Class<? extends FormatHandler>> FORMAT_HANDLERS = Lists.newArrayList(
|
||||
DocxFormatHandler.class,
|
||||
OdtFormatHandler.class,
|
||||
VideoFormatHandler.class,
|
||||
PdfFormatHandler.class,
|
||||
TextPlainFormatHandler.class,
|
||||
ImageFormatHandler.class
|
||||
);
|
||||
|
||||
/**
|
||||
* Find a suitable format handler for this MIME type.
|
||||
*
|
||||
* @param mimeType MIME type
|
||||
* @return Instancied format handler
|
||||
*/
|
||||
public static FormatHandler find(String mimeType) {
|
||||
try {
|
||||
for (Class<? extends FormatHandler> formatHandlerClass : FORMAT_HANDLERS) {
|
||||
FormatHandler formatHandler = formatHandlerClass.newInstance();
|
||||
if (formatHandler.accept(mimeType)) {
|
||||
return formatHandler;
|
||||
}
|
||||
}
|
||||
} catch (InstantiationException | IllegalAccessException e) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
@ -0,0 +1,108 @@
|
||||
package com.sismics.docs.core.util.format;
|
||||
|
||||
import com.google.common.io.Closer;
|
||||
import com.sismics.docs.core.constant.Constants;
|
||||
import com.sismics.docs.core.util.FileUtil;
|
||||
import com.sismics.util.mime.MimeType;
|
||||
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.JPEGFactory;
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
/**
|
||||
* Image format handler.
|
||||
*
|
||||
* @author bgamard
|
||||
*/
|
||||
public class ImageFormatHandler implements FormatHandler {
|
||||
/**
|
||||
* Logger.
|
||||
*/
|
||||
private static final Logger log = LoggerFactory.getLogger(PdfFormatHandler.class);
|
||||
|
||||
/**
|
||||
* Saved MIME type.
|
||||
*/
|
||||
private String mimeType;
|
||||
|
||||
@Override
|
||||
public boolean accept(String mimeType) {
|
||||
this.mimeType = mimeType;
|
||||
return mimeType.equals(MimeType.IMAGE_GIF) || mimeType.equals(MimeType.IMAGE_PNG) || mimeType.equals(MimeType.IMAGE_JPEG);
|
||||
}
|
||||
|
||||
@Override
|
||||
public BufferedImage generateThumbnail(Path file) throws IOException {
|
||||
try (InputStream inputStream = Files.newInputStream(file)) {
|
||||
return ImageIO.read(inputStream);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String extractContent(String language, Path file) {
|
||||
try (InputStream inputStream = Files.newInputStream(file)) {
|
||||
return FileUtil.ocrFile(language, ImageIO.read(inputStream));
|
||||
} catch (IOException e) {
|
||||
log.error("Error reading the image", e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) throws Exception {
|
||||
PDPage page = new PDPage(PDRectangle.A4); // Images into A4 pages
|
||||
try (PDPageContentStream contentStream = new PDPageContentStream(doc, page);
|
||||
InputStream storedFileInputStream = Files.newInputStream(file)) {
|
||||
// Read the image using the correct handler. PDFBox can't do it because it relies wrongly on file extension
|
||||
PDImageXObject pdImage;
|
||||
switch (mimeType) {
|
||||
case MimeType.IMAGE_JPEG:
|
||||
pdImage = JPEGFactory.createFromStream(doc, storedFileInputStream);
|
||||
break;
|
||||
case MimeType.IMAGE_GIF:
|
||||
case MimeType.IMAGE_PNG:
|
||||
BufferedImage bim = ImageIO.read(storedFileInputStream);
|
||||
pdImage = LosslessFactory.createFromImage(doc, bim);
|
||||
break;
|
||||
default:
|
||||
return;
|
||||
}
|
||||
|
||||
// Do we want to fill the page with the image?
|
||||
if (fitImageToPage) {
|
||||
// Fill the page with the image
|
||||
float widthAvailable = page.getMediaBox().getWidth() - 2 * margin * Constants.MM_PER_INCH;
|
||||
float heightAvailable = page.getMediaBox().getHeight() - 2 * margin * Constants.MM_PER_INCH;
|
||||
|
||||
// Compare page format and image format
|
||||
if (widthAvailable / heightAvailable < (float) pdImage.getWidth() / (float) pdImage.getHeight()) {
|
||||
float imageHeight = widthAvailable / pdImage.getWidth() * pdImage.getHeight();
|
||||
contentStream.drawImage(pdImage, margin * Constants.MM_PER_INCH, heightAvailable + margin * Constants.MM_PER_INCH - imageHeight,
|
||||
widthAvailable, imageHeight);
|
||||
} else {
|
||||
float imageWidth = heightAvailable / pdImage.getHeight() * pdImage.getWidth();
|
||||
contentStream.drawImage(pdImage, margin * Constants.MM_PER_INCH, margin * Constants.MM_PER_INCH,
|
||||
imageWidth, heightAvailable);
|
||||
}
|
||||
} else {
|
||||
// Draw the image as is
|
||||
contentStream.drawImage(pdImage, margin * Constants.MM_PER_INCH,
|
||||
page.getMediaBox().getHeight() - pdImage.getHeight() - margin * Constants.MM_PER_INCH);
|
||||
}
|
||||
}
|
||||
doc.addPage(page);
|
||||
}
|
||||
}
|
@ -0,0 +1,72 @@
|
||||
package com.sismics.docs.core.util.format;
|
||||
|
||||
import com.google.common.io.Closer;
|
||||
import com.sismics.util.context.ThreadLocalContext;
|
||||
import com.sismics.util.mime.MimeType;
|
||||
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.odftoolkit.odfdom.converter.pdf.PdfConverter;
|
||||
import org.odftoolkit.odfdom.converter.pdf.PdfOptions;
|
||||
import org.odftoolkit.odfdom.doc.OdfTextDocument;
|
||||
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
/**
|
||||
* ODT format handler.
|
||||
*
|
||||
* @author bgamard
|
||||
*/
|
||||
public class OdtFormatHandler implements FormatHandler {
|
||||
/**
|
||||
* Temporary PDF file.
|
||||
*/
|
||||
private Path temporaryPdfFile;
|
||||
|
||||
@Override
|
||||
public boolean accept(String mimeType) {
|
||||
return MimeType.OPEN_DOCUMENT_TEXT.equals(mimeType);
|
||||
}
|
||||
|
||||
@Override
|
||||
public BufferedImage generateThumbnail(Path file) throws Exception {
|
||||
// Use the PDF format handler
|
||||
return new PdfFormatHandler().generateThumbnail(getGeneratedPdf(file));
|
||||
}
|
||||
|
||||
@Override
|
||||
public String extractContent(String language, Path file) throws Exception {
|
||||
// Use the PDF format handler
|
||||
return new PdfFormatHandler().extractContent(language, getGeneratedPdf(file));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) throws Exception {
|
||||
// Use the PDF format handler
|
||||
new PdfFormatHandler().appendToPdf(getGeneratedPdf(file), doc, fitImageToPage, margin, memUsageSettings, closer);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a PDF from this ODT.
|
||||
*
|
||||
* @param file File
|
||||
* @return PDF file
|
||||
* @throws Exception e
|
||||
*/
|
||||
private Path getGeneratedPdf(Path file) throws Exception {
|
||||
if (temporaryPdfFile == null) {
|
||||
temporaryPdfFile = ThreadLocalContext.get().createTemporaryFile();
|
||||
try (InputStream inputStream = Files.newInputStream(file);
|
||||
OutputStream outputStream = Files.newOutputStream(temporaryPdfFile)) {
|
||||
OdfTextDocument document = OdfTextDocument.loadDocument(inputStream);
|
||||
PdfOptions options = PdfOptions.create();
|
||||
PdfConverter.getInstance().convert(document, outputStream, options);
|
||||
}
|
||||
}
|
||||
|
||||
return temporaryPdfFile;
|
||||
}
|
||||
}
|
@ -0,0 +1,80 @@
|
||||
package com.sismics.docs.core.util.format;
|
||||
|
||||
import com.google.common.io.Closer;
|
||||
import com.sismics.docs.core.util.FileUtil;
|
||||
import com.sismics.util.mime.MimeType;
|
||||
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||
import org.apache.pdfbox.multipdf.PDFMergerUtility;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.rendering.PDFRenderer;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
/**
|
||||
* PDF format handler.
|
||||
*
|
||||
* @author bgamard
|
||||
*/
|
||||
public class PdfFormatHandler implements FormatHandler {
|
||||
/**
|
||||
* Logger.
|
||||
*/
|
||||
private static final Logger log = LoggerFactory.getLogger(PdfFormatHandler.class);
|
||||
|
||||
@Override
|
||||
public boolean accept(String mimeType) {
|
||||
return mimeType.equals(MimeType.APPLICATION_PDF);
|
||||
}
|
||||
|
||||
@Override
|
||||
public BufferedImage generateThumbnail(Path file) throws Exception {
|
||||
try (InputStream inputStream = Files.newInputStream(file);
|
||||
PDDocument pdfDocument = PDDocument.load(inputStream)) {
|
||||
PDFRenderer renderer = new PDFRenderer(pdfDocument);
|
||||
return renderer.renderImage(0);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String extractContent(String language, Path file) {
|
||||
String content = null;
|
||||
try (InputStream inputStream = Files.newInputStream(file);
|
||||
PDDocument pdfDocument = PDDocument.load(inputStream)) {
|
||||
content = new PDFTextStripper().getText(pdfDocument);
|
||||
} catch (Exception e) {
|
||||
log.error("Error while extracting text from the PDF", e);
|
||||
}
|
||||
|
||||
// No text content, try to OCR it
|
||||
if (language != null && content != null && content.trim().isEmpty()) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
try (InputStream inputStream = Files.newInputStream(file);
|
||||
PDDocument pdfDocument = PDDocument.load(inputStream)) {
|
||||
PDFRenderer renderer = new PDFRenderer(pdfDocument);
|
||||
for (int pageIndex = 0; pageIndex < pdfDocument.getNumberOfPages(); pageIndex++) {
|
||||
sb.append(" ");
|
||||
sb.append(FileUtil.ocrFile(language, renderer.renderImage(pageIndex)));
|
||||
}
|
||||
return sb.toString();
|
||||
} catch (Exception e) {
|
||||
log.error("Error while OCR-izing the PDF", e);
|
||||
}
|
||||
}
|
||||
|
||||
return content;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) throws Exception {
|
||||
PDDocument mergeDoc = PDDocument.load(file.toFile(), memUsageSettings);
|
||||
closer.register(mergeDoc);
|
||||
PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
|
||||
pdfMergerUtility.appendDocument(doc, mergeDoc);
|
||||
}
|
||||
}
|
@ -0,0 +1,56 @@
|
||||
package com.sismics.docs.core.util.format;
|
||||
|
||||
import com.google.common.base.Charsets;
|
||||
import com.google.common.io.Closer;
|
||||
import com.lowagie.text.*;
|
||||
import com.lowagie.text.pdf.PdfWriter;
|
||||
import com.sismics.util.context.ThreadLocalContext;
|
||||
import com.sismics.util.mime.MimeType;
|
||||
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.OutputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
/**
|
||||
* Text plain format handler.
|
||||
*
|
||||
* @author bgamard
|
||||
*/
|
||||
public class TextPlainFormatHandler implements FormatHandler {
|
||||
@Override
|
||||
public boolean accept(String mimeType) {
|
||||
return mimeType.equals(MimeType.TEXT_CSV) || mimeType.equals(MimeType.TEXT_PLAIN);
|
||||
}
|
||||
|
||||
@Override
|
||||
public BufferedImage generateThumbnail(Path file) throws Exception {
|
||||
Document output = new Document(PageSize.A4, 40, 40, 40, 40);
|
||||
Path tempFile = ThreadLocalContext.get().createTemporaryFile();
|
||||
OutputStream pdfOutputStream = Files.newOutputStream(tempFile);
|
||||
PdfWriter.getInstance(output, pdfOutputStream);
|
||||
|
||||
output.open();
|
||||
String content = new String(Files.readAllBytes(file), Charsets.UTF_8);
|
||||
Font font = FontFactory.getFont("LiberationMono-Regular");
|
||||
Paragraph paragraph = new Paragraph(content, font);
|
||||
paragraph.setAlignment(Element.ALIGN_LEFT);
|
||||
output.add(paragraph);
|
||||
output.close();
|
||||
|
||||
// Use the PDF format handler
|
||||
return new PdfFormatHandler().generateThumbnail(tempFile);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String extractContent(String language, Path file) throws Exception {
|
||||
return new String(Files.readAllBytes(file), "UTF-8");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) {
|
||||
// TODO Append the text file to the PDF
|
||||
}
|
||||
}
|
@ -1,10 +1,13 @@
|
||||
package com.sismics.util;
|
||||
package com.sismics.docs.core.util.format;
|
||||
|
||||
import com.google.common.base.Charsets;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.io.ByteStreams;
|
||||
import com.google.common.io.Closer;
|
||||
import com.sismics.util.io.InputStreamReaderThread;
|
||||
import com.sismics.util.mime.MimeType;
|
||||
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
import java.awt.image.BufferedImage;
|
||||
@ -15,27 +18,18 @@ import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Video processing utilities.
|
||||
* Video format handler.
|
||||
*
|
||||
* @author bgamard
|
||||
*/
|
||||
public class VideoUtil {
|
||||
/**
|
||||
* Returns true if this MIME type is a video.
|
||||
* @param mimeType MIME type
|
||||
* @return True if video
|
||||
*/
|
||||
public static boolean isVideo(String mimeType) {
|
||||
public class VideoFormatHandler implements FormatHandler {
|
||||
@Override
|
||||
public boolean accept(String mimeType) {
|
||||
return mimeType.equals(MimeType.VIDEO_MP4) || mimeType.equals(MimeType.VIDEO_WEBM);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a thumbnail from a video file.
|
||||
*
|
||||
* @param file Video file
|
||||
* @return Thumbnail
|
||||
*/
|
||||
public static BufferedImage getThumbnail(Path file) throws Exception {
|
||||
@Override
|
||||
public BufferedImage generateThumbnail(Path file) throws IOException {
|
||||
List<String> result = Lists.newLinkedList(Arrays.asList("ffmpeg", "-i"));
|
||||
result.add(file.toAbsolutePath().toString());
|
||||
result.addAll(Arrays.asList("-vf", "thumbnail", "-frames:v", "1", "-f", "mjpeg", "-"));
|
||||
@ -52,13 +46,8 @@ public class VideoUtil {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract metadata from a video file.
|
||||
*
|
||||
* @param file Video file
|
||||
* @return Metadata
|
||||
*/
|
||||
public static String getMetadata(Path file) {
|
||||
@Override
|
||||
public String extractContent(String language, Path file) {
|
||||
List<String> result = Lists.newLinkedList();
|
||||
result.add("mediainfo");
|
||||
result.add(file.toAbsolutePath().toString());
|
||||
@ -81,4 +70,9 @@ public class VideoUtil {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) {
|
||||
// Video cannot be appended to PDF files
|
||||
}
|
||||
}
|
@ -2,7 +2,6 @@ package com.sismics.util;
|
||||
|
||||
import com.google.common.base.Charsets;
|
||||
import com.google.common.hash.Hashing;
|
||||
import com.sismics.util.mime.MimeType;
|
||||
|
||||
import javax.imageio.IIOImage;
|
||||
import javax.imageio.ImageIO;
|
||||
@ -67,15 +66,6 @@ public class ImageUtil {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if this MIME type is an image.
|
||||
* @param mimeType MIME type
|
||||
* @return True if image
|
||||
*/
|
||||
public static boolean isImage(String mimeType) {
|
||||
return mimeType.equals(MimeType.IMAGE_GIF) || mimeType.equals(MimeType.IMAGE_PNG) || mimeType.equals(MimeType.IMAGE_JPEG);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute Gravatar hash.
|
||||
* See https://en.gravatar.com/site/implement/hash/.
|
||||
|
@ -1,7 +1,6 @@
|
||||
package com.sismics.util.mime;
|
||||
|
||||
import com.google.common.base.Charsets;
|
||||
import com.sismics.docs.core.model.jpa.File;
|
||||
import org.apache.commons.compress.utils.IOUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
@ -15,7 +14,7 @@ import java.util.zip.ZipInputStream;
|
||||
/**
|
||||
* Utility to check MIME types.
|
||||
*
|
||||
* @author jtremeaux
|
||||
* @author bgamard
|
||||
*/
|
||||
public class MimeTypeUtil {
|
||||
/**
|
||||
@ -27,11 +26,14 @@ public class MimeTypeUtil {
|
||||
* @throws IOException e
|
||||
*/
|
||||
public static String guessMimeType(Path file, String name) throws IOException {
|
||||
String mimeType;
|
||||
try (InputStream is = Files.newInputStream(file)) {
|
||||
byte[] headerBytes = new byte[64];
|
||||
is.read(headerBytes);
|
||||
return guessMimeType(headerBytes, name);
|
||||
mimeType = guessMimeType(headerBytes, name);
|
||||
}
|
||||
|
||||
return guessOpenDocumentFormat(mimeType, file);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -116,18 +118,17 @@ public class MimeTypeUtil {
|
||||
* It's more costly than the simple header check, but needed because open document formats
|
||||
* are simple ZIP files on the outside and much bigger on the inside.
|
||||
*
|
||||
* @param file File
|
||||
* @param unencryptedFile File on disk
|
||||
* @param mimeType Currently detected MIME type
|
||||
* @param file File on disk
|
||||
* @return MIME type
|
||||
*/
|
||||
public static String guessOpenDocumentFormat(File file, Path unencryptedFile) {
|
||||
if (!MimeType.APPLICATION_ZIP.equals(file.getMimeType())) {
|
||||
private static String guessOpenDocumentFormat(String mimeType, Path file) {
|
||||
if (!MimeType.APPLICATION_ZIP.equals(mimeType)) {
|
||||
// open document formats are ZIP files
|
||||
return file.getMimeType();
|
||||
return mimeType;
|
||||
}
|
||||
|
||||
String mimeType = file.getMimeType();
|
||||
try (InputStream inputStream = Files.newInputStream(unencryptedFile);
|
||||
try (InputStream inputStream = Files.newInputStream(file);
|
||||
ZipInputStream zipInputStream = new ZipInputStream(inputStream, Charsets.ISO_8859_1)) {
|
||||
ZipEntry archiveEntry = zipInputStream.getNextEntry();
|
||||
while (archiveEntry != null) {
|
||||
@ -151,7 +152,7 @@ public class MimeTypeUtil {
|
||||
}
|
||||
} catch (Exception e) {
|
||||
// In case of any error, just give up and keep the ZIP MIME type
|
||||
return file.getMimeType();
|
||||
return mimeType;
|
||||
}
|
||||
|
||||
return mimeType;
|
||||
|
@ -4,7 +4,9 @@ import com.google.common.collect.Lists;
|
||||
import com.google.common.io.Resources;
|
||||
import com.sismics.docs.core.dao.jpa.dto.DocumentDto;
|
||||
import com.sismics.docs.core.model.jpa.File;
|
||||
import com.sismics.docs.core.util.format.*;
|
||||
import com.sismics.util.mime.MimeType;
|
||||
import com.sismics.util.mime.MimeTypeUtil;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
@ -25,39 +27,40 @@ public class TestFileUtil {
|
||||
@Test
|
||||
public void extractContentOpenDocumentTextTest() throws Exception {
|
||||
Path path = Paths.get(ClassLoader.getSystemResource("file/document.odt").toURI());
|
||||
File file = new File();
|
||||
file.setMimeType(MimeType.OPEN_DOCUMENT_TEXT);
|
||||
Path pdfPath = PdfUtil.convertToPdf(file, path);
|
||||
String content = FileUtil.extractContent("eng", file, path, pdfPath);
|
||||
FormatHandler formatHandler = FormatHandlerUtil.find(MimeTypeUtil.guessMimeType(path, "document.odt"));
|
||||
Assert.assertNotNull(formatHandler);
|
||||
Assert.assertTrue(formatHandler instanceof OdtFormatHandler);
|
||||
String content = formatHandler.extractContent("eng", path);
|
||||
Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen."));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void extractContentOfficeDocumentTest() throws Exception {
|
||||
Path path = Paths.get(ClassLoader.getSystemResource("file/document.docx").toURI());
|
||||
File file = new File();
|
||||
file.setMimeType(MimeType.OFFICE_DOCUMENT);
|
||||
Path pdfPath = PdfUtil.convertToPdf(file, path);
|
||||
String content = FileUtil.extractContent("eng", file, path, pdfPath);
|
||||
FormatHandler formatHandler = FormatHandlerUtil.find(MimeTypeUtil.guessMimeType(path, "document.docx"));
|
||||
Assert.assertNotNull(formatHandler);
|
||||
Assert.assertTrue(formatHandler instanceof DocxFormatHandler);
|
||||
String content = formatHandler.extractContent("eng", path);
|
||||
Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen."));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void extractContentPdf() throws Exception {
|
||||
Path path = Paths.get(ClassLoader.getSystemResource("file/udhr.pdf").toURI());
|
||||
File file = new File();
|
||||
file.setMimeType(MimeType.APPLICATION_PDF);
|
||||
String content = FileUtil.extractContent("eng", file, path, path);
|
||||
FormatHandler formatHandler = FormatHandlerUtil.find(MimeTypeUtil.guessMimeType(path, "udhr.pdf"));
|
||||
Assert.assertNotNull(formatHandler);
|
||||
Assert.assertTrue(formatHandler instanceof PdfFormatHandler);
|
||||
String content = formatHandler.extractContent("eng", path);
|
||||
Assert.assertTrue(content.contains("All human beings are born free and equal in dignity and rights."));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void extractContentScannedPdf() throws Exception {
|
||||
Path path = Paths.get(ClassLoader.getSystemResource("file/scanned.pdf").toURI());
|
||||
File file = new File();
|
||||
file.setMimeType(MimeType.APPLICATION_PDF);
|
||||
String content = FileUtil.extractContent("eng", file, path, path);
|
||||
System.out.println(content);
|
||||
FormatHandler formatHandler = FormatHandlerUtil.find(MimeTypeUtil.guessMimeType(path, "scanned.pdf"));
|
||||
Assert.assertNotNull(formatHandler);
|
||||
Assert.assertTrue(formatHandler instanceof PdfFormatHandler);
|
||||
String content = formatHandler.extractContent("eng", path);
|
||||
Assert.assertTrue(content.contains("All human beings are born free and equal in dignity and rights."));
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,5 @@
|
||||
package com.sismics.util;
|
||||
|
||||
import com.sismics.docs.core.model.jpa.File;
|
||||
import com.sismics.util.mime.MimeType;
|
||||
import com.sismics.util.mime.MimeTypeUtil;
|
||||
import org.junit.Assert;
|
||||
@ -19,14 +18,10 @@ public class TestMimeTypeUtil {
|
||||
public void guessOpenDocumentFormatTest() throws Exception {
|
||||
// Detect ODT files
|
||||
Path path = Paths.get(ClassLoader.getSystemResource("file/document.odt").toURI());
|
||||
File file = new File();
|
||||
file.setMimeType(MimeType.APPLICATION_ZIP);
|
||||
Assert.assertEquals(MimeType.OPEN_DOCUMENT_TEXT, MimeTypeUtil.guessOpenDocumentFormat(file, path));
|
||||
Assert.assertEquals(MimeType.OPEN_DOCUMENT_TEXT, MimeTypeUtil.guessMimeType(path, "document.odt"));
|
||||
|
||||
// Detect DOCX files
|
||||
path = Paths.get(ClassLoader.getSystemResource("file/document.docx").toURI());
|
||||
file = new File();
|
||||
file.setMimeType(MimeType.APPLICATION_ZIP);
|
||||
Assert.assertEquals(MimeType.OFFICE_DOCUMENT, MimeTypeUtil.guessOpenDocumentFormat(file, path));
|
||||
Assert.assertEquals(MimeType.OFFICE_DOCUMENT, MimeTypeUtil.guessMimeType(path, "document.odt"));
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user