mirror of
https://github.com/sismics/docs.git
synced 2024-11-25 23:27:57 +01:00
Closes #182: format handling refactoring
This commit is contained in:
parent
996585d7ac
commit
7ea8d0c0f7
@ -87,4 +87,9 @@ public class Constants {
|
|||||||
* Email template for route step validate.
|
* Email template for route step validate.
|
||||||
*/
|
*/
|
||||||
public static final String EMAIL_TEMPLATE_ROUTE_STEP_VALIDATE = "route_step_validate";
|
public static final String EMAIL_TEMPLATE_ROUTE_STEP_VALIDATE = "route_step_validate";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* mm per inch.
|
||||||
|
*/
|
||||||
|
public static float MM_PER_INCH = 1 / (10 * 2.54f) * 72f;
|
||||||
}
|
}
|
||||||
|
@ -7,15 +7,22 @@ import com.sismics.docs.core.dao.lucene.LuceneDao;
|
|||||||
import com.sismics.docs.core.event.FileCreatedAsyncEvent;
|
import com.sismics.docs.core.event.FileCreatedAsyncEvent;
|
||||||
import com.sismics.docs.core.model.jpa.File;
|
import com.sismics.docs.core.model.jpa.File;
|
||||||
import com.sismics.docs.core.model.jpa.User;
|
import com.sismics.docs.core.model.jpa.User;
|
||||||
|
import com.sismics.docs.core.util.DirectoryUtil;
|
||||||
import com.sismics.docs.core.util.EncryptionUtil;
|
import com.sismics.docs.core.util.EncryptionUtil;
|
||||||
import com.sismics.docs.core.util.FileUtil;
|
import com.sismics.docs.core.util.FileUtil;
|
||||||
import com.sismics.docs.core.util.PdfUtil;
|
|
||||||
import com.sismics.docs.core.util.TransactionUtil;
|
import com.sismics.docs.core.util.TransactionUtil;
|
||||||
import com.sismics.util.mime.MimeTypeUtil;
|
import com.sismics.docs.core.util.format.FormatHandler;
|
||||||
|
import com.sismics.docs.core.util.format.FormatHandlerUtil;
|
||||||
|
import com.sismics.util.ImageUtil;
|
||||||
|
import com.sismics.util.Scalr;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import javax.crypto.Cipher;
|
import javax.crypto.Cipher;
|
||||||
|
import javax.crypto.CipherOutputStream;
|
||||||
|
import java.awt.image.BufferedImage;
|
||||||
|
import java.io.OutputStream;
|
||||||
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.text.MessageFormat;
|
import java.text.MessageFormat;
|
||||||
import java.util.concurrent.atomic.AtomicReference;
|
import java.util.concurrent.atomic.AtomicReference;
|
||||||
@ -42,16 +49,12 @@ public class FileCreatedAsyncListener {
|
|||||||
log.info("File created event: " + event.toString());
|
log.info("File created event: " + event.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Guess the mime type a second time, for open document format (first detected as simple ZIP file)
|
// Find a format handler
|
||||||
final File file = event.getFile();
|
final File file = event.getFile();
|
||||||
file.setMimeType(MimeTypeUtil.guessOpenDocumentFormat(file, event.getUnencryptedFile()));
|
FormatHandler formatHandler = FormatHandlerUtil.find(file.getMimeType());
|
||||||
|
if (formatHandler == null) {
|
||||||
// Convert to PDF if necessary (for thumbnail and text extraction)
|
log.error("Format unhandled: " + file.getMimeType());
|
||||||
Path unencryptedPdfFile = null;
|
return;
|
||||||
try {
|
|
||||||
unencryptedPdfFile = PdfUtil.convertToPdf(file, event.getUnencryptedFile());
|
|
||||||
} catch (Exception e) {
|
|
||||||
log.error("Unable to convert to PDF", e);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get the user from the database
|
// Get the user from the database
|
||||||
@ -71,15 +74,37 @@ public class FileCreatedAsyncListener {
|
|||||||
// Generate file variations
|
// Generate file variations
|
||||||
try {
|
try {
|
||||||
Cipher cipher = EncryptionUtil.getEncryptionCipher(user.get().getPrivateKey());
|
Cipher cipher = EncryptionUtil.getEncryptionCipher(user.get().getPrivateKey());
|
||||||
FileUtil.saveVariations(file, event.getUnencryptedFile(), unencryptedPdfFile, cipher);
|
BufferedImage image = formatHandler.generateThumbnail(event.getUnencryptedFile());
|
||||||
|
if (image != null) {
|
||||||
|
// Generate thumbnails from image
|
||||||
|
BufferedImage web = Scalr.resize(image, Scalr.Method.ULTRA_QUALITY, Scalr.Mode.AUTOMATIC, 1280);
|
||||||
|
BufferedImage thumbnail = Scalr.resize(image, Scalr.Method.ULTRA_QUALITY, Scalr.Mode.AUTOMATIC, 256);
|
||||||
|
image.flush();
|
||||||
|
|
||||||
|
// Write "web" encrypted image
|
||||||
|
Path outputFile = DirectoryUtil.getStorageDirectory().resolve(file.getId() + "_web");
|
||||||
|
try (OutputStream outputStream = new CipherOutputStream(Files.newOutputStream(outputFile), cipher)) {
|
||||||
|
ImageUtil.writeJpeg(web, outputStream);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write "thumb" encrypted image
|
||||||
|
outputFile = DirectoryUtil.getStorageDirectory().resolve(file.getId() + "_thumb");
|
||||||
|
try (OutputStream outputStream = new CipherOutputStream(Files.newOutputStream(outputFile), cipher)) {
|
||||||
|
ImageUtil.writeJpeg(thumbnail, outputStream);
|
||||||
|
}
|
||||||
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.error("Unable to generate thumbnails", e);
|
log.error("Unable to generate thumbnails", e);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extract text content from the file
|
// Extract text content from the file
|
||||||
long startTime = System.currentTimeMillis();
|
long startTime = System.currentTimeMillis();
|
||||||
final String content = FileUtil.extractContent(event.getLanguage(), file,
|
final AtomicReference<String> content = new AtomicReference<>();
|
||||||
event.getUnencryptedFile(), unencryptedPdfFile);
|
try {
|
||||||
|
content.set(formatHandler.extractContent(event.getLanguage(), event.getUnencryptedFile()));
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.error("Error extracting content from: " + event.getFile());
|
||||||
|
}
|
||||||
log.info(MessageFormat.format("File content extracted in {0}ms", System.currentTimeMillis() - startTime));
|
log.info(MessageFormat.format("File content extracted in {0}ms", System.currentTimeMillis() - startTime));
|
||||||
|
|
||||||
// Save the file to database
|
// Save the file to database
|
||||||
@ -92,7 +117,7 @@ public class FileCreatedAsyncListener {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
file.setContent(content);
|
file.setContent(content.get());
|
||||||
fileDao.update(file);
|
fileDao.update(file);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@ -10,9 +10,7 @@ import com.sismics.docs.core.model.jpa.File;
|
|||||||
import com.sismics.docs.core.model.jpa.User;
|
import com.sismics.docs.core.model.jpa.User;
|
||||||
import com.sismics.tess4j.Tesseract;
|
import com.sismics.tess4j.Tesseract;
|
||||||
import com.sismics.util.ImageDeskew;
|
import com.sismics.util.ImageDeskew;
|
||||||
import com.sismics.util.ImageUtil;
|
|
||||||
import com.sismics.util.Scalr;
|
import com.sismics.util.Scalr;
|
||||||
import com.sismics.util.VideoUtil;
|
|
||||||
import com.sismics.util.context.ThreadLocalContext;
|
import com.sismics.util.context.ThreadLocalContext;
|
||||||
import com.sismics.util.mime.MimeTypeUtil;
|
import com.sismics.util.mime.MimeTypeUtil;
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang.StringUtils;
|
||||||
@ -21,12 +19,9 @@ import org.slf4j.LoggerFactory;
|
|||||||
|
|
||||||
import javax.crypto.Cipher;
|
import javax.crypto.Cipher;
|
||||||
import javax.crypto.CipherInputStream;
|
import javax.crypto.CipherInputStream;
|
||||||
import javax.crypto.CipherOutputStream;
|
|
||||||
import javax.imageio.ImageIO;
|
|
||||||
import java.awt.image.BufferedImage;
|
import java.awt.image.BufferedImage;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.OutputStream;
|
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
@ -49,40 +44,14 @@ public class FileUtil {
|
|||||||
*/
|
*/
|
||||||
private static Set<String> processingFileSet = Collections.synchronizedSet(new HashSet<String>());
|
private static Set<String> processingFileSet = Collections.synchronizedSet(new HashSet<String>());
|
||||||
|
|
||||||
/**
|
|
||||||
* Extract content from a file.
|
|
||||||
*
|
|
||||||
* @param language Language to extract
|
|
||||||
* @param file File to extract
|
|
||||||
* @param unencryptedFile Unencrypted file
|
|
||||||
* @param unencryptedPdfFile Unencrypted PDF file
|
|
||||||
* @return Content extract
|
|
||||||
*/
|
|
||||||
public static String extractContent(String language, File file, Path unencryptedFile, Path unencryptedPdfFile) {
|
|
||||||
String content = null;
|
|
||||||
if (language == null) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ImageUtil.isImage(file.getMimeType())) {
|
|
||||||
content = ocrFile(unencryptedFile, language);
|
|
||||||
} else if (VideoUtil.isVideo(file.getMimeType())) {
|
|
||||||
content = VideoUtil.getMetadata(unencryptedFile);
|
|
||||||
} else if (unencryptedPdfFile != null) {
|
|
||||||
content = PdfUtil.extractPdf(unencryptedPdfFile, language);
|
|
||||||
}
|
|
||||||
|
|
||||||
return content;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Optical character recognition on an image.
|
* Optical character recognition on an image.
|
||||||
*
|
*
|
||||||
* @param image Buffered image
|
|
||||||
* @param language Language to OCR
|
* @param language Language to OCR
|
||||||
|
* @param image Buffered image
|
||||||
* @return Content extracted
|
* @return Content extracted
|
||||||
*/
|
*/
|
||||||
public static String ocrFile(BufferedImage image, String language) {
|
public static String ocrFile(String language, BufferedImage image) {
|
||||||
// Upscale, grayscale and deskew the image
|
// Upscale, grayscale and deskew the image
|
||||||
String content = null;
|
String content = null;
|
||||||
BufferedImage resizedImage = Scalr.resize(image, Scalr.Method.AUTOMATIC, Scalr.Mode.AUTOMATIC, 3500, Scalr.OP_ANTIALIAS, Scalr.OP_GRAYSCALE);
|
BufferedImage resizedImage = Scalr.resize(image, Scalr.Method.AUTOMATIC, Scalr.Mode.AUTOMATIC, 3500, Scalr.OP_ANTIALIAS, Scalr.OP_GRAYSCALE);
|
||||||
@ -105,66 +74,6 @@ public class FileUtil {
|
|||||||
return content;
|
return content;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Optical character recognition on a file.
|
|
||||||
*
|
|
||||||
* @param unecryptedFile Unencrypted file
|
|
||||||
* @param language Language to OCR
|
|
||||||
* @return Content extracted
|
|
||||||
*/
|
|
||||||
private static String ocrFile(Path unecryptedFile, String language) {
|
|
||||||
BufferedImage image;
|
|
||||||
try (InputStream inputStream = Files.newInputStream(unecryptedFile)) {
|
|
||||||
image = ImageIO.read(inputStream);
|
|
||||||
} catch (IOException e) {
|
|
||||||
log.error("Error reading the image", e);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
return ocrFile(image, language);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Generate file variations.
|
|
||||||
*
|
|
||||||
* @param file File from database
|
|
||||||
* @param unencryptedFile Unencrypted file
|
|
||||||
* @param unencryptedPdfFile Unencrypted PDF file
|
|
||||||
* @param cipher Cipher to use for encryption
|
|
||||||
*/
|
|
||||||
public static void saveVariations(File file, Path unencryptedFile, Path unencryptedPdfFile, Cipher cipher) throws Exception {
|
|
||||||
BufferedImage image = null;
|
|
||||||
if (ImageUtil.isImage(file.getMimeType())) {
|
|
||||||
try (InputStream inputStream = Files.newInputStream(unencryptedFile)) {
|
|
||||||
image = ImageIO.read(inputStream);
|
|
||||||
}
|
|
||||||
} else if (VideoUtil.isVideo(file.getMimeType())) {
|
|
||||||
image = VideoUtil.getThumbnail(unencryptedFile);
|
|
||||||
} else if (unencryptedPdfFile != null) {
|
|
||||||
// Generate preview from the first page of the PDF
|
|
||||||
image = PdfUtil.renderFirstPage(unencryptedPdfFile);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (image != null) {
|
|
||||||
// Generate thumbnails from image
|
|
||||||
BufferedImage web = Scalr.resize(image, Scalr.Method.ULTRA_QUALITY, Scalr.Mode.AUTOMATIC, 1280);
|
|
||||||
BufferedImage thumbnail = Scalr.resize(image, Scalr.Method.ULTRA_QUALITY, Scalr.Mode.AUTOMATIC, 256);
|
|
||||||
image.flush();
|
|
||||||
|
|
||||||
// Write "web" encrypted image
|
|
||||||
Path outputFile = DirectoryUtil.getStorageDirectory().resolve(file.getId() + "_web");
|
|
||||||
try (OutputStream outputStream = new CipherOutputStream(Files.newOutputStream(outputFile), cipher)) {
|
|
||||||
ImageUtil.writeJpeg(web, outputStream);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Write "thumb" encrypted image
|
|
||||||
outputFile = DirectoryUtil.getStorageDirectory().resolve(file.getId() + "_thumb");
|
|
||||||
try (OutputStream outputStream = new CipherOutputStream(Files.newOutputStream(outputFile), cipher)) {
|
|
||||||
ImageUtil.writeJpeg(thumbnail, outputStream);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Remove a file from the storage filesystem.
|
* Remove a file from the storage filesystem.
|
||||||
*
|
*
|
||||||
|
@ -1,39 +1,23 @@
|
|||||||
package com.sismics.docs.core.util;
|
package com.sismics.docs.core.util;
|
||||||
|
|
||||||
import com.google.common.base.Charsets;
|
|
||||||
import com.google.common.base.Strings;
|
import com.google.common.base.Strings;
|
||||||
import com.google.common.io.ByteStreams;
|
import com.google.common.io.ByteStreams;
|
||||||
import com.google.common.io.Closer;
|
import com.google.common.io.Closer;
|
||||||
import com.google.common.io.Resources;
|
import com.google.common.io.Resources;
|
||||||
import com.lowagie.text.*;
|
import com.lowagie.text.FontFactory;
|
||||||
import com.lowagie.text.pdf.PdfWriter;
|
import com.sismics.docs.core.constant.Constants;
|
||||||
import com.sismics.docs.core.dao.jpa.dto.DocumentDto;
|
import com.sismics.docs.core.dao.jpa.dto.DocumentDto;
|
||||||
import com.sismics.docs.core.model.jpa.File;
|
import com.sismics.docs.core.model.jpa.File;
|
||||||
|
import com.sismics.docs.core.util.format.FormatHandler;
|
||||||
|
import com.sismics.docs.core.util.format.FormatHandlerUtil;
|
||||||
import com.sismics.docs.core.util.pdf.PdfPage;
|
import com.sismics.docs.core.util.pdf.PdfPage;
|
||||||
import com.sismics.util.ImageUtil;
|
|
||||||
import com.sismics.util.context.ThreadLocalContext;
|
|
||||||
import com.sismics.util.mime.MimeType;
|
|
||||||
import org.apache.pdfbox.io.MemoryUsageSetting;
|
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||||
import org.apache.pdfbox.multipdf.PDFMergerUtility;
|
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
import org.apache.pdfbox.pdmodel.PDPage;
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
|
||||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
|
||||||
import org.apache.pdfbox.pdmodel.font.DocsPDType1Font;
|
import org.apache.pdfbox.pdmodel.font.DocsPDType1Font;
|
||||||
import org.apache.pdfbox.pdmodel.graphics.image.JPEGFactory;
|
|
||||||
import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
|
|
||||||
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
|
||||||
import org.apache.pdfbox.rendering.PDFRenderer;
|
|
||||||
import org.apache.pdfbox.text.PDFTextStripper;
|
|
||||||
import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
|
||||||
import org.odftoolkit.odfdom.converter.pdf.PdfConverter;
|
|
||||||
import org.odftoolkit.odfdom.converter.pdf.PdfOptions;
|
|
||||||
import org.odftoolkit.odfdom.doc.OdfTextDocument;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import javax.imageio.ImageIO;
|
|
||||||
import java.awt.image.BufferedImage;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
@ -55,127 +39,6 @@ public class PdfUtil {
|
|||||||
*/
|
*/
|
||||||
private static final Logger log = LoggerFactory.getLogger(PdfUtil.class);
|
private static final Logger log = LoggerFactory.getLogger(PdfUtil.class);
|
||||||
|
|
||||||
/**
|
|
||||||
* Extract text from a PDF.
|
|
||||||
*
|
|
||||||
* @param unencryptedPdfFile Unencrypted PDF file
|
|
||||||
* @param language Language
|
|
||||||
* @return Content extracted
|
|
||||||
*/
|
|
||||||
public static String extractPdf(Path unencryptedPdfFile, String language) {
|
|
||||||
String content = null;
|
|
||||||
try (InputStream inputStream = Files.newInputStream(unencryptedPdfFile);
|
|
||||||
PDDocument pdfDocument = PDDocument.load(inputStream)) {
|
|
||||||
content = new PDFTextStripper().getText(pdfDocument);
|
|
||||||
} catch (Exception e) {
|
|
||||||
log.error("Error while extracting text from the PDF", e);
|
|
||||||
}
|
|
||||||
|
|
||||||
// No text content, try to OCR it
|
|
||||||
if (language != null && content != null && content.trim().isEmpty()) {
|
|
||||||
StringBuilder sb = new StringBuilder();
|
|
||||||
try (InputStream inputStream = Files.newInputStream(unencryptedPdfFile);
|
|
||||||
PDDocument pdfDocument = PDDocument.load(inputStream)) {
|
|
||||||
PDFRenderer renderer = new PDFRenderer(pdfDocument);
|
|
||||||
for (int pageIndex = 0; pageIndex < pdfDocument.getNumberOfPages(); pageIndex++) {
|
|
||||||
sb.append(" ");
|
|
||||||
sb.append(FileUtil.ocrFile(renderer.renderImage(pageIndex), language));
|
|
||||||
}
|
|
||||||
return sb.toString();
|
|
||||||
} catch (Exception e) {
|
|
||||||
log.error("Error while OCR-izing the PDF", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return content;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Convert a file to PDF if necessary.
|
|
||||||
*
|
|
||||||
* @param file File
|
|
||||||
* @param unencryptedFile Unencrypted file
|
|
||||||
* @return PDF temporary file
|
|
||||||
*/
|
|
||||||
public static Path convertToPdf(File file, Path unencryptedFile) throws Exception {
|
|
||||||
if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) {
|
|
||||||
// It's already PDF, just return the file
|
|
||||||
return unencryptedFile;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (file.getMimeType().equals(MimeType.OFFICE_DOCUMENT)) {
|
|
||||||
return convertOfficeDocument(unencryptedFile);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (file.getMimeType().equals(MimeType.OPEN_DOCUMENT_TEXT)) {
|
|
||||||
return convertOpenDocumentText(unencryptedFile);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (file.getMimeType().equals(MimeType.TEXT_PLAIN) || file.getMimeType().equals(MimeType.TEXT_CSV)) {
|
|
||||||
return convertTextPlain(unencryptedFile);
|
|
||||||
}
|
|
||||||
|
|
||||||
// PDF conversion not necessary/possible
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Convert a text plain document to PDF.
|
|
||||||
*
|
|
||||||
* @param unencryptedFile Unencrypted file
|
|
||||||
* @return PDF file
|
|
||||||
*/
|
|
||||||
private static Path convertTextPlain(Path unencryptedFile) throws Exception {
|
|
||||||
Document output = new Document(PageSize.A4, 40, 40, 40, 40);
|
|
||||||
Path tempFile = ThreadLocalContext.get().createTemporaryFile();
|
|
||||||
OutputStream pdfOutputStream = Files.newOutputStream(tempFile);
|
|
||||||
PdfWriter.getInstance(output, pdfOutputStream);
|
|
||||||
|
|
||||||
output.open();
|
|
||||||
String content = new String(Files.readAllBytes(unencryptedFile), Charsets.UTF_8);
|
|
||||||
Font font = FontFactory.getFont("LiberationMono-Regular");
|
|
||||||
Paragraph paragraph = new Paragraph(content, font);
|
|
||||||
paragraph.setAlignment(Element.ALIGN_LEFT);
|
|
||||||
output.add(paragraph);
|
|
||||||
output.close();
|
|
||||||
|
|
||||||
return tempFile;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Convert an open document text file to PDF.
|
|
||||||
*
|
|
||||||
* @param unencryptedFile Unencrypted file
|
|
||||||
* @return PDF file
|
|
||||||
*/
|
|
||||||
private static Path convertOpenDocumentText(Path unencryptedFile) throws Exception {
|
|
||||||
Path tempFile = ThreadLocalContext.get().createTemporaryFile();
|
|
||||||
try (InputStream inputStream = Files.newInputStream(unencryptedFile);
|
|
||||||
OutputStream outputStream = Files.newOutputStream(tempFile)) {
|
|
||||||
OdfTextDocument document = OdfTextDocument.loadDocument(inputStream);
|
|
||||||
PdfOptions options = PdfOptions.create();
|
|
||||||
PdfConverter.getInstance().convert(document, outputStream, options);
|
|
||||||
}
|
|
||||||
return tempFile;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Convert an Office document to PDF.
|
|
||||||
*
|
|
||||||
* @param unencryptedFile Unencrypted file
|
|
||||||
* @return PDF file
|
|
||||||
*/
|
|
||||||
private static Path convertOfficeDocument(Path unencryptedFile) throws Exception {
|
|
||||||
Path tempFile = ThreadLocalContext.get().createTemporaryFile();
|
|
||||||
try (InputStream inputStream = Files.newInputStream(unencryptedFile);
|
|
||||||
OutputStream outputStream = Files.newOutputStream(tempFile)) {
|
|
||||||
XWPFDocument document = new XWPFDocument(inputStream);
|
|
||||||
org.apache.poi.xwpf.converter.pdf.PdfOptions options = org.apache.poi.xwpf.converter.pdf.PdfOptions.create();
|
|
||||||
org.apache.poi.xwpf.converter.pdf.PdfConverter.getInstance().convert(document, outputStream, options);
|
|
||||||
}
|
|
||||||
return tempFile;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Convert a document and its files to a merged PDF file.
|
* Convert a document and its files to a merged PDF file.
|
||||||
*
|
*
|
||||||
@ -192,7 +55,6 @@ public class PdfUtil {
|
|||||||
Closer closer = Closer.create();
|
Closer closer = Closer.create();
|
||||||
MemoryUsageSetting memUsageSettings = MemoryUsageSetting.setupMixed(1000000); // 1MB max memory usage
|
MemoryUsageSetting memUsageSettings = MemoryUsageSetting.setupMixed(1000000); // 1MB max memory usage
|
||||||
memUsageSettings.setTempDir(new java.io.File(System.getProperty("java.io.tmpdir"))); // To OS temp
|
memUsageSettings.setTempDir(new java.io.File(System.getProperty("java.io.tmpdir"))); // To OS temp
|
||||||
float mmPerInch = 1 / (10 * 2.54f) * 72f;
|
|
||||||
|
|
||||||
// Create a blank PDF
|
// Create a blank PDF
|
||||||
try (PDDocument doc = new PDDocument(memUsageSettings)) {
|
try (PDDocument doc = new PDDocument(memUsageSettings)) {
|
||||||
@ -200,7 +62,7 @@ public class PdfUtil {
|
|||||||
if (metadata) {
|
if (metadata) {
|
||||||
PDPage page = new PDPage();
|
PDPage page = new PDPage();
|
||||||
doc.addPage(page);
|
doc.addPage(page);
|
||||||
try (PdfPage pdfPage = new PdfPage(doc, page, margin * mmPerInch, DocsPDType1Font.HELVETICA, 12)) {
|
try (PdfPage pdfPage = new PdfPage(doc, page, margin * Constants.MM_PER_INCH, DocsPDType1Font.HELVETICA, 12)) {
|
||||||
SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
|
SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
|
||||||
pdfPage.addText(documentDto.getTitle(), true, DocsPDType1Font.HELVETICA_BOLD, 16)
|
pdfPage.addText(documentDto.getTitle(), true, DocsPDType1Font.HELVETICA_BOLD, 16)
|
||||||
.newLine()
|
.newLine()
|
||||||
@ -245,55 +107,9 @@ public class PdfUtil {
|
|||||||
|
|
||||||
// Decrypt the file to a temporary file
|
// Decrypt the file to a temporary file
|
||||||
Path unencryptedFile = EncryptionUtil.decryptFile(storedFile, file.getPrivateKey());
|
Path unencryptedFile = EncryptionUtil.decryptFile(storedFile, file.getPrivateKey());
|
||||||
|
FormatHandler formatHandler = FormatHandlerUtil.find(file.getMimeType());
|
||||||
if (ImageUtil.isImage(file.getMimeType())) {
|
if (formatHandler != null) {
|
||||||
PDPage page = new PDPage(PDRectangle.A4); // Images into A4 pages
|
formatHandler.appendToPdf(unencryptedFile, doc, fitImageToPage, margin, memUsageSettings, closer);
|
||||||
try (PDPageContentStream contentStream = new PDPageContentStream(doc, page);
|
|
||||||
InputStream storedFileInputStream = Files.newInputStream(unencryptedFile)) {
|
|
||||||
// Read the image using the correct handler. PDFBox can't do it because it relies wrongly on file extension
|
|
||||||
PDImageXObject pdImage = null;
|
|
||||||
if (file.getMimeType().equals(MimeType.IMAGE_JPEG)) {
|
|
||||||
pdImage = JPEGFactory.createFromStream(doc, storedFileInputStream);
|
|
||||||
} else if (file.getMimeType().equals(MimeType.IMAGE_GIF) || file.getMimeType().equals(MimeType.IMAGE_PNG)) {
|
|
||||||
BufferedImage bim = ImageIO.read(storedFileInputStream);
|
|
||||||
pdImage = LosslessFactory.createFromImage(doc, bim);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Do we want to fill the page with the image?
|
|
||||||
if (fitImageToPage) {
|
|
||||||
// Fill the page with the image
|
|
||||||
float widthAvailable = page.getMediaBox().getWidth() - 2 * margin * mmPerInch;
|
|
||||||
float heightAvailable = page.getMediaBox().getHeight() - 2 * margin * mmPerInch;
|
|
||||||
|
|
||||||
// Compare page format and image format
|
|
||||||
if (widthAvailable / heightAvailable < (float) pdImage.getWidth() / (float) pdImage.getHeight()) {
|
|
||||||
float imageHeight = widthAvailable / pdImage.getWidth() * pdImage.getHeight();
|
|
||||||
contentStream.drawImage(pdImage, margin * mmPerInch, heightAvailable + margin * mmPerInch - imageHeight,
|
|
||||||
widthAvailable, imageHeight);
|
|
||||||
} else {
|
|
||||||
float imageWidth = heightAvailable / pdImage.getHeight() * pdImage.getWidth();
|
|
||||||
contentStream.drawImage(pdImage, margin * mmPerInch, margin * mmPerInch,
|
|
||||||
imageWidth, heightAvailable);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Draw the image as is
|
|
||||||
contentStream.drawImage(pdImage, margin * mmPerInch,
|
|
||||||
page.getMediaBox().getHeight() - pdImage.getHeight() - margin * mmPerInch);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
doc.addPage(page);
|
|
||||||
} else {
|
|
||||||
// Try to convert the file to PDF
|
|
||||||
Path unencryptedPdfFile = convertToPdf(file, unencryptedFile);
|
|
||||||
if (unencryptedPdfFile != null) {
|
|
||||||
// This file is convertible to PDF, just add it to the end
|
|
||||||
PDDocument mergeDoc = PDDocument.load(unencryptedPdfFile.toFile(), memUsageSettings);
|
|
||||||
closer.register(mergeDoc);
|
|
||||||
PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
|
|
||||||
pdfMergerUtility.appendDocument(doc, mergeDoc);
|
|
||||||
}
|
|
||||||
|
|
||||||
// All other non-PDF-convertible files are ignored
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -302,20 +118,6 @@ public class PdfUtil {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Render the first page of a PDF.
|
|
||||||
*
|
|
||||||
* @param unencryptedFile PDF document
|
|
||||||
* @return Render of the first page
|
|
||||||
*/
|
|
||||||
public static BufferedImage renderFirstPage(Path unencryptedFile) throws IOException {
|
|
||||||
try (InputStream inputStream = Files.newInputStream(unencryptedFile);
|
|
||||||
PDDocument pdfDocument = PDDocument.load(inputStream)) {
|
|
||||||
PDFRenderer renderer = new PDFRenderer(pdfDocument);
|
|
||||||
return renderer.renderImage(0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Register fonts.
|
* Register fonts.
|
||||||
*/
|
*/
|
||||||
|
@ -0,0 +1,70 @@
|
|||||||
|
package com.sismics.docs.core.util.format;
|
||||||
|
|
||||||
|
import com.google.common.io.Closer;
|
||||||
|
import com.sismics.util.context.ThreadLocalContext;
|
||||||
|
import com.sismics.util.mime.MimeType;
|
||||||
|
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
|
import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
||||||
|
|
||||||
|
import java.awt.image.BufferedImage;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.OutputStream;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* DOCX format handler.
|
||||||
|
*
|
||||||
|
* @author bgamard
|
||||||
|
*/
|
||||||
|
public class DocxFormatHandler implements FormatHandler {
|
||||||
|
/**
|
||||||
|
* Temporary PDF file.
|
||||||
|
*/
|
||||||
|
private Path temporaryPdfFile;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean accept(String mimeType) {
|
||||||
|
return MimeType.OFFICE_DOCUMENT.equals(mimeType);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BufferedImage generateThumbnail(Path file) throws Exception {
|
||||||
|
// Use the PDF format handler
|
||||||
|
return new PdfFormatHandler().generateThumbnail(getGeneratedPdf(file));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String extractContent(String language, Path file) throws Exception {
|
||||||
|
// Use the PDF format handler
|
||||||
|
return new PdfFormatHandler().extractContent(language, getGeneratedPdf(file));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) throws Exception {
|
||||||
|
// Use the PDF format handler
|
||||||
|
new PdfFormatHandler().appendToPdf(getGeneratedPdf(file), doc, fitImageToPage, margin, memUsageSettings, closer);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate a PDF from this DOCX.
|
||||||
|
*
|
||||||
|
* @param file File
|
||||||
|
* @return PDF file
|
||||||
|
* @throws Exception e
|
||||||
|
*/
|
||||||
|
private Path getGeneratedPdf(Path file) throws Exception {
|
||||||
|
if (temporaryPdfFile == null) {
|
||||||
|
temporaryPdfFile = ThreadLocalContext.get().createTemporaryFile();
|
||||||
|
try (InputStream inputStream = Files.newInputStream(file);
|
||||||
|
OutputStream outputStream = Files.newOutputStream(temporaryPdfFile)) {
|
||||||
|
XWPFDocument document = new XWPFDocument(inputStream);
|
||||||
|
org.apache.poi.xwpf.converter.pdf.PdfOptions options = org.apache.poi.xwpf.converter.pdf.PdfOptions.create();
|
||||||
|
org.apache.poi.xwpf.converter.pdf.PdfConverter.getInstance().convert(document, outputStream, options);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return temporaryPdfFile;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,55 @@
|
|||||||
|
package com.sismics.docs.core.util.format;
|
||||||
|
|
||||||
|
import com.google.common.io.Closer;
|
||||||
|
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
|
|
||||||
|
import java.awt.image.BufferedImage;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A format handler.
|
||||||
|
*
|
||||||
|
* @author bgamard
|
||||||
|
*/
|
||||||
|
public interface FormatHandler {
|
||||||
|
/**
|
||||||
|
* Returns true if this format handler can handle this MIME type.
|
||||||
|
*
|
||||||
|
* @param mimeType MIME type
|
||||||
|
* @return True if accepted
|
||||||
|
*/
|
||||||
|
boolean accept(String mimeType);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate a thumbnail.
|
||||||
|
*
|
||||||
|
* @param file File
|
||||||
|
* @return Thumbnail
|
||||||
|
* @throws Exception e
|
||||||
|
*/
|
||||||
|
BufferedImage generateThumbnail(Path file) throws Exception;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract text content.
|
||||||
|
*
|
||||||
|
* @param language Language
|
||||||
|
* @param file File
|
||||||
|
* @return Text content
|
||||||
|
* @throws Exception e
|
||||||
|
*/
|
||||||
|
String extractContent(String language, Path file) throws Exception;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Append to a PDF.
|
||||||
|
*
|
||||||
|
* @param file File
|
||||||
|
* @param doc PDF document
|
||||||
|
* @param fitImageToPage Fit image to page
|
||||||
|
* @param margin Margin
|
||||||
|
* @param memUsageSettings Memory usage
|
||||||
|
* @param closer Closer
|
||||||
|
* @throws Exception e
|
||||||
|
*/
|
||||||
|
void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) throws Exception;
|
||||||
|
}
|
@ -0,0 +1,45 @@
|
|||||||
|
package com.sismics.docs.core.util.format;
|
||||||
|
|
||||||
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Format handler utilities.
|
||||||
|
*
|
||||||
|
* @author bgamard
|
||||||
|
*/
|
||||||
|
public class FormatHandlerUtil {
|
||||||
|
/**
|
||||||
|
* List of format handlers.
|
||||||
|
*/
|
||||||
|
private static final List<Class<? extends FormatHandler>> FORMAT_HANDLERS = Lists.newArrayList(
|
||||||
|
DocxFormatHandler.class,
|
||||||
|
OdtFormatHandler.class,
|
||||||
|
VideoFormatHandler.class,
|
||||||
|
PdfFormatHandler.class,
|
||||||
|
TextPlainFormatHandler.class,
|
||||||
|
ImageFormatHandler.class
|
||||||
|
);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Find a suitable format handler for this MIME type.
|
||||||
|
*
|
||||||
|
* @param mimeType MIME type
|
||||||
|
* @return Instancied format handler
|
||||||
|
*/
|
||||||
|
public static FormatHandler find(String mimeType) {
|
||||||
|
try {
|
||||||
|
for (Class<? extends FormatHandler> formatHandlerClass : FORMAT_HANDLERS) {
|
||||||
|
FormatHandler formatHandler = formatHandlerClass.newInstance();
|
||||||
|
if (formatHandler.accept(mimeType)) {
|
||||||
|
return formatHandler;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (InstantiationException | IllegalAccessException e) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,108 @@
|
|||||||
|
package com.sismics.docs.core.util.format;
|
||||||
|
|
||||||
|
import com.google.common.io.Closer;
|
||||||
|
import com.sismics.docs.core.constant.Constants;
|
||||||
|
import com.sismics.docs.core.util.FileUtil;
|
||||||
|
import com.sismics.util.mime.MimeType;
|
||||||
|
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||||
|
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||||
|
import org.apache.pdfbox.pdmodel.graphics.image.JPEGFactory;
|
||||||
|
import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
|
||||||
|
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import javax.imageio.ImageIO;
|
||||||
|
import java.awt.image.BufferedImage;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Image format handler.
|
||||||
|
*
|
||||||
|
* @author bgamard
|
||||||
|
*/
|
||||||
|
public class ImageFormatHandler implements FormatHandler {
|
||||||
|
/**
|
||||||
|
* Logger.
|
||||||
|
*/
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(PdfFormatHandler.class);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Saved MIME type.
|
||||||
|
*/
|
||||||
|
private String mimeType;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean accept(String mimeType) {
|
||||||
|
this.mimeType = mimeType;
|
||||||
|
return mimeType.equals(MimeType.IMAGE_GIF) || mimeType.equals(MimeType.IMAGE_PNG) || mimeType.equals(MimeType.IMAGE_JPEG);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BufferedImage generateThumbnail(Path file) throws IOException {
|
||||||
|
try (InputStream inputStream = Files.newInputStream(file)) {
|
||||||
|
return ImageIO.read(inputStream);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String extractContent(String language, Path file) {
|
||||||
|
try (InputStream inputStream = Files.newInputStream(file)) {
|
||||||
|
return FileUtil.ocrFile(language, ImageIO.read(inputStream));
|
||||||
|
} catch (IOException e) {
|
||||||
|
log.error("Error reading the image", e);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) throws Exception {
|
||||||
|
PDPage page = new PDPage(PDRectangle.A4); // Images into A4 pages
|
||||||
|
try (PDPageContentStream contentStream = new PDPageContentStream(doc, page);
|
||||||
|
InputStream storedFileInputStream = Files.newInputStream(file)) {
|
||||||
|
// Read the image using the correct handler. PDFBox can't do it because it relies wrongly on file extension
|
||||||
|
PDImageXObject pdImage;
|
||||||
|
switch (mimeType) {
|
||||||
|
case MimeType.IMAGE_JPEG:
|
||||||
|
pdImage = JPEGFactory.createFromStream(doc, storedFileInputStream);
|
||||||
|
break;
|
||||||
|
case MimeType.IMAGE_GIF:
|
||||||
|
case MimeType.IMAGE_PNG:
|
||||||
|
BufferedImage bim = ImageIO.read(storedFileInputStream);
|
||||||
|
pdImage = LosslessFactory.createFromImage(doc, bim);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Do we want to fill the page with the image?
|
||||||
|
if (fitImageToPage) {
|
||||||
|
// Fill the page with the image
|
||||||
|
float widthAvailable = page.getMediaBox().getWidth() - 2 * margin * Constants.MM_PER_INCH;
|
||||||
|
float heightAvailable = page.getMediaBox().getHeight() - 2 * margin * Constants.MM_PER_INCH;
|
||||||
|
|
||||||
|
// Compare page format and image format
|
||||||
|
if (widthAvailable / heightAvailable < (float) pdImage.getWidth() / (float) pdImage.getHeight()) {
|
||||||
|
float imageHeight = widthAvailable / pdImage.getWidth() * pdImage.getHeight();
|
||||||
|
contentStream.drawImage(pdImage, margin * Constants.MM_PER_INCH, heightAvailable + margin * Constants.MM_PER_INCH - imageHeight,
|
||||||
|
widthAvailable, imageHeight);
|
||||||
|
} else {
|
||||||
|
float imageWidth = heightAvailable / pdImage.getHeight() * pdImage.getWidth();
|
||||||
|
contentStream.drawImage(pdImage, margin * Constants.MM_PER_INCH, margin * Constants.MM_PER_INCH,
|
||||||
|
imageWidth, heightAvailable);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Draw the image as is
|
||||||
|
contentStream.drawImage(pdImage, margin * Constants.MM_PER_INCH,
|
||||||
|
page.getMediaBox().getHeight() - pdImage.getHeight() - margin * Constants.MM_PER_INCH);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
doc.addPage(page);
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,72 @@
|
|||||||
|
package com.sismics.docs.core.util.format;
|
||||||
|
|
||||||
|
import com.google.common.io.Closer;
|
||||||
|
import com.sismics.util.context.ThreadLocalContext;
|
||||||
|
import com.sismics.util.mime.MimeType;
|
||||||
|
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
|
import org.odftoolkit.odfdom.converter.pdf.PdfConverter;
|
||||||
|
import org.odftoolkit.odfdom.converter.pdf.PdfOptions;
|
||||||
|
import org.odftoolkit.odfdom.doc.OdfTextDocument;
|
||||||
|
|
||||||
|
import java.awt.image.BufferedImage;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.OutputStream;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* ODT format handler.
|
||||||
|
*
|
||||||
|
* @author bgamard
|
||||||
|
*/
|
||||||
|
public class OdtFormatHandler implements FormatHandler {
|
||||||
|
/**
|
||||||
|
* Temporary PDF file.
|
||||||
|
*/
|
||||||
|
private Path temporaryPdfFile;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean accept(String mimeType) {
|
||||||
|
return MimeType.OPEN_DOCUMENT_TEXT.equals(mimeType);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BufferedImage generateThumbnail(Path file) throws Exception {
|
||||||
|
// Use the PDF format handler
|
||||||
|
return new PdfFormatHandler().generateThumbnail(getGeneratedPdf(file));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String extractContent(String language, Path file) throws Exception {
|
||||||
|
// Use the PDF format handler
|
||||||
|
return new PdfFormatHandler().extractContent(language, getGeneratedPdf(file));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) throws Exception {
|
||||||
|
// Use the PDF format handler
|
||||||
|
new PdfFormatHandler().appendToPdf(getGeneratedPdf(file), doc, fitImageToPage, margin, memUsageSettings, closer);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate a PDF from this ODT.
|
||||||
|
*
|
||||||
|
* @param file File
|
||||||
|
* @return PDF file
|
||||||
|
* @throws Exception e
|
||||||
|
*/
|
||||||
|
private Path getGeneratedPdf(Path file) throws Exception {
|
||||||
|
if (temporaryPdfFile == null) {
|
||||||
|
temporaryPdfFile = ThreadLocalContext.get().createTemporaryFile();
|
||||||
|
try (InputStream inputStream = Files.newInputStream(file);
|
||||||
|
OutputStream outputStream = Files.newOutputStream(temporaryPdfFile)) {
|
||||||
|
OdfTextDocument document = OdfTextDocument.loadDocument(inputStream);
|
||||||
|
PdfOptions options = PdfOptions.create();
|
||||||
|
PdfConverter.getInstance().convert(document, outputStream, options);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return temporaryPdfFile;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,80 @@
|
|||||||
|
package com.sismics.docs.core.util.format;
|
||||||
|
|
||||||
|
import com.google.common.io.Closer;
|
||||||
|
import com.sismics.docs.core.util.FileUtil;
|
||||||
|
import com.sismics.util.mime.MimeType;
|
||||||
|
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||||
|
import org.apache.pdfbox.multipdf.PDFMergerUtility;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
|
import org.apache.pdfbox.rendering.PDFRenderer;
|
||||||
|
import org.apache.pdfbox.text.PDFTextStripper;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.awt.image.BufferedImage;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* PDF format handler.
|
||||||
|
*
|
||||||
|
* @author bgamard
|
||||||
|
*/
|
||||||
|
public class PdfFormatHandler implements FormatHandler {
|
||||||
|
/**
|
||||||
|
* Logger.
|
||||||
|
*/
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(PdfFormatHandler.class);
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean accept(String mimeType) {
|
||||||
|
return mimeType.equals(MimeType.APPLICATION_PDF);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BufferedImage generateThumbnail(Path file) throws Exception {
|
||||||
|
try (InputStream inputStream = Files.newInputStream(file);
|
||||||
|
PDDocument pdfDocument = PDDocument.load(inputStream)) {
|
||||||
|
PDFRenderer renderer = new PDFRenderer(pdfDocument);
|
||||||
|
return renderer.renderImage(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String extractContent(String language, Path file) {
|
||||||
|
String content = null;
|
||||||
|
try (InputStream inputStream = Files.newInputStream(file);
|
||||||
|
PDDocument pdfDocument = PDDocument.load(inputStream)) {
|
||||||
|
content = new PDFTextStripper().getText(pdfDocument);
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.error("Error while extracting text from the PDF", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
// No text content, try to OCR it
|
||||||
|
if (language != null && content != null && content.trim().isEmpty()) {
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
try (InputStream inputStream = Files.newInputStream(file);
|
||||||
|
PDDocument pdfDocument = PDDocument.load(inputStream)) {
|
||||||
|
PDFRenderer renderer = new PDFRenderer(pdfDocument);
|
||||||
|
for (int pageIndex = 0; pageIndex < pdfDocument.getNumberOfPages(); pageIndex++) {
|
||||||
|
sb.append(" ");
|
||||||
|
sb.append(FileUtil.ocrFile(language, renderer.renderImage(pageIndex)));
|
||||||
|
}
|
||||||
|
return sb.toString();
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.error("Error while OCR-izing the PDF", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return content;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) throws Exception {
|
||||||
|
PDDocument mergeDoc = PDDocument.load(file.toFile(), memUsageSettings);
|
||||||
|
closer.register(mergeDoc);
|
||||||
|
PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
|
||||||
|
pdfMergerUtility.appendDocument(doc, mergeDoc);
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,56 @@
|
|||||||
|
package com.sismics.docs.core.util.format;
|
||||||
|
|
||||||
|
import com.google.common.base.Charsets;
|
||||||
|
import com.google.common.io.Closer;
|
||||||
|
import com.lowagie.text.*;
|
||||||
|
import com.lowagie.text.pdf.PdfWriter;
|
||||||
|
import com.sismics.util.context.ThreadLocalContext;
|
||||||
|
import com.sismics.util.mime.MimeType;
|
||||||
|
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
|
|
||||||
|
import java.awt.image.BufferedImage;
|
||||||
|
import java.io.OutputStream;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Text plain format handler.
|
||||||
|
*
|
||||||
|
* @author bgamard
|
||||||
|
*/
|
||||||
|
public class TextPlainFormatHandler implements FormatHandler {
|
||||||
|
@Override
|
||||||
|
public boolean accept(String mimeType) {
|
||||||
|
return mimeType.equals(MimeType.TEXT_CSV) || mimeType.equals(MimeType.TEXT_PLAIN);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BufferedImage generateThumbnail(Path file) throws Exception {
|
||||||
|
Document output = new Document(PageSize.A4, 40, 40, 40, 40);
|
||||||
|
Path tempFile = ThreadLocalContext.get().createTemporaryFile();
|
||||||
|
OutputStream pdfOutputStream = Files.newOutputStream(tempFile);
|
||||||
|
PdfWriter.getInstance(output, pdfOutputStream);
|
||||||
|
|
||||||
|
output.open();
|
||||||
|
String content = new String(Files.readAllBytes(file), Charsets.UTF_8);
|
||||||
|
Font font = FontFactory.getFont("LiberationMono-Regular");
|
||||||
|
Paragraph paragraph = new Paragraph(content, font);
|
||||||
|
paragraph.setAlignment(Element.ALIGN_LEFT);
|
||||||
|
output.add(paragraph);
|
||||||
|
output.close();
|
||||||
|
|
||||||
|
// Use the PDF format handler
|
||||||
|
return new PdfFormatHandler().generateThumbnail(tempFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String extractContent(String language, Path file) throws Exception {
|
||||||
|
return new String(Files.readAllBytes(file), "UTF-8");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) {
|
||||||
|
// TODO Append the text file to the PDF
|
||||||
|
}
|
||||||
|
}
|
@ -1,10 +1,13 @@
|
|||||||
package com.sismics.util;
|
package com.sismics.docs.core.util.format;
|
||||||
|
|
||||||
import com.google.common.base.Charsets;
|
import com.google.common.base.Charsets;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import com.google.common.io.ByteStreams;
|
import com.google.common.io.ByteStreams;
|
||||||
|
import com.google.common.io.Closer;
|
||||||
import com.sismics.util.io.InputStreamReaderThread;
|
import com.sismics.util.io.InputStreamReaderThread;
|
||||||
import com.sismics.util.mime.MimeType;
|
import com.sismics.util.mime.MimeType;
|
||||||
|
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
|
|
||||||
import javax.imageio.ImageIO;
|
import javax.imageio.ImageIO;
|
||||||
import java.awt.image.BufferedImage;
|
import java.awt.image.BufferedImage;
|
||||||
@ -15,27 +18,18 @@ import java.util.Arrays;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Video processing utilities.
|
* Video format handler.
|
||||||
*
|
*
|
||||||
* @author bgamard
|
* @author bgamard
|
||||||
*/
|
*/
|
||||||
public class VideoUtil {
|
public class VideoFormatHandler implements FormatHandler {
|
||||||
/**
|
@Override
|
||||||
* Returns true if this MIME type is a video.
|
public boolean accept(String mimeType) {
|
||||||
* @param mimeType MIME type
|
|
||||||
* @return True if video
|
|
||||||
*/
|
|
||||||
public static boolean isVideo(String mimeType) {
|
|
||||||
return mimeType.equals(MimeType.VIDEO_MP4) || mimeType.equals(MimeType.VIDEO_WEBM);
|
return mimeType.equals(MimeType.VIDEO_MP4) || mimeType.equals(MimeType.VIDEO_WEBM);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
@Override
|
||||||
* Generate a thumbnail from a video file.
|
public BufferedImage generateThumbnail(Path file) throws IOException {
|
||||||
*
|
|
||||||
* @param file Video file
|
|
||||||
* @return Thumbnail
|
|
||||||
*/
|
|
||||||
public static BufferedImage getThumbnail(Path file) throws Exception {
|
|
||||||
List<String> result = Lists.newLinkedList(Arrays.asList("ffmpeg", "-i"));
|
List<String> result = Lists.newLinkedList(Arrays.asList("ffmpeg", "-i"));
|
||||||
result.add(file.toAbsolutePath().toString());
|
result.add(file.toAbsolutePath().toString());
|
||||||
result.addAll(Arrays.asList("-vf", "thumbnail", "-frames:v", "1", "-f", "mjpeg", "-"));
|
result.addAll(Arrays.asList("-vf", "thumbnail", "-frames:v", "1", "-f", "mjpeg", "-"));
|
||||||
@ -52,13 +46,8 @@ public class VideoUtil {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
@Override
|
||||||
* Extract metadata from a video file.
|
public String extractContent(String language, Path file) {
|
||||||
*
|
|
||||||
* @param file Video file
|
|
||||||
* @return Metadata
|
|
||||||
*/
|
|
||||||
public static String getMetadata(Path file) {
|
|
||||||
List<String> result = Lists.newLinkedList();
|
List<String> result = Lists.newLinkedList();
|
||||||
result.add("mediainfo");
|
result.add("mediainfo");
|
||||||
result.add(file.toAbsolutePath().toString());
|
result.add(file.toAbsolutePath().toString());
|
||||||
@ -81,4 +70,9 @@ public class VideoUtil {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) {
|
||||||
|
// Video cannot be appended to PDF files
|
||||||
|
}
|
||||||
}
|
}
|
@ -2,7 +2,6 @@ package com.sismics.util;
|
|||||||
|
|
||||||
import com.google.common.base.Charsets;
|
import com.google.common.base.Charsets;
|
||||||
import com.google.common.hash.Hashing;
|
import com.google.common.hash.Hashing;
|
||||||
import com.sismics.util.mime.MimeType;
|
|
||||||
|
|
||||||
import javax.imageio.IIOImage;
|
import javax.imageio.IIOImage;
|
||||||
import javax.imageio.ImageIO;
|
import javax.imageio.ImageIO;
|
||||||
@ -67,15 +66,6 @@ public class ImageUtil {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns true if this MIME type is an image.
|
|
||||||
* @param mimeType MIME type
|
|
||||||
* @return True if image
|
|
||||||
*/
|
|
||||||
public static boolean isImage(String mimeType) {
|
|
||||||
return mimeType.equals(MimeType.IMAGE_GIF) || mimeType.equals(MimeType.IMAGE_PNG) || mimeType.equals(MimeType.IMAGE_JPEG);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Compute Gravatar hash.
|
* Compute Gravatar hash.
|
||||||
* See https://en.gravatar.com/site/implement/hash/.
|
* See https://en.gravatar.com/site/implement/hash/.
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
package com.sismics.util.mime;
|
package com.sismics.util.mime;
|
||||||
|
|
||||||
import com.google.common.base.Charsets;
|
import com.google.common.base.Charsets;
|
||||||
import com.sismics.docs.core.model.jpa.File;
|
|
||||||
import org.apache.commons.compress.utils.IOUtils;
|
import org.apache.commons.compress.utils.IOUtils;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@ -15,7 +14,7 @@ import java.util.zip.ZipInputStream;
|
|||||||
/**
|
/**
|
||||||
* Utility to check MIME types.
|
* Utility to check MIME types.
|
||||||
*
|
*
|
||||||
* @author jtremeaux
|
* @author bgamard
|
||||||
*/
|
*/
|
||||||
public class MimeTypeUtil {
|
public class MimeTypeUtil {
|
||||||
/**
|
/**
|
||||||
@ -27,11 +26,14 @@ public class MimeTypeUtil {
|
|||||||
* @throws IOException e
|
* @throws IOException e
|
||||||
*/
|
*/
|
||||||
public static String guessMimeType(Path file, String name) throws IOException {
|
public static String guessMimeType(Path file, String name) throws IOException {
|
||||||
|
String mimeType;
|
||||||
try (InputStream is = Files.newInputStream(file)) {
|
try (InputStream is = Files.newInputStream(file)) {
|
||||||
byte[] headerBytes = new byte[64];
|
byte[] headerBytes = new byte[64];
|
||||||
is.read(headerBytes);
|
is.read(headerBytes);
|
||||||
return guessMimeType(headerBytes, name);
|
mimeType = guessMimeType(headerBytes, name);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return guessOpenDocumentFormat(mimeType, file);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -116,18 +118,17 @@ public class MimeTypeUtil {
|
|||||||
* It's more costly than the simple header check, but needed because open document formats
|
* It's more costly than the simple header check, but needed because open document formats
|
||||||
* are simple ZIP files on the outside and much bigger on the inside.
|
* are simple ZIP files on the outside and much bigger on the inside.
|
||||||
*
|
*
|
||||||
* @param file File
|
* @param mimeType Currently detected MIME type
|
||||||
* @param unencryptedFile File on disk
|
* @param file File on disk
|
||||||
* @return MIME type
|
* @return MIME type
|
||||||
*/
|
*/
|
||||||
public static String guessOpenDocumentFormat(File file, Path unencryptedFile) {
|
private static String guessOpenDocumentFormat(String mimeType, Path file) {
|
||||||
if (!MimeType.APPLICATION_ZIP.equals(file.getMimeType())) {
|
if (!MimeType.APPLICATION_ZIP.equals(mimeType)) {
|
||||||
// open document formats are ZIP files
|
// open document formats are ZIP files
|
||||||
return file.getMimeType();
|
return mimeType;
|
||||||
}
|
}
|
||||||
|
|
||||||
String mimeType = file.getMimeType();
|
try (InputStream inputStream = Files.newInputStream(file);
|
||||||
try (InputStream inputStream = Files.newInputStream(unencryptedFile);
|
|
||||||
ZipInputStream zipInputStream = new ZipInputStream(inputStream, Charsets.ISO_8859_1)) {
|
ZipInputStream zipInputStream = new ZipInputStream(inputStream, Charsets.ISO_8859_1)) {
|
||||||
ZipEntry archiveEntry = zipInputStream.getNextEntry();
|
ZipEntry archiveEntry = zipInputStream.getNextEntry();
|
||||||
while (archiveEntry != null) {
|
while (archiveEntry != null) {
|
||||||
@ -151,7 +152,7 @@ public class MimeTypeUtil {
|
|||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
// In case of any error, just give up and keep the ZIP MIME type
|
// In case of any error, just give up and keep the ZIP MIME type
|
||||||
return file.getMimeType();
|
return mimeType;
|
||||||
}
|
}
|
||||||
|
|
||||||
return mimeType;
|
return mimeType;
|
||||||
|
@ -4,7 +4,9 @@ import com.google.common.collect.Lists;
|
|||||||
import com.google.common.io.Resources;
|
import com.google.common.io.Resources;
|
||||||
import com.sismics.docs.core.dao.jpa.dto.DocumentDto;
|
import com.sismics.docs.core.dao.jpa.dto.DocumentDto;
|
||||||
import com.sismics.docs.core.model.jpa.File;
|
import com.sismics.docs.core.model.jpa.File;
|
||||||
|
import com.sismics.docs.core.util.format.*;
|
||||||
import com.sismics.util.mime.MimeType;
|
import com.sismics.util.mime.MimeType;
|
||||||
|
import com.sismics.util.mime.MimeTypeUtil;
|
||||||
import org.junit.Assert;
|
import org.junit.Assert;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
@ -25,39 +27,40 @@ public class TestFileUtil {
|
|||||||
@Test
|
@Test
|
||||||
public void extractContentOpenDocumentTextTest() throws Exception {
|
public void extractContentOpenDocumentTextTest() throws Exception {
|
||||||
Path path = Paths.get(ClassLoader.getSystemResource("file/document.odt").toURI());
|
Path path = Paths.get(ClassLoader.getSystemResource("file/document.odt").toURI());
|
||||||
File file = new File();
|
FormatHandler formatHandler = FormatHandlerUtil.find(MimeTypeUtil.guessMimeType(path, "document.odt"));
|
||||||
file.setMimeType(MimeType.OPEN_DOCUMENT_TEXT);
|
Assert.assertNotNull(formatHandler);
|
||||||
Path pdfPath = PdfUtil.convertToPdf(file, path);
|
Assert.assertTrue(formatHandler instanceof OdtFormatHandler);
|
||||||
String content = FileUtil.extractContent("eng", file, path, pdfPath);
|
String content = formatHandler.extractContent("eng", path);
|
||||||
Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen."));
|
Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen."));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void extractContentOfficeDocumentTest() throws Exception {
|
public void extractContentOfficeDocumentTest() throws Exception {
|
||||||
Path path = Paths.get(ClassLoader.getSystemResource("file/document.docx").toURI());
|
Path path = Paths.get(ClassLoader.getSystemResource("file/document.docx").toURI());
|
||||||
File file = new File();
|
FormatHandler formatHandler = FormatHandlerUtil.find(MimeTypeUtil.guessMimeType(path, "document.docx"));
|
||||||
file.setMimeType(MimeType.OFFICE_DOCUMENT);
|
Assert.assertNotNull(formatHandler);
|
||||||
Path pdfPath = PdfUtil.convertToPdf(file, path);
|
Assert.assertTrue(formatHandler instanceof DocxFormatHandler);
|
||||||
String content = FileUtil.extractContent("eng", file, path, pdfPath);
|
String content = formatHandler.extractContent("eng", path);
|
||||||
Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen."));
|
Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen."));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void extractContentPdf() throws Exception {
|
public void extractContentPdf() throws Exception {
|
||||||
Path path = Paths.get(ClassLoader.getSystemResource("file/udhr.pdf").toURI());
|
Path path = Paths.get(ClassLoader.getSystemResource("file/udhr.pdf").toURI());
|
||||||
File file = new File();
|
FormatHandler formatHandler = FormatHandlerUtil.find(MimeTypeUtil.guessMimeType(path, "udhr.pdf"));
|
||||||
file.setMimeType(MimeType.APPLICATION_PDF);
|
Assert.assertNotNull(formatHandler);
|
||||||
String content = FileUtil.extractContent("eng", file, path, path);
|
Assert.assertTrue(formatHandler instanceof PdfFormatHandler);
|
||||||
|
String content = formatHandler.extractContent("eng", path);
|
||||||
Assert.assertTrue(content.contains("All human beings are born free and equal in dignity and rights."));
|
Assert.assertTrue(content.contains("All human beings are born free and equal in dignity and rights."));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void extractContentScannedPdf() throws Exception {
|
public void extractContentScannedPdf() throws Exception {
|
||||||
Path path = Paths.get(ClassLoader.getSystemResource("file/scanned.pdf").toURI());
|
Path path = Paths.get(ClassLoader.getSystemResource("file/scanned.pdf").toURI());
|
||||||
File file = new File();
|
FormatHandler formatHandler = FormatHandlerUtil.find(MimeTypeUtil.guessMimeType(path, "scanned.pdf"));
|
||||||
file.setMimeType(MimeType.APPLICATION_PDF);
|
Assert.assertNotNull(formatHandler);
|
||||||
String content = FileUtil.extractContent("eng", file, path, path);
|
Assert.assertTrue(formatHandler instanceof PdfFormatHandler);
|
||||||
System.out.println(content);
|
String content = formatHandler.extractContent("eng", path);
|
||||||
Assert.assertTrue(content.contains("All human beings are born free and equal in dignity and rights."));
|
Assert.assertTrue(content.contains("All human beings are born free and equal in dignity and rights."));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,6 +1,5 @@
|
|||||||
package com.sismics.util;
|
package com.sismics.util;
|
||||||
|
|
||||||
import com.sismics.docs.core.model.jpa.File;
|
|
||||||
import com.sismics.util.mime.MimeType;
|
import com.sismics.util.mime.MimeType;
|
||||||
import com.sismics.util.mime.MimeTypeUtil;
|
import com.sismics.util.mime.MimeTypeUtil;
|
||||||
import org.junit.Assert;
|
import org.junit.Assert;
|
||||||
@ -19,14 +18,10 @@ public class TestMimeTypeUtil {
|
|||||||
public void guessOpenDocumentFormatTest() throws Exception {
|
public void guessOpenDocumentFormatTest() throws Exception {
|
||||||
// Detect ODT files
|
// Detect ODT files
|
||||||
Path path = Paths.get(ClassLoader.getSystemResource("file/document.odt").toURI());
|
Path path = Paths.get(ClassLoader.getSystemResource("file/document.odt").toURI());
|
||||||
File file = new File();
|
Assert.assertEquals(MimeType.OPEN_DOCUMENT_TEXT, MimeTypeUtil.guessMimeType(path, "document.odt"));
|
||||||
file.setMimeType(MimeType.APPLICATION_ZIP);
|
|
||||||
Assert.assertEquals(MimeType.OPEN_DOCUMENT_TEXT, MimeTypeUtil.guessOpenDocumentFormat(file, path));
|
|
||||||
|
|
||||||
// Detect DOCX files
|
// Detect DOCX files
|
||||||
path = Paths.get(ClassLoader.getSystemResource("file/document.docx").toURI());
|
path = Paths.get(ClassLoader.getSystemResource("file/document.docx").toURI());
|
||||||
file = new File();
|
Assert.assertEquals(MimeType.OFFICE_DOCUMENT, MimeTypeUtil.guessMimeType(path, "document.odt"));
|
||||||
file.setMimeType(MimeType.APPLICATION_ZIP);
|
|
||||||
Assert.assertEquals(MimeType.OFFICE_DOCUMENT, MimeTypeUtil.guessOpenDocumentFormat(file, path));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user