diff --git a/docs-core/src/main/java/com/sismics/docs/core/constant/Constants.java b/docs-core/src/main/java/com/sismics/docs/core/constant/Constants.java index f3626b33..83249389 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/constant/Constants.java +++ b/docs-core/src/main/java/com/sismics/docs/core/constant/Constants.java @@ -87,4 +87,9 @@ public class Constants { * Email template for route step validate. */ public static final String EMAIL_TEMPLATE_ROUTE_STEP_VALIDATE = "route_step_validate"; + + /** + * mm per inch. + */ + public static float MM_PER_INCH = 1 / (10 * 2.54f) * 72f; } diff --git a/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java b/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java index d90c3b15..c0e7e890 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java +++ b/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java @@ -7,15 +7,22 @@ import com.sismics.docs.core.dao.lucene.LuceneDao; import com.sismics.docs.core.event.FileCreatedAsyncEvent; import com.sismics.docs.core.model.jpa.File; import com.sismics.docs.core.model.jpa.User; +import com.sismics.docs.core.util.DirectoryUtil; import com.sismics.docs.core.util.EncryptionUtil; import com.sismics.docs.core.util.FileUtil; -import com.sismics.docs.core.util.PdfUtil; import com.sismics.docs.core.util.TransactionUtil; -import com.sismics.util.mime.MimeTypeUtil; +import com.sismics.docs.core.util.format.FormatHandler; +import com.sismics.docs.core.util.format.FormatHandlerUtil; +import com.sismics.util.ImageUtil; +import com.sismics.util.Scalr; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.crypto.Cipher; +import javax.crypto.CipherOutputStream; +import java.awt.image.BufferedImage; +import java.io.OutputStream; +import java.nio.file.Files; import java.nio.file.Path; import java.text.MessageFormat; import java.util.concurrent.atomic.AtomicReference; @@ -42,16 +49,12 @@ public class FileCreatedAsyncListener { log.info("File created event: " + event.toString()); } - // Guess the mime type a second time, for open document format (first detected as simple ZIP file) + // Find a format handler final File file = event.getFile(); - file.setMimeType(MimeTypeUtil.guessOpenDocumentFormat(file, event.getUnencryptedFile())); - - // Convert to PDF if necessary (for thumbnail and text extraction) - Path unencryptedPdfFile = null; - try { - unencryptedPdfFile = PdfUtil.convertToPdf(file, event.getUnencryptedFile()); - } catch (Exception e) { - log.error("Unable to convert to PDF", e); + FormatHandler formatHandler = FormatHandlerUtil.find(file.getMimeType()); + if (formatHandler == null) { + log.error("Format unhandled: " + file.getMimeType()); + return; } // Get the user from the database @@ -71,15 +74,37 @@ public class FileCreatedAsyncListener { // Generate file variations try { Cipher cipher = EncryptionUtil.getEncryptionCipher(user.get().getPrivateKey()); - FileUtil.saveVariations(file, event.getUnencryptedFile(), unencryptedPdfFile, cipher); + BufferedImage image = formatHandler.generateThumbnail(event.getUnencryptedFile()); + if (image != null) { + // Generate thumbnails from image + BufferedImage web = Scalr.resize(image, Scalr.Method.ULTRA_QUALITY, Scalr.Mode.AUTOMATIC, 1280); + BufferedImage thumbnail = Scalr.resize(image, Scalr.Method.ULTRA_QUALITY, Scalr.Mode.AUTOMATIC, 256); + image.flush(); + + // Write "web" encrypted image + Path outputFile = DirectoryUtil.getStorageDirectory().resolve(file.getId() + "_web"); + try (OutputStream outputStream = new CipherOutputStream(Files.newOutputStream(outputFile), cipher)) { + ImageUtil.writeJpeg(web, outputStream); + } + + // Write "thumb" encrypted image + outputFile = DirectoryUtil.getStorageDirectory().resolve(file.getId() + "_thumb"); + try (OutputStream outputStream = new CipherOutputStream(Files.newOutputStream(outputFile), cipher)) { + ImageUtil.writeJpeg(thumbnail, outputStream); + } + } } catch (Exception e) { log.error("Unable to generate thumbnails", e); } // Extract text content from the file long startTime = System.currentTimeMillis(); - final String content = FileUtil.extractContent(event.getLanguage(), file, - event.getUnencryptedFile(), unencryptedPdfFile); + final AtomicReference content = new AtomicReference<>(); + try { + content.set(formatHandler.extractContent(event.getLanguage(), event.getUnencryptedFile())); + } catch (Exception e) { + log.error("Error extracting content from: " + event.getFile()); + } log.info(MessageFormat.format("File content extracted in {0}ms", System.currentTimeMillis() - startTime)); // Save the file to database @@ -91,8 +116,8 @@ public class FileCreatedAsyncListener { // The file has been deleted since the text extraction started, ignore the result return; } - - file.setContent(content); + + file.setContent(content.get()); fileDao.update(file); } }); diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java b/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java index 6ede2e45..ddf4ec81 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java +++ b/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java @@ -10,9 +10,7 @@ import com.sismics.docs.core.model.jpa.File; import com.sismics.docs.core.model.jpa.User; import com.sismics.tess4j.Tesseract; import com.sismics.util.ImageDeskew; -import com.sismics.util.ImageUtil; import com.sismics.util.Scalr; -import com.sismics.util.VideoUtil; import com.sismics.util.context.ThreadLocalContext; import com.sismics.util.mime.MimeTypeUtil; import org.apache.commons.lang.StringUtils; @@ -21,12 +19,9 @@ import org.slf4j.LoggerFactory; import javax.crypto.Cipher; import javax.crypto.CipherInputStream; -import javax.crypto.CipherOutputStream; -import javax.imageio.ImageIO; import java.awt.image.BufferedImage; import java.io.IOException; import java.io.InputStream; -import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Path; import java.util.Collections; @@ -49,40 +44,14 @@ public class FileUtil { */ private static Set processingFileSet = Collections.synchronizedSet(new HashSet()); - /** - * Extract content from a file. - * - * @param language Language to extract - * @param file File to extract - * @param unencryptedFile Unencrypted file - * @param unencryptedPdfFile Unencrypted PDF file - * @return Content extract - */ - public static String extractContent(String language, File file, Path unencryptedFile, Path unencryptedPdfFile) { - String content = null; - if (language == null) { - return null; - } - - if (ImageUtil.isImage(file.getMimeType())) { - content = ocrFile(unencryptedFile, language); - } else if (VideoUtil.isVideo(file.getMimeType())) { - content = VideoUtil.getMetadata(unencryptedFile); - } else if (unencryptedPdfFile != null) { - content = PdfUtil.extractPdf(unencryptedPdfFile, language); - } - - return content; - } - /** * Optical character recognition on an image. * - * @param image Buffered image * @param language Language to OCR + * @param image Buffered image * @return Content extracted */ - public static String ocrFile(BufferedImage image, String language) { + public static String ocrFile(String language, BufferedImage image) { // Upscale, grayscale and deskew the image String content = null; BufferedImage resizedImage = Scalr.resize(image, Scalr.Method.AUTOMATIC, Scalr.Mode.AUTOMATIC, 3500, Scalr.OP_ANTIALIAS, Scalr.OP_GRAYSCALE); @@ -105,66 +74,6 @@ public class FileUtil { return content; } - /** - * Optical character recognition on a file. - * - * @param unecryptedFile Unencrypted file - * @param language Language to OCR - * @return Content extracted - */ - private static String ocrFile(Path unecryptedFile, String language) { - BufferedImage image; - try (InputStream inputStream = Files.newInputStream(unecryptedFile)) { - image = ImageIO.read(inputStream); - } catch (IOException e) { - log.error("Error reading the image", e); - return null; - } - - return ocrFile(image, language); - } - - /** - * Generate file variations. - * - * @param file File from database - * @param unencryptedFile Unencrypted file - * @param unencryptedPdfFile Unencrypted PDF file - * @param cipher Cipher to use for encryption - */ - public static void saveVariations(File file, Path unencryptedFile, Path unencryptedPdfFile, Cipher cipher) throws Exception { - BufferedImage image = null; - if (ImageUtil.isImage(file.getMimeType())) { - try (InputStream inputStream = Files.newInputStream(unencryptedFile)) { - image = ImageIO.read(inputStream); - } - } else if (VideoUtil.isVideo(file.getMimeType())) { - image = VideoUtil.getThumbnail(unencryptedFile); - } else if (unencryptedPdfFile != null) { - // Generate preview from the first page of the PDF - image = PdfUtil.renderFirstPage(unencryptedPdfFile); - } - - if (image != null) { - // Generate thumbnails from image - BufferedImage web = Scalr.resize(image, Scalr.Method.ULTRA_QUALITY, Scalr.Mode.AUTOMATIC, 1280); - BufferedImage thumbnail = Scalr.resize(image, Scalr.Method.ULTRA_QUALITY, Scalr.Mode.AUTOMATIC, 256); - image.flush(); - - // Write "web" encrypted image - Path outputFile = DirectoryUtil.getStorageDirectory().resolve(file.getId() + "_web"); - try (OutputStream outputStream = new CipherOutputStream(Files.newOutputStream(outputFile), cipher)) { - ImageUtil.writeJpeg(web, outputStream); - } - - // Write "thumb" encrypted image - outputFile = DirectoryUtil.getStorageDirectory().resolve(file.getId() + "_thumb"); - try (OutputStream outputStream = new CipherOutputStream(Files.newOutputStream(outputFile), cipher)) { - ImageUtil.writeJpeg(thumbnail, outputStream); - } - } - } - /** * Remove a file from the storage filesystem. * diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/PdfUtil.java b/docs-core/src/main/java/com/sismics/docs/core/util/PdfUtil.java index decc88b7..fb786619 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/util/PdfUtil.java +++ b/docs-core/src/main/java/com/sismics/docs/core/util/PdfUtil.java @@ -1,39 +1,23 @@ package com.sismics.docs.core.util; -import com.google.common.base.Charsets; import com.google.common.base.Strings; import com.google.common.io.ByteStreams; import com.google.common.io.Closer; import com.google.common.io.Resources; -import com.lowagie.text.*; -import com.lowagie.text.pdf.PdfWriter; +import com.lowagie.text.FontFactory; +import com.sismics.docs.core.constant.Constants; import com.sismics.docs.core.dao.jpa.dto.DocumentDto; import com.sismics.docs.core.model.jpa.File; +import com.sismics.docs.core.util.format.FormatHandler; +import com.sismics.docs.core.util.format.FormatHandlerUtil; import com.sismics.docs.core.util.pdf.PdfPage; -import com.sismics.util.ImageUtil; -import com.sismics.util.context.ThreadLocalContext; -import com.sismics.util.mime.MimeType; import org.apache.pdfbox.io.MemoryUsageSetting; -import org.apache.pdfbox.multipdf.PDFMergerUtility; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; -import org.apache.pdfbox.pdmodel.PDPageContentStream; -import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.font.DocsPDType1Font; -import org.apache.pdfbox.pdmodel.graphics.image.JPEGFactory; -import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory; -import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; -import org.apache.pdfbox.rendering.PDFRenderer; -import org.apache.pdfbox.text.PDFTextStripper; -import org.apache.poi.xwpf.usermodel.XWPFDocument; -import org.odftoolkit.odfdom.converter.pdf.PdfConverter; -import org.odftoolkit.odfdom.converter.pdf.PdfOptions; -import org.odftoolkit.odfdom.doc.OdfTextDocument; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.imageio.ImageIO; -import java.awt.image.BufferedImage; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; @@ -54,128 +38,7 @@ public class PdfUtil { * Logger. */ private static final Logger log = LoggerFactory.getLogger(PdfUtil.class); - - /** - * Extract text from a PDF. - * - * @param unencryptedPdfFile Unencrypted PDF file - * @param language Language - * @return Content extracted - */ - public static String extractPdf(Path unencryptedPdfFile, String language) { - String content = null; - try (InputStream inputStream = Files.newInputStream(unencryptedPdfFile); - PDDocument pdfDocument = PDDocument.load(inputStream)) { - content = new PDFTextStripper().getText(pdfDocument); - } catch (Exception e) { - log.error("Error while extracting text from the PDF", e); - } - // No text content, try to OCR it - if (language != null && content != null && content.trim().isEmpty()) { - StringBuilder sb = new StringBuilder(); - try (InputStream inputStream = Files.newInputStream(unencryptedPdfFile); - PDDocument pdfDocument = PDDocument.load(inputStream)) { - PDFRenderer renderer = new PDFRenderer(pdfDocument); - for (int pageIndex = 0; pageIndex < pdfDocument.getNumberOfPages(); pageIndex++) { - sb.append(" "); - sb.append(FileUtil.ocrFile(renderer.renderImage(pageIndex), language)); - } - return sb.toString(); - } catch (Exception e) { - log.error("Error while OCR-izing the PDF", e); - } - } - - return content; - } - - /** - * Convert a file to PDF if necessary. - * - * @param file File - * @param unencryptedFile Unencrypted file - * @return PDF temporary file - */ - public static Path convertToPdf(File file, Path unencryptedFile) throws Exception { - if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) { - // It's already PDF, just return the file - return unencryptedFile; - } - - if (file.getMimeType().equals(MimeType.OFFICE_DOCUMENT)) { - return convertOfficeDocument(unencryptedFile); - } - - if (file.getMimeType().equals(MimeType.OPEN_DOCUMENT_TEXT)) { - return convertOpenDocumentText(unencryptedFile); - } - - if (file.getMimeType().equals(MimeType.TEXT_PLAIN) || file.getMimeType().equals(MimeType.TEXT_CSV)) { - return convertTextPlain(unencryptedFile); - } - - // PDF conversion not necessary/possible - return null; - } - - /** - * Convert a text plain document to PDF. - * - * @param unencryptedFile Unencrypted file - * @return PDF file - */ - private static Path convertTextPlain(Path unencryptedFile) throws Exception { - Document output = new Document(PageSize.A4, 40, 40, 40, 40); - Path tempFile = ThreadLocalContext.get().createTemporaryFile(); - OutputStream pdfOutputStream = Files.newOutputStream(tempFile); - PdfWriter.getInstance(output, pdfOutputStream); - - output.open(); - String content = new String(Files.readAllBytes(unencryptedFile), Charsets.UTF_8); - Font font = FontFactory.getFont("LiberationMono-Regular"); - Paragraph paragraph = new Paragraph(content, font); - paragraph.setAlignment(Element.ALIGN_LEFT); - output.add(paragraph); - output.close(); - - return tempFile; - } - - /** - * Convert an open document text file to PDF. - * - * @param unencryptedFile Unencrypted file - * @return PDF file - */ - private static Path convertOpenDocumentText(Path unencryptedFile) throws Exception { - Path tempFile = ThreadLocalContext.get().createTemporaryFile(); - try (InputStream inputStream = Files.newInputStream(unencryptedFile); - OutputStream outputStream = Files.newOutputStream(tempFile)) { - OdfTextDocument document = OdfTextDocument.loadDocument(inputStream); - PdfOptions options = PdfOptions.create(); - PdfConverter.getInstance().convert(document, outputStream, options); - } - return tempFile; - } - - /** - * Convert an Office document to PDF. - * - * @param unencryptedFile Unencrypted file - * @return PDF file - */ - private static Path convertOfficeDocument(Path unencryptedFile) throws Exception { - Path tempFile = ThreadLocalContext.get().createTemporaryFile(); - try (InputStream inputStream = Files.newInputStream(unencryptedFile); - OutputStream outputStream = Files.newOutputStream(tempFile)) { - XWPFDocument document = new XWPFDocument(inputStream); - org.apache.poi.xwpf.converter.pdf.PdfOptions options = org.apache.poi.xwpf.converter.pdf.PdfOptions.create(); - org.apache.poi.xwpf.converter.pdf.PdfConverter.getInstance().convert(document, outputStream, options); - } - return tempFile; - } - /** * Convert a document and its files to a merged PDF file. * @@ -192,15 +55,14 @@ public class PdfUtil { Closer closer = Closer.create(); MemoryUsageSetting memUsageSettings = MemoryUsageSetting.setupMixed(1000000); // 1MB max memory usage memUsageSettings.setTempDir(new java.io.File(System.getProperty("java.io.tmpdir"))); // To OS temp - float mmPerInch = 1 / (10 * 2.54f) * 72f; - + // Create a blank PDF try (PDDocument doc = new PDDocument(memUsageSettings)) { // Add metadata if (metadata) { PDPage page = new PDPage(); doc.addPage(page); - try (PdfPage pdfPage = new PdfPage(doc, page, margin * mmPerInch, DocsPDType1Font.HELVETICA, 12)) { + try (PdfPage pdfPage = new PdfPage(doc, page, margin * Constants.MM_PER_INCH, DocsPDType1Font.HELVETICA, 12)) { SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd"); pdfPage.addText(documentDto.getTitle(), true, DocsPDType1Font.HELVETICA_BOLD, 16) .newLine() @@ -245,55 +107,9 @@ public class PdfUtil { // Decrypt the file to a temporary file Path unencryptedFile = EncryptionUtil.decryptFile(storedFile, file.getPrivateKey()); - - if (ImageUtil.isImage(file.getMimeType())) { - PDPage page = new PDPage(PDRectangle.A4); // Images into A4 pages - try (PDPageContentStream contentStream = new PDPageContentStream(doc, page); - InputStream storedFileInputStream = Files.newInputStream(unencryptedFile)) { - // Read the image using the correct handler. PDFBox can't do it because it relies wrongly on file extension - PDImageXObject pdImage = null; - if (file.getMimeType().equals(MimeType.IMAGE_JPEG)) { - pdImage = JPEGFactory.createFromStream(doc, storedFileInputStream); - } else if (file.getMimeType().equals(MimeType.IMAGE_GIF) || file.getMimeType().equals(MimeType.IMAGE_PNG)) { - BufferedImage bim = ImageIO.read(storedFileInputStream); - pdImage = LosslessFactory.createFromImage(doc, bim); - } - - // Do we want to fill the page with the image? - if (fitImageToPage) { - // Fill the page with the image - float widthAvailable = page.getMediaBox().getWidth() - 2 * margin * mmPerInch; - float heightAvailable = page.getMediaBox().getHeight() - 2 * margin * mmPerInch; - - // Compare page format and image format - if (widthAvailable / heightAvailable < (float) pdImage.getWidth() / (float) pdImage.getHeight()) { - float imageHeight = widthAvailable / pdImage.getWidth() * pdImage.getHeight(); - contentStream.drawImage(pdImage, margin * mmPerInch, heightAvailable + margin * mmPerInch - imageHeight, - widthAvailable, imageHeight); - } else { - float imageWidth = heightAvailable / pdImage.getHeight() * pdImage.getWidth(); - contentStream.drawImage(pdImage, margin * mmPerInch, margin * mmPerInch, - imageWidth, heightAvailable); - } - } else { - // Draw the image as is - contentStream.drawImage(pdImage, margin * mmPerInch, - page.getMediaBox().getHeight() - pdImage.getHeight() - margin * mmPerInch); - } - } - doc.addPage(page); - } else { - // Try to convert the file to PDF - Path unencryptedPdfFile = convertToPdf(file, unencryptedFile); - if (unencryptedPdfFile != null) { - // This file is convertible to PDF, just add it to the end - PDDocument mergeDoc = PDDocument.load(unencryptedPdfFile.toFile(), memUsageSettings); - closer.register(mergeDoc); - PDFMergerUtility pdfMergerUtility = new PDFMergerUtility(); - pdfMergerUtility.appendDocument(doc, mergeDoc); - } - - // All other non-PDF-convertible files are ignored + FormatHandler formatHandler = FormatHandlerUtil.find(file.getMimeType()); + if (formatHandler != null) { + formatHandler.appendToPdf(unencryptedFile, doc, fitImageToPage, margin, memUsageSettings, closer); } } @@ -302,20 +118,6 @@ public class PdfUtil { } } - /** - * Render the first page of a PDF. - * - * @param unencryptedFile PDF document - * @return Render of the first page - */ - public static BufferedImage renderFirstPage(Path unencryptedFile) throws IOException { - try (InputStream inputStream = Files.newInputStream(unencryptedFile); - PDDocument pdfDocument = PDDocument.load(inputStream)) { - PDFRenderer renderer = new PDFRenderer(pdfDocument); - return renderer.renderImage(0); - } - } - /** * Register fonts. */ diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/format/DocxFormatHandler.java b/docs-core/src/main/java/com/sismics/docs/core/util/format/DocxFormatHandler.java new file mode 100644 index 00000000..10e4f7f9 --- /dev/null +++ b/docs-core/src/main/java/com/sismics/docs/core/util/format/DocxFormatHandler.java @@ -0,0 +1,70 @@ +package com.sismics.docs.core.util.format; + +import com.google.common.io.Closer; +import com.sismics.util.context.ThreadLocalContext; +import com.sismics.util.mime.MimeType; +import org.apache.pdfbox.io.MemoryUsageSetting; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.poi.xwpf.usermodel.XWPFDocument; + +import java.awt.image.BufferedImage; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; + +/** + * DOCX format handler. + * + * @author bgamard + */ +public class DocxFormatHandler implements FormatHandler { + /** + * Temporary PDF file. + */ + private Path temporaryPdfFile; + + @Override + public boolean accept(String mimeType) { + return MimeType.OFFICE_DOCUMENT.equals(mimeType); + } + + @Override + public BufferedImage generateThumbnail(Path file) throws Exception { + // Use the PDF format handler + return new PdfFormatHandler().generateThumbnail(getGeneratedPdf(file)); + } + + @Override + public String extractContent(String language, Path file) throws Exception { + // Use the PDF format handler + return new PdfFormatHandler().extractContent(language, getGeneratedPdf(file)); + } + + @Override + public void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) throws Exception { + // Use the PDF format handler + new PdfFormatHandler().appendToPdf(getGeneratedPdf(file), doc, fitImageToPage, margin, memUsageSettings, closer); + } + + /** + * Generate a PDF from this DOCX. + * + * @param file File + * @return PDF file + * @throws Exception e + */ + private Path getGeneratedPdf(Path file) throws Exception { + if (temporaryPdfFile == null) { + temporaryPdfFile = ThreadLocalContext.get().createTemporaryFile(); + try (InputStream inputStream = Files.newInputStream(file); + OutputStream outputStream = Files.newOutputStream(temporaryPdfFile)) { + XWPFDocument document = new XWPFDocument(inputStream); + org.apache.poi.xwpf.converter.pdf.PdfOptions options = org.apache.poi.xwpf.converter.pdf.PdfOptions.create(); + org.apache.poi.xwpf.converter.pdf.PdfConverter.getInstance().convert(document, outputStream, options); + } + } + + return temporaryPdfFile; + } +} diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/format/FormatHandler.java b/docs-core/src/main/java/com/sismics/docs/core/util/format/FormatHandler.java new file mode 100644 index 00000000..945ca9b1 --- /dev/null +++ b/docs-core/src/main/java/com/sismics/docs/core/util/format/FormatHandler.java @@ -0,0 +1,55 @@ +package com.sismics.docs.core.util.format; + +import com.google.common.io.Closer; +import org.apache.pdfbox.io.MemoryUsageSetting; +import org.apache.pdfbox.pdmodel.PDDocument; + +import java.awt.image.BufferedImage; +import java.nio.file.Path; + +/** + * A format handler. + * + * @author bgamard + */ +public interface FormatHandler { + /** + * Returns true if this format handler can handle this MIME type. + * + * @param mimeType MIME type + * @return True if accepted + */ + boolean accept(String mimeType); + + /** + * Generate a thumbnail. + * + * @param file File + * @return Thumbnail + * @throws Exception e + */ + BufferedImage generateThumbnail(Path file) throws Exception; + + /** + * Extract text content. + * + * @param language Language + * @param file File + * @return Text content + * @throws Exception e + */ + String extractContent(String language, Path file) throws Exception; + + /** + * Append to a PDF. + * + * @param file File + * @param doc PDF document + * @param fitImageToPage Fit image to page + * @param margin Margin + * @param memUsageSettings Memory usage + * @param closer Closer + * @throws Exception e + */ + void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) throws Exception; +} diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/format/FormatHandlerUtil.java b/docs-core/src/main/java/com/sismics/docs/core/util/format/FormatHandlerUtil.java new file mode 100644 index 00000000..47cb778c --- /dev/null +++ b/docs-core/src/main/java/com/sismics/docs/core/util/format/FormatHandlerUtil.java @@ -0,0 +1,45 @@ +package com.sismics.docs.core.util.format; + +import com.google.common.collect.Lists; + +import java.util.List; + +/** + * Format handler utilities. + * + * @author bgamard + */ +public class FormatHandlerUtil { + /** + * List of format handlers. + */ + private static final List> FORMAT_HANDLERS = Lists.newArrayList( + DocxFormatHandler.class, + OdtFormatHandler.class, + VideoFormatHandler.class, + PdfFormatHandler.class, + TextPlainFormatHandler.class, + ImageFormatHandler.class + ); + + /** + * Find a suitable format handler for this MIME type. + * + * @param mimeType MIME type + * @return Instancied format handler + */ + public static FormatHandler find(String mimeType) { + try { + for (Class formatHandlerClass : FORMAT_HANDLERS) { + FormatHandler formatHandler = formatHandlerClass.newInstance(); + if (formatHandler.accept(mimeType)) { + return formatHandler; + } + } + } catch (InstantiationException | IllegalAccessException e) { + return null; + } + + return null; + } +} diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/format/ImageFormatHandler.java b/docs-core/src/main/java/com/sismics/docs/core/util/format/ImageFormatHandler.java new file mode 100644 index 00000000..e4016e1f --- /dev/null +++ b/docs-core/src/main/java/com/sismics/docs/core/util/format/ImageFormatHandler.java @@ -0,0 +1,108 @@ +package com.sismics.docs.core.util.format; + +import com.google.common.io.Closer; +import com.sismics.docs.core.constant.Constants; +import com.sismics.docs.core.util.FileUtil; +import com.sismics.util.mime.MimeType; +import org.apache.pdfbox.io.MemoryUsageSetting; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageContentStream; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.graphics.image.JPEGFactory; +import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory; +import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.imageio.ImageIO; +import java.awt.image.BufferedImage; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; + +/** + * Image format handler. + * + * @author bgamard + */ +public class ImageFormatHandler implements FormatHandler { + /** + * Logger. + */ + private static final Logger log = LoggerFactory.getLogger(PdfFormatHandler.class); + + /** + * Saved MIME type. + */ + private String mimeType; + + @Override + public boolean accept(String mimeType) { + this.mimeType = mimeType; + return mimeType.equals(MimeType.IMAGE_GIF) || mimeType.equals(MimeType.IMAGE_PNG) || mimeType.equals(MimeType.IMAGE_JPEG); + } + + @Override + public BufferedImage generateThumbnail(Path file) throws IOException { + try (InputStream inputStream = Files.newInputStream(file)) { + return ImageIO.read(inputStream); + } + } + + @Override + public String extractContent(String language, Path file) { + try (InputStream inputStream = Files.newInputStream(file)) { + return FileUtil.ocrFile(language, ImageIO.read(inputStream)); + } catch (IOException e) { + log.error("Error reading the image", e); + return null; + } + } + + @Override + public void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) throws Exception { + PDPage page = new PDPage(PDRectangle.A4); // Images into A4 pages + try (PDPageContentStream contentStream = new PDPageContentStream(doc, page); + InputStream storedFileInputStream = Files.newInputStream(file)) { + // Read the image using the correct handler. PDFBox can't do it because it relies wrongly on file extension + PDImageXObject pdImage; + switch (mimeType) { + case MimeType.IMAGE_JPEG: + pdImage = JPEGFactory.createFromStream(doc, storedFileInputStream); + break; + case MimeType.IMAGE_GIF: + case MimeType.IMAGE_PNG: + BufferedImage bim = ImageIO.read(storedFileInputStream); + pdImage = LosslessFactory.createFromImage(doc, bim); + break; + default: + return; + } + + // Do we want to fill the page with the image? + if (fitImageToPage) { + // Fill the page with the image + float widthAvailable = page.getMediaBox().getWidth() - 2 * margin * Constants.MM_PER_INCH; + float heightAvailable = page.getMediaBox().getHeight() - 2 * margin * Constants.MM_PER_INCH; + + // Compare page format and image format + if (widthAvailable / heightAvailable < (float) pdImage.getWidth() / (float) pdImage.getHeight()) { + float imageHeight = widthAvailable / pdImage.getWidth() * pdImage.getHeight(); + contentStream.drawImage(pdImage, margin * Constants.MM_PER_INCH, heightAvailable + margin * Constants.MM_PER_INCH - imageHeight, + widthAvailable, imageHeight); + } else { + float imageWidth = heightAvailable / pdImage.getHeight() * pdImage.getWidth(); + contentStream.drawImage(pdImage, margin * Constants.MM_PER_INCH, margin * Constants.MM_PER_INCH, + imageWidth, heightAvailable); + } + } else { + // Draw the image as is + contentStream.drawImage(pdImage, margin * Constants.MM_PER_INCH, + page.getMediaBox().getHeight() - pdImage.getHeight() - margin * Constants.MM_PER_INCH); + } + } + doc.addPage(page); + } +} diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/format/OdtFormatHandler.java b/docs-core/src/main/java/com/sismics/docs/core/util/format/OdtFormatHandler.java new file mode 100644 index 00000000..1e9802f3 --- /dev/null +++ b/docs-core/src/main/java/com/sismics/docs/core/util/format/OdtFormatHandler.java @@ -0,0 +1,72 @@ +package com.sismics.docs.core.util.format; + +import com.google.common.io.Closer; +import com.sismics.util.context.ThreadLocalContext; +import com.sismics.util.mime.MimeType; +import org.apache.pdfbox.io.MemoryUsageSetting; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.odftoolkit.odfdom.converter.pdf.PdfConverter; +import org.odftoolkit.odfdom.converter.pdf.PdfOptions; +import org.odftoolkit.odfdom.doc.OdfTextDocument; + +import java.awt.image.BufferedImage; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; + +/** + * ODT format handler. + * + * @author bgamard + */ +public class OdtFormatHandler implements FormatHandler { + /** + * Temporary PDF file. + */ + private Path temporaryPdfFile; + + @Override + public boolean accept(String mimeType) { + return MimeType.OPEN_DOCUMENT_TEXT.equals(mimeType); + } + + @Override + public BufferedImage generateThumbnail(Path file) throws Exception { + // Use the PDF format handler + return new PdfFormatHandler().generateThumbnail(getGeneratedPdf(file)); + } + + @Override + public String extractContent(String language, Path file) throws Exception { + // Use the PDF format handler + return new PdfFormatHandler().extractContent(language, getGeneratedPdf(file)); + } + + @Override + public void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) throws Exception { + // Use the PDF format handler + new PdfFormatHandler().appendToPdf(getGeneratedPdf(file), doc, fitImageToPage, margin, memUsageSettings, closer); + } + + /** + * Generate a PDF from this ODT. + * + * @param file File + * @return PDF file + * @throws Exception e + */ + private Path getGeneratedPdf(Path file) throws Exception { + if (temporaryPdfFile == null) { + temporaryPdfFile = ThreadLocalContext.get().createTemporaryFile(); + try (InputStream inputStream = Files.newInputStream(file); + OutputStream outputStream = Files.newOutputStream(temporaryPdfFile)) { + OdfTextDocument document = OdfTextDocument.loadDocument(inputStream); + PdfOptions options = PdfOptions.create(); + PdfConverter.getInstance().convert(document, outputStream, options); + } + } + + return temporaryPdfFile; + } +} diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/format/PdfFormatHandler.java b/docs-core/src/main/java/com/sismics/docs/core/util/format/PdfFormatHandler.java new file mode 100644 index 00000000..f47f8522 --- /dev/null +++ b/docs-core/src/main/java/com/sismics/docs/core/util/format/PdfFormatHandler.java @@ -0,0 +1,80 @@ +package com.sismics.docs.core.util.format; + +import com.google.common.io.Closer; +import com.sismics.docs.core.util.FileUtil; +import com.sismics.util.mime.MimeType; +import org.apache.pdfbox.io.MemoryUsageSetting; +import org.apache.pdfbox.multipdf.PDFMergerUtility; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.rendering.PDFRenderer; +import org.apache.pdfbox.text.PDFTextStripper; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.awt.image.BufferedImage; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; + +/** + * PDF format handler. + * + * @author bgamard + */ +public class PdfFormatHandler implements FormatHandler { + /** + * Logger. + */ + private static final Logger log = LoggerFactory.getLogger(PdfFormatHandler.class); + + @Override + public boolean accept(String mimeType) { + return mimeType.equals(MimeType.APPLICATION_PDF); + } + + @Override + public BufferedImage generateThumbnail(Path file) throws Exception { + try (InputStream inputStream = Files.newInputStream(file); + PDDocument pdfDocument = PDDocument.load(inputStream)) { + PDFRenderer renderer = new PDFRenderer(pdfDocument); + return renderer.renderImage(0); + } + } + + @Override + public String extractContent(String language, Path file) { + String content = null; + try (InputStream inputStream = Files.newInputStream(file); + PDDocument pdfDocument = PDDocument.load(inputStream)) { + content = new PDFTextStripper().getText(pdfDocument); + } catch (Exception e) { + log.error("Error while extracting text from the PDF", e); + } + + // No text content, try to OCR it + if (language != null && content != null && content.trim().isEmpty()) { + StringBuilder sb = new StringBuilder(); + try (InputStream inputStream = Files.newInputStream(file); + PDDocument pdfDocument = PDDocument.load(inputStream)) { + PDFRenderer renderer = new PDFRenderer(pdfDocument); + for (int pageIndex = 0; pageIndex < pdfDocument.getNumberOfPages(); pageIndex++) { + sb.append(" "); + sb.append(FileUtil.ocrFile(language, renderer.renderImage(pageIndex))); + } + return sb.toString(); + } catch (Exception e) { + log.error("Error while OCR-izing the PDF", e); + } + } + + return content; + } + + @Override + public void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) throws Exception { + PDDocument mergeDoc = PDDocument.load(file.toFile(), memUsageSettings); + closer.register(mergeDoc); + PDFMergerUtility pdfMergerUtility = new PDFMergerUtility(); + pdfMergerUtility.appendDocument(doc, mergeDoc); + } +} diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/format/TextPlainFormatHandler.java b/docs-core/src/main/java/com/sismics/docs/core/util/format/TextPlainFormatHandler.java new file mode 100644 index 00000000..2da22b05 --- /dev/null +++ b/docs-core/src/main/java/com/sismics/docs/core/util/format/TextPlainFormatHandler.java @@ -0,0 +1,56 @@ +package com.sismics.docs.core.util.format; + +import com.google.common.base.Charsets; +import com.google.common.io.Closer; +import com.lowagie.text.*; +import com.lowagie.text.pdf.PdfWriter; +import com.sismics.util.context.ThreadLocalContext; +import com.sismics.util.mime.MimeType; +import org.apache.pdfbox.io.MemoryUsageSetting; +import org.apache.pdfbox.pdmodel.PDDocument; + +import java.awt.image.BufferedImage; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; + +/** + * Text plain format handler. + * + * @author bgamard + */ +public class TextPlainFormatHandler implements FormatHandler { + @Override + public boolean accept(String mimeType) { + return mimeType.equals(MimeType.TEXT_CSV) || mimeType.equals(MimeType.TEXT_PLAIN); + } + + @Override + public BufferedImage generateThumbnail(Path file) throws Exception { + Document output = new Document(PageSize.A4, 40, 40, 40, 40); + Path tempFile = ThreadLocalContext.get().createTemporaryFile(); + OutputStream pdfOutputStream = Files.newOutputStream(tempFile); + PdfWriter.getInstance(output, pdfOutputStream); + + output.open(); + String content = new String(Files.readAllBytes(file), Charsets.UTF_8); + Font font = FontFactory.getFont("LiberationMono-Regular"); + Paragraph paragraph = new Paragraph(content, font); + paragraph.setAlignment(Element.ALIGN_LEFT); + output.add(paragraph); + output.close(); + + // Use the PDF format handler + return new PdfFormatHandler().generateThumbnail(tempFile); + } + + @Override + public String extractContent(String language, Path file) throws Exception { + return new String(Files.readAllBytes(file), "UTF-8"); + } + + @Override + public void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) { + // TODO Append the text file to the PDF + } +} diff --git a/docs-core/src/main/java/com/sismics/util/VideoUtil.java b/docs-core/src/main/java/com/sismics/docs/core/util/format/VideoFormatHandler.java similarity index 74% rename from docs-core/src/main/java/com/sismics/util/VideoUtil.java rename to docs-core/src/main/java/com/sismics/docs/core/util/format/VideoFormatHandler.java index 12dcbfd0..1365aca5 100644 --- a/docs-core/src/main/java/com/sismics/util/VideoUtil.java +++ b/docs-core/src/main/java/com/sismics/docs/core/util/format/VideoFormatHandler.java @@ -1,10 +1,13 @@ -package com.sismics.util; +package com.sismics.docs.core.util.format; import com.google.common.base.Charsets; import com.google.common.collect.Lists; import com.google.common.io.ByteStreams; +import com.google.common.io.Closer; import com.sismics.util.io.InputStreamReaderThread; import com.sismics.util.mime.MimeType; +import org.apache.pdfbox.io.MemoryUsageSetting; +import org.apache.pdfbox.pdmodel.PDDocument; import javax.imageio.ImageIO; import java.awt.image.BufferedImage; @@ -15,27 +18,18 @@ import java.util.Arrays; import java.util.List; /** - * Video processing utilities. + * Video format handler. * * @author bgamard */ -public class VideoUtil { - /** - * Returns true if this MIME type is a video. - * @param mimeType MIME type - * @return True if video - */ - public static boolean isVideo(String mimeType) { +public class VideoFormatHandler implements FormatHandler { + @Override + public boolean accept(String mimeType) { return mimeType.equals(MimeType.VIDEO_MP4) || mimeType.equals(MimeType.VIDEO_WEBM); } - /** - * Generate a thumbnail from a video file. - * - * @param file Video file - * @return Thumbnail - */ - public static BufferedImage getThumbnail(Path file) throws Exception { + @Override + public BufferedImage generateThumbnail(Path file) throws IOException { List result = Lists.newLinkedList(Arrays.asList("ffmpeg", "-i")); result.add(file.toAbsolutePath().toString()); result.addAll(Arrays.asList("-vf", "thumbnail", "-frames:v", "1", "-f", "mjpeg", "-")); @@ -52,13 +46,8 @@ public class VideoUtil { } } - /** - * Extract metadata from a video file. - * - * @param file Video file - * @return Metadata - */ - public static String getMetadata(Path file) { + @Override + public String extractContent(String language, Path file) { List result = Lists.newLinkedList(); result.add("mediainfo"); result.add(file.toAbsolutePath().toString()); @@ -81,4 +70,9 @@ public class VideoUtil { return null; } } + + @Override + public void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) { + // Video cannot be appended to PDF files + } } diff --git a/docs-core/src/main/java/com/sismics/util/ImageUtil.java b/docs-core/src/main/java/com/sismics/util/ImageUtil.java index 33a6ddee..4a61e625 100644 --- a/docs-core/src/main/java/com/sismics/util/ImageUtil.java +++ b/docs-core/src/main/java/com/sismics/util/ImageUtil.java @@ -2,7 +2,6 @@ package com.sismics.util; import com.google.common.base.Charsets; import com.google.common.hash.Hashing; -import com.sismics.util.mime.MimeType; import javax.imageio.IIOImage; import javax.imageio.ImageIO; @@ -67,15 +66,6 @@ public class ImageUtil { } } - /** - * Returns true if this MIME type is an image. - * @param mimeType MIME type - * @return True if image - */ - public static boolean isImage(String mimeType) { - return mimeType.equals(MimeType.IMAGE_GIF) || mimeType.equals(MimeType.IMAGE_PNG) || mimeType.equals(MimeType.IMAGE_JPEG); - } - /** * Compute Gravatar hash. * See https://en.gravatar.com/site/implement/hash/. diff --git a/docs-core/src/main/java/com/sismics/util/mime/MimeTypeUtil.java b/docs-core/src/main/java/com/sismics/util/mime/MimeTypeUtil.java index b08b471f..3ca6e1af 100644 --- a/docs-core/src/main/java/com/sismics/util/mime/MimeTypeUtil.java +++ b/docs-core/src/main/java/com/sismics/util/mime/MimeTypeUtil.java @@ -1,7 +1,6 @@ package com.sismics.util.mime; import com.google.common.base.Charsets; -import com.sismics.docs.core.model.jpa.File; import org.apache.commons.compress.utils.IOUtils; import java.io.IOException; @@ -15,7 +14,7 @@ import java.util.zip.ZipInputStream; /** * Utility to check MIME types. * - * @author jtremeaux + * @author bgamard */ public class MimeTypeUtil { /** @@ -27,11 +26,14 @@ public class MimeTypeUtil { * @throws IOException e */ public static String guessMimeType(Path file, String name) throws IOException { + String mimeType; try (InputStream is = Files.newInputStream(file)) { byte[] headerBytes = new byte[64]; is.read(headerBytes); - return guessMimeType(headerBytes, name); + mimeType = guessMimeType(headerBytes, name); } + + return guessOpenDocumentFormat(mimeType, file); } /** @@ -116,18 +118,17 @@ public class MimeTypeUtil { * It's more costly than the simple header check, but needed because open document formats * are simple ZIP files on the outside and much bigger on the inside. * - * @param file File - * @param unencryptedFile File on disk + * @param mimeType Currently detected MIME type + * @param file File on disk * @return MIME type */ - public static String guessOpenDocumentFormat(File file, Path unencryptedFile) { - if (!MimeType.APPLICATION_ZIP.equals(file.getMimeType())) { + private static String guessOpenDocumentFormat(String mimeType, Path file) { + if (!MimeType.APPLICATION_ZIP.equals(mimeType)) { // open document formats are ZIP files - return file.getMimeType(); + return mimeType; } - String mimeType = file.getMimeType(); - try (InputStream inputStream = Files.newInputStream(unencryptedFile); + try (InputStream inputStream = Files.newInputStream(file); ZipInputStream zipInputStream = new ZipInputStream(inputStream, Charsets.ISO_8859_1)) { ZipEntry archiveEntry = zipInputStream.getNextEntry(); while (archiveEntry != null) { @@ -151,7 +152,7 @@ public class MimeTypeUtil { } } catch (Exception e) { // In case of any error, just give up and keep the ZIP MIME type - return file.getMimeType(); + return mimeType; } return mimeType; diff --git a/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java b/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java index c62561df..bf4f344f 100644 --- a/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java +++ b/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java @@ -4,7 +4,9 @@ import com.google.common.collect.Lists; import com.google.common.io.Resources; import com.sismics.docs.core.dao.jpa.dto.DocumentDto; import com.sismics.docs.core.model.jpa.File; +import com.sismics.docs.core.util.format.*; import com.sismics.util.mime.MimeType; +import com.sismics.util.mime.MimeTypeUtil; import org.junit.Assert; import org.junit.Test; @@ -25,39 +27,40 @@ public class TestFileUtil { @Test public void extractContentOpenDocumentTextTest() throws Exception { Path path = Paths.get(ClassLoader.getSystemResource("file/document.odt").toURI()); - File file = new File(); - file.setMimeType(MimeType.OPEN_DOCUMENT_TEXT); - Path pdfPath = PdfUtil.convertToPdf(file, path); - String content = FileUtil.extractContent("eng", file, path, pdfPath); + FormatHandler formatHandler = FormatHandlerUtil.find(MimeTypeUtil.guessMimeType(path, "document.odt")); + Assert.assertNotNull(formatHandler); + Assert.assertTrue(formatHandler instanceof OdtFormatHandler); + String content = formatHandler.extractContent("eng", path); Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen.")); } @Test public void extractContentOfficeDocumentTest() throws Exception { Path path = Paths.get(ClassLoader.getSystemResource("file/document.docx").toURI()); - File file = new File(); - file.setMimeType(MimeType.OFFICE_DOCUMENT); - Path pdfPath = PdfUtil.convertToPdf(file, path); - String content = FileUtil.extractContent("eng", file, path, pdfPath); + FormatHandler formatHandler = FormatHandlerUtil.find(MimeTypeUtil.guessMimeType(path, "document.docx")); + Assert.assertNotNull(formatHandler); + Assert.assertTrue(formatHandler instanceof DocxFormatHandler); + String content = formatHandler.extractContent("eng", path); Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen.")); } @Test public void extractContentPdf() throws Exception { Path path = Paths.get(ClassLoader.getSystemResource("file/udhr.pdf").toURI()); - File file = new File(); - file.setMimeType(MimeType.APPLICATION_PDF); - String content = FileUtil.extractContent("eng", file, path, path); + FormatHandler formatHandler = FormatHandlerUtil.find(MimeTypeUtil.guessMimeType(path, "udhr.pdf")); + Assert.assertNotNull(formatHandler); + Assert.assertTrue(formatHandler instanceof PdfFormatHandler); + String content = formatHandler.extractContent("eng", path); Assert.assertTrue(content.contains("All human beings are born free and equal in dignity and rights.")); } @Test public void extractContentScannedPdf() throws Exception { Path path = Paths.get(ClassLoader.getSystemResource("file/scanned.pdf").toURI()); - File file = new File(); - file.setMimeType(MimeType.APPLICATION_PDF); - String content = FileUtil.extractContent("eng", file, path, path); - System.out.println(content); + FormatHandler formatHandler = FormatHandlerUtil.find(MimeTypeUtil.guessMimeType(path, "scanned.pdf")); + Assert.assertNotNull(formatHandler); + Assert.assertTrue(formatHandler instanceof PdfFormatHandler); + String content = formatHandler.extractContent("eng", path); Assert.assertTrue(content.contains("All human beings are born free and equal in dignity and rights.")); } diff --git a/docs-core/src/test/java/com/sismics/util/TestMimeTypeUtil.java b/docs-core/src/test/java/com/sismics/util/TestMimeTypeUtil.java index 06a0e159..e7e2c286 100644 --- a/docs-core/src/test/java/com/sismics/util/TestMimeTypeUtil.java +++ b/docs-core/src/test/java/com/sismics/util/TestMimeTypeUtil.java @@ -1,6 +1,5 @@ package com.sismics.util; -import com.sismics.docs.core.model.jpa.File; import com.sismics.util.mime.MimeType; import com.sismics.util.mime.MimeTypeUtil; import org.junit.Assert; @@ -19,14 +18,10 @@ public class TestMimeTypeUtil { public void guessOpenDocumentFormatTest() throws Exception { // Detect ODT files Path path = Paths.get(ClassLoader.getSystemResource("file/document.odt").toURI()); - File file = new File(); - file.setMimeType(MimeType.APPLICATION_ZIP); - Assert.assertEquals(MimeType.OPEN_DOCUMENT_TEXT, MimeTypeUtil.guessOpenDocumentFormat(file, path)); + Assert.assertEquals(MimeType.OPEN_DOCUMENT_TEXT, MimeTypeUtil.guessMimeType(path, "document.odt")); // Detect DOCX files path = Paths.get(ClassLoader.getSystemResource("file/document.docx").toURI()); - file = new File(); - file.setMimeType(MimeType.APPLICATION_ZIP); - Assert.assertEquals(MimeType.OFFICE_DOCUMENT, MimeTypeUtil.guessOpenDocumentFormat(file, path)); + Assert.assertEquals(MimeType.OFFICE_DOCUMENT, MimeTypeUtil.guessMimeType(path, "document.odt")); } }