diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java b/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java index 82f9de31..64d0552e 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java +++ b/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java @@ -1,47 +1,27 @@ package com.sismics.docs.core.util; import java.awt.image.BufferedImage; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Path; -import java.util.List; import javax.crypto.Cipher; import javax.crypto.CipherInputStream; import javax.crypto.CipherOutputStream; import javax.imageio.ImageIO; -import org.apache.pdfbox.io.MemoryUsageSetting; -import org.apache.pdfbox.multipdf.PDFMergerUtility; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDPage; -import org.apache.pdfbox.pdmodel.PDPageContentStream; -import org.apache.pdfbox.pdmodel.common.PDRectangle; -import org.apache.pdfbox.pdmodel.graphics.image.JPEGFactory; -import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory; -import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; -import org.apache.pdfbox.rendering.PDFRenderer; -import org.apache.pdfbox.text.PDFTextStripper; -import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.imgscalr.Scalr; import org.imgscalr.Scalr.Method; import org.imgscalr.Scalr.Mode; -import org.odftoolkit.odfdom.converter.pdf.PdfConverter; -import org.odftoolkit.odfdom.converter.pdf.PdfOptions; -import org.odftoolkit.odfdom.doc.OdfTextDocument; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.google.common.io.Closer; import com.sismics.docs.core.model.jpa.Document; import com.sismics.docs.core.model.jpa.File; import com.sismics.tess4j.Tesseract; import com.sismics.util.ImageUtil; -import com.sismics.util.mime.MimeType; /** * File entity utilities. @@ -69,7 +49,7 @@ public class FileUtil { if (ImageUtil.isImage(file.getMimeType())) { content = ocrFile(inputStream, document); } else if (pdfInputStream != null) { - content = extractPdf(pdfInputStream); + content = PdfUtil.extractPdf(pdfInputStream); } return content; @@ -109,99 +89,6 @@ public class FileUtil { return content; } - /** - * Extract text from a PDF. - * - * @param inputStream Unencrypted input stream - * @return Content extracted - */ - private static String extractPdf(InputStream inputStream) { - String content = null; - PDDocument pdfDocument = null; - try { - PDFTextStripper stripper = new PDFTextStripper(); - pdfDocument = PDDocument.load(inputStream); - content = stripper.getText(pdfDocument); - } catch (IOException e) { - log.error("Error while extracting text from the PDF", e); - } finally { - if (pdfDocument != null) { - try { - pdfDocument.close(); - } catch (IOException e) { - // NOP - } - } - } - - return content; - } - - /** - * Convert a file to PDF if necessary. - * - * @param file File - * @param inputStream InputStream - * @param reset Reset the stream after usage - * @return PDF input stream - * @throws Exception - */ - public static InputStream convertToPdf(File file, InputStream inputStream, boolean reset) throws Exception { - if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) { - // It's already PDF, just return the input - return inputStream; - } - - if (file.getMimeType().equals(MimeType.OFFICE_DOCUMENT)) { - return convertOfficeDocument(inputStream, reset); - } - - if (file.getMimeType().equals(MimeType.OPEN_DOCUMENT_TEXT)) { - return convertOpenDocumentText(inputStream, reset); - } - - // PDF conversion not necessary/possible - return null; - } - - /** - * Convert an open document text file to PDF. - * - * @param inputStream Unencrypted input stream - * @param reset Reset the stream after usage - * @return PDF input stream - * @throws Exception - */ - private static InputStream convertOpenDocumentText(InputStream inputStream, boolean reset) throws Exception { - ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream(); - OdfTextDocument document = OdfTextDocument.loadDocument(inputStream); - PdfOptions options = PdfOptions.create(); - PdfConverter.getInstance().convert(document, pdfOutputStream, options); - if (reset) { - inputStream.reset(); - } - return new ByteArrayInputStream(pdfOutputStream.toByteArray()); - } - - /** - * Convert an Office document to PDF. - * - * @param inputStream Unencrypted input stream - * @param reset Reset the stream after usage - * @return PDF input stream - * @throws Exception - */ - private static InputStream convertOfficeDocument(InputStream inputStream, boolean reset) throws Exception { - ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream(); - XWPFDocument document = new XWPFDocument(inputStream); - org.apache.poi.xwpf.converter.pdf.PdfOptions options = org.apache.poi.xwpf.converter.pdf.PdfOptions.create(); - org.apache.poi.xwpf.converter.pdf.PdfConverter.getInstance().convert(document, pdfOutputStream, options); - if (reset) { - inputStream.reset(); - } - return new ByteArrayInputStream(pdfOutputStream.toByteArray()); - } - /** * Save a file on the storage filesystem. * @@ -237,15 +124,8 @@ public class FileUtil { inputStream.reset(); } else if(pdfInputStream != null) { // Generate preview from the first page of the PDF - PDDocument pdfDocument = null; - try { - pdfDocument = PDDocument.load(pdfInputStream); - PDFRenderer renderer = new PDFRenderer(pdfDocument); - image = renderer.renderImage(0); - pdfInputStream.reset(); - } finally { - pdfDocument.close(); - } + image = PdfUtil.renderFirstPage(pdfInputStream); + pdfInputStream.reset(); } if (image != null) { @@ -289,94 +169,4 @@ public class FileUtil { Files.delete(thumbnailFile); } } - - /** - * Convert a document and its files to a merged PDF file. - * - * @param fileList List of files - * @param fitImageToPage Fill images to the page - * @param margin Margins in millimeters - * @return PDF input stream - * @throws IOException - */ - public static InputStream convertToPdf(List fileList, boolean fitImageToPage, int margin) throws Exception { - // TODO PDF Export: Option to add a front page with: - // document title, document description, creator, date created, language, - // list of all files (and information if it is in this document or not) - // TODO PDF Export: Option to add the comments - - // Create a blank PDF - Closer closer = Closer.create(); - MemoryUsageSetting memUsageSettings = MemoryUsageSetting.setupMixed(1000000); // 1MB max memory usage - memUsageSettings.setTempDir(new java.io.File(System.getProperty("java.io.tmpdir"))); // To OS temp - float mmPerInch = 1 / (10 * 2.54f) * 72f; - - try (PDDocument doc = new PDDocument(memUsageSettings)) { - // Add files - for (File file : fileList) { - Path storedFile = DirectoryUtil.getStorageDirectory().resolve(file.getId()); - try (InputStream storedFileInputStream = file.getPrivateKey() == null ? // Try to decrypt the file if we have a private key available - Files.newInputStream(storedFile) : EncryptionUtil.decryptInputStream(Files.newInputStream(storedFile), file.getPrivateKey())) { - if (ImageUtil.isImage(file.getMimeType())) { - PDPage page = new PDPage(PDRectangle.A4); // Images into A4 pages - try (PDPageContentStream contentStream = new PDPageContentStream(doc, page)) { - // Read the image using the correct handler. PDFBox can't do it because it relies wrongly on file extension - PDImageXObject pdImage = null; - if (file.getMimeType().equals(MimeType.IMAGE_JPEG)) { - pdImage = JPEGFactory.createFromStream(doc, storedFileInputStream); - } else if (file.getMimeType().equals(MimeType.IMAGE_GIF) || file.getMimeType().equals(MimeType.IMAGE_PNG)) { - BufferedImage bim = ImageIO.read(storedFileInputStream); - pdImage = LosslessFactory.createFromImage(doc, bim); - } - - if (fitImageToPage) { - // Fill the page with the image - float widthAvailable = page.getMediaBox().getWidth() - 2 * margin * mmPerInch; - float heightAvailable = page.getMediaBox().getHeight() - 2 * margin * mmPerInch; - - // Compare page format and image format - if (widthAvailable / heightAvailable < (float) pdImage.getWidth() / (float) pdImage.getHeight()) { - float imageHeight = widthAvailable / pdImage.getWidth() * pdImage.getHeight(); - contentStream.drawImage(pdImage, margin * mmPerInch, heightAvailable + margin * mmPerInch - imageHeight, - widthAvailable, imageHeight); - } else { - float imageWidth = heightAvailable / pdImage.getHeight() * pdImage.getWidth(); - contentStream.drawImage(pdImage, margin * mmPerInch, margin * mmPerInch, - imageWidth, heightAvailable); - } - } else { - // Draw the image as is - contentStream.drawImage(pdImage, margin * mmPerInch, - page.getMediaBox().getHeight() - pdImage.getHeight() - margin * mmPerInch); - } - } - doc.addPage(page); - } else { - // Try to convert the file to PDF - InputStream pdfInputStream = convertToPdf(file, storedFileInputStream, false); - if (pdfInputStream != null) { - // This file is convertible to PDF, just add it to the end - try { - PDDocument mergeDoc = PDDocument.load(pdfInputStream, memUsageSettings); - closer.register(mergeDoc); - PDFMergerUtility pdfMergerUtility = new PDFMergerUtility(); - pdfMergerUtility.appendDocument(doc, mergeDoc); - } finally { - pdfInputStream.close(); - } - } - - // All other non-PDF-convertible files are ignored - } - } - } - - // Save to a temporary file - try (TemporaryFileStream temporaryFileStream = new TemporaryFileStream()) { - doc.save(temporaryFileStream.openWriteStream()); - closer.close(); // Close all remaining opened PDF - return temporaryFileStream.openReadStream(); - } - } - } } diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/PdfUtil.java b/docs-core/src/main/java/com/sismics/docs/core/util/PdfUtil.java new file mode 100644 index 00000000..ebf64351 --- /dev/null +++ b/docs-core/src/main/java/com/sismics/docs/core/util/PdfUtil.java @@ -0,0 +1,245 @@ +package com.sismics.docs.core.util; + +import java.awt.image.BufferedImage; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; + +import javax.imageio.ImageIO; + +import org.apache.pdfbox.io.MemoryUsageSetting; +import org.apache.pdfbox.multipdf.PDFMergerUtility; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageContentStream; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.graphics.image.JPEGFactory; +import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory; +import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; +import org.apache.pdfbox.rendering.PDFRenderer; +import org.apache.pdfbox.text.PDFTextStripper; +import org.apache.poi.xwpf.usermodel.XWPFDocument; +import org.odftoolkit.odfdom.converter.pdf.PdfConverter; +import org.odftoolkit.odfdom.converter.pdf.PdfOptions; +import org.odftoolkit.odfdom.doc.OdfTextDocument; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.io.Closer; +import com.sismics.docs.core.model.jpa.File; +import com.sismics.util.ImageUtil; +import com.sismics.util.mime.MimeType; + +/** + * PDF utilities. + * + * @author bgamard + */ +public class PdfUtil { + /** + * Logger. + */ + private static final Logger log = LoggerFactory.getLogger(PdfUtil.class); + + /** + * Extract text from a PDF. + * + * @param inputStream Unencrypted input stream + * @return Content extracted + */ + public static String extractPdf(InputStream inputStream) { + String content = null; + PDDocument pdfDocument = null; + try { + PDFTextStripper stripper = new PDFTextStripper(); + pdfDocument = PDDocument.load(inputStream); + content = stripper.getText(pdfDocument); + } catch (IOException e) { + log.error("Error while extracting text from the PDF", e); + } finally { + if (pdfDocument != null) { + try { + pdfDocument.close(); + } catch (IOException e) { + // NOP + } + } + } + + return content; + } + + /** + * Convert a file to PDF if necessary. + * + * @param file File + * @param inputStream InputStream + * @param reset Reset the stream after usage + * @return PDF input stream + * @throws Exception + */ + public static InputStream convertToPdf(File file, InputStream inputStream, boolean reset) throws Exception { + if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) { + // It's already PDF, just return the input + return inputStream; + } + + if (file.getMimeType().equals(MimeType.OFFICE_DOCUMENT)) { + return convertOfficeDocument(inputStream, reset); + } + + if (file.getMimeType().equals(MimeType.OPEN_DOCUMENT_TEXT)) { + return convertOpenDocumentText(inputStream, reset); + } + + // PDF conversion not necessary/possible + return null; + } + + /** + * Convert an open document text file to PDF. + * + * @param inputStream Unencrypted input stream + * @param reset Reset the stream after usage + * @return PDF input stream + * @throws Exception + */ + private static InputStream convertOpenDocumentText(InputStream inputStream, boolean reset) throws Exception { + ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream(); + OdfTextDocument document = OdfTextDocument.loadDocument(inputStream); + PdfOptions options = PdfOptions.create(); + PdfConverter.getInstance().convert(document, pdfOutputStream, options); + if (reset) { + inputStream.reset(); + } + return new ByteArrayInputStream(pdfOutputStream.toByteArray()); + } + + /** + * Convert an Office document to PDF. + * + * @param inputStream Unencrypted input stream + * @param reset Reset the stream after usage + * @return PDF input stream + * @throws Exception + */ + private static InputStream convertOfficeDocument(InputStream inputStream, boolean reset) throws Exception { + ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream(); + XWPFDocument document = new XWPFDocument(inputStream); + org.apache.poi.xwpf.converter.pdf.PdfOptions options = org.apache.poi.xwpf.converter.pdf.PdfOptions.create(); + org.apache.poi.xwpf.converter.pdf.PdfConverter.getInstance().convert(document, pdfOutputStream, options); + if (reset) { + inputStream.reset(); + } + return new ByteArrayInputStream(pdfOutputStream.toByteArray()); + } + + /** + * Convert a document and its files to a merged PDF file. + * + * @param fileList List of files + * @param fitImageToPage Fit images to the page + * @param margin Margins in millimeters + * @return PDF input stream + * @throws IOException + */ + public static InputStream convertToPdf(List fileList, boolean fitImageToPage, int margin) throws Exception { + // TODO PDF Export: Option to add a front page with: + // document title, document description, creator, date created, language, + // list of all files (and information if it is in this document or not) + // TODO PDF Export: Option to add the comments + + // Create a blank PDF + Closer closer = Closer.create(); + MemoryUsageSetting memUsageSettings = MemoryUsageSetting.setupMixed(1000000); // 1MB max memory usage + memUsageSettings.setTempDir(new java.io.File(System.getProperty("java.io.tmpdir"))); // To OS temp + float mmPerInch = 1 / (10 * 2.54f) * 72f; + + try (PDDocument doc = new PDDocument(memUsageSettings)) { + // Add files + for (File file : fileList) { + Path storedFile = DirectoryUtil.getStorageDirectory().resolve(file.getId()); + try (InputStream storedFileInputStream = file.getPrivateKey() == null ? // Try to decrypt the file if we have a private key available + Files.newInputStream(storedFile) : EncryptionUtil.decryptInputStream(Files.newInputStream(storedFile), file.getPrivateKey())) { + if (ImageUtil.isImage(file.getMimeType())) { + PDPage page = new PDPage(PDRectangle.A4); // Images into A4 pages + try (PDPageContentStream contentStream = new PDPageContentStream(doc, page)) { + // Read the image using the correct handler. PDFBox can't do it because it relies wrongly on file extension + PDImageXObject pdImage = null; + if (file.getMimeType().equals(MimeType.IMAGE_JPEG)) { + pdImage = JPEGFactory.createFromStream(doc, storedFileInputStream); + } else if (file.getMimeType().equals(MimeType.IMAGE_GIF) || file.getMimeType().equals(MimeType.IMAGE_PNG)) { + BufferedImage bim = ImageIO.read(storedFileInputStream); + pdImage = LosslessFactory.createFromImage(doc, bim); + } + + // Do we want to fill the page with the image? + if (fitImageToPage) { + // Fill the page with the image + float widthAvailable = page.getMediaBox().getWidth() - 2 * margin * mmPerInch; + float heightAvailable = page.getMediaBox().getHeight() - 2 * margin * mmPerInch; + + // Compare page format and image format + if (widthAvailable / heightAvailable < (float) pdImage.getWidth() / (float) pdImage.getHeight()) { + float imageHeight = widthAvailable / pdImage.getWidth() * pdImage.getHeight(); + contentStream.drawImage(pdImage, margin * mmPerInch, heightAvailable + margin * mmPerInch - imageHeight, + widthAvailable, imageHeight); + } else { + float imageWidth = heightAvailable / pdImage.getHeight() * pdImage.getWidth(); + contentStream.drawImage(pdImage, margin * mmPerInch, margin * mmPerInch, + imageWidth, heightAvailable); + } + } else { + // Draw the image as is + contentStream.drawImage(pdImage, margin * mmPerInch, + page.getMediaBox().getHeight() - pdImage.getHeight() - margin * mmPerInch); + } + } + doc.addPage(page); + } else { + // Try to convert the file to PDF + InputStream pdfInputStream = convertToPdf(file, storedFileInputStream, false); + if (pdfInputStream != null) { + // This file is convertible to PDF, just add it to the end + try { + PDDocument mergeDoc = PDDocument.load(pdfInputStream, memUsageSettings); + closer.register(mergeDoc); + PDFMergerUtility pdfMergerUtility = new PDFMergerUtility(); + pdfMergerUtility.appendDocument(doc, mergeDoc); + } finally { + pdfInputStream.close(); + } + } + + // All other non-PDF-convertible files are ignored + } + } + } + + // Save to a temporary file + try (TemporaryFileStream temporaryFileStream = new TemporaryFileStream()) { + doc.save(temporaryFileStream.openWriteStream()); + closer.close(); // Close all remaining opened PDF + return temporaryFileStream.openReadStream(); + } + } + } + + /** + * Render the first page of a PDF. + * + * @param inputStream PDF document + * @return Render of the first page + * @throws IOException + */ + public static BufferedImage renderFirstPage(InputStream inputStream) throws IOException { + try (PDDocument pdfDocument = PDDocument.load(inputStream)) { + PDFRenderer renderer = new PDFRenderer(pdfDocument); + return renderer.renderImage(0); + } + } +} diff --git a/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java b/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java index e8dff58a..7ec18c99 100644 --- a/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java +++ b/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java @@ -4,8 +4,6 @@ import java.io.InputStream; import java.nio.file.Files; import java.nio.file.StandardCopyOption; -import junit.framework.Assert; - import org.junit.Test; import com.google.common.collect.Lists; @@ -13,6 +11,8 @@ import com.google.common.io.Resources; import com.sismics.docs.core.model.jpa.File; import com.sismics.util.mime.MimeType; +import junit.framework.Assert; + /** * Test of the file entity utilities. * @@ -24,7 +24,7 @@ public class TestFileUtil { try (InputStream inputStream = Resources.getResource("file/document.odt").openStream()) { File file = new File(); file.setMimeType(MimeType.OPEN_DOCUMENT_TEXT); - try (InputStream pdfInputStream = FileUtil.convertToPdf(file, inputStream, false)) { + try (InputStream pdfInputStream = PdfUtil.convertToPdf(file, inputStream, false)) { String content = FileUtil.extractContent(null, file, inputStream, pdfInputStream); Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen.")); } @@ -36,7 +36,7 @@ public class TestFileUtil { try (InputStream inputStream = Resources.getResource("file/document.docx").openStream()) { File file = new File(); file.setMimeType(MimeType.OFFICE_DOCUMENT); - try (InputStream pdfInputStream = FileUtil.convertToPdf(file, inputStream, false)) { + try (InputStream pdfInputStream = PdfUtil.convertToPdf(file, inputStream, false)) { String content = FileUtil.extractContent(null, file, inputStream, pdfInputStream); Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen.")); } @@ -81,7 +81,7 @@ public class TestFileUtil { file4.setId("document_odt"); file4.setMimeType(MimeType.OPEN_DOCUMENT_TEXT); - FileUtil.convertToPdf(Lists.newArrayList(file0, file1, file2, file3, file4), true, 10).close(); + PdfUtil.convertToPdf(Lists.newArrayList(file0, file1, file2, file3, file4), true, 10).close(); } } } diff --git a/docs-web/src/main/java/com/sismics/docs/rest/resource/FileResource.java b/docs-web/src/main/java/com/sismics/docs/rest/resource/FileResource.java index 539f79e2..c79dba41 100644 --- a/docs-web/src/main/java/com/sismics/docs/rest/resource/FileResource.java +++ b/docs-web/src/main/java/com/sismics/docs/rest/resource/FileResource.java @@ -53,6 +53,7 @@ import com.sismics.docs.core.model.jpa.User; import com.sismics.docs.core.util.DirectoryUtil; import com.sismics.docs.core.util.EncryptionUtil; import com.sismics.docs.core.util.FileUtil; +import com.sismics.docs.core.util.PdfUtil; import com.sismics.rest.exception.ClientException; import com.sismics.rest.exception.ForbiddenClientException; import com.sismics.rest.exception.ServerException; @@ -150,7 +151,7 @@ public class FileResource extends BaseResource { file.setMimeType(MimeTypeUtil.guessOpenDocumentFormat(file, fileInputStream)); // Convert to PDF if necessary (for thumbnail and text extraction) - InputStream pdfIntputStream = FileUtil.convertToPdf(file, fileInputStream, true); + InputStream pdfIntputStream = PdfUtil.convertToPdf(file, fileInputStream, true); // Save the file FileUtil.save(fileInputStream, pdfIntputStream, file, user.getPrivateKey());