diff --git a/README.md b/README.md index 95f20e5f..6383b142 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ Features - Responsive user interface - Optical character recognition -- Support image and PDF files +- Support image, PDF, ODT and DOCX files - Flexible search engine - Full text search in image and PDF - 256-bit AES encryption diff --git a/docs-core/src/main/java/com/sismics/docs/core/event/FileCreatedAsyncEvent.java b/docs-core/src/main/java/com/sismics/docs/core/event/FileCreatedAsyncEvent.java index 41a128f3..8eab7115 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/event/FileCreatedAsyncEvent.java +++ b/docs-core/src/main/java/com/sismics/docs/core/event/FileCreatedAsyncEvent.java @@ -28,58 +28,43 @@ public class FileCreatedAsyncEvent { private InputStream inputStream; /** - * Getter of file. - * - * @return the file + * Unencrypted input stream containing a PDF representation + * of the file. May be null if the PDF conversion is not + * necessary or not possible. */ + private InputStream pdfInputStream; + public File getFile() { return file; } - /** - * Setter of file. - * - * @param file file - */ public void setFile(File file) { this.file = file; } - /** - * Getter of document. - * - * @return the document - */ public Document getDocument() { return document; } - /** - * Setter of document. - * - * @param document document - */ public void setDocument(Document document) { this.document = document; } - /** - * Getter of inputStream. - * - * @return the inputStream - */ public InputStream getInputStream() { return inputStream; } - /** - * Setter de inputStream. - * - * @param inputStream inputStream - */ public void setInputStream(InputStream inputStream) { this.inputStream = inputStream; } + + public InputStream getPdfInputStream() { + return pdfInputStream; + } + + public void setPdfInputStream(InputStream pdfInputStream) { + this.pdfInputStream = pdfInputStream; + } @Override public String toString() { diff --git a/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java b/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java index bda947ab..0eacdeca 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java +++ b/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java @@ -12,7 +12,6 @@ import com.sismics.docs.core.event.FileCreatedAsyncEvent; import com.sismics.docs.core.model.jpa.File; import com.sismics.docs.core.util.FileUtil; import com.sismics.docs.core.util.TransactionUtil; -import com.sismics.util.mime.MimeTypeUtil; /** * Listener on file created. @@ -39,12 +38,15 @@ public class FileCreatedAsyncListener { // Guess the mime type a second time, for open document format (first detected as simple ZIP file) final File file = fileCreatedAsyncEvent.getFile(); - file.setMimeType(MimeTypeUtil.guessOpenDocumentFormat(file, fileCreatedAsyncEvent.getInputStream())); // Extract text content from the file long startTime = System.currentTimeMillis(); - final String content = FileUtil.extractContent(fileCreatedAsyncEvent.getDocument(), file, fileCreatedAsyncEvent.getInputStream()); + final String content = FileUtil.extractContent(fileCreatedAsyncEvent.getDocument(), file, + fileCreatedAsyncEvent.getInputStream(), fileCreatedAsyncEvent.getPdfInputStream()); fileCreatedAsyncEvent.getInputStream().close(); + if (fileCreatedAsyncEvent.getPdfInputStream() != null) { + fileCreatedAsyncEvent.getPdfInputStream().close(); + } log.info(MessageFormat.format("File content extracted in {0}ms", System.currentTimeMillis() - startTime)); // Store the text content in the database diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java b/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java index 951590c6..59b30bec 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java +++ b/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java @@ -1,6 +1,8 @@ package com.sismics.docs.core.util; import java.awt.image.BufferedImage; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; @@ -48,19 +50,16 @@ public class FileUtil { * @param document Document linked to the file * @param file File to extract * @param inputStream Unencrypted input stream + * @param pdfInputStream Unencrypted PDF input stream * @return Content extract */ - public static String extractContent(Document document, File file, InputStream inputStream) { + public static String extractContent(Document document, File file, InputStream inputStream, InputStream pdfInputStream) { String content = null; if (ImageUtil.isImage(file.getMimeType())) { content = ocrFile(inputStream, document); - } else if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) { - content = extractPdf(inputStream); - } else if (file.getMimeType().equals(MimeType.OPEN_DOCUMENT_TEXT)) { - content = extractOpenDocumentText(inputStream); - } else if (file.getMimeType().equals(MimeType.OFFICE_DOCUMENT)) { - content = extractOfficeDocument(inputStream); + } else if (pdfInputStream != null) { + content = extractPdf(pdfInputStream); } return content; @@ -129,92 +128,80 @@ public class FileUtil { } /** - * Extract text from an open document text file. + * Convert a file to PDF if necessary. * - * @param inputStream Unencrypted input stream - * @return Content extracted + * @param inputStream InputStream + * @param file File + * @return PDF input stream + * @throws Exception */ - private static String extractOpenDocumentText(InputStream inputStream) { - String content = null; - Path tempFile = null; - try { - // Convert the ODT file to a temporary PDF file - tempFile = Files.createTempFile("sismicsdocs_", ".pdf"); - try (OutputStream out = Files.newOutputStream(tempFile)) { - OdfTextDocument document = OdfTextDocument.loadDocument(inputStream); - PdfOptions options = PdfOptions.create(); - PdfConverter.getInstance().convert(document, out, options); - } - - // Extract content from the PDF file - try (InputStream pdfInputStream = Files.newInputStream(tempFile)) { - content = extractPdf(pdfInputStream); - } - - } catch (Exception e) { - log.error("Error while extracting text from the ODT", e); - } finally { - try { - Files.delete(tempFile); // Delete the temporary PDF file - } catch (IOException e) { - // Should not happen - } + public static InputStream convertToPdf(InputStream inputStream, File file) throws Exception { + if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) { + // It's already PDF, just return the input + return inputStream; } - return content; + + if (file.getMimeType().equals(MimeType.OFFICE_DOCUMENT)) { + return convertOfficeDocument(inputStream); + } + + if (file.getMimeType().equals(MimeType.OPEN_DOCUMENT_TEXT)) { + return convertOpenDocumentText(inputStream); + } + + // PDF conversion not necessary/possible + return null; } /** - * Extract text from an Office document. + * Convert an open document text file to PDF. * * @param inputStream Unencrypted input stream - * @return Content extracted + * @return PDF input stream + * @throws Exception */ - private static String extractOfficeDocument(InputStream inputStream) { - String content = null; - Path tempFile = null; - try { - // Convert the DOCX file to a temporary PDF file - tempFile = Files.createTempFile("sismicsdocs_", ".pdf"); - try (OutputStream out = Files.newOutputStream(tempFile)) { - XWPFDocument document = new XWPFDocument(inputStream); - org.apache.poi.xwpf.converter.pdf.PdfOptions options = org.apache.poi.xwpf.converter.pdf.PdfOptions.create(); - org.apache.poi.xwpf.converter.pdf.PdfConverter.getInstance().convert(document, out, options); - } - - // Extract content from the PDF file - try (InputStream pdfInputStream = Files.newInputStream(tempFile)) { - content = extractPdf(pdfInputStream); - } - - } catch (Exception e) { - log.error("Error while extracting text from the DOCX", e); - } finally { - try { - Files.delete(tempFile); // Delete the temporary PDF file - } catch (IOException e) { - // Should not happen - } - } - return content; + private static InputStream convertOpenDocumentText(InputStream inputStream) throws Exception { + ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream(); + OdfTextDocument document = OdfTextDocument.loadDocument(inputStream); + PdfOptions options = PdfOptions.create(); + PdfConverter.getInstance().convert(document, pdfOutputStream, options); + inputStream.reset(); + return new ByteArrayInputStream(pdfOutputStream.toByteArray()); + } + + /** + * Convert an Office document to PDF. + * + * @param inputStream Unencrypted input stream + * @return PDF input stream + * @throws Exception + */ + private static InputStream convertOfficeDocument(InputStream inputStream) throws Exception { + ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream(); + XWPFDocument document = new XWPFDocument(inputStream); + org.apache.poi.xwpf.converter.pdf.PdfOptions options = org.apache.poi.xwpf.converter.pdf.PdfOptions.create(); + org.apache.poi.xwpf.converter.pdf.PdfConverter.getInstance().convert(document, pdfOutputStream, options); + inputStream.reset(); + return new ByteArrayInputStream(pdfOutputStream.toByteArray()); } /** * Save a file on the storage filesystem. * * @param inputStream Unencrypted input stream + * @param pdf * @param file File to save * @param privateKey Private key used for encryption * @throws Exception */ - public static void save(InputStream inputStream, File file, String privateKey) throws Exception { + public static void save(InputStream inputStream, InputStream pdfInputStream, File file, String privateKey) throws Exception { Cipher cipher = EncryptionUtil.getEncryptionCipher(privateKey); Path path = DirectoryUtil.getStorageDirectory().resolve(file.getId()); Files.copy(new CipherInputStream(inputStream, cipher), path); + inputStream.reset(); // Generate file variations - inputStream.reset(); - saveVariations(file, inputStream, cipher); - inputStream.reset(); + saveVariations(file, inputStream, pdfInputStream, cipher); } /** @@ -222,25 +209,27 @@ public class FileUtil { * * @param file File from database * @param inputStream Unencrypted input stream + * @param pdfInputStream Unencrypted PDF input stream * @param cipher Cipher to use for encryption * @throws Exception */ - public static void saveVariations(File file, InputStream inputStream, Cipher cipher) throws Exception { + public static void saveVariations(File file, InputStream inputStream, InputStream pdfInputStream, Cipher cipher) throws Exception { BufferedImage image = null; if (ImageUtil.isImage(file.getMimeType())) { image = ImageIO.read(inputStream); - } else if(file.getMimeType().equals(MimeType.APPLICATION_PDF)) { + inputStream.reset(); + } else if(pdfInputStream != null) { // Generate preview from the first page of the PDF PDDocument pdfDocument = null; try { - pdfDocument = PDDocument.load(inputStream); + pdfDocument = PDDocument.load(pdfInputStream); PDFRenderer renderer = new PDFRenderer(pdfDocument); image = renderer.renderImage(0); + pdfInputStream.reset(); } finally { pdfDocument.close(); } } - // TODO Generate thumbnails for DOCX/ODT documents (guess the MIME type earlier and build a PDF version now?) if (image != null) { // Generate thumbnails from image diff --git a/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java b/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java index c9ecdeeb..2bb2dd38 100644 --- a/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java +++ b/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java @@ -1,9 +1,11 @@ package com.sismics.docs.core.util; +import java.io.ByteArrayInputStream; import java.io.InputStream; import junit.framework.Assert; +import org.apache.pdfbox.io.IOUtils; import org.junit.Test; import com.google.common.io.Resources; @@ -18,19 +20,25 @@ import com.sismics.util.mime.MimeType; public class TestFileUtil { @Test public void extractContentOpenDocumentTextTest() throws Exception { - try (InputStream inputStream = Resources.getResource("file/document.odt").openStream()) { + try (InputStream inputStream = Resources.getResource("file/document.odt").openStream(); + InputStream bytesInputStream = new ByteArrayInputStream(IOUtils.toByteArray(inputStream))) { File file = new File(); file.setMimeType(MimeType.OPEN_DOCUMENT_TEXT); - Assert.assertEquals("Lorem ipsum dolor sit amen.\r\n", FileUtil.extractContent(null, file, inputStream)); + try (InputStream pdfInputStream = FileUtil.convertToPdf(bytesInputStream, file)) { + Assert.assertEquals("Lorem ipsum dolor sit amen.\r\n", FileUtil.extractContent(null, file, inputStream, pdfInputStream)); + } } } @Test public void extractContentOfficeDocumentTest() throws Exception { - try (InputStream inputStream = Resources.getResource("file/document.docx").openStream()) { + try (InputStream inputStream = Resources.getResource("file/document.docx").openStream(); + InputStream bytesInputStream = new ByteArrayInputStream(IOUtils.toByteArray(inputStream))) { File file = new File(); file.setMimeType(MimeType.OFFICE_DOCUMENT); - Assert.assertEquals("Lorem ipsum dolor sit amen.\r\n", FileUtil.extractContent(null, file, inputStream)); + try (InputStream pdfInputStream = FileUtil.convertToPdf(bytesInputStream, file)) { + Assert.assertEquals("Lorem ipsum dolor sit amen.\r\n", FileUtil.extractContent(null, file, inputStream, pdfInputStream)); + } } } } diff --git a/docs-web/src/main/java/com/sismics/docs/rest/resource/FileResource.java b/docs-web/src/main/java/com/sismics/docs/rest/resource/FileResource.java index 69052f27..4ddc0621 100644 --- a/docs-web/src/main/java/com/sismics/docs/rest/resource/FileResource.java +++ b/docs-web/src/main/java/com/sismics/docs/rest/resource/FileResource.java @@ -146,8 +146,14 @@ public class FileResource extends BaseResource { file.setUserId(principal.getId()); String fileId = fileDao.create(file); + // Guess the mime type a second time, for open document format (first detected as simple ZIP file) + file.setMimeType(MimeTypeUtil.guessOpenDocumentFormat(file, fileInputStream)); + + // Convert to PDF if necessary (for thumbnail and text extraction) + InputStream pdfIntputStream = FileUtil.convertToPdf(fileInputStream, file); + // Save the file - FileUtil.save(fileInputStream, file, user.getPrivateKey()); + FileUtil.save(fileInputStream, pdfIntputStream, file, user.getPrivateKey()); // Update the user quota user.setStorageCurrent(user.getStorageCurrent() + fileData.length); @@ -159,6 +165,7 @@ public class FileResource extends BaseResource { fileCreatedAsyncEvent.setDocument(document); fileCreatedAsyncEvent.setFile(file); fileCreatedAsyncEvent.setInputStream(fileInputStream); + fileCreatedAsyncEvent.setPdfInputStream(pdfIntputStream); AppContext.getInstance().getAsyncEventBus().post(fileCreatedAsyncEvent); } diff --git a/docs-web/src/main/webapp/src/partial/docs/document.edit.html b/docs-web/src/main/webapp/src/partial/docs/document.edit.html index 717fc3a5..736f1136 100644 --- a/docs-web/src/main/webapp/src/partial/docs/document.edit.html +++ b/docs-web/src/main/webapp/src/partial/docs/document.edit.html @@ -39,7 +39,8 @@