diff --git a/README.md b/README.md index 95f20e5f..6383b142 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ Features - Responsive user interface - Optical character recognition -- Support image and PDF files +- Support image, PDF, ODT and DOCX files - Flexible search engine - Full text search in image and PDF - 256-bit AES encryption diff --git a/docs-core/src/main/java/com/sismics/docs/core/event/FileCreatedAsyncEvent.java b/docs-core/src/main/java/com/sismics/docs/core/event/FileCreatedAsyncEvent.java index 41a128f3..8eab7115 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/event/FileCreatedAsyncEvent.java +++ b/docs-core/src/main/java/com/sismics/docs/core/event/FileCreatedAsyncEvent.java @@ -28,58 +28,43 @@ public class FileCreatedAsyncEvent { private InputStream inputStream; /** - * Getter of file. - * - * @return the file + * Unencrypted input stream containing a PDF representation + * of the file. May be null if the PDF conversion is not + * necessary or not possible. */ + private InputStream pdfInputStream; + public File getFile() { return file; } - /** - * Setter of file. - * - * @param file file - */ public void setFile(File file) { this.file = file; } - /** - * Getter of document. - * - * @return the document - */ public Document getDocument() { return document; } - /** - * Setter of document. - * - * @param document document - */ public void setDocument(Document document) { this.document = document; } - /** - * Getter of inputStream. - * - * @return the inputStream - */ public InputStream getInputStream() { return inputStream; } - /** - * Setter de inputStream. - * - * @param inputStream inputStream - */ public void setInputStream(InputStream inputStream) { this.inputStream = inputStream; } + + public InputStream getPdfInputStream() { + return pdfInputStream; + } + + public void setPdfInputStream(InputStream pdfInputStream) { + this.pdfInputStream = pdfInputStream; + } @Override public String toString() { diff --git a/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java b/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java index bda947ab..0eacdeca 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java +++ b/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java @@ -12,7 +12,6 @@ import com.sismics.docs.core.event.FileCreatedAsyncEvent; import com.sismics.docs.core.model.jpa.File; import com.sismics.docs.core.util.FileUtil; import com.sismics.docs.core.util.TransactionUtil; -import com.sismics.util.mime.MimeTypeUtil; /** * Listener on file created. @@ -39,12 +38,15 @@ public class FileCreatedAsyncListener { // Guess the mime type a second time, for open document format (first detected as simple ZIP file) final File file = fileCreatedAsyncEvent.getFile(); - file.setMimeType(MimeTypeUtil.guessOpenDocumentFormat(file, fileCreatedAsyncEvent.getInputStream())); // Extract text content from the file long startTime = System.currentTimeMillis(); - final String content = FileUtil.extractContent(fileCreatedAsyncEvent.getDocument(), file, fileCreatedAsyncEvent.getInputStream()); + final String content = FileUtil.extractContent(fileCreatedAsyncEvent.getDocument(), file, + fileCreatedAsyncEvent.getInputStream(), fileCreatedAsyncEvent.getPdfInputStream()); fileCreatedAsyncEvent.getInputStream().close(); + if (fileCreatedAsyncEvent.getPdfInputStream() != null) { + fileCreatedAsyncEvent.getPdfInputStream().close(); + } log.info(MessageFormat.format("File content extracted in {0}ms", System.currentTimeMillis() - startTime)); // Store the text content in the database diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java b/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java index 951590c6..59b30bec 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java +++ b/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java @@ -1,6 +1,8 @@ package com.sismics.docs.core.util; import java.awt.image.BufferedImage; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; @@ -48,19 +50,16 @@ public class FileUtil { * @param document Document linked to the file * @param file File to extract * @param inputStream Unencrypted input stream + * @param pdfInputStream Unencrypted PDF input stream * @return Content extract */ - public static String extractContent(Document document, File file, InputStream inputStream) { + public static String extractContent(Document document, File file, InputStream inputStream, InputStream pdfInputStream) { String content = null; if (ImageUtil.isImage(file.getMimeType())) { content = ocrFile(inputStream, document); - } else if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) { - content = extractPdf(inputStream); - } else if (file.getMimeType().equals(MimeType.OPEN_DOCUMENT_TEXT)) { - content = extractOpenDocumentText(inputStream); - } else if (file.getMimeType().equals(MimeType.OFFICE_DOCUMENT)) { - content = extractOfficeDocument(inputStream); + } else if (pdfInputStream != null) { + content = extractPdf(pdfInputStream); } return content; @@ -129,92 +128,80 @@ public class FileUtil { } /** - * Extract text from an open document text file. + * Convert a file to PDF if necessary. * - * @param inputStream Unencrypted input stream - * @return Content extracted + * @param inputStream InputStream + * @param file File + * @return PDF input stream + * @throws Exception */ - private static String extractOpenDocumentText(InputStream inputStream) { - String content = null; - Path tempFile = null; - try { - // Convert the ODT file to a temporary PDF file - tempFile = Files.createTempFile("sismicsdocs_", ".pdf"); - try (OutputStream out = Files.newOutputStream(tempFile)) { - OdfTextDocument document = OdfTextDocument.loadDocument(inputStream); - PdfOptions options = PdfOptions.create(); - PdfConverter.getInstance().convert(document, out, options); - } - - // Extract content from the PDF file - try (InputStream pdfInputStream = Files.newInputStream(tempFile)) { - content = extractPdf(pdfInputStream); - } - - } catch (Exception e) { - log.error("Error while extracting text from the ODT", e); - } finally { - try { - Files.delete(tempFile); // Delete the temporary PDF file - } catch (IOException e) { - // Should not happen - } + public static InputStream convertToPdf(InputStream inputStream, File file) throws Exception { + if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) { + // It's already PDF, just return the input + return inputStream; } - return content; + + if (file.getMimeType().equals(MimeType.OFFICE_DOCUMENT)) { + return convertOfficeDocument(inputStream); + } + + if (file.getMimeType().equals(MimeType.OPEN_DOCUMENT_TEXT)) { + return convertOpenDocumentText(inputStream); + } + + // PDF conversion not necessary/possible + return null; } /** - * Extract text from an Office document. + * Convert an open document text file to PDF. * * @param inputStream Unencrypted input stream - * @return Content extracted + * @return PDF input stream + * @throws Exception */ - private static String extractOfficeDocument(InputStream inputStream) { - String content = null; - Path tempFile = null; - try { - // Convert the DOCX file to a temporary PDF file - tempFile = Files.createTempFile("sismicsdocs_", ".pdf"); - try (OutputStream out = Files.newOutputStream(tempFile)) { - XWPFDocument document = new XWPFDocument(inputStream); - org.apache.poi.xwpf.converter.pdf.PdfOptions options = org.apache.poi.xwpf.converter.pdf.PdfOptions.create(); - org.apache.poi.xwpf.converter.pdf.PdfConverter.getInstance().convert(document, out, options); - } - - // Extract content from the PDF file - try (InputStream pdfInputStream = Files.newInputStream(tempFile)) { - content = extractPdf(pdfInputStream); - } - - } catch (Exception e) { - log.error("Error while extracting text from the DOCX", e); - } finally { - try { - Files.delete(tempFile); // Delete the temporary PDF file - } catch (IOException e) { - // Should not happen - } - } - return content; + private static InputStream convertOpenDocumentText(InputStream inputStream) throws Exception { + ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream(); + OdfTextDocument document = OdfTextDocument.loadDocument(inputStream); + PdfOptions options = PdfOptions.create(); + PdfConverter.getInstance().convert(document, pdfOutputStream, options); + inputStream.reset(); + return new ByteArrayInputStream(pdfOutputStream.toByteArray()); + } + + /** + * Convert an Office document to PDF. + * + * @param inputStream Unencrypted input stream + * @return PDF input stream + * @throws Exception + */ + private static InputStream convertOfficeDocument(InputStream inputStream) throws Exception { + ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream(); + XWPFDocument document = new XWPFDocument(inputStream); + org.apache.poi.xwpf.converter.pdf.PdfOptions options = org.apache.poi.xwpf.converter.pdf.PdfOptions.create(); + org.apache.poi.xwpf.converter.pdf.PdfConverter.getInstance().convert(document, pdfOutputStream, options); + inputStream.reset(); + return new ByteArrayInputStream(pdfOutputStream.toByteArray()); } /** * Save a file on the storage filesystem. * * @param inputStream Unencrypted input stream + * @param pdf * @param file File to save * @param privateKey Private key used for encryption * @throws Exception */ - public static void save(InputStream inputStream, File file, String privateKey) throws Exception { + public static void save(InputStream inputStream, InputStream pdfInputStream, File file, String privateKey) throws Exception { Cipher cipher = EncryptionUtil.getEncryptionCipher(privateKey); Path path = DirectoryUtil.getStorageDirectory().resolve(file.getId()); Files.copy(new CipherInputStream(inputStream, cipher), path); + inputStream.reset(); // Generate file variations - inputStream.reset(); - saveVariations(file, inputStream, cipher); - inputStream.reset(); + saveVariations(file, inputStream, pdfInputStream, cipher); } /** @@ -222,25 +209,27 @@ public class FileUtil { * * @param file File from database * @param inputStream Unencrypted input stream + * @param pdfInputStream Unencrypted PDF input stream * @param cipher Cipher to use for encryption * @throws Exception */ - public static void saveVariations(File file, InputStream inputStream, Cipher cipher) throws Exception { + public static void saveVariations(File file, InputStream inputStream, InputStream pdfInputStream, Cipher cipher) throws Exception { BufferedImage image = null; if (ImageUtil.isImage(file.getMimeType())) { image = ImageIO.read(inputStream); - } else if(file.getMimeType().equals(MimeType.APPLICATION_PDF)) { + inputStream.reset(); + } else if(pdfInputStream != null) { // Generate preview from the first page of the PDF PDDocument pdfDocument = null; try { - pdfDocument = PDDocument.load(inputStream); + pdfDocument = PDDocument.load(pdfInputStream); PDFRenderer renderer = new PDFRenderer(pdfDocument); image = renderer.renderImage(0); + pdfInputStream.reset(); } finally { pdfDocument.close(); } } - // TODO Generate thumbnails for DOCX/ODT documents (guess the MIME type earlier and build a PDF version now?) if (image != null) { // Generate thumbnails from image diff --git a/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java b/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java index c9ecdeeb..2bb2dd38 100644 --- a/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java +++ b/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java @@ -1,9 +1,11 @@ package com.sismics.docs.core.util; +import java.io.ByteArrayInputStream; import java.io.InputStream; import junit.framework.Assert; +import org.apache.pdfbox.io.IOUtils; import org.junit.Test; import com.google.common.io.Resources; @@ -18,19 +20,25 @@ import com.sismics.util.mime.MimeType; public class TestFileUtil { @Test public void extractContentOpenDocumentTextTest() throws Exception { - try (InputStream inputStream = Resources.getResource("file/document.odt").openStream()) { + try (InputStream inputStream = Resources.getResource("file/document.odt").openStream(); + InputStream bytesInputStream = new ByteArrayInputStream(IOUtils.toByteArray(inputStream))) { File file = new File(); file.setMimeType(MimeType.OPEN_DOCUMENT_TEXT); - Assert.assertEquals("Lorem ipsum dolor sit amen.\r\n", FileUtil.extractContent(null, file, inputStream)); + try (InputStream pdfInputStream = FileUtil.convertToPdf(bytesInputStream, file)) { + Assert.assertEquals("Lorem ipsum dolor sit amen.\r\n", FileUtil.extractContent(null, file, inputStream, pdfInputStream)); + } } } @Test public void extractContentOfficeDocumentTest() throws Exception { - try (InputStream inputStream = Resources.getResource("file/document.docx").openStream()) { + try (InputStream inputStream = Resources.getResource("file/document.docx").openStream(); + InputStream bytesInputStream = new ByteArrayInputStream(IOUtils.toByteArray(inputStream))) { File file = new File(); file.setMimeType(MimeType.OFFICE_DOCUMENT); - Assert.assertEquals("Lorem ipsum dolor sit amen.\r\n", FileUtil.extractContent(null, file, inputStream)); + try (InputStream pdfInputStream = FileUtil.convertToPdf(bytesInputStream, file)) { + Assert.assertEquals("Lorem ipsum dolor sit amen.\r\n", FileUtil.extractContent(null, file, inputStream, pdfInputStream)); + } } } } diff --git a/docs-web/src/main/java/com/sismics/docs/rest/resource/FileResource.java b/docs-web/src/main/java/com/sismics/docs/rest/resource/FileResource.java index 69052f27..4ddc0621 100644 --- a/docs-web/src/main/java/com/sismics/docs/rest/resource/FileResource.java +++ b/docs-web/src/main/java/com/sismics/docs/rest/resource/FileResource.java @@ -146,8 +146,14 @@ public class FileResource extends BaseResource { file.setUserId(principal.getId()); String fileId = fileDao.create(file); + // Guess the mime type a second time, for open document format (first detected as simple ZIP file) + file.setMimeType(MimeTypeUtil.guessOpenDocumentFormat(file, fileInputStream)); + + // Convert to PDF if necessary (for thumbnail and text extraction) + InputStream pdfIntputStream = FileUtil.convertToPdf(fileInputStream, file); + // Save the file - FileUtil.save(fileInputStream, file, user.getPrivateKey()); + FileUtil.save(fileInputStream, pdfIntputStream, file, user.getPrivateKey()); // Update the user quota user.setStorageCurrent(user.getStorageCurrent() + fileData.length); @@ -159,6 +165,7 @@ public class FileResource extends BaseResource { fileCreatedAsyncEvent.setDocument(document); fileCreatedAsyncEvent.setFile(file); fileCreatedAsyncEvent.setInputStream(fileInputStream); + fileCreatedAsyncEvent.setPdfInputStream(pdfIntputStream); AppContext.getInstance().getAsyncEventBus().post(fileCreatedAsyncEvent); } diff --git a/docs-web/src/main/webapp/src/partial/docs/document.edit.html b/docs-web/src/main/webapp/src/partial/docs/document.edit.html index 717fc3a5..736f1136 100644 --- a/docs-web/src/main/webapp/src/partial/docs/document.edit.html +++ b/docs-web/src/main/webapp/src/partial/docs/document.edit.html @@ -39,7 +39,8 @@
+ accept="image/png,image/jpg,image/jpeg,image/gif,application/pdf,application/vnd.oasis.opendocument.text,application/vnd.openxmlformats-officedocument.wordprocessingml.document" + ng-disabled="fileIsUploading">
+ {{ orphanFiles.length }} file{{ orphanFiles.length > 1 ? 's' : '' }} diff --git a/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java b/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java index 3f1433fb..00ca2853 100644 --- a/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java +++ b/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java @@ -267,6 +267,124 @@ public class TestDocumentResource extends BaseJerseyTest { return json.getJsonArray("documents").size(); } + /** + * Test ODT extraction. + * + * @throws Exception + */ + @Test + public void testOdtExtraction() throws Exception { + // Login document_odt + clientUtil.createUser("document_odt"); + String documentOdtToken = clientUtil.login("document_odt"); + + // Create a document + long create1Date = new Date().getTime(); + JsonObject json = target().path("/document").request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentOdtToken) + .put(Entity.form(new Form() + .param("title", "My super title document 1") + .param("description", "My super description for document 1") + .param("language", "eng") + .param("create_date", Long.toString(create1Date))), JsonObject.class); + String document1Id = json.getString("id"); + Assert.assertNotNull(document1Id); + + // Add a PDF file + String file1Id = null; + try (InputStream is = Resources.getResource("file/document.odt").openStream()) { + StreamDataBodyPart streamDataBodyPart = new StreamDataBodyPart("file", is, "document.odt"); + try (FormDataMultiPart multiPart = new FormDataMultiPart()) { + json = target() + .register(MultiPartFeature.class) + .path("/file").request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentOdtToken) + .put(Entity.entity(multiPart.field("id", document1Id).bodyPart(streamDataBodyPart), + MediaType.MULTIPART_FORM_DATA_TYPE), JsonObject.class); + file1Id = json.getString("id"); + Assert.assertNotNull(file1Id); + } + } + + // Search documents by query in full content + json = target().path("/document/list") + .queryParam("search", "full:ipsum") + .request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentOdtToken) + .get(JsonObject.class); + Assert.assertTrue(json.getJsonArray("documents").size() == 1); + + // Get the file thumbnail data + Response response = target().path("/file/" + file1Id + "/data") + .queryParam("size", "thumb") + .request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentOdtToken) + .get(); + InputStream is = (InputStream) response.getEntity(); + byte[] fileBytes = ByteStreams.toByteArray(is); + Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues + Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes)); + } + + /** + * Test DOCX extraction. + * + * @throws Exception + */ + @Test + public void testDocxExtraction() throws Exception { + // Login document_docx + clientUtil.createUser("document_docx"); + String documentDocxToken = clientUtil.login("document_docx"); + + // Create a document + long create1Date = new Date().getTime(); + JsonObject json = target().path("/document").request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentDocxToken) + .put(Entity.form(new Form() + .param("title", "My super title document 1") + .param("description", "My super description for document 1") + .param("language", "eng") + .param("create_date", Long.toString(create1Date))), JsonObject.class); + String document1Id = json.getString("id"); + Assert.assertNotNull(document1Id); + + // Add a PDF file + String file1Id = null; + try (InputStream is = Resources.getResource("file/document.docx").openStream()) { + StreamDataBodyPart streamDataBodyPart = new StreamDataBodyPart("file", is, "document.docx"); + try (FormDataMultiPart multiPart = new FormDataMultiPart()) { + json = target() + .register(MultiPartFeature.class) + .path("/file").request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentDocxToken) + .put(Entity.entity(multiPart.field("id", document1Id).bodyPart(streamDataBodyPart), + MediaType.MULTIPART_FORM_DATA_TYPE), JsonObject.class); + file1Id = json.getString("id"); + Assert.assertNotNull(file1Id); + } + } + + // Search documents by query in full content + json = target().path("/document/list") + .queryParam("search", "full:dolor") + .request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentDocxToken) + .get(JsonObject.class); + Assert.assertTrue(json.getJsonArray("documents").size() == 1); + + // Get the file thumbnail data + Response response = target().path("/file/" + file1Id + "/data") + .queryParam("size", "thumb") + .request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentDocxToken) + .get(); + InputStream is = (InputStream) response.getEntity(); + byte[] fileBytes = ByteStreams.toByteArray(is); + Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues + Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes)); + } + /** * Test PDF extraction. * @@ -274,14 +392,14 @@ public class TestDocumentResource extends BaseJerseyTest { */ @Test public void testPdfExtraction() throws Exception { - // Login document2 - clientUtil.createUser("document2"); - String document2Token = clientUtil.login("document2"); + // Login document_pdf + clientUtil.createUser("document_pdf"); + String documentPdfToken = clientUtil.login("document_pdf"); // Create a document long create1Date = new Date().getTime(); JsonObject json = target().path("/document").request() - .cookie(TokenBasedSecurityFilter.COOKIE_NAME, document2Token) + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPdfToken) .put(Entity.form(new Form() .param("title", "My super title document 1") .param("description", "My super description for document 1") @@ -298,7 +416,7 @@ public class TestDocumentResource extends BaseJerseyTest { json = target() .register(MultiPartFeature.class) .path("/file").request() - .cookie(TokenBasedSecurityFilter.COOKIE_NAME, document2Token) + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPdfToken) .put(Entity.entity(multiPart.field("id", document1Id).bodyPart(streamDataBodyPart), MediaType.MULTIPART_FORM_DATA_TYPE), JsonObject.class); file1Id = json.getString("id"); @@ -310,7 +428,7 @@ public class TestDocumentResource extends BaseJerseyTest { json = target().path("/document/list") .queryParam("search", "full:vrandecic") .request() - .cookie(TokenBasedSecurityFilter.COOKIE_NAME, document2Token) + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPdfToken) .get(JsonObject.class); Assert.assertTrue(json.getJsonArray("documents").size() == 1); @@ -318,7 +436,7 @@ public class TestDocumentResource extends BaseJerseyTest { Response response = target().path("/file/" + file1Id + "/data") .queryParam("size", "thumb") .request() - .cookie(TokenBasedSecurityFilter.COOKIE_NAME, document2Token) + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPdfToken) .get(); InputStream is = (InputStream) response.getEntity(); byte[] fileBytes = ByteStreams.toByteArray(is); diff --git a/docs-web/src/test/resources/file/document.docx b/docs-web/src/test/resources/file/document.docx new file mode 100644 index 00000000..fb1e6c2c Binary files /dev/null and b/docs-web/src/test/resources/file/document.docx differ diff --git a/docs-web/src/test/resources/file/document.odt b/docs-web/src/test/resources/file/document.odt new file mode 100644 index 00000000..b7062de0 Binary files /dev/null and b/docs-web/src/test/resources/file/document.odt differ