From 046984a447c4012df45b5def14fdedf8e7b46f0b Mon Sep 17 00:00:00 2001 From: jendib Date: Sat, 5 Dec 2015 20:00:51 +0100 Subject: [PATCH 1/4] Closes #51: File sizes displayed in kB or MB --- .../webapp/src/app/docs/filter/Filesize.js | 18 ++++++++++++++++++ .../main/webapp/src/app/docs/filter/Newline.js | 4 ++-- .../main/webapp/src/app/docs/filter/Shorten.js | 2 +- .../webapp/src/app/share/filter/Filesize.js | 18 ++++++++++++++++++ .../webapp/src/app/share/filter/Newline.js | 4 ++-- docs-web/src/main/webapp/src/index.html | 1 + .../src/partial/docs/document.default.html | 2 +- .../partial/docs/document.view.content.html | 2 +- .../main/webapp/src/partial/share/share.html | 4 +++- docs-web/src/main/webapp/src/share.html | 1 + 10 files changed, 48 insertions(+), 8 deletions(-) create mode 100644 docs-web/src/main/webapp/src/app/docs/filter/Filesize.js create mode 100644 docs-web/src/main/webapp/src/app/share/filter/Filesize.js diff --git a/docs-web/src/main/webapp/src/app/docs/filter/Filesize.js b/docs-web/src/main/webapp/src/app/docs/filter/Filesize.js new file mode 100644 index 00000000..e3dfda68 --- /dev/null +++ b/docs-web/src/main/webapp/src/app/docs/filter/Filesize.js @@ -0,0 +1,18 @@ +'use strict'; + +/** + * Format file sizes. + */ +angular.module('docs').filter('filesize', function() { + return function(text) { + if (!text) { + return ''; + } + + var size = parseInt(text); + if (size > 1000000) { // 1MB + return Math.round(size / 1000000) + 'MB'; + } + return Math.round(size / 1000) + 'kB'; + } +}); \ No newline at end of file diff --git a/docs-web/src/main/webapp/src/app/docs/filter/Newline.js b/docs-web/src/main/webapp/src/app/docs/filter/Newline.js index c7cce4b7..e56e6bac 100644 --- a/docs-web/src/main/webapp/src/app/docs/filter/Newline.js +++ b/docs-web/src/main/webapp/src/app/docs/filter/Newline.js @@ -1,7 +1,7 @@ 'use strict'; /** - * Filter converting new lines in
+ * Filter converting new lines in
. */ angular.module('docs').filter('newline', function() { return function(text) { @@ -10,4 +10,4 @@ angular.module('docs').filter('newline', function() { } return text.replace(/\n/g, '
'); } -}) \ No newline at end of file +}); \ No newline at end of file diff --git a/docs-web/src/main/webapp/src/app/docs/filter/Shorten.js b/docs-web/src/main/webapp/src/app/docs/filter/Shorten.js index d2b055e1..d479cce1 100644 --- a/docs-web/src/main/webapp/src/app/docs/filter/Shorten.js +++ b/docs-web/src/main/webapp/src/app/docs/filter/Shorten.js @@ -10,4 +10,4 @@ angular.module('docs').filter('shorten', function() { } return text.substring(0, 1).toUpperCase(); } -}) \ No newline at end of file +}); \ No newline at end of file diff --git a/docs-web/src/main/webapp/src/app/share/filter/Filesize.js b/docs-web/src/main/webapp/src/app/share/filter/Filesize.js new file mode 100644 index 00000000..b6765b45 --- /dev/null +++ b/docs-web/src/main/webapp/src/app/share/filter/Filesize.js @@ -0,0 +1,18 @@ +'use strict'; + +/** + * Format file sizes. + */ +angular.module('share').filter('filesize', function() { + return function(text) { + if (!text) { + return ''; + } + + var size = parseInt(text); + if (size > 1000000) { // 1MB + return Math.round(size / 1000000) + 'MB'; + } + return Math.round(size / 1000) + 'kB'; + } +}); \ No newline at end of file diff --git a/docs-web/src/main/webapp/src/app/share/filter/Newline.js b/docs-web/src/main/webapp/src/app/share/filter/Newline.js index 571fa4d5..5e5c44f8 100644 --- a/docs-web/src/main/webapp/src/app/share/filter/Newline.js +++ b/docs-web/src/main/webapp/src/app/share/filter/Newline.js @@ -1,7 +1,7 @@ 'use strict'; /** - * Filter converting new lines in
+ * Filter converting new lines in
. */ angular.module('share').filter('newline', function() { return function(text) { @@ -10,4 +10,4 @@ angular.module('share').filter('newline', function() { } return text.replace(/\n/g, '
'); } -}) \ No newline at end of file +}); \ No newline at end of file diff --git a/docs-web/src/main/webapp/src/index.html b/docs-web/src/main/webapp/src/index.html index c71cf81d..59f8b030 100644 --- a/docs-web/src/main/webapp/src/index.html +++ b/docs-web/src/main/webapp/src/index.html @@ -63,6 +63,7 @@ + diff --git a/docs-web/src/main/webapp/src/partial/docs/document.default.html b/docs-web/src/main/webapp/src/partial/docs/document.default.html index d4842b17..0d814030 100644 --- a/docs-web/src/main/webapp/src/partial/docs/document.default.html +++ b/docs-web/src/main/webapp/src/partial/docs/document.default.html @@ -8,7 +8,7 @@
- +
diff --git a/docs-web/src/main/webapp/src/partial/docs/document.view.content.html b/docs-web/src/main/webapp/src/partial/docs/document.view.content.html index 5042d3ce..4d5d7a9d 100644 --- a/docs-web/src/main/webapp/src/partial/docs/document.view.content.html +++ b/docs-web/src/main/webapp/src/partial/docs/document.view.content.html @@ -6,7 +6,7 @@
- +
diff --git a/docs-web/src/main/webapp/src/partial/share/share.html b/docs-web/src/main/webapp/src/partial/share/share.html index 25833a68..d03fe326 100644 --- a/docs-web/src/main/webapp/src/partial/share/share.html +++ b/docs-web/src/main/webapp/src/partial/share/share.html @@ -18,7 +18,9 @@ diff --git a/docs-web/src/main/webapp/src/share.html b/docs-web/src/main/webapp/src/share.html index 6e401014..a8e8ea7d 100644 --- a/docs-web/src/main/webapp/src/share.html +++ b/docs-web/src/main/webapp/src/share.html @@ -37,6 +37,7 @@ + From 1a37d97a6179c927c055fc28b12d81b6ac6e8f7f Mon Sep 17 00:00:00 2001 From: jendib Date: Mon, 7 Dec 2015 23:53:30 +0100 Subject: [PATCH 2/4] #53: Handle and extract text content from DOCX and ODT files --- docs-core/pom.xml | 10 +++ .../sismics/docs/core/dao/jpa/FileDao.java | 1 + .../async/FileCreatedAsyncListener.java | 10 ++- .../com/sismics/docs/core/util/FileUtil.java | 79 ++++++++++++++++++ .../java/com/sismics/util/mime/MimeType.java | 4 + .../com/sismics/util/mime/MimeTypeUtil.java | 58 +++++++++++++ .../docs/core/util/TestEncryptUtil.java | 1 - .../sismics/docs/core/util/TestFileUtil.java | 36 ++++++++ .../com/sismics/util/TestMimeTypeUtil.java | 40 +++++++++ .../src/test/resources/file/document.docx | Bin 0 -> 4827 bytes .../src/test/resources/file/document.odt | Bin 0 -> 9267 bytes docs-parent/pom.xml | 13 +++ 12 files changed, 248 insertions(+), 4 deletions(-) create mode 100644 docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java create mode 100644 docs-core/src/test/java/com/sismics/util/TestMimeTypeUtil.java create mode 100644 docs-core/src/test/resources/file/document.docx create mode 100644 docs-core/src/test/resources/file/document.odt diff --git a/docs-core/pom.xml b/docs-core/pom.xml index cf4d3719..3ad8e390 100644 --- a/docs-core/pom.xml +++ b/docs-core/pom.xml @@ -117,6 +117,16 @@ com.levigo.jbig2 levigo-jbig2-imageio + + + fr.opensagres.xdocreport + org.odftoolkit.odfdom.converter.pdf + + + + fr.opensagres.xdocreport + org.apache.poi.xwpf.converter.pdf + diff --git a/docs-core/src/main/java/com/sismics/docs/core/dao/jpa/FileDao.java b/docs-core/src/main/java/com/sismics/docs/core/dao/jpa/FileDao.java index 28b7a92c..3aa3e310 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/dao/jpa/FileDao.java +++ b/docs-core/src/main/java/com/sismics/docs/core/dao/jpa/FileDao.java @@ -142,6 +142,7 @@ public class FileDao { fileFromDb.setDocumentId(file.getDocumentId()); fileFromDb.setContent(file.getContent()); fileFromDb.setOrder(file.getOrder()); + fileFromDb.setMimeType(file.getMimeType()); return file; } diff --git a/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java b/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java index 1b76cb0d..bda947ab 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java +++ b/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java @@ -12,6 +12,7 @@ import com.sismics.docs.core.event.FileCreatedAsyncEvent; import com.sismics.docs.core.model.jpa.File; import com.sismics.docs.core.util.FileUtil; import com.sismics.docs.core.util.TransactionUtil; +import com.sismics.util.mime.MimeTypeUtil; /** * Listener on file created. @@ -36,20 +37,23 @@ public class FileCreatedAsyncListener { log.info("File created event: " + fileCreatedAsyncEvent.toString()); } - // OCR the file + // Guess the mime type a second time, for open document format (first detected as simple ZIP file) final File file = fileCreatedAsyncEvent.getFile(); + file.setMimeType(MimeTypeUtil.guessOpenDocumentFormat(file, fileCreatedAsyncEvent.getInputStream())); + + // Extract text content from the file long startTime = System.currentTimeMillis(); final String content = FileUtil.extractContent(fileCreatedAsyncEvent.getDocument(), file, fileCreatedAsyncEvent.getInputStream()); fileCreatedAsyncEvent.getInputStream().close(); log.info(MessageFormat.format("File content extracted in {0}ms", System.currentTimeMillis() - startTime)); - // Store the OCR-ization result in the database + // Store the text content in the database TransactionUtil.handle(new Runnable() { @Override public void run() { FileDao fileDao = new FileDao(); if (fileDao.getById(file.getId()) == null) { - // The file has been deleted since the OCR-ization started, ignore the result + // The file has been deleted since the text extraction started, ignore the result return; } diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java b/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java index d836e94b..951590c6 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java +++ b/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java @@ -15,9 +15,13 @@ import javax.imageio.ImageIO; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.rendering.PDFRenderer; import org.apache.pdfbox.text.PDFTextStripper; +import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.imgscalr.Scalr; import org.imgscalr.Scalr.Method; import org.imgscalr.Scalr.Mode; +import org.odftoolkit.odfdom.converter.pdf.PdfConverter; +import org.odftoolkit.odfdom.converter.pdf.PdfOptions; +import org.odftoolkit.odfdom.doc.OdfTextDocument; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -53,6 +57,10 @@ public class FileUtil { content = ocrFile(inputStream, document); } else if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) { content = extractPdf(inputStream); + } else if (file.getMimeType().equals(MimeType.OPEN_DOCUMENT_TEXT)) { + content = extractOpenDocumentText(inputStream); + } else if (file.getMimeType().equals(MimeType.OFFICE_DOCUMENT)) { + content = extractOfficeDocument(inputStream); } return content; @@ -120,6 +128,76 @@ public class FileUtil { return content; } + /** + * Extract text from an open document text file. + * + * @param inputStream Unencrypted input stream + * @return Content extracted + */ + private static String extractOpenDocumentText(InputStream inputStream) { + String content = null; + Path tempFile = null; + try { + // Convert the ODT file to a temporary PDF file + tempFile = Files.createTempFile("sismicsdocs_", ".pdf"); + try (OutputStream out = Files.newOutputStream(tempFile)) { + OdfTextDocument document = OdfTextDocument.loadDocument(inputStream); + PdfOptions options = PdfOptions.create(); + PdfConverter.getInstance().convert(document, out, options); + } + + // Extract content from the PDF file + try (InputStream pdfInputStream = Files.newInputStream(tempFile)) { + content = extractPdf(pdfInputStream); + } + + } catch (Exception e) { + log.error("Error while extracting text from the ODT", e); + } finally { + try { + Files.delete(tempFile); // Delete the temporary PDF file + } catch (IOException e) { + // Should not happen + } + } + return content; + } + + /** + * Extract text from an Office document. + * + * @param inputStream Unencrypted input stream + * @return Content extracted + */ + private static String extractOfficeDocument(InputStream inputStream) { + String content = null; + Path tempFile = null; + try { + // Convert the DOCX file to a temporary PDF file + tempFile = Files.createTempFile("sismicsdocs_", ".pdf"); + try (OutputStream out = Files.newOutputStream(tempFile)) { + XWPFDocument document = new XWPFDocument(inputStream); + org.apache.poi.xwpf.converter.pdf.PdfOptions options = org.apache.poi.xwpf.converter.pdf.PdfOptions.create(); + org.apache.poi.xwpf.converter.pdf.PdfConverter.getInstance().convert(document, out, options); + } + + // Extract content from the PDF file + try (InputStream pdfInputStream = Files.newInputStream(tempFile)) { + content = extractPdf(pdfInputStream); + } + + } catch (Exception e) { + log.error("Error while extracting text from the DOCX", e); + } finally { + try { + Files.delete(tempFile); // Delete the temporary PDF file + } catch (IOException e) { + // Should not happen + } + } + return content; + } + /** * Save a file on the storage filesystem. * @@ -162,6 +240,7 @@ public class FileUtil { pdfDocument.close(); } } + // TODO Generate thumbnails for DOCX/ODT documents (guess the MIME type earlier and build a PDF version now?) if (image != null) { // Generate thumbnails from image diff --git a/docs-core/src/main/java/com/sismics/util/mime/MimeType.java b/docs-core/src/main/java/com/sismics/util/mime/MimeType.java index 9e1d6916..e5821a62 100644 --- a/docs-core/src/main/java/com/sismics/util/mime/MimeType.java +++ b/docs-core/src/main/java/com/sismics/util/mime/MimeType.java @@ -18,4 +18,8 @@ public class MimeType { public static final String APPLICATION_ZIP = "application/zip"; public static final String APPLICATION_PDF = "application/pdf"; + + public static final String OPEN_DOCUMENT_TEXT = "application/vnd.oasis.opendocument.text"; + + public static final String OFFICE_DOCUMENT = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"; } diff --git a/docs-core/src/main/java/com/sismics/util/mime/MimeTypeUtil.java b/docs-core/src/main/java/com/sismics/util/mime/MimeTypeUtil.java index 76c49ba1..058f7f7d 100644 --- a/docs-core/src/main/java/com/sismics/util/mime/MimeTypeUtil.java +++ b/docs-core/src/main/java/com/sismics/util/mime/MimeTypeUtil.java @@ -3,6 +3,13 @@ package com.sismics.util.mime; import java.io.InputStream; import java.io.UnsupportedEncodingException; +import org.apache.commons.compress.archivers.ArchiveEntry; +import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream; +import org.apache.commons.compress.utils.IOUtils; + +import com.google.common.base.Charsets; +import com.sismics.docs.core.model.jpa.File; + /** * Utility to check MIME types. * @@ -77,8 +84,59 @@ public class MimeTypeUtil { return "ico"; case MimeType.APPLICATION_PDF: return "pdf"; + case MimeType.OPEN_DOCUMENT_TEXT: + return "odt"; + case MimeType.OFFICE_DOCUMENT: + return "docx"; default: return null; } } + + /** + * Guess the MIME type of open document formats (docx and odt). + * It's more costly than the simple header check, but needed because open document formats + * are simple ZIP files on the outside and much bigger on the inside. + * + * @param file File + * @param inputStream Input stream + * @return MIME type + */ + public static String guessOpenDocumentFormat(File file, InputStream inputStream) { + if (!MimeType.APPLICATION_ZIP.equals(file.getMimeType())) { + // open document formats are ZIP files + return file.getMimeType(); + } + + String mimeType = file.getMimeType(); + try (ZipArchiveInputStream archiveInputStream = new ZipArchiveInputStream(inputStream, Charsets.ISO_8859_1.name())) { + ArchiveEntry archiveEntry = archiveInputStream.getNextEntry(); + while (archiveEntry != null) { + if (archiveEntry.getName().equals("mimetype")) { + // Maybe it's an ODT file + String content = new String(IOUtils.toByteArray(archiveInputStream), Charsets.ISO_8859_1); + if (MimeType.OPEN_DOCUMENT_TEXT.equals(content.trim())) { + mimeType = MimeType.OPEN_DOCUMENT_TEXT; + break; + } + } else if (archiveEntry.getName().equals("[Content_Types].xml")) { + // Maybe it's a DOCX file + String content = new String(IOUtils.toByteArray(archiveInputStream), Charsets.ISO_8859_1); + if (content.contains(MimeType.OFFICE_DOCUMENT)) { + mimeType = MimeType.OFFICE_DOCUMENT; + break; + } + } + + archiveEntry = archiveInputStream.getNextEntry(); + } + + inputStream.reset(); + } catch (Exception e) { + // In case of any error, just give up and keep the ZIP MIME type + return file.getMimeType(); + } + + return mimeType; + } } diff --git a/docs-core/src/test/java/com/sismics/docs/core/util/TestEncryptUtil.java b/docs-core/src/test/java/com/sismics/docs/core/util/TestEncryptUtil.java index 2e161157..b6f54964 100644 --- a/docs-core/src/test/java/com/sismics/docs/core/util/TestEncryptUtil.java +++ b/docs-core/src/test/java/com/sismics/docs/core/util/TestEncryptUtil.java @@ -18,7 +18,6 @@ import com.google.common.io.ByteStreams; * @author bgamard */ public class TestEncryptUtil { - /** * Test private key. */ diff --git a/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java b/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java new file mode 100644 index 00000000..c9ecdeeb --- /dev/null +++ b/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java @@ -0,0 +1,36 @@ +package com.sismics.docs.core.util; + +import java.io.InputStream; + +import junit.framework.Assert; + +import org.junit.Test; + +import com.google.common.io.Resources; +import com.sismics.docs.core.model.jpa.File; +import com.sismics.util.mime.MimeType; + +/** + * Test of the file entity utilities. + * + * @author bgamard + */ +public class TestFileUtil { + @Test + public void extractContentOpenDocumentTextTest() throws Exception { + try (InputStream inputStream = Resources.getResource("file/document.odt").openStream()) { + File file = new File(); + file.setMimeType(MimeType.OPEN_DOCUMENT_TEXT); + Assert.assertEquals("Lorem ipsum dolor sit amen.\r\n", FileUtil.extractContent(null, file, inputStream)); + } + } + + @Test + public void extractContentOfficeDocumentTest() throws Exception { + try (InputStream inputStream = Resources.getResource("file/document.docx").openStream()) { + File file = new File(); + file.setMimeType(MimeType.OFFICE_DOCUMENT); + Assert.assertEquals("Lorem ipsum dolor sit amen.\r\n", FileUtil.extractContent(null, file, inputStream)); + } + } +} diff --git a/docs-core/src/test/java/com/sismics/util/TestMimeTypeUtil.java b/docs-core/src/test/java/com/sismics/util/TestMimeTypeUtil.java new file mode 100644 index 00000000..b16e91c7 --- /dev/null +++ b/docs-core/src/test/java/com/sismics/util/TestMimeTypeUtil.java @@ -0,0 +1,40 @@ +package com.sismics.util; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; + +import org.apache.commons.compress.utils.IOUtils; +import org.junit.Assert; +import org.junit.Test; + +import com.google.common.io.Resources; +import com.sismics.docs.core.model.jpa.File; +import com.sismics.util.mime.MimeType; +import com.sismics.util.mime.MimeTypeUtil; + +/** + * Test of the utilities to check MIME types. + * + * @author bgamard + */ +public class TestMimeTypeUtil { + + @Test + public void guessOpenDocumentFormatTest() throws Exception { + // Detect ODT files + try (InputStream inputStream = Resources.getResource("file/document.odt").openStream(); + InputStream byteArrayInputStream = new ByteArrayInputStream(IOUtils.toByteArray(inputStream))) { + File file = new File(); + file.setMimeType(MimeType.APPLICATION_ZIP); + Assert.assertEquals(MimeType.OPEN_DOCUMENT_TEXT, MimeTypeUtil.guessOpenDocumentFormat(file, byteArrayInputStream)); + } + + // Detect DOCX files + try (InputStream inputStream = Resources.getResource("file/document.docx").openStream(); + InputStream byteArrayInputStream = new ByteArrayInputStream(IOUtils.toByteArray(inputStream))) { + File file = new File(); + file.setMimeType(MimeType.APPLICATION_ZIP); + Assert.assertEquals(MimeType.OFFICE_DOCUMENT, MimeTypeUtil.guessOpenDocumentFormat(file, byteArrayInputStream)); + } + } +} diff --git a/docs-core/src/test/resources/file/document.docx b/docs-core/src/test/resources/file/document.docx new file mode 100644 index 0000000000000000000000000000000000000000..fb1e6c2c6783058e041c09cabcec60e2a99fbccb GIT binary patch literal 4827 zcma)A2UJtp+6}!|sZs?33Jd{3K#CeVp$3qSNS`1*v;fkp(vd1K6cOo)H0dprK_Ju! zQlwV_0YUmpeDj{@`)AhwW!pduCGL)>6=@q0U}XY*~JW)v|js3PIn_> zZ-;f*ao{{hGxO?%A|FKqAqQ5MAUa0FCua3_8gl5WahJQJE{L8}@UiB#V# z5)*NK_57NxmQJ^fCD+@+1lQ@B!?b{#Ie&BLOu9)&5xE%4tNYb#THu-viiIUd^Ow8G@ zud&-<=)wHc{=AL}mi<4p)rg2!Q8?TanDxry0sz{7%LonT3TtO8Z8v8ZILOM`%|^)E z(IGjpO|wH7!;6e!vhVYkBM+ZUZUWXxc<|*pouKQUG#Xcd6ZK89u!;I$&$-ppmb=n5 zdtbrTL+qtVSwhsRI&?t{WS2xd~6g($r6R zfZZtHsAOs^6qW1SzfOlo0K4rxQl(gnY5i4jsXkV3lVb!rDpIB@|XsnkLh_!tjMb__wEy$}un zp#G0!Q(q?A!o>y4bymEFT8A)G;P&|w+QxF#NskSs$!n2(ZEVHt${CH0+>W$~k_`dQ zdTFxT-*#H}{9kv-#Pl*}8Yme=zQhOSge5=@>3tlt^??w-PsY1LzrR8e$ViQPZLUTW zFw%YFVW_G=Q3p?3Z6X0z4q>BG83T1u5s~HHFE^eADs$iAL*dwkItahVsltC(iRL^O z*czN0eDEe1_4$D8~3Qx)hXS`SAXCwDA~f4RAjv|X55 z@z7r}tx2PVi%VlV@{|5LQK$QH>oWM}lk+AVBvEADs+-fz!|eAw=4S4L1FQ0OF@b`r zLAlVHb#?BC=1yH?hV=P<5lt_=fmK;#f(L1wIY2kHfe?FWU%X)iZ}#Z;Bh@$i6ka-P zHX(kAFX3CWvN_*&tUp2mXZ4w1@2>A~IGdoe+NT7je8r?5$Ny111#bI?4j(Qi>ANi>G7Hfc)kmvg$4q5)P_9LBdmi#SeX(hF?CwH5@{iClpOEAQ9 z+@ipx0S>W!92R8FmFgZdWA|+ojBpvhx%Tu58t5B+|51t(zSX_@QY+l%j$RQI2_jT)Bqb??8FtqRD^SUZbg0pEO}%}-ROUzNJJZt^_=qxqI2YQp2YNH{cxd=! zdf8T#;B-;Nyt-7Ly#8H`$g7M4WKD8oqFATV6ut|Y4qhnpZ8_qH46oPPiDPx*vJh8G z$HP|zZj4S9dmA1QTNLLnPsS z>lhP#!~Z8EQT<0KaCd}*4IFErrc@31q%bwM60%Q6y-Cszk4jQq6SfE+7Jm`JZe!AA zzdr|YBR=|?9IDaEK|=i}(6C|HXVwm#=;pDf9<>)5+L=kvD!qYA^d_=9tsie^!(wy| zPno~RQbkf8$B>J}sr;_lZ-hF(ktD`G7+icv6R20@@TvYLgeGh^+Ab1i-cAq(p+ZiX zxzN}JE2}`*vT4!Vp1wk1X+bxmy`W?Yt~_;%QBqGOJrIeXh$pmt-l40to;}0)y-JOX z?n$@I#;P~7o@6I}Nr48Tq}bKym1r_1+A{qWyz=VtG-Y!rEk(Rp=M zJCVSf?UH~itX#Pr@a6aMtt6=O{ghq?#$cUk2sxf`hkJ^s)l+LgGgv$Vziyd+SCQop zPLjX?sBw6uJ-3e{OijVgy@l-@_?f)0i*DXQB1*25+s$I_K!a5RY^AsOo)Qk$DGvvf?3 z3z}Py=2<_lkm&-o#f2laOhli;4HmtKGeA=%U=V9;Ekq!c7CCE#Y}ggqFNJi6_@B2g z*gUE>)HI+Fhe{!(*z-{wHL5!1rr?nFB}XcJ2J3R?w_8b%2-h>CE%k=`SY~-_E!{-E~F=_>-QV zTYPIWYnTlfc_ZrIGW+_`*SNrz7=P}Q{*H&B{WxM9UD@7gE6~KoexP~{D|^_A2jhkQ zjCJPh4&|Je=C^#f`P<@P3Cfk^d6w0;_68Z_^@IKQd$Ya6kk*q#ENQiz=3?MGdZYMi z_peyTXc*UgwAtX-1s!-gQfjymi;hrMv#Oo=_OSREFB|dh1AX$KTJU0N&6=uO*sAPg z!(@#2*V<;jsu^p_0x??=pK=jnqd2A2fYf8Vf#kCT8;#!Z4eHy~?M^9U+l(vpw#%H? zu0wI^HXn=Y;%66DF;qejxr(6!eajz>Hd7vF0cQ)}Xh5*`3>8&mB6zLwGc$AV5Cj+QoV zFef{#3;RL6CH|CXBD~7ECFGjOOvo5F zBjMm-f7&duk-z^DoeXojcvd3)loI5_*-1z~OLSo6TH^}pQ%y6)w;W}9}zuBQN)RtV2H1@5StR+|~ zDn-0lE+l1X*osf8I2jr<4Fy8R#Mo@q>h7;@JIJJ->bX;^Pa;hECEv63=o zH0APQ39Qqmy1!HzJ*E7%!+>LQ2%fH1ULcMg* zjWcX>{k88Kv_`maHy3Yx`I7^D$Kfg>ATrBHlQO>PZm{2!nA-uJbWJ8Jr*8oMQ59@a zCN9hL-{(vnt{!Q3QiAx1dS*B{7e47!wgR`-oUgZ41JejR2TU@Z5i0@MTs&mhywHou z#R8=NFCo}EJGtvwSUP-fz?-zemagVgmYaN9r2OTM6!ul3||cJ|MtXQgK$*QbqKJ6pSSf!nrWbSjhRaLQ!AY1!#JDs42c2scOs5OOIc zOZ|Bu7OezDV?Lo1NR5bRfljH3d#mW|HDKwNHn94vi*^&02V4kVCq3@S@Hf`Qj2Hb^oIKD}5 zUOfMHkm!M!ru<9098qZP!YOfE>&!f}PRNAB$mibAOgub#YUi(@umDZMPTjWf?DDw1{5&D52mTmO5fNvFOIxyFsJ)Nz z+24EYzx@#0#@!v`u8#tBiz)?D>8iTi-d;()#<&H2p{ZLXure+lmXK4H%CT(ylGkc2p=!O9 z^#cdf&Q*4()|{K(K=m&9P)FBi{}1P8liy1@O^nv>1(+;>!W@6U#w(b!Ch{2RU>ad_ zJ%o!5-0XWff7XxHPw4jJCp`ww>Rj#-1J`h(xc8|Cda5PL6i>M;OeSmXvXp4#RBjZw zL|G$jn~gp~-hFs3^bvgTvjejH>FvH&NMV(k!Da#C{q~2v?BxD7Jlhx5hHo^Y&$Hpr zScGxOR@#O&H-c#rQY`wL;+-uiPHpeBEDcW(i!Y1Vysr1-HARtD77%|?WqQRFK!Lgj z8cwyLQkJ*WSNEQU-QfDVAYIao+6{V^*+eKYxj;|Cbdmic)0A)}!JgoH;*Q`0x0UQ* zP6diWnj6O}_s})QxHb+HMx-B-bc3W4&bETDQhG&m9tE(v$DNa(jZNK%QA!C^(MsZF z@`^@zRuXl|6L}vobWf=^PG`{LSDH4rG07U`0qwceoA6ZZCgc#x#bl|Cv30 zg*dFvp5EA_b|Iv~D3cj39vDNcOL}3!__h_w!Qk00hOtA2m+{{k$b>udl3 literal 0 HcmV?d00001 diff --git a/docs-core/src/test/resources/file/document.odt b/docs-core/src/test/resources/file/document.odt new file mode 100644 index 0000000000000000000000000000000000000000..b7062de00dbdf5945ca643521e6101130c854764 GIT binary patch literal 9267 zcmeHNWmr^O+a5u>l@ckXL0USbyQPtqu7Mf4LsCY%hLY|M=@Jl-8bBJPB^3c__=fYo z9zEyPbG_I1>-+INbM2Wmu%COab+6gae%5nu6?w$l_y7PZ0H9PJB5lyafk_7d0B#QW zp8$4Nc4lCApqVicXlrF^47PHxXLYfE&f;JUvI4O<0L|>5JD57#nc0I`z-De>mA`<& z{r@ylcuU;D-rUN<+361$5C<#J$-%d$?EJqNCD_5i_QxnuQBnV(_)kjUjT@T(&Wf?AshO=AoFoS)R#RsuryFkF z>}(MMD)J~dO9DW+d4t>UDWib@_wTS)cE%QFAXYIeu$?gw^bZXDif?mU2V=0=KdsWQ zc)OWD(AeJ0_TRnziUD-^P(fgKTQktjBzN@;U(Isg@oiH!E_maaZP)kBW7m8v1VIvm zTIgA<=bQY11bQfu9KMWo%TWI3D0~{ZgytD%ZZyJOadeRRyCsJ!wbjHadb<*_AC@+c2)q_ldywFAl#^IJl+2l5FpiC*JELh}n7@n(7B+sZVetihH=| z^r&!dotMg~c$t}1W-q8t`1r#kGwJuwO}nwGSxesqk}#FH>s7h&_?UI=$kQQE^rG7s zw2u=hpK33m^r`f#zOP1Lni*a8AH5jU2dWfJ4Vm|@IC?x&XqX!A95mHWUB#8Ty%m4RVy^LrPGm%+$PUgjF^Z4|qB$LF^!mk|lGy}Q`z z2b*Mipsvv;RrfSQZb9dU77#}-4zBc9Fdp7-N?5lOCBV>qxsbQ^ZH;fEwn(=fxj>aC zWjK-Ze0D|H;Q15uqR=l>D-k>F=8loP!)GFhtw3bb&^a3(=Q%M458(QQLR0{T22%_~ zS;bFDJ!DeU%$B$L@kHgGWXxez0Dr{2AQE2w#{AV2@sa-2M68NU;`W@^y%LzQ8Tl;e zK?lj}gh-13~YrqHS40BUZ1@`-hY*x_ds8W@`>iQnP zesz|IyLzJQ?ztY5J5h7*DS0(pk_bBJ(LFOdX*Bc1wMA8#YUJiVlqtIz&e1kt$J#4> zDFG4%-B_DjHFSJzBJa@Cmzuj(RS3k@ntB6BK6?Z>@g`=25nJ(!`l#Z+l1wmyQH*7I9Dt0~>sm3u|$W~K)<8m~ulBvE@R}I#srS7IF zK2ziY<-!a;E{m+R?`yWH-eOk@${NYovXh}Xp+g`;SQ&}AsL)<0`NEO(DF}tnuQH_y z!fl1g9`|(@FHfvlg|-z0Is7aqK@RMEg<2CY@j9f*NLc@^4(tB-14c_W+(XmbeF(iH zSXqcfhn$mqa@A7xM9H6z@|HoG^hTx@LZ-XP)dj?DGU<_B_yq!f~ibv+Wqz%lW4JC3=m%&KEpdga;*`X%j z63B>6&GStRlMXG@4$RIzY$;`PB+c)vx?ah+8^mwX&SJc1)Y99s;LapR|JJ9B$iK&g zfcwETBm$w`c6r}n<1#&U@|bxUrZ%~ByN_)!TG)lZJYpy*T#cOnk(NZT5P{oiG~S}s z4hn>fh#$8mQB((zX#6H>=0&+aF`CcLBW$X)s0#@hfBeO}&Lf7n;CDiG6>UZn-68-Q zZR!pqb6}6Dwwwx1R(@FS!Kn`o?|y(Ake|5V3FNhph7(zVaF;ak31`Z6!%APFa%r!V zb}$k(v0upum)E^yl7*L_->99bW336jeB76wWdd|N@eR|d_xK>i*8PO1o1&UZxUaWr zP=t}arEP$kb@9wK_CpogN_8`6AuOZxnWJzyl`BjU;cq!rv3$!Ae+BA^g*ULlX#cg~-um61g zn)DS+;1gzg^(6*F@Fs(Fkn_O3Y`h_2$bQeL!1X<7^%PN!kgdi?I@(FbK!ez4gGfP_ zg7!l!Rf_CGORed{A$RvgOwD_1Zi5Se7lB>-mQ=d$8LeM^sg4vZ>;4KZ*o-I4Zdv8w zy({b`O7YsfHA497i=ER;i5XwHG6l&5LtL4IXs_ob?C=mhh5cAHci`KnwbbT0+n)93 zrr6f4^y+tYHJp#bbl;wFzI)dw6|Y5;0eGIj;w3kUn*Tz~uw5{?UI@m~h$0;!Yrkh( z^7dN3(9%P!aXW}d4W=aVF%OtQfy0SsNgIUat_5l`dEfQ>Tuf1V!|CT zail1t%j}YC72o4?nI|J16`L~>XL0&rF2mTuVe;0(XJR;Z?R0zSn4kS@t*d=TMtJN( za#Ln8^HSM;sL^K9cH9k@^gl8i7z(~Qs#Pv=m{YBvxO$iYvz{V*-O=DWCPLN;qK1tY zi}9xpbwLesyyea<)$t5SH@lg{`E_vsWToeh+$*fu2Vdhmn!^(id3oZnM-sj)%aj}3 zgKZz}0$^CVfi*PEyfB_}^i4~C!^1@J44Bx=4a5-~qH&isMEKul)R$v!YWSvnLnTYq zGNh4pFu3=Adk}XBD;Mz(fTar!(Uzp4#LJ_R{F+RcAe&74Sg#9ES!j@{Wjf4TM+j7a zR>0q%pUQOEG>mUtYkE$2NM0TDNsn6K)9qXofA_oV15?FxmzD~{8w+#WmypS*)xM~J z8beW@7O9-kOo!x-2+s3>4^2Jyr1Dd8L`7%ovPK07*BzkXgAE~J?rJUv$dM4vds)*} z)V;BO@{&kpqxlcL{>PGp$U+KfA~jW}-suK(4-KsC!rQ}b*%{W_QDI=@-37rwHL*xr@5=JT z66U#fFMPc?WM7XGa8%59V7e~;5i4DJ+-KJ6Eta66@ELI}5FlSWVTOqOFYal`<*LMMw>F*O>HOL{&kD z`eRcDntmC}+D#_@CYFZWS^k-+Iwpz~vVH5o+ziG>S9?Z2!-N;iJean_)mijzDF=ee z%?~>!cpYUhdukx4XylP%R!6H|Bz!~5Xy#g~;O)#P_hU+-h7k*&xC6B z9<08+Avc*iqUyl6b>;e*!jTi7bH{yRrrMbo)IG+4aGM7jO&GpAPoyPJ)O-sLmnivE zUZ0-c?w6zkKKfGF596!B7?x5UEU8hiysN#%J0VoS^Iy9%f)V2gnjeQ+J*U z%UWux1gw}qmo|$>VqeMu&jh&`Oqv-c#|fbNqB0vq6k*lLdqr)#vRB^hZm6xpOTKO> zb+oRB4~KV-PPI46E0n$(!N_d*L*EE9)ni-|(yFFwpHptF69HYzzwuBvSO?bzAN8KM zXGUt*+K!%#j?MY&Aj-B%#ASQ}#$nqIzu|EQjNvlR*JifAl7gN^*kXF6C4^TI_xcQE z_{d#vA)k9N)O9Fc)be=Pq7mfKV;GQgK)Pv#a-f2C#1AA^6;tq1myqwI+D@KcaTbp> zZ3L!Y79APhEWZPMpV0?j{!jn7{5Mw*$P5g&vbXqS%TueX2b$r>^?B0vu**|U2KLxE z28_wT%#4OfJDA#v=6NgQo$f1Yl#^rqzTkDBY=NB+`$<)Ci}!iy>h^F%5=D!jI$k(b zccn{q7gfT~m^zQAwpVrIr0U!0w+x$9g)BhI17s^qF$$)U6mnJR!$y_Coo|ZF@mXOC zl0^*Km_>zww6v6(6fWvs}E+j@cG z47uf-3ZwhiLt(*>qP%xNoecH$OTMv7YAHLi>Ac+H~a!kjI{F^OJ1lC)`)8?6cj}5EQzU{jIvyc&Ct8S zU`hGqXB|Sp70PEpj?NV)D@ioH(Wu6^mg=fq=#soc&Cp<_e3Pz-z z4#gWkLBp_=O#j9e3w6^JmEHqB*S)WZL(It+S%;ey?&XGWy39PRsVL_d&n;g(ITq#@ zVo<~$p>VWkUHQ?Sk)42B%-f{fHD-`Zr%zxTVsBm!i0|+$nkJnlKGHd1^ugX+;|zwx!Etogt0KXidDODt+gEFz=F* zNZ*Hg<*2SD0f_#p7KBni%fnot=Xn)Pzo&WMy-22B^%@BnmLUF^6+>*BIygH}tch(K zn?mI&|F(L7mG|YoZxJhU+yo1UsQ-fkN?DBP34*z{Q51Rl?IQB~T_mn^hadsVL%O>r zTM*)|L?p8`qh^#xU@>BPdT;VFt5ig$vqJ7B6(P4zBZqvZcp{xjlH!He#cQ&TWkx{> zWcB_al-YQjSWW5&j;2prcT{u}!lsZ;|WwrZ9Y?u>u0sQv(E=7bl6oHeIKY;XZq-T@&u65jTkceH=`qkxCfQadu zFBjzjGIiqIHYGZ)V3Uc0CgMb+Txe@!jDtt`v;}bOH8-DhRs{P)5DxR@@TNZ%P4F%b zJ&a9UseO+T|8tsqT+Ol`{vBtwx;Gr{YF-x6Lm0-qTMfF`xbKg(D|jO126!lx-#n_r z){EmG9+YQ%V9&z7;I8mAp7l|@+Zd7WslT7l$8ra6QLV*eyQN)+n>z|~wv`kyG60Z9 z|2?7jUZ>5o;b6?V4FKF6Hw8O&OJ_S1dt)nG5G(k5lLcsR5vHOfjfGBfQ)R=Fm61?` zf7Sv3w*aW{dlaDavUd{zfEN}O)Fo}4T;b)>m+s)mAn)+dpuot)7ZJ&?V$Q)@R!<>t2a2@2e8^N)P3;ZO*}iU* z25%BQ?rSJc^ng~oX@pm6ho$_mGsEl^Vv`f*wsh=M>L6J!K8);MT$79DlM_9q42)bK zVIk$S>AB?|*s+mW^_=}UQ*HTk=Dz-k()!Ewi!K|h{Hm6URu47mBfHXx=H|~>vysQP zrv;#qLF9R_`grW(mPZhQBWTm(-N)Fyr7ig;3(h`kJZew>@7u7#6PBK`t;^t(#v9^g zC6y$~#Eb&|rDFVQpNRoK%hbUh{BHpyS4-YumKV46SPPZ0f+yK=4k3)&g*5^$4Ty~9 zwC_c~rcc%uR*$LL@m+jCGa6^wFmv`$FZJ_%>7FS1F)J>}PFiJ~l~owiCTgq1>cteN^!vy6 zewzC{QA+Ijv3ir7w3>N-w&8gQndAQUMVb(=IT1-cs1F-MDRMJ6MSkIadOUhFi?!`^ z7=h=|s_!<~#x9L&ylyv3pE<5<Q zboJ2oNj0kp&2mjG^Qdw)S-VK>6roKhL(k(E)5=_!n}tGg3Orwp>`J;7C+G49#6|!E_|V9; zmQ%+7^%8#XJ`Lqs$#`v`a*Do@Oyl)aWG_6+$nUkN%0-~)*qQwJQr5Im&_k+;QX&12 z3Ra^zgWMZ;aGzFUn|rFRHb6PCDHeM~j;LFQKB-zv{GiZ!zr z>;!yr4KG8g-!y-N2)^vftWfFHr(|M(ts{!Y0}Uxl3|Y(Zbag2oqkF{kKG6z6(bEsL z2PcqPGV$~kCUOLh@>fi3`eFJr#L4~(ck~_2vFAB+cvazY0FhXjoDrYc`1?DSD*`3S z!;9C(qzUE;c79Of6su(R$)pdSuCQGqo+rfe-5O1tS@Ej5idWqk3{dcUG<%3?S>0}O z*oIU(I|FLjJeeYaTpQ#*MzY1dzUFil(JQ{Hqiv3ziTgUHkxY@>yOe%NuA;}RI=*5_ zYDga>`RQHE(kDjf3hG{_P=hZ?(aA}OUpC;-Fnor|+~Mw5V%zhJZs}cV71CsN)98NC z%&W(Zb7f%v=@qiDaDe9m3$0I~QQrDVRAEo&T7o%g84}xi=AgTy>nGM!o*oyCb=zjS z=>gpS?V9$*;`oaRZ)vxBWzW(*dPhQ$JzMhlyobiMRuXFTWnhyeMUh8DEp=zaZ7dv? z_>Uj&KX`MBc9WzyTZ$vc!P5y@?C(j+uP$Z+_{FR!sV>Sat0cwxXY~kP0x4COx9(!Y zo!`Khnl&jLEJ78~!S@I_1QZFn+=jiA$&Q+9b!xAacMP?1Iq?2G$fj2NL_jZb`7nwy|KM;EYs$+!sq|%O+UkaO?~= zL^^0tQUcP_ey}|GGL^#eg2{FcJ(o-ST^Wv8J}&aO;AKG0wWAQPg#xe4$3CEtw6-+) zrpbeaWV5L>Vpy?JFENhddAnf4mYEPrlt<$iREg6xL0_s6Q>Vp(myrQdvBy))QN;QD zxZ*wp@~yWUeW9SE_>%*a9iKlWUlM8LS~7T9K^UIusmR|#zz6(ZuD=PaKkU1B|97t+ zX8-^a{qIq3bmSk_0&o7ucz#mx|681+ zU*P<#>i-_+Cb|A$@OtiljOT~S|1XrEfBoO3`Wv19hnfBYC@B$1?)oDf!Ox&sEo( z;@S@*hJXDt)W8eXfA{>kVE4z96@R*FVZlBBU8DWG*Uy`>zxJnz^VibechBE#+Wzbg zPyi2)zuUrnNBKVApX2RKiuA+$;Pn5LF#X-}=Rk53j(?aU`EP==iaZj0`w0Nx!Cydl K{3-q8H}^k%IhZK` literal 0 HcmV?d00001 diff --git a/docs-parent/pom.xml b/docs-parent/pom.xml index 18d616e0..40da3594 100644 --- a/docs-parent/pom.xml +++ b/docs-parent/pom.xml @@ -36,6 +36,7 @@ 4.1.0.Final 3.1.0 1.6.3 + 1.0.5 9.2.13.v20150730 9.2.13.v20150730 @@ -367,6 +368,18 @@ ${org.bouncycastle.bcprov-jdk15on.version} + + fr.opensagres.xdocreport + org.odftoolkit.odfdom.converter.pdf + ${fr.opensagres.xdocreport.version} + + + + fr.opensagres.xdocreport + org.apache.poi.xwpf.converter.pdf + ${fr.opensagres.xdocreport.version} + + com.levigo.jbig2 From 7708f61343562ba26cdc341083d94963329a5b1b Mon Sep 17 00:00:00 2001 From: jendib Date: Fri, 11 Dec 2015 22:00:44 +0100 Subject: [PATCH 3/4] Closes #53: Build thumbnails for DOCX and ODT files --- README.md | 2 +- .../core/event/FileCreatedAsyncEvent.java | 41 ++---- .../async/FileCreatedAsyncListener.java | 8 +- .../com/sismics/docs/core/util/FileUtil.java | 135 ++++++++---------- .../sismics/docs/core/util/TestFileUtil.java | 16 ++- .../docs/rest/resource/FileResource.java | 9 +- .../src/partial/docs/document.edit.html | 3 +- .../docs/rest/TestDocumentResource.java | 132 ++++++++++++++++- .../src/test/resources/file/document.docx | Bin 0 -> 4827 bytes docs-web/src/test/resources/file/document.odt | Bin 0 -> 9267 bytes 10 files changed, 228 insertions(+), 118 deletions(-) create mode 100644 docs-web/src/test/resources/file/document.docx create mode 100644 docs-web/src/test/resources/file/document.odt diff --git a/README.md b/README.md index 95f20e5f..6383b142 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ Features - Responsive user interface - Optical character recognition -- Support image and PDF files +- Support image, PDF, ODT and DOCX files - Flexible search engine - Full text search in image and PDF - 256-bit AES encryption diff --git a/docs-core/src/main/java/com/sismics/docs/core/event/FileCreatedAsyncEvent.java b/docs-core/src/main/java/com/sismics/docs/core/event/FileCreatedAsyncEvent.java index 41a128f3..8eab7115 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/event/FileCreatedAsyncEvent.java +++ b/docs-core/src/main/java/com/sismics/docs/core/event/FileCreatedAsyncEvent.java @@ -28,58 +28,43 @@ public class FileCreatedAsyncEvent { private InputStream inputStream; /** - * Getter of file. - * - * @return the file + * Unencrypted input stream containing a PDF representation + * of the file. May be null if the PDF conversion is not + * necessary or not possible. */ + private InputStream pdfInputStream; + public File getFile() { return file; } - /** - * Setter of file. - * - * @param file file - */ public void setFile(File file) { this.file = file; } - /** - * Getter of document. - * - * @return the document - */ public Document getDocument() { return document; } - /** - * Setter of document. - * - * @param document document - */ public void setDocument(Document document) { this.document = document; } - /** - * Getter of inputStream. - * - * @return the inputStream - */ public InputStream getInputStream() { return inputStream; } - /** - * Setter de inputStream. - * - * @param inputStream inputStream - */ public void setInputStream(InputStream inputStream) { this.inputStream = inputStream; } + + public InputStream getPdfInputStream() { + return pdfInputStream; + } + + public void setPdfInputStream(InputStream pdfInputStream) { + this.pdfInputStream = pdfInputStream; + } @Override public String toString() { diff --git a/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java b/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java index bda947ab..0eacdeca 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java +++ b/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java @@ -12,7 +12,6 @@ import com.sismics.docs.core.event.FileCreatedAsyncEvent; import com.sismics.docs.core.model.jpa.File; import com.sismics.docs.core.util.FileUtil; import com.sismics.docs.core.util.TransactionUtil; -import com.sismics.util.mime.MimeTypeUtil; /** * Listener on file created. @@ -39,12 +38,15 @@ public class FileCreatedAsyncListener { // Guess the mime type a second time, for open document format (first detected as simple ZIP file) final File file = fileCreatedAsyncEvent.getFile(); - file.setMimeType(MimeTypeUtil.guessOpenDocumentFormat(file, fileCreatedAsyncEvent.getInputStream())); // Extract text content from the file long startTime = System.currentTimeMillis(); - final String content = FileUtil.extractContent(fileCreatedAsyncEvent.getDocument(), file, fileCreatedAsyncEvent.getInputStream()); + final String content = FileUtil.extractContent(fileCreatedAsyncEvent.getDocument(), file, + fileCreatedAsyncEvent.getInputStream(), fileCreatedAsyncEvent.getPdfInputStream()); fileCreatedAsyncEvent.getInputStream().close(); + if (fileCreatedAsyncEvent.getPdfInputStream() != null) { + fileCreatedAsyncEvent.getPdfInputStream().close(); + } log.info(MessageFormat.format("File content extracted in {0}ms", System.currentTimeMillis() - startTime)); // Store the text content in the database diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java b/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java index 951590c6..59b30bec 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java +++ b/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java @@ -1,6 +1,8 @@ package com.sismics.docs.core.util; import java.awt.image.BufferedImage; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; @@ -48,19 +50,16 @@ public class FileUtil { * @param document Document linked to the file * @param file File to extract * @param inputStream Unencrypted input stream + * @param pdfInputStream Unencrypted PDF input stream * @return Content extract */ - public static String extractContent(Document document, File file, InputStream inputStream) { + public static String extractContent(Document document, File file, InputStream inputStream, InputStream pdfInputStream) { String content = null; if (ImageUtil.isImage(file.getMimeType())) { content = ocrFile(inputStream, document); - } else if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) { - content = extractPdf(inputStream); - } else if (file.getMimeType().equals(MimeType.OPEN_DOCUMENT_TEXT)) { - content = extractOpenDocumentText(inputStream); - } else if (file.getMimeType().equals(MimeType.OFFICE_DOCUMENT)) { - content = extractOfficeDocument(inputStream); + } else if (pdfInputStream != null) { + content = extractPdf(pdfInputStream); } return content; @@ -129,92 +128,80 @@ public class FileUtil { } /** - * Extract text from an open document text file. + * Convert a file to PDF if necessary. * - * @param inputStream Unencrypted input stream - * @return Content extracted + * @param inputStream InputStream + * @param file File + * @return PDF input stream + * @throws Exception */ - private static String extractOpenDocumentText(InputStream inputStream) { - String content = null; - Path tempFile = null; - try { - // Convert the ODT file to a temporary PDF file - tempFile = Files.createTempFile("sismicsdocs_", ".pdf"); - try (OutputStream out = Files.newOutputStream(tempFile)) { - OdfTextDocument document = OdfTextDocument.loadDocument(inputStream); - PdfOptions options = PdfOptions.create(); - PdfConverter.getInstance().convert(document, out, options); - } - - // Extract content from the PDF file - try (InputStream pdfInputStream = Files.newInputStream(tempFile)) { - content = extractPdf(pdfInputStream); - } - - } catch (Exception e) { - log.error("Error while extracting text from the ODT", e); - } finally { - try { - Files.delete(tempFile); // Delete the temporary PDF file - } catch (IOException e) { - // Should not happen - } + public static InputStream convertToPdf(InputStream inputStream, File file) throws Exception { + if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) { + // It's already PDF, just return the input + return inputStream; } - return content; + + if (file.getMimeType().equals(MimeType.OFFICE_DOCUMENT)) { + return convertOfficeDocument(inputStream); + } + + if (file.getMimeType().equals(MimeType.OPEN_DOCUMENT_TEXT)) { + return convertOpenDocumentText(inputStream); + } + + // PDF conversion not necessary/possible + return null; } /** - * Extract text from an Office document. + * Convert an open document text file to PDF. * * @param inputStream Unencrypted input stream - * @return Content extracted + * @return PDF input stream + * @throws Exception */ - private static String extractOfficeDocument(InputStream inputStream) { - String content = null; - Path tempFile = null; - try { - // Convert the DOCX file to a temporary PDF file - tempFile = Files.createTempFile("sismicsdocs_", ".pdf"); - try (OutputStream out = Files.newOutputStream(tempFile)) { - XWPFDocument document = new XWPFDocument(inputStream); - org.apache.poi.xwpf.converter.pdf.PdfOptions options = org.apache.poi.xwpf.converter.pdf.PdfOptions.create(); - org.apache.poi.xwpf.converter.pdf.PdfConverter.getInstance().convert(document, out, options); - } - - // Extract content from the PDF file - try (InputStream pdfInputStream = Files.newInputStream(tempFile)) { - content = extractPdf(pdfInputStream); - } - - } catch (Exception e) { - log.error("Error while extracting text from the DOCX", e); - } finally { - try { - Files.delete(tempFile); // Delete the temporary PDF file - } catch (IOException e) { - // Should not happen - } - } - return content; + private static InputStream convertOpenDocumentText(InputStream inputStream) throws Exception { + ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream(); + OdfTextDocument document = OdfTextDocument.loadDocument(inputStream); + PdfOptions options = PdfOptions.create(); + PdfConverter.getInstance().convert(document, pdfOutputStream, options); + inputStream.reset(); + return new ByteArrayInputStream(pdfOutputStream.toByteArray()); + } + + /** + * Convert an Office document to PDF. + * + * @param inputStream Unencrypted input stream + * @return PDF input stream + * @throws Exception + */ + private static InputStream convertOfficeDocument(InputStream inputStream) throws Exception { + ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream(); + XWPFDocument document = new XWPFDocument(inputStream); + org.apache.poi.xwpf.converter.pdf.PdfOptions options = org.apache.poi.xwpf.converter.pdf.PdfOptions.create(); + org.apache.poi.xwpf.converter.pdf.PdfConverter.getInstance().convert(document, pdfOutputStream, options); + inputStream.reset(); + return new ByteArrayInputStream(pdfOutputStream.toByteArray()); } /** * Save a file on the storage filesystem. * * @param inputStream Unencrypted input stream + * @param pdf * @param file File to save * @param privateKey Private key used for encryption * @throws Exception */ - public static void save(InputStream inputStream, File file, String privateKey) throws Exception { + public static void save(InputStream inputStream, InputStream pdfInputStream, File file, String privateKey) throws Exception { Cipher cipher = EncryptionUtil.getEncryptionCipher(privateKey); Path path = DirectoryUtil.getStorageDirectory().resolve(file.getId()); Files.copy(new CipherInputStream(inputStream, cipher), path); + inputStream.reset(); // Generate file variations - inputStream.reset(); - saveVariations(file, inputStream, cipher); - inputStream.reset(); + saveVariations(file, inputStream, pdfInputStream, cipher); } /** @@ -222,25 +209,27 @@ public class FileUtil { * * @param file File from database * @param inputStream Unencrypted input stream + * @param pdfInputStream Unencrypted PDF input stream * @param cipher Cipher to use for encryption * @throws Exception */ - public static void saveVariations(File file, InputStream inputStream, Cipher cipher) throws Exception { + public static void saveVariations(File file, InputStream inputStream, InputStream pdfInputStream, Cipher cipher) throws Exception { BufferedImage image = null; if (ImageUtil.isImage(file.getMimeType())) { image = ImageIO.read(inputStream); - } else if(file.getMimeType().equals(MimeType.APPLICATION_PDF)) { + inputStream.reset(); + } else if(pdfInputStream != null) { // Generate preview from the first page of the PDF PDDocument pdfDocument = null; try { - pdfDocument = PDDocument.load(inputStream); + pdfDocument = PDDocument.load(pdfInputStream); PDFRenderer renderer = new PDFRenderer(pdfDocument); image = renderer.renderImage(0); + pdfInputStream.reset(); } finally { pdfDocument.close(); } } - // TODO Generate thumbnails for DOCX/ODT documents (guess the MIME type earlier and build a PDF version now?) if (image != null) { // Generate thumbnails from image diff --git a/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java b/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java index c9ecdeeb..2bb2dd38 100644 --- a/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java +++ b/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java @@ -1,9 +1,11 @@ package com.sismics.docs.core.util; +import java.io.ByteArrayInputStream; import java.io.InputStream; import junit.framework.Assert; +import org.apache.pdfbox.io.IOUtils; import org.junit.Test; import com.google.common.io.Resources; @@ -18,19 +20,25 @@ import com.sismics.util.mime.MimeType; public class TestFileUtil { @Test public void extractContentOpenDocumentTextTest() throws Exception { - try (InputStream inputStream = Resources.getResource("file/document.odt").openStream()) { + try (InputStream inputStream = Resources.getResource("file/document.odt").openStream(); + InputStream bytesInputStream = new ByteArrayInputStream(IOUtils.toByteArray(inputStream))) { File file = new File(); file.setMimeType(MimeType.OPEN_DOCUMENT_TEXT); - Assert.assertEquals("Lorem ipsum dolor sit amen.\r\n", FileUtil.extractContent(null, file, inputStream)); + try (InputStream pdfInputStream = FileUtil.convertToPdf(bytesInputStream, file)) { + Assert.assertEquals("Lorem ipsum dolor sit amen.\r\n", FileUtil.extractContent(null, file, inputStream, pdfInputStream)); + } } } @Test public void extractContentOfficeDocumentTest() throws Exception { - try (InputStream inputStream = Resources.getResource("file/document.docx").openStream()) { + try (InputStream inputStream = Resources.getResource("file/document.docx").openStream(); + InputStream bytesInputStream = new ByteArrayInputStream(IOUtils.toByteArray(inputStream))) { File file = new File(); file.setMimeType(MimeType.OFFICE_DOCUMENT); - Assert.assertEquals("Lorem ipsum dolor sit amen.\r\n", FileUtil.extractContent(null, file, inputStream)); + try (InputStream pdfInputStream = FileUtil.convertToPdf(bytesInputStream, file)) { + Assert.assertEquals("Lorem ipsum dolor sit amen.\r\n", FileUtil.extractContent(null, file, inputStream, pdfInputStream)); + } } } } diff --git a/docs-web/src/main/java/com/sismics/docs/rest/resource/FileResource.java b/docs-web/src/main/java/com/sismics/docs/rest/resource/FileResource.java index 69052f27..4ddc0621 100644 --- a/docs-web/src/main/java/com/sismics/docs/rest/resource/FileResource.java +++ b/docs-web/src/main/java/com/sismics/docs/rest/resource/FileResource.java @@ -146,8 +146,14 @@ public class FileResource extends BaseResource { file.setUserId(principal.getId()); String fileId = fileDao.create(file); + // Guess the mime type a second time, for open document format (first detected as simple ZIP file) + file.setMimeType(MimeTypeUtil.guessOpenDocumentFormat(file, fileInputStream)); + + // Convert to PDF if necessary (for thumbnail and text extraction) + InputStream pdfIntputStream = FileUtil.convertToPdf(fileInputStream, file); + // Save the file - FileUtil.save(fileInputStream, file, user.getPrivateKey()); + FileUtil.save(fileInputStream, pdfIntputStream, file, user.getPrivateKey()); // Update the user quota user.setStorageCurrent(user.getStorageCurrent() + fileData.length); @@ -159,6 +165,7 @@ public class FileResource extends BaseResource { fileCreatedAsyncEvent.setDocument(document); fileCreatedAsyncEvent.setFile(file); fileCreatedAsyncEvent.setInputStream(fileInputStream); + fileCreatedAsyncEvent.setPdfInputStream(pdfIntputStream); AppContext.getInstance().getAsyncEventBus().post(fileCreatedAsyncEvent); } diff --git a/docs-web/src/main/webapp/src/partial/docs/document.edit.html b/docs-web/src/main/webapp/src/partial/docs/document.edit.html index 717fc3a5..736f1136 100644 --- a/docs-web/src/main/webapp/src/partial/docs/document.edit.html +++ b/docs-web/src/main/webapp/src/partial/docs/document.edit.html @@ -39,7 +39,8 @@
+ accept="image/png,image/jpg,image/jpeg,image/gif,application/pdf,application/vnd.oasis.opendocument.text,application/vnd.openxmlformats-officedocument.wordprocessingml.document" + ng-disabled="fileIsUploading">
+ {{ orphanFiles.length }} file{{ orphanFiles.length > 1 ? 's' : '' }} diff --git a/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java b/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java index 3f1433fb..00ca2853 100644 --- a/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java +++ b/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java @@ -267,6 +267,124 @@ public class TestDocumentResource extends BaseJerseyTest { return json.getJsonArray("documents").size(); } + /** + * Test ODT extraction. + * + * @throws Exception + */ + @Test + public void testOdtExtraction() throws Exception { + // Login document_odt + clientUtil.createUser("document_odt"); + String documentOdtToken = clientUtil.login("document_odt"); + + // Create a document + long create1Date = new Date().getTime(); + JsonObject json = target().path("/document").request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentOdtToken) + .put(Entity.form(new Form() + .param("title", "My super title document 1") + .param("description", "My super description for document 1") + .param("language", "eng") + .param("create_date", Long.toString(create1Date))), JsonObject.class); + String document1Id = json.getString("id"); + Assert.assertNotNull(document1Id); + + // Add a PDF file + String file1Id = null; + try (InputStream is = Resources.getResource("file/document.odt").openStream()) { + StreamDataBodyPart streamDataBodyPart = new StreamDataBodyPart("file", is, "document.odt"); + try (FormDataMultiPart multiPart = new FormDataMultiPart()) { + json = target() + .register(MultiPartFeature.class) + .path("/file").request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentOdtToken) + .put(Entity.entity(multiPart.field("id", document1Id).bodyPart(streamDataBodyPart), + MediaType.MULTIPART_FORM_DATA_TYPE), JsonObject.class); + file1Id = json.getString("id"); + Assert.assertNotNull(file1Id); + } + } + + // Search documents by query in full content + json = target().path("/document/list") + .queryParam("search", "full:ipsum") + .request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentOdtToken) + .get(JsonObject.class); + Assert.assertTrue(json.getJsonArray("documents").size() == 1); + + // Get the file thumbnail data + Response response = target().path("/file/" + file1Id + "/data") + .queryParam("size", "thumb") + .request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentOdtToken) + .get(); + InputStream is = (InputStream) response.getEntity(); + byte[] fileBytes = ByteStreams.toByteArray(is); + Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues + Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes)); + } + + /** + * Test DOCX extraction. + * + * @throws Exception + */ + @Test + public void testDocxExtraction() throws Exception { + // Login document_docx + clientUtil.createUser("document_docx"); + String documentDocxToken = clientUtil.login("document_docx"); + + // Create a document + long create1Date = new Date().getTime(); + JsonObject json = target().path("/document").request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentDocxToken) + .put(Entity.form(new Form() + .param("title", "My super title document 1") + .param("description", "My super description for document 1") + .param("language", "eng") + .param("create_date", Long.toString(create1Date))), JsonObject.class); + String document1Id = json.getString("id"); + Assert.assertNotNull(document1Id); + + // Add a PDF file + String file1Id = null; + try (InputStream is = Resources.getResource("file/document.docx").openStream()) { + StreamDataBodyPart streamDataBodyPart = new StreamDataBodyPart("file", is, "document.docx"); + try (FormDataMultiPart multiPart = new FormDataMultiPart()) { + json = target() + .register(MultiPartFeature.class) + .path("/file").request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentDocxToken) + .put(Entity.entity(multiPart.field("id", document1Id).bodyPart(streamDataBodyPart), + MediaType.MULTIPART_FORM_DATA_TYPE), JsonObject.class); + file1Id = json.getString("id"); + Assert.assertNotNull(file1Id); + } + } + + // Search documents by query in full content + json = target().path("/document/list") + .queryParam("search", "full:dolor") + .request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentDocxToken) + .get(JsonObject.class); + Assert.assertTrue(json.getJsonArray("documents").size() == 1); + + // Get the file thumbnail data + Response response = target().path("/file/" + file1Id + "/data") + .queryParam("size", "thumb") + .request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentDocxToken) + .get(); + InputStream is = (InputStream) response.getEntity(); + byte[] fileBytes = ByteStreams.toByteArray(is); + Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues + Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes)); + } + /** * Test PDF extraction. * @@ -274,14 +392,14 @@ public class TestDocumentResource extends BaseJerseyTest { */ @Test public void testPdfExtraction() throws Exception { - // Login document2 - clientUtil.createUser("document2"); - String document2Token = clientUtil.login("document2"); + // Login document_pdf + clientUtil.createUser("document_pdf"); + String documentPdfToken = clientUtil.login("document_pdf"); // Create a document long create1Date = new Date().getTime(); JsonObject json = target().path("/document").request() - .cookie(TokenBasedSecurityFilter.COOKIE_NAME, document2Token) + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPdfToken) .put(Entity.form(new Form() .param("title", "My super title document 1") .param("description", "My super description for document 1") @@ -298,7 +416,7 @@ public class TestDocumentResource extends BaseJerseyTest { json = target() .register(MultiPartFeature.class) .path("/file").request() - .cookie(TokenBasedSecurityFilter.COOKIE_NAME, document2Token) + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPdfToken) .put(Entity.entity(multiPart.field("id", document1Id).bodyPart(streamDataBodyPart), MediaType.MULTIPART_FORM_DATA_TYPE), JsonObject.class); file1Id = json.getString("id"); @@ -310,7 +428,7 @@ public class TestDocumentResource extends BaseJerseyTest { json = target().path("/document/list") .queryParam("search", "full:vrandecic") .request() - .cookie(TokenBasedSecurityFilter.COOKIE_NAME, document2Token) + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPdfToken) .get(JsonObject.class); Assert.assertTrue(json.getJsonArray("documents").size() == 1); @@ -318,7 +436,7 @@ public class TestDocumentResource extends BaseJerseyTest { Response response = target().path("/file/" + file1Id + "/data") .queryParam("size", "thumb") .request() - .cookie(TokenBasedSecurityFilter.COOKIE_NAME, document2Token) + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPdfToken) .get(); InputStream is = (InputStream) response.getEntity(); byte[] fileBytes = ByteStreams.toByteArray(is); diff --git a/docs-web/src/test/resources/file/document.docx b/docs-web/src/test/resources/file/document.docx new file mode 100644 index 0000000000000000000000000000000000000000..fb1e6c2c6783058e041c09cabcec60e2a99fbccb GIT binary patch literal 4827 zcma)A2UJtp+6}!|sZs?33Jd{3K#CeVp$3qSNS`1*v;fkp(vd1K6cOo)H0dprK_Ju! zQlwV_0YUmpeDj{@`)AhwW!pduCGL)>6=@q0U}XY*~JW)v|js3PIn_> zZ-;f*ao{{hGxO?%A|FKqAqQ5MAUa0FCua3_8gl5WahJQJE{L8}@UiB#V# z5)*NK_57NxmQJ^fCD+@+1lQ@B!?b{#Ie&BLOu9)&5xE%4tNYb#THu-viiIUd^Ow8G@ zud&-<=)wHc{=AL}mi<4p)rg2!Q8?TanDxry0sz{7%LonT3TtO8Z8v8ZILOM`%|^)E z(IGjpO|wH7!;6e!vhVYkBM+ZUZUWXxc<|*pouKQUG#Xcd6ZK89u!;I$&$-ppmb=n5 zdtbrTL+qtVSwhsRI&?t{WS2xd~6g($r6R zfZZtHsAOs^6qW1SzfOlo0K4rxQl(gnY5i4jsXkV3lVb!rDpIB@|XsnkLh_!tjMb__wEy$}un zp#G0!Q(q?A!o>y4bymEFT8A)G;P&|w+QxF#NskSs$!n2(ZEVHt${CH0+>W$~k_`dQ zdTFxT-*#H}{9kv-#Pl*}8Yme=zQhOSge5=@>3tlt^??w-PsY1LzrR8e$ViQPZLUTW zFw%YFVW_G=Q3p?3Z6X0z4q>BG83T1u5s~HHFE^eADs$iAL*dwkItahVsltC(iRL^O z*czN0eDEe1_4$D8~3Qx)hXS`SAXCwDA~f4RAjv|X55 z@z7r}tx2PVi%VlV@{|5LQK$QH>oWM}lk+AVBvEADs+-fz!|eAw=4S4L1FQ0OF@b`r zLAlVHb#?BC=1yH?hV=P<5lt_=fmK;#f(L1wIY2kHfe?FWU%X)iZ}#Z;Bh@$i6ka-P zHX(kAFX3CWvN_*&tUp2mXZ4w1@2>A~IGdoe+NT7je8r?5$Ny111#bI?4j(Qi>ANi>G7Hfc)kmvg$4q5)P_9LBdmi#SeX(hF?CwH5@{iClpOEAQ9 z+@ipx0S>W!92R8FmFgZdWA|+ojBpvhx%Tu58t5B+|51t(zSX_@QY+l%j$RQI2_jT)Bqb??8FtqRD^SUZbg0pEO}%}-ROUzNJJZt^_=qxqI2YQp2YNH{cxd=! zdf8T#;B-;Nyt-7Ly#8H`$g7M4WKD8oqFATV6ut|Y4qhnpZ8_qH46oPPiDPx*vJh8G z$HP|zZj4S9dmA1QTNLLnPsS z>lhP#!~Z8EQT<0KaCd}*4IFErrc@31q%bwM60%Q6y-Cszk4jQq6SfE+7Jm`JZe!AA zzdr|YBR=|?9IDaEK|=i}(6C|HXVwm#=;pDf9<>)5+L=kvD!qYA^d_=9tsie^!(wy| zPno~RQbkf8$B>J}sr;_lZ-hF(ktD`G7+icv6R20@@TvYLgeGh^+Ab1i-cAq(p+ZiX zxzN}JE2}`*vT4!Vp1wk1X+bxmy`W?Yt~_;%QBqGOJrIeXh$pmt-l40to;}0)y-JOX z?n$@I#;P~7o@6I}Nr48Tq}bKym1r_1+A{qWyz=VtG-Y!rEk(Rp=M zJCVSf?UH~itX#Pr@a6aMtt6=O{ghq?#$cUk2sxf`hkJ^s)l+LgGgv$Vziyd+SCQop zPLjX?sBw6uJ-3e{OijVgy@l-@_?f)0i*DXQB1*25+s$I_K!a5RY^AsOo)Qk$DGvvf?3 z3z}Py=2<_lkm&-o#f2laOhli;4HmtKGeA=%U=V9;Ekq!c7CCE#Y}ggqFNJi6_@B2g z*gUE>)HI+Fhe{!(*z-{wHL5!1rr?nFB}XcJ2J3R?w_8b%2-h>CE%k=`SY~-_E!{-E~F=_>-QV zTYPIWYnTlfc_ZrIGW+_`*SNrz7=P}Q{*H&B{WxM9UD@7gE6~KoexP~{D|^_A2jhkQ zjCJPh4&|Je=C^#f`P<@P3Cfk^d6w0;_68Z_^@IKQd$Ya6kk*q#ENQiz=3?MGdZYMi z_peyTXc*UgwAtX-1s!-gQfjymi;hrMv#Oo=_OSREFB|dh1AX$KTJU0N&6=uO*sAPg z!(@#2*V<;jsu^p_0x??=pK=jnqd2A2fYf8Vf#kCT8;#!Z4eHy~?M^9U+l(vpw#%H? zu0wI^HXn=Y;%66DF;qejxr(6!eajz>Hd7vF0cQ)}Xh5*`3>8&mB6zLwGc$AV5Cj+QoV zFef{#3;RL6CH|CXBD~7ECFGjOOvo5F zBjMm-f7&duk-z^DoeXojcvd3)loI5_*-1z~OLSo6TH^}pQ%y6)w;W}9}zuBQN)RtV2H1@5StR+|~ zDn-0lE+l1X*osf8I2jr<4Fy8R#Mo@q>h7;@JIJJ->bX;^Pa;hECEv63=o zH0APQ39Qqmy1!HzJ*E7%!+>LQ2%fH1ULcMg* zjWcX>{k88Kv_`maHy3Yx`I7^D$Kfg>ATrBHlQO>PZm{2!nA-uJbWJ8Jr*8oMQ59@a zCN9hL-{(vnt{!Q3QiAx1dS*B{7e47!wgR`-oUgZ41JejR2TU@Z5i0@MTs&mhywHou z#R8=NFCo}EJGtvwSUP-fz?-zemagVgmYaN9r2OTM6!ul3||cJ|MtXQgK$*QbqKJ6pSSf!nrWbSjhRaLQ!AY1!#JDs42c2scOs5OOIc zOZ|Bu7OezDV?Lo1NR5bRfljH3d#mW|HDKwNHn94vi*^&02V4kVCq3@S@Hf`Qj2Hb^oIKD}5 zUOfMHkm!M!ru<9098qZP!YOfE>&!f}PRNAB$mibAOgub#YUi(@umDZMPTjWf?DDw1{5&D52mTmO5fNvFOIxyFsJ)Nz z+24EYzx@#0#@!v`u8#tBiz)?D>8iTi-d;()#<&H2p{ZLXure+lmXK4H%CT(ylGkc2p=!O9 z^#cdf&Q*4()|{K(K=m&9P)FBi{}1P8liy1@O^nv>1(+;>!W@6U#w(b!Ch{2RU>ad_ zJ%o!5-0XWff7XxHPw4jJCp`ww>Rj#-1J`h(xc8|Cda5PL6i>M;OeSmXvXp4#RBjZw zL|G$jn~gp~-hFs3^bvgTvjejH>FvH&NMV(k!Da#C{q~2v?BxD7Jlhx5hHo^Y&$Hpr zScGxOR@#O&H-c#rQY`wL;+-uiPHpeBEDcW(i!Y1Vysr1-HARtD77%|?WqQRFK!Lgj z8cwyLQkJ*WSNEQU-QfDVAYIao+6{V^*+eKYxj;|Cbdmic)0A)}!JgoH;*Q`0x0UQ* zP6diWnj6O}_s})QxHb+HMx-B-bc3W4&bETDQhG&m9tE(v$DNa(jZNK%QA!C^(MsZF z@`^@zRuXl|6L}vobWf=^PG`{LSDH4rG07U`0qwceoA6ZZCgc#x#bl|Cv30 zg*dFvp5EA_b|Iv~D3cj39vDNcOL}3!__h_w!Qk00hOtA2m+{{k$b>udl3 literal 0 HcmV?d00001 diff --git a/docs-web/src/test/resources/file/document.odt b/docs-web/src/test/resources/file/document.odt new file mode 100644 index 0000000000000000000000000000000000000000..b7062de00dbdf5945ca643521e6101130c854764 GIT binary patch literal 9267 zcmeHNWmr^O+a5u>l@ckXL0USbyQPtqu7Mf4LsCY%hLY|M=@Jl-8bBJPB^3c__=fYo z9zEyPbG_I1>-+INbM2Wmu%COab+6gae%5nu6?w$l_y7PZ0H9PJB5lyafk_7d0B#QW zp8$4Nc4lCApqVicXlrF^47PHxXLYfE&f;JUvI4O<0L|>5JD57#nc0I`z-De>mA`<& z{r@ylcuU;D-rUN<+361$5C<#J$-%d$?EJqNCD_5i_QxnuQBnV(_)kjUjT@T(&Wf?AshO=AoFoS)R#RsuryFkF z>}(MMD)J~dO9DW+d4t>UDWib@_wTS)cE%QFAXYIeu$?gw^bZXDif?mU2V=0=KdsWQ zc)OWD(AeJ0_TRnziUD-^P(fgKTQktjBzN@;U(Isg@oiH!E_maaZP)kBW7m8v1VIvm zTIgA<=bQY11bQfu9KMWo%TWI3D0~{ZgytD%ZZyJOadeRRyCsJ!wbjHadb<*_AC@+c2)q_ldywFAl#^IJl+2l5FpiC*JELh}n7@n(7B+sZVetihH=| z^r&!dotMg~c$t}1W-q8t`1r#kGwJuwO}nwGSxesqk}#FH>s7h&_?UI=$kQQE^rG7s zw2u=hpK33m^r`f#zOP1Lni*a8AH5jU2dWfJ4Vm|@IC?x&XqX!A95mHWUB#8Ty%m4RVy^LrPGm%+$PUgjF^Z4|qB$LF^!mk|lGy}Q`z z2b*Mipsvv;RrfSQZb9dU77#}-4zBc9Fdp7-N?5lOCBV>qxsbQ^ZH;fEwn(=fxj>aC zWjK-Ze0D|H;Q15uqR=l>D-k>F=8loP!)GFhtw3bb&^a3(=Q%M458(QQLR0{T22%_~ zS;bFDJ!DeU%$B$L@kHgGWXxez0Dr{2AQE2w#{AV2@sa-2M68NU;`W@^y%LzQ8Tl;e zK?lj}gh-13~YrqHS40BUZ1@`-hY*x_ds8W@`>iQnP zesz|IyLzJQ?ztY5J5h7*DS0(pk_bBJ(LFOdX*Bc1wMA8#YUJiVlqtIz&e1kt$J#4> zDFG4%-B_DjHFSJzBJa@Cmzuj(RS3k@ntB6BK6?Z>@g`=25nJ(!`l#Z+l1wmyQH*7I9Dt0~>sm3u|$W~K)<8m~ulBvE@R}I#srS7IF zK2ziY<-!a;E{m+R?`yWH-eOk@${NYovXh}Xp+g`;SQ&}AsL)<0`NEO(DF}tnuQH_y z!fl1g9`|(@FHfvlg|-z0Is7aqK@RMEg<2CY@j9f*NLc@^4(tB-14c_W+(XmbeF(iH zSXqcfhn$mqa@A7xM9H6z@|HoG^hTx@LZ-XP)dj?DGU<_B_yq!f~ibv+Wqz%lW4JC3=m%&KEpdga;*`X%j z63B>6&GStRlMXG@4$RIzY$;`PB+c)vx?ah+8^mwX&SJc1)Y99s;LapR|JJ9B$iK&g zfcwETBm$w`c6r}n<1#&U@|bxUrZ%~ByN_)!TG)lZJYpy*T#cOnk(NZT5P{oiG~S}s z4hn>fh#$8mQB((zX#6H>=0&+aF`CcLBW$X)s0#@hfBeO}&Lf7n;CDiG6>UZn-68-Q zZR!pqb6}6Dwwwx1R(@FS!Kn`o?|y(Ake|5V3FNhph7(zVaF;ak31`Z6!%APFa%r!V zb}$k(v0upum)E^yl7*L_->99bW336jeB76wWdd|N@eR|d_xK>i*8PO1o1&UZxUaWr zP=t}arEP$kb@9wK_CpogN_8`6AuOZxnWJzyl`BjU;cq!rv3$!Ae+BA^g*ULlX#cg~-um61g zn)DS+;1gzg^(6*F@Fs(Fkn_O3Y`h_2$bQeL!1X<7^%PN!kgdi?I@(FbK!ez4gGfP_ zg7!l!Rf_CGORed{A$RvgOwD_1Zi5Se7lB>-mQ=d$8LeM^sg4vZ>;4KZ*o-I4Zdv8w zy({b`O7YsfHA497i=ER;i5XwHG6l&5LtL4IXs_ob?C=mhh5cAHci`KnwbbT0+n)93 zrr6f4^y+tYHJp#bbl;wFzI)dw6|Y5;0eGIj;w3kUn*Tz~uw5{?UI@m~h$0;!Yrkh( z^7dN3(9%P!aXW}d4W=aVF%OtQfy0SsNgIUat_5l`dEfQ>Tuf1V!|CT zail1t%j}YC72o4?nI|J16`L~>XL0&rF2mTuVe;0(XJR;Z?R0zSn4kS@t*d=TMtJN( za#Ln8^HSM;sL^K9cH9k@^gl8i7z(~Qs#Pv=m{YBvxO$iYvz{V*-O=DWCPLN;qK1tY zi}9xpbwLesyyea<)$t5SH@lg{`E_vsWToeh+$*fu2Vdhmn!^(id3oZnM-sj)%aj}3 zgKZz}0$^CVfi*PEyfB_}^i4~C!^1@J44Bx=4a5-~qH&isMEKul)R$v!YWSvnLnTYq zGNh4pFu3=Adk}XBD;Mz(fTar!(Uzp4#LJ_R{F+RcAe&74Sg#9ES!j@{Wjf4TM+j7a zR>0q%pUQOEG>mUtYkE$2NM0TDNsn6K)9qXofA_oV15?FxmzD~{8w+#WmypS*)xM~J z8beW@7O9-kOo!x-2+s3>4^2Jyr1Dd8L`7%ovPK07*BzkXgAE~J?rJUv$dM4vds)*} z)V;BO@{&kpqxlcL{>PGp$U+KfA~jW}-suK(4-KsC!rQ}b*%{W_QDI=@-37rwHL*xr@5=JT z66U#fFMPc?WM7XGa8%59V7e~;5i4DJ+-KJ6Eta66@ELI}5FlSWVTOqOFYal`<*LMMw>F*O>HOL{&kD z`eRcDntmC}+D#_@CYFZWS^k-+Iwpz~vVH5o+ziG>S9?Z2!-N;iJean_)mijzDF=ee z%?~>!cpYUhdukx4XylP%R!6H|Bz!~5Xy#g~;O)#P_hU+-h7k*&xC6B z9<08+Avc*iqUyl6b>;e*!jTi7bH{yRrrMbo)IG+4aGM7jO&GpAPoyPJ)O-sLmnivE zUZ0-c?w6zkKKfGF596!B7?x5UEU8hiysN#%J0VoS^Iy9%f)V2gnjeQ+J*U z%UWux1gw}qmo|$>VqeMu&jh&`Oqv-c#|fbNqB0vq6k*lLdqr)#vRB^hZm6xpOTKO> zb+oRB4~KV-PPI46E0n$(!N_d*L*EE9)ni-|(yFFwpHptF69HYzzwuBvSO?bzAN8KM zXGUt*+K!%#j?MY&Aj-B%#ASQ}#$nqIzu|EQjNvlR*JifAl7gN^*kXF6C4^TI_xcQE z_{d#vA)k9N)O9Fc)be=Pq7mfKV;GQgK)Pv#a-f2C#1AA^6;tq1myqwI+D@KcaTbp> zZ3L!Y79APhEWZPMpV0?j{!jn7{5Mw*$P5g&vbXqS%TueX2b$r>^?B0vu**|U2KLxE z28_wT%#4OfJDA#v=6NgQo$f1Yl#^rqzTkDBY=NB+`$<)Ci}!iy>h^F%5=D!jI$k(b zccn{q7gfT~m^zQAwpVrIr0U!0w+x$9g)BhI17s^qF$$)U6mnJR!$y_Coo|ZF@mXOC zl0^*Km_>zww6v6(6fWvs}E+j@cG z47uf-3ZwhiLt(*>qP%xNoecH$OTMv7YAHLi>Ac+H~a!kjI{F^OJ1lC)`)8?6cj}5EQzU{jIvyc&Ct8S zU`hGqXB|Sp70PEpj?NV)D@ioH(Wu6^mg=fq=#soc&Cp<_e3Pz-z z4#gWkLBp_=O#j9e3w6^JmEHqB*S)WZL(It+S%;ey?&XGWy39PRsVL_d&n;g(ITq#@ zVo<~$p>VWkUHQ?Sk)42B%-f{fHD-`Zr%zxTVsBm!i0|+$nkJnlKGHd1^ugX+;|zwx!Etogt0KXidDODt+gEFz=F* zNZ*Hg<*2SD0f_#p7KBni%fnot=Xn)Pzo&WMy-22B^%@BnmLUF^6+>*BIygH}tch(K zn?mI&|F(L7mG|YoZxJhU+yo1UsQ-fkN?DBP34*z{Q51Rl?IQB~T_mn^hadsVL%O>r zTM*)|L?p8`qh^#xU@>BPdT;VFt5ig$vqJ7B6(P4zBZqvZcp{xjlH!He#cQ&TWkx{> zWcB_al-YQjSWW5&j;2prcT{u}!lsZ;|WwrZ9Y?u>u0sQv(E=7bl6oHeIKY;XZq-T@&u65jTkceH=`qkxCfQadu zFBjzjGIiqIHYGZ)V3Uc0CgMb+Txe@!jDtt`v;}bOH8-DhRs{P)5DxR@@TNZ%P4F%b zJ&a9UseO+T|8tsqT+Ol`{vBtwx;Gr{YF-x6Lm0-qTMfF`xbKg(D|jO126!lx-#n_r z){EmG9+YQ%V9&z7;I8mAp7l|@+Zd7WslT7l$8ra6QLV*eyQN)+n>z|~wv`kyG60Z9 z|2?7jUZ>5o;b6?V4FKF6Hw8O&OJ_S1dt)nG5G(k5lLcsR5vHOfjfGBfQ)R=Fm61?` zf7Sv3w*aW{dlaDavUd{zfEN}O)Fo}4T;b)>m+s)mAn)+dpuot)7ZJ&?V$Q)@R!<>t2a2@2e8^N)P3;ZO*}iU* z25%BQ?rSJc^ng~oX@pm6ho$_mGsEl^Vv`f*wsh=M>L6J!K8);MT$79DlM_9q42)bK zVIk$S>AB?|*s+mW^_=}UQ*HTk=Dz-k()!Ewi!K|h{Hm6URu47mBfHXx=H|~>vysQP zrv;#qLF9R_`grW(mPZhQBWTm(-N)Fyr7ig;3(h`kJZew>@7u7#6PBK`t;^t(#v9^g zC6y$~#Eb&|rDFVQpNRoK%hbUh{BHpyS4-YumKV46SPPZ0f+yK=4k3)&g*5^$4Ty~9 zwC_c~rcc%uR*$LL@m+jCGa6^wFmv`$FZJ_%>7FS1F)J>}PFiJ~l~owiCTgq1>cteN^!vy6 zewzC{QA+Ijv3ir7w3>N-w&8gQndAQUMVb(=IT1-cs1F-MDRMJ6MSkIadOUhFi?!`^ z7=h=|s_!<~#x9L&ylyv3pE<5<Q zboJ2oNj0kp&2mjG^Qdw)S-VK>6roKhL(k(E)5=_!n}tGg3Orwp>`J;7C+G49#6|!E_|V9; zmQ%+7^%8#XJ`Lqs$#`v`a*Do@Oyl)aWG_6+$nUkN%0-~)*qQwJQr5Im&_k+;QX&12 z3Ra^zgWMZ;aGzFUn|rFRHb6PCDHeM~j;LFQKB-zv{GiZ!zr z>;!yr4KG8g-!y-N2)^vftWfFHr(|M(ts{!Y0}Uxl3|Y(Zbag2oqkF{kKG6z6(bEsL z2PcqPGV$~kCUOLh@>fi3`eFJr#L4~(ck~_2vFAB+cvazY0FhXjoDrYc`1?DSD*`3S z!;9C(qzUE;c79Of6su(R$)pdSuCQGqo+rfe-5O1tS@Ej5idWqk3{dcUG<%3?S>0}O z*oIU(I|FLjJeeYaTpQ#*MzY1dzUFil(JQ{Hqiv3ziTgUHkxY@>yOe%NuA;}RI=*5_ zYDga>`RQHE(kDjf3hG{_P=hZ?(aA}OUpC;-Fnor|+~Mw5V%zhJZs}cV71CsN)98NC z%&W(Zb7f%v=@qiDaDe9m3$0I~QQrDVRAEo&T7o%g84}xi=AgTy>nGM!o*oyCb=zjS z=>gpS?V9$*;`oaRZ)vxBWzW(*dPhQ$JzMhlyobiMRuXFTWnhyeMUh8DEp=zaZ7dv? z_>Uj&KX`MBc9WzyTZ$vc!P5y@?C(j+uP$Z+_{FR!sV>Sat0cwxXY~kP0x4COx9(!Y zo!`Khnl&jLEJ78~!S@I_1QZFn+=jiA$&Q+9b!xAacMP?1Iq?2G$fj2NL_jZb`7nwy|KM;EYs$+!sq|%O+UkaO?~= zL^^0tQUcP_ey}|GGL^#eg2{FcJ(o-ST^Wv8J}&aO;AKG0wWAQPg#xe4$3CEtw6-+) zrpbeaWV5L>Vpy?JFENhddAnf4mYEPrlt<$iREg6xL0_s6Q>Vp(myrQdvBy))QN;QD zxZ*wp@~yWUeW9SE_>%*a9iKlWUlM8LS~7T9K^UIusmR|#zz6(ZuD=PaKkU1B|97t+ zX8-^a{qIq3bmSk_0&o7ucz#mx|681+ zU*P<#>i-_+Cb|A$@OtiljOT~S|1XrEfBoO3`Wv19hnfBYC@B$1?)oDf!Ox&sEo( z;@S@*hJXDt)W8eXfA{>kVE4z96@R*FVZlBBU8DWG*Uy`>zxJnz^VibechBE#+Wzbg zPyi2)zuUrnNBKVApX2RKiuA+$;Pn5LF#X-}=Rk53j(?aU`EP==iaZj0`w0Nx!Cydl K{3-q8H}^k%IhZK` literal 0 HcmV?d00001 From 24d8784e1b2af70fa52649bf7e6adafce00a2b83 Mon Sep 17 00:00:00 2001 From: jendib Date: Fri, 11 Dec 2015 22:22:21 +0100 Subject: [PATCH 4/4] Fix Junit for Unix systems --- .../test/java/com/sismics/docs/core/util/TestFileUtil.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java b/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java index 2bb2dd38..1482d88c 100644 --- a/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java +++ b/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java @@ -25,7 +25,8 @@ public class TestFileUtil { File file = new File(); file.setMimeType(MimeType.OPEN_DOCUMENT_TEXT); try (InputStream pdfInputStream = FileUtil.convertToPdf(bytesInputStream, file)) { - Assert.assertEquals("Lorem ipsum dolor sit amen.\r\n", FileUtil.extractContent(null, file, inputStream, pdfInputStream)); + String content = FileUtil.extractContent(null, file, inputStream, pdfInputStream); + Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen.")); } } } @@ -37,7 +38,8 @@ public class TestFileUtil { File file = new File(); file.setMimeType(MimeType.OFFICE_DOCUMENT); try (InputStream pdfInputStream = FileUtil.convertToPdf(bytesInputStream, file)) { - Assert.assertEquals("Lorem ipsum dolor sit amen.\r\n", FileUtil.extractContent(null, file, inputStream, pdfInputStream)); + String content = FileUtil.extractContent(null, file, inputStream, pdfInputStream); + Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen.")); } } }