diff --git a/README.md b/README.md index 95f20e5f..6383b142 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ Features - Responsive user interface - Optical character recognition -- Support image and PDF files +- Support image, PDF, ODT and DOCX files - Flexible search engine - Full text search in image and PDF - 256-bit AES encryption diff --git a/docs-core/pom.xml b/docs-core/pom.xml index cf4d3719..3ad8e390 100644 --- a/docs-core/pom.xml +++ b/docs-core/pom.xml @@ -117,6 +117,16 @@ com.levigo.jbig2 levigo-jbig2-imageio + + + fr.opensagres.xdocreport + org.odftoolkit.odfdom.converter.pdf + + + + fr.opensagres.xdocreport + org.apache.poi.xwpf.converter.pdf + diff --git a/docs-core/src/main/java/com/sismics/docs/core/dao/jpa/FileDao.java b/docs-core/src/main/java/com/sismics/docs/core/dao/jpa/FileDao.java index 28b7a92c..3aa3e310 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/dao/jpa/FileDao.java +++ b/docs-core/src/main/java/com/sismics/docs/core/dao/jpa/FileDao.java @@ -142,6 +142,7 @@ public class FileDao { fileFromDb.setDocumentId(file.getDocumentId()); fileFromDb.setContent(file.getContent()); fileFromDb.setOrder(file.getOrder()); + fileFromDb.setMimeType(file.getMimeType()); return file; } diff --git a/docs-core/src/main/java/com/sismics/docs/core/event/FileCreatedAsyncEvent.java b/docs-core/src/main/java/com/sismics/docs/core/event/FileCreatedAsyncEvent.java index 41a128f3..8eab7115 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/event/FileCreatedAsyncEvent.java +++ b/docs-core/src/main/java/com/sismics/docs/core/event/FileCreatedAsyncEvent.java @@ -28,58 +28,43 @@ public class FileCreatedAsyncEvent { private InputStream inputStream; /** - * Getter of file. - * - * @return the file + * Unencrypted input stream containing a PDF representation + * of the file. May be null if the PDF conversion is not + * necessary or not possible. */ + private InputStream pdfInputStream; + public File getFile() { return file; } - /** - * Setter of file. - * - * @param file file - */ public void setFile(File file) { this.file = file; } - /** - * Getter of document. - * - * @return the document - */ public Document getDocument() { return document; } - /** - * Setter of document. - * - * @param document document - */ public void setDocument(Document document) { this.document = document; } - /** - * Getter of inputStream. - * - * @return the inputStream - */ public InputStream getInputStream() { return inputStream; } - /** - * Setter de inputStream. - * - * @param inputStream inputStream - */ public void setInputStream(InputStream inputStream) { this.inputStream = inputStream; } + + public InputStream getPdfInputStream() { + return pdfInputStream; + } + + public void setPdfInputStream(InputStream pdfInputStream) { + this.pdfInputStream = pdfInputStream; + } @Override public String toString() { diff --git a/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java b/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java index 1b76cb0d..0eacdeca 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java +++ b/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java @@ -36,20 +36,26 @@ public class FileCreatedAsyncListener { log.info("File created event: " + fileCreatedAsyncEvent.toString()); } - // OCR the file + // Guess the mime type a second time, for open document format (first detected as simple ZIP file) final File file = fileCreatedAsyncEvent.getFile(); + + // Extract text content from the file long startTime = System.currentTimeMillis(); - final String content = FileUtil.extractContent(fileCreatedAsyncEvent.getDocument(), file, fileCreatedAsyncEvent.getInputStream()); + final String content = FileUtil.extractContent(fileCreatedAsyncEvent.getDocument(), file, + fileCreatedAsyncEvent.getInputStream(), fileCreatedAsyncEvent.getPdfInputStream()); fileCreatedAsyncEvent.getInputStream().close(); + if (fileCreatedAsyncEvent.getPdfInputStream() != null) { + fileCreatedAsyncEvent.getPdfInputStream().close(); + } log.info(MessageFormat.format("File content extracted in {0}ms", System.currentTimeMillis() - startTime)); - // Store the OCR-ization result in the database + // Store the text content in the database TransactionUtil.handle(new Runnable() { @Override public void run() { FileDao fileDao = new FileDao(); if (fileDao.getById(file.getId()) == null) { - // The file has been deleted since the OCR-ization started, ignore the result + // The file has been deleted since the text extraction started, ignore the result return; } diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java b/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java index d836e94b..59b30bec 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java +++ b/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java @@ -1,6 +1,8 @@ package com.sismics.docs.core.util; import java.awt.image.BufferedImage; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; @@ -15,9 +17,13 @@ import javax.imageio.ImageIO; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.rendering.PDFRenderer; import org.apache.pdfbox.text.PDFTextStripper; +import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.imgscalr.Scalr; import org.imgscalr.Scalr.Method; import org.imgscalr.Scalr.Mode; +import org.odftoolkit.odfdom.converter.pdf.PdfConverter; +import org.odftoolkit.odfdom.converter.pdf.PdfOptions; +import org.odftoolkit.odfdom.doc.OdfTextDocument; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -44,15 +50,16 @@ public class FileUtil { * @param document Document linked to the file * @param file File to extract * @param inputStream Unencrypted input stream + * @param pdfInputStream Unencrypted PDF input stream * @return Content extract */ - public static String extractContent(Document document, File file, InputStream inputStream) { + public static String extractContent(Document document, File file, InputStream inputStream, InputStream pdfInputStream) { String content = null; if (ImageUtil.isImage(file.getMimeType())) { content = ocrFile(inputStream, document); - } else if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) { - content = extractPdf(inputStream); + } else if (pdfInputStream != null) { + content = extractPdf(pdfInputStream); } return content; @@ -120,23 +127,81 @@ public class FileUtil { return content; } + /** + * Convert a file to PDF if necessary. + * + * @param inputStream InputStream + * @param file File + * @return PDF input stream + * @throws Exception + */ + public static InputStream convertToPdf(InputStream inputStream, File file) throws Exception { + if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) { + // It's already PDF, just return the input + return inputStream; + } + + if (file.getMimeType().equals(MimeType.OFFICE_DOCUMENT)) { + return convertOfficeDocument(inputStream); + } + + if (file.getMimeType().equals(MimeType.OPEN_DOCUMENT_TEXT)) { + return convertOpenDocumentText(inputStream); + } + + // PDF conversion not necessary/possible + return null; + } + + /** + * Convert an open document text file to PDF. + * + * @param inputStream Unencrypted input stream + * @return PDF input stream + * @throws Exception + */ + private static InputStream convertOpenDocumentText(InputStream inputStream) throws Exception { + ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream(); + OdfTextDocument document = OdfTextDocument.loadDocument(inputStream); + PdfOptions options = PdfOptions.create(); + PdfConverter.getInstance().convert(document, pdfOutputStream, options); + inputStream.reset(); + return new ByteArrayInputStream(pdfOutputStream.toByteArray()); + } + + /** + * Convert an Office document to PDF. + * + * @param inputStream Unencrypted input stream + * @return PDF input stream + * @throws Exception + */ + private static InputStream convertOfficeDocument(InputStream inputStream) throws Exception { + ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream(); + XWPFDocument document = new XWPFDocument(inputStream); + org.apache.poi.xwpf.converter.pdf.PdfOptions options = org.apache.poi.xwpf.converter.pdf.PdfOptions.create(); + org.apache.poi.xwpf.converter.pdf.PdfConverter.getInstance().convert(document, pdfOutputStream, options); + inputStream.reset(); + return new ByteArrayInputStream(pdfOutputStream.toByteArray()); + } + /** * Save a file on the storage filesystem. * * @param inputStream Unencrypted input stream + * @param pdf * @param file File to save * @param privateKey Private key used for encryption * @throws Exception */ - public static void save(InputStream inputStream, File file, String privateKey) throws Exception { + public static void save(InputStream inputStream, InputStream pdfInputStream, File file, String privateKey) throws Exception { Cipher cipher = EncryptionUtil.getEncryptionCipher(privateKey); Path path = DirectoryUtil.getStorageDirectory().resolve(file.getId()); Files.copy(new CipherInputStream(inputStream, cipher), path); + inputStream.reset(); // Generate file variations - inputStream.reset(); - saveVariations(file, inputStream, cipher); - inputStream.reset(); + saveVariations(file, inputStream, pdfInputStream, cipher); } /** @@ -144,20 +209,23 @@ public class FileUtil { * * @param file File from database * @param inputStream Unencrypted input stream + * @param pdfInputStream Unencrypted PDF input stream * @param cipher Cipher to use for encryption * @throws Exception */ - public static void saveVariations(File file, InputStream inputStream, Cipher cipher) throws Exception { + public static void saveVariations(File file, InputStream inputStream, InputStream pdfInputStream, Cipher cipher) throws Exception { BufferedImage image = null; if (ImageUtil.isImage(file.getMimeType())) { image = ImageIO.read(inputStream); - } else if(file.getMimeType().equals(MimeType.APPLICATION_PDF)) { + inputStream.reset(); + } else if(pdfInputStream != null) { // Generate preview from the first page of the PDF PDDocument pdfDocument = null; try { - pdfDocument = PDDocument.load(inputStream); + pdfDocument = PDDocument.load(pdfInputStream); PDFRenderer renderer = new PDFRenderer(pdfDocument); image = renderer.renderImage(0); + pdfInputStream.reset(); } finally { pdfDocument.close(); } diff --git a/docs-core/src/main/java/com/sismics/util/mime/MimeType.java b/docs-core/src/main/java/com/sismics/util/mime/MimeType.java index 9e1d6916..e5821a62 100644 --- a/docs-core/src/main/java/com/sismics/util/mime/MimeType.java +++ b/docs-core/src/main/java/com/sismics/util/mime/MimeType.java @@ -18,4 +18,8 @@ public class MimeType { public static final String APPLICATION_ZIP = "application/zip"; public static final String APPLICATION_PDF = "application/pdf"; + + public static final String OPEN_DOCUMENT_TEXT = "application/vnd.oasis.opendocument.text"; + + public static final String OFFICE_DOCUMENT = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"; } diff --git a/docs-core/src/main/java/com/sismics/util/mime/MimeTypeUtil.java b/docs-core/src/main/java/com/sismics/util/mime/MimeTypeUtil.java index 76c49ba1..058f7f7d 100644 --- a/docs-core/src/main/java/com/sismics/util/mime/MimeTypeUtil.java +++ b/docs-core/src/main/java/com/sismics/util/mime/MimeTypeUtil.java @@ -3,6 +3,13 @@ package com.sismics.util.mime; import java.io.InputStream; import java.io.UnsupportedEncodingException; +import org.apache.commons.compress.archivers.ArchiveEntry; +import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream; +import org.apache.commons.compress.utils.IOUtils; + +import com.google.common.base.Charsets; +import com.sismics.docs.core.model.jpa.File; + /** * Utility to check MIME types. * @@ -77,8 +84,59 @@ public class MimeTypeUtil { return "ico"; case MimeType.APPLICATION_PDF: return "pdf"; + case MimeType.OPEN_DOCUMENT_TEXT: + return "odt"; + case MimeType.OFFICE_DOCUMENT: + return "docx"; default: return null; } } + + /** + * Guess the MIME type of open document formats (docx and odt). + * It's more costly than the simple header check, but needed because open document formats + * are simple ZIP files on the outside and much bigger on the inside. + * + * @param file File + * @param inputStream Input stream + * @return MIME type + */ + public static String guessOpenDocumentFormat(File file, InputStream inputStream) { + if (!MimeType.APPLICATION_ZIP.equals(file.getMimeType())) { + // open document formats are ZIP files + return file.getMimeType(); + } + + String mimeType = file.getMimeType(); + try (ZipArchiveInputStream archiveInputStream = new ZipArchiveInputStream(inputStream, Charsets.ISO_8859_1.name())) { + ArchiveEntry archiveEntry = archiveInputStream.getNextEntry(); + while (archiveEntry != null) { + if (archiveEntry.getName().equals("mimetype")) { + // Maybe it's an ODT file + String content = new String(IOUtils.toByteArray(archiveInputStream), Charsets.ISO_8859_1); + if (MimeType.OPEN_DOCUMENT_TEXT.equals(content.trim())) { + mimeType = MimeType.OPEN_DOCUMENT_TEXT; + break; + } + } else if (archiveEntry.getName().equals("[Content_Types].xml")) { + // Maybe it's a DOCX file + String content = new String(IOUtils.toByteArray(archiveInputStream), Charsets.ISO_8859_1); + if (content.contains(MimeType.OFFICE_DOCUMENT)) { + mimeType = MimeType.OFFICE_DOCUMENT; + break; + } + } + + archiveEntry = archiveInputStream.getNextEntry(); + } + + inputStream.reset(); + } catch (Exception e) { + // In case of any error, just give up and keep the ZIP MIME type + return file.getMimeType(); + } + + return mimeType; + } } diff --git a/docs-core/src/test/java/com/sismics/docs/core/util/TestEncryptUtil.java b/docs-core/src/test/java/com/sismics/docs/core/util/TestEncryptUtil.java index 2e161157..b6f54964 100644 --- a/docs-core/src/test/java/com/sismics/docs/core/util/TestEncryptUtil.java +++ b/docs-core/src/test/java/com/sismics/docs/core/util/TestEncryptUtil.java @@ -18,7 +18,6 @@ import com.google.common.io.ByteStreams; * @author bgamard */ public class TestEncryptUtil { - /** * Test private key. */ diff --git a/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java b/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java new file mode 100644 index 00000000..1482d88c --- /dev/null +++ b/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java @@ -0,0 +1,46 @@ +package com.sismics.docs.core.util; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; + +import junit.framework.Assert; + +import org.apache.pdfbox.io.IOUtils; +import org.junit.Test; + +import com.google.common.io.Resources; +import com.sismics.docs.core.model.jpa.File; +import com.sismics.util.mime.MimeType; + +/** + * Test of the file entity utilities. + * + * @author bgamard + */ +public class TestFileUtil { + @Test + public void extractContentOpenDocumentTextTest() throws Exception { + try (InputStream inputStream = Resources.getResource("file/document.odt").openStream(); + InputStream bytesInputStream = new ByteArrayInputStream(IOUtils.toByteArray(inputStream))) { + File file = new File(); + file.setMimeType(MimeType.OPEN_DOCUMENT_TEXT); + try (InputStream pdfInputStream = FileUtil.convertToPdf(bytesInputStream, file)) { + String content = FileUtil.extractContent(null, file, inputStream, pdfInputStream); + Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen.")); + } + } + } + + @Test + public void extractContentOfficeDocumentTest() throws Exception { + try (InputStream inputStream = Resources.getResource("file/document.docx").openStream(); + InputStream bytesInputStream = new ByteArrayInputStream(IOUtils.toByteArray(inputStream))) { + File file = new File(); + file.setMimeType(MimeType.OFFICE_DOCUMENT); + try (InputStream pdfInputStream = FileUtil.convertToPdf(bytesInputStream, file)) { + String content = FileUtil.extractContent(null, file, inputStream, pdfInputStream); + Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen.")); + } + } + } +} diff --git a/docs-core/src/test/java/com/sismics/util/TestMimeTypeUtil.java b/docs-core/src/test/java/com/sismics/util/TestMimeTypeUtil.java new file mode 100644 index 00000000..b16e91c7 --- /dev/null +++ b/docs-core/src/test/java/com/sismics/util/TestMimeTypeUtil.java @@ -0,0 +1,40 @@ +package com.sismics.util; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; + +import org.apache.commons.compress.utils.IOUtils; +import org.junit.Assert; +import org.junit.Test; + +import com.google.common.io.Resources; +import com.sismics.docs.core.model.jpa.File; +import com.sismics.util.mime.MimeType; +import com.sismics.util.mime.MimeTypeUtil; + +/** + * Test of the utilities to check MIME types. + * + * @author bgamard + */ +public class TestMimeTypeUtil { + + @Test + public void guessOpenDocumentFormatTest() throws Exception { + // Detect ODT files + try (InputStream inputStream = Resources.getResource("file/document.odt").openStream(); + InputStream byteArrayInputStream = new ByteArrayInputStream(IOUtils.toByteArray(inputStream))) { + File file = new File(); + file.setMimeType(MimeType.APPLICATION_ZIP); + Assert.assertEquals(MimeType.OPEN_DOCUMENT_TEXT, MimeTypeUtil.guessOpenDocumentFormat(file, byteArrayInputStream)); + } + + // Detect DOCX files + try (InputStream inputStream = Resources.getResource("file/document.docx").openStream(); + InputStream byteArrayInputStream = new ByteArrayInputStream(IOUtils.toByteArray(inputStream))) { + File file = new File(); + file.setMimeType(MimeType.APPLICATION_ZIP); + Assert.assertEquals(MimeType.OFFICE_DOCUMENT, MimeTypeUtil.guessOpenDocumentFormat(file, byteArrayInputStream)); + } + } +} diff --git a/docs-core/src/test/resources/file/document.docx b/docs-core/src/test/resources/file/document.docx new file mode 100644 index 00000000..fb1e6c2c Binary files /dev/null and b/docs-core/src/test/resources/file/document.docx differ diff --git a/docs-core/src/test/resources/file/document.odt b/docs-core/src/test/resources/file/document.odt new file mode 100644 index 00000000..b7062de0 Binary files /dev/null and b/docs-core/src/test/resources/file/document.odt differ diff --git a/docs-parent/pom.xml b/docs-parent/pom.xml index 18d616e0..40da3594 100644 --- a/docs-parent/pom.xml +++ b/docs-parent/pom.xml @@ -36,6 +36,7 @@ 4.1.0.Final 3.1.0 1.6.3 + 1.0.5 9.2.13.v20150730 9.2.13.v20150730 @@ -367,6 +368,18 @@ ${org.bouncycastle.bcprov-jdk15on.version} + + fr.opensagres.xdocreport + org.odftoolkit.odfdom.converter.pdf + ${fr.opensagres.xdocreport.version} + + + + fr.opensagres.xdocreport + org.apache.poi.xwpf.converter.pdf + ${fr.opensagres.xdocreport.version} + + com.levigo.jbig2 diff --git a/docs-web/src/main/java/com/sismics/docs/rest/resource/FileResource.java b/docs-web/src/main/java/com/sismics/docs/rest/resource/FileResource.java index 69052f27..4ddc0621 100644 --- a/docs-web/src/main/java/com/sismics/docs/rest/resource/FileResource.java +++ b/docs-web/src/main/java/com/sismics/docs/rest/resource/FileResource.java @@ -146,8 +146,14 @@ public class FileResource extends BaseResource { file.setUserId(principal.getId()); String fileId = fileDao.create(file); + // Guess the mime type a second time, for open document format (first detected as simple ZIP file) + file.setMimeType(MimeTypeUtil.guessOpenDocumentFormat(file, fileInputStream)); + + // Convert to PDF if necessary (for thumbnail and text extraction) + InputStream pdfIntputStream = FileUtil.convertToPdf(fileInputStream, file); + // Save the file - FileUtil.save(fileInputStream, file, user.getPrivateKey()); + FileUtil.save(fileInputStream, pdfIntputStream, file, user.getPrivateKey()); // Update the user quota user.setStorageCurrent(user.getStorageCurrent() + fileData.length); @@ -159,6 +165,7 @@ public class FileResource extends BaseResource { fileCreatedAsyncEvent.setDocument(document); fileCreatedAsyncEvent.setFile(file); fileCreatedAsyncEvent.setInputStream(fileInputStream); + fileCreatedAsyncEvent.setPdfInputStream(pdfIntputStream); AppContext.getInstance().getAsyncEventBus().post(fileCreatedAsyncEvent); } diff --git a/docs-web/src/main/webapp/src/app/docs/filter/Filesize.js b/docs-web/src/main/webapp/src/app/docs/filter/Filesize.js new file mode 100644 index 00000000..e3dfda68 --- /dev/null +++ b/docs-web/src/main/webapp/src/app/docs/filter/Filesize.js @@ -0,0 +1,18 @@ +'use strict'; + +/** + * Format file sizes. + */ +angular.module('docs').filter('filesize', function() { + return function(text) { + if (!text) { + return ''; + } + + var size = parseInt(text); + if (size > 1000000) { // 1MB + return Math.round(size / 1000000) + 'MB'; + } + return Math.round(size / 1000) + 'kB'; + } +}); \ No newline at end of file diff --git a/docs-web/src/main/webapp/src/app/docs/filter/Newline.js b/docs-web/src/main/webapp/src/app/docs/filter/Newline.js index c7cce4b7..e56e6bac 100644 --- a/docs-web/src/main/webapp/src/app/docs/filter/Newline.js +++ b/docs-web/src/main/webapp/src/app/docs/filter/Newline.js @@ -1,7 +1,7 @@ 'use strict'; /** - * Filter converting new lines in
+ * Filter converting new lines in
. */ angular.module('docs').filter('newline', function() { return function(text) { @@ -10,4 +10,4 @@ angular.module('docs').filter('newline', function() { } return text.replace(/\n/g, '
'); } -}) \ No newline at end of file +}); \ No newline at end of file diff --git a/docs-web/src/main/webapp/src/app/docs/filter/Shorten.js b/docs-web/src/main/webapp/src/app/docs/filter/Shorten.js index d2b055e1..d479cce1 100644 --- a/docs-web/src/main/webapp/src/app/docs/filter/Shorten.js +++ b/docs-web/src/main/webapp/src/app/docs/filter/Shorten.js @@ -10,4 +10,4 @@ angular.module('docs').filter('shorten', function() { } return text.substring(0, 1).toUpperCase(); } -}) \ No newline at end of file +}); \ No newline at end of file diff --git a/docs-web/src/main/webapp/src/app/share/filter/Filesize.js b/docs-web/src/main/webapp/src/app/share/filter/Filesize.js new file mode 100644 index 00000000..b6765b45 --- /dev/null +++ b/docs-web/src/main/webapp/src/app/share/filter/Filesize.js @@ -0,0 +1,18 @@ +'use strict'; + +/** + * Format file sizes. + */ +angular.module('share').filter('filesize', function() { + return function(text) { + if (!text) { + return ''; + } + + var size = parseInt(text); + if (size > 1000000) { // 1MB + return Math.round(size / 1000000) + 'MB'; + } + return Math.round(size / 1000) + 'kB'; + } +}); \ No newline at end of file diff --git a/docs-web/src/main/webapp/src/app/share/filter/Newline.js b/docs-web/src/main/webapp/src/app/share/filter/Newline.js index 571fa4d5..5e5c44f8 100644 --- a/docs-web/src/main/webapp/src/app/share/filter/Newline.js +++ b/docs-web/src/main/webapp/src/app/share/filter/Newline.js @@ -1,7 +1,7 @@ 'use strict'; /** - * Filter converting new lines in
+ * Filter converting new lines in
. */ angular.module('share').filter('newline', function() { return function(text) { @@ -10,4 +10,4 @@ angular.module('share').filter('newline', function() { } return text.replace(/\n/g, '
'); } -}) \ No newline at end of file +}); \ No newline at end of file diff --git a/docs-web/src/main/webapp/src/index.html b/docs-web/src/main/webapp/src/index.html index c71cf81d..59f8b030 100644 --- a/docs-web/src/main/webapp/src/index.html +++ b/docs-web/src/main/webapp/src/index.html @@ -63,6 +63,7 @@ + diff --git a/docs-web/src/main/webapp/src/partial/docs/document.default.html b/docs-web/src/main/webapp/src/partial/docs/document.default.html index d4842b17..0d814030 100644 --- a/docs-web/src/main/webapp/src/partial/docs/document.default.html +++ b/docs-web/src/main/webapp/src/partial/docs/document.default.html @@ -8,7 +8,7 @@
- +
diff --git a/docs-web/src/main/webapp/src/partial/docs/document.edit.html b/docs-web/src/main/webapp/src/partial/docs/document.edit.html index 717fc3a5..736f1136 100644 --- a/docs-web/src/main/webapp/src/partial/docs/document.edit.html +++ b/docs-web/src/main/webapp/src/partial/docs/document.edit.html @@ -39,7 +39,8 @@
+ accept="image/png,image/jpg,image/jpeg,image/gif,application/pdf,application/vnd.oasis.opendocument.text,application/vnd.openxmlformats-officedocument.wordprocessingml.document" + ng-disabled="fileIsUploading">
+ {{ orphanFiles.length }} file{{ orphanFiles.length > 1 ? 's' : '' }} diff --git a/docs-web/src/main/webapp/src/partial/docs/document.view.content.html b/docs-web/src/main/webapp/src/partial/docs/document.view.content.html index 5042d3ce..4d5d7a9d 100644 --- a/docs-web/src/main/webapp/src/partial/docs/document.view.content.html +++ b/docs-web/src/main/webapp/src/partial/docs/document.view.content.html @@ -6,7 +6,7 @@
- +
diff --git a/docs-web/src/main/webapp/src/partial/share/share.html b/docs-web/src/main/webapp/src/partial/share/share.html index 25833a68..d03fe326 100644 --- a/docs-web/src/main/webapp/src/partial/share/share.html +++ b/docs-web/src/main/webapp/src/partial/share/share.html @@ -18,7 +18,9 @@ diff --git a/docs-web/src/main/webapp/src/share.html b/docs-web/src/main/webapp/src/share.html index 6e401014..a8e8ea7d 100644 --- a/docs-web/src/main/webapp/src/share.html +++ b/docs-web/src/main/webapp/src/share.html @@ -37,6 +37,7 @@ + diff --git a/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java b/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java index 3f1433fb..00ca2853 100644 --- a/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java +++ b/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java @@ -267,6 +267,124 @@ public class TestDocumentResource extends BaseJerseyTest { return json.getJsonArray("documents").size(); } + /** + * Test ODT extraction. + * + * @throws Exception + */ + @Test + public void testOdtExtraction() throws Exception { + // Login document_odt + clientUtil.createUser("document_odt"); + String documentOdtToken = clientUtil.login("document_odt"); + + // Create a document + long create1Date = new Date().getTime(); + JsonObject json = target().path("/document").request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentOdtToken) + .put(Entity.form(new Form() + .param("title", "My super title document 1") + .param("description", "My super description for document 1") + .param("language", "eng") + .param("create_date", Long.toString(create1Date))), JsonObject.class); + String document1Id = json.getString("id"); + Assert.assertNotNull(document1Id); + + // Add a PDF file + String file1Id = null; + try (InputStream is = Resources.getResource("file/document.odt").openStream()) { + StreamDataBodyPart streamDataBodyPart = new StreamDataBodyPart("file", is, "document.odt"); + try (FormDataMultiPart multiPart = new FormDataMultiPart()) { + json = target() + .register(MultiPartFeature.class) + .path("/file").request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentOdtToken) + .put(Entity.entity(multiPart.field("id", document1Id).bodyPart(streamDataBodyPart), + MediaType.MULTIPART_FORM_DATA_TYPE), JsonObject.class); + file1Id = json.getString("id"); + Assert.assertNotNull(file1Id); + } + } + + // Search documents by query in full content + json = target().path("/document/list") + .queryParam("search", "full:ipsum") + .request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentOdtToken) + .get(JsonObject.class); + Assert.assertTrue(json.getJsonArray("documents").size() == 1); + + // Get the file thumbnail data + Response response = target().path("/file/" + file1Id + "/data") + .queryParam("size", "thumb") + .request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentOdtToken) + .get(); + InputStream is = (InputStream) response.getEntity(); + byte[] fileBytes = ByteStreams.toByteArray(is); + Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues + Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes)); + } + + /** + * Test DOCX extraction. + * + * @throws Exception + */ + @Test + public void testDocxExtraction() throws Exception { + // Login document_docx + clientUtil.createUser("document_docx"); + String documentDocxToken = clientUtil.login("document_docx"); + + // Create a document + long create1Date = new Date().getTime(); + JsonObject json = target().path("/document").request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentDocxToken) + .put(Entity.form(new Form() + .param("title", "My super title document 1") + .param("description", "My super description for document 1") + .param("language", "eng") + .param("create_date", Long.toString(create1Date))), JsonObject.class); + String document1Id = json.getString("id"); + Assert.assertNotNull(document1Id); + + // Add a PDF file + String file1Id = null; + try (InputStream is = Resources.getResource("file/document.docx").openStream()) { + StreamDataBodyPart streamDataBodyPart = new StreamDataBodyPart("file", is, "document.docx"); + try (FormDataMultiPart multiPart = new FormDataMultiPart()) { + json = target() + .register(MultiPartFeature.class) + .path("/file").request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentDocxToken) + .put(Entity.entity(multiPart.field("id", document1Id).bodyPart(streamDataBodyPart), + MediaType.MULTIPART_FORM_DATA_TYPE), JsonObject.class); + file1Id = json.getString("id"); + Assert.assertNotNull(file1Id); + } + } + + // Search documents by query in full content + json = target().path("/document/list") + .queryParam("search", "full:dolor") + .request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentDocxToken) + .get(JsonObject.class); + Assert.assertTrue(json.getJsonArray("documents").size() == 1); + + // Get the file thumbnail data + Response response = target().path("/file/" + file1Id + "/data") + .queryParam("size", "thumb") + .request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentDocxToken) + .get(); + InputStream is = (InputStream) response.getEntity(); + byte[] fileBytes = ByteStreams.toByteArray(is); + Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues + Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes)); + } + /** * Test PDF extraction. * @@ -274,14 +392,14 @@ public class TestDocumentResource extends BaseJerseyTest { */ @Test public void testPdfExtraction() throws Exception { - // Login document2 - clientUtil.createUser("document2"); - String document2Token = clientUtil.login("document2"); + // Login document_pdf + clientUtil.createUser("document_pdf"); + String documentPdfToken = clientUtil.login("document_pdf"); // Create a document long create1Date = new Date().getTime(); JsonObject json = target().path("/document").request() - .cookie(TokenBasedSecurityFilter.COOKIE_NAME, document2Token) + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPdfToken) .put(Entity.form(new Form() .param("title", "My super title document 1") .param("description", "My super description for document 1") @@ -298,7 +416,7 @@ public class TestDocumentResource extends BaseJerseyTest { json = target() .register(MultiPartFeature.class) .path("/file").request() - .cookie(TokenBasedSecurityFilter.COOKIE_NAME, document2Token) + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPdfToken) .put(Entity.entity(multiPart.field("id", document1Id).bodyPart(streamDataBodyPart), MediaType.MULTIPART_FORM_DATA_TYPE), JsonObject.class); file1Id = json.getString("id"); @@ -310,7 +428,7 @@ public class TestDocumentResource extends BaseJerseyTest { json = target().path("/document/list") .queryParam("search", "full:vrandecic") .request() - .cookie(TokenBasedSecurityFilter.COOKIE_NAME, document2Token) + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPdfToken) .get(JsonObject.class); Assert.assertTrue(json.getJsonArray("documents").size() == 1); @@ -318,7 +436,7 @@ public class TestDocumentResource extends BaseJerseyTest { Response response = target().path("/file/" + file1Id + "/data") .queryParam("size", "thumb") .request() - .cookie(TokenBasedSecurityFilter.COOKIE_NAME, document2Token) + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPdfToken) .get(); InputStream is = (InputStream) response.getEntity(); byte[] fileBytes = ByteStreams.toByteArray(is); diff --git a/docs-web/src/test/resources/file/document.docx b/docs-web/src/test/resources/file/document.docx new file mode 100644 index 00000000..fb1e6c2c Binary files /dev/null and b/docs-web/src/test/resources/file/document.docx differ diff --git a/docs-web/src/test/resources/file/document.odt b/docs-web/src/test/resources/file/document.odt new file mode 100644 index 00000000..b7062de0 Binary files /dev/null and b/docs-web/src/test/resources/file/document.odt differ