Merge pull request #54 from sismics/master

Push to production
2025-06-06 15:49:56 +02:00 · 2015-12-11 22:22:44 +01:00 · 2015-12-11 22:22:44 +01:00 · 737b3299ff
commit 737b3299ff
parent 1934bb71f0 24d8784e1b
29 changed files with 457 additions and 61 deletions
--- a/README.md
+++ b/README.md
@ -21,7 +21,7 @@ Features

 - Responsive user interface
 - Optical character recognition
- Support image and PDF files
+- Support image, PDF, ODT and DOCX files
 - Flexible search engine
 - Full text search in image and PDF
 - 256-bit AES encryption
--- a/docs-core/pom.xml
+++ b/docs-core/pom.xml
@ -118,6 +118,16 @@
      <artifactId>levigo-jbig2-imageio</artifactId>
    </dependency>

+		<dependency>
+			<groupId>fr.opensagres.xdocreport</groupId>
+			<artifactId>org.odftoolkit.odfdom.converter.pdf</artifactId>
+		</dependency>
+
+		<dependency>
+			<groupId>fr.opensagres.xdocreport</groupId>
+			<artifactId>org.apache.poi.xwpf.converter.pdf</artifactId>
+		</dependency>
+    
    <!-- OCR dependencies -->
    <dependency>
      <groupId>jna</groupId>
--- a/docs-core/src/main/java/com/sismics/docs/core/dao/jpa/FileDao.java
+++ b/docs-core/src/main/java/com/sismics/docs/core/dao/jpa/FileDao.java
@ -142,6 +142,7 @@ public class FileDao {
        fileFromDb.setDocumentId(file.getDocumentId());
        fileFromDb.setContent(file.getContent());
        fileFromDb.setOrder(file.getOrder());
+        fileFromDb.setMimeType(file.getMimeType());
        
        return file;
    }
--- a/docs-core/src/main/java/com/sismics/docs/core/event/FileCreatedAsyncEvent.java
+++ b/docs-core/src/main/java/com/sismics/docs/core/event/FileCreatedAsyncEvent.java
@ -28,59 +28,44 @@ public class FileCreatedAsyncEvent {
    private InputStream inputStream;
    
    /**
-     * Getter of file.
-     *
-     * @return the file
+     * Unencrypted input stream containing a PDF representation
+     * of the file. May be null if the PDF conversion is not
+     * necessary or not possible.
     */
+    private InputStream pdfInputStream;
+    
    public File getFile() {
        return file;
    }

-    /**
-     * Setter of file.
-     *
-     * @param file file
-     */
    public void setFile(File file) {
        this.file = file;
    }
    
-    /**
-     * Getter of document.
-     *
-     * @return the document
-     */
    public Document getDocument() {
        return document;
    }

-    /**
-     * Setter of document.
-     *
-     * @param document document
-     */
    public void setDocument(Document document) {
        this.document = document;
    }
    
-    /**
-     * Getter of inputStream.
-     *
-     * @return the inputStream
-     */
    public InputStream getInputStream() {
        return inputStream;
    }

-    /**
-     * Setter de inputStream.
-     *
-     * @param inputStream inputStream
-     */
    public void setInputStream(InputStream inputStream) {
        this.inputStream = inputStream;
    }
    
+    public InputStream getPdfInputStream() {
+        return pdfInputStream;
+    }
+
+    public void setPdfInputStream(InputStream pdfInputStream) {
+        this.pdfInputStream = pdfInputStream;
+    }
+
    @Override
    public String toString() {
        return MoreObjects.toStringHelper(this)
--- a/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java
+++ b/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java
@ -36,20 +36,26 @@ public class FileCreatedAsyncListener {
            log.info("File created event: " + fileCreatedAsyncEvent.toString());
        }

-        // OCR the file
+        // Guess the mime type a second time, for open document format (first detected as simple ZIP file)
        final File file = fileCreatedAsyncEvent.getFile();
+        
+        // Extract text content from the file
        long startTime = System.currentTimeMillis();
-        final String content = FileUtil.extractContent(fileCreatedAsyncEvent.getDocument(), file, fileCreatedAsyncEvent.getInputStream());
+        final String content = FileUtil.extractContent(fileCreatedAsyncEvent.getDocument(), file,
+                fileCreatedAsyncEvent.getInputStream(), fileCreatedAsyncEvent.getPdfInputStream());
        fileCreatedAsyncEvent.getInputStream().close();
+        if (fileCreatedAsyncEvent.getPdfInputStream() != null) {
+            fileCreatedAsyncEvent.getPdfInputStream().close();
+        }
        log.info(MessageFormat.format("File content extracted in {0}ms", System.currentTimeMillis() - startTime));
        
-        // Store the OCR-ization result in the database
+        // Store the text content in the database
        TransactionUtil.handle(new Runnable() {
            @Override
            public void run() {
                FileDao fileDao = new FileDao();
                if (fileDao.getById(file.getId()) == null) {
-                    // The file has been deleted since the OCR-ization started, ignore the result
+                    // The file has been deleted since the text extraction started, ignore the result
                    return;
                }
                
--- a/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java
+++ b/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java
@ -1,6 +1,8 @@
 package com.sismics.docs.core.util;

 import java.awt.image.BufferedImage;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
@ -15,9 +17,13 @@ import javax.imageio.ImageIO;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.rendering.PDFRenderer;
 import org.apache.pdfbox.text.PDFTextStripper;
+import org.apache.poi.xwpf.usermodel.XWPFDocument;
 import org.imgscalr.Scalr;
 import org.imgscalr.Scalr.Method;
 import org.imgscalr.Scalr.Mode;
+import org.odftoolkit.odfdom.converter.pdf.PdfConverter;
+import org.odftoolkit.odfdom.converter.pdf.PdfOptions;
+import org.odftoolkit.odfdom.doc.OdfTextDocument;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

@ -44,15 +50,16 @@ public class FileUtil {
     * @param document Document linked to the file
     * @param file File to extract
     * @param inputStream Unencrypted input stream
+     * @param pdfInputStream Unencrypted PDF input stream
     * @return Content extract
     */
-    public static String extractContent(Document document, File file, InputStream inputStream) {
+    public static String extractContent(Document document, File file, InputStream inputStream, InputStream pdfInputStream) {
        String content = null;
        
        if (ImageUtil.isImage(file.getMimeType())) {
            content = ocrFile(inputStream, document);
-        } else if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) {
-            content = extractPdf(inputStream);
+        } else if (pdfInputStream != null) {
+            content = extractPdf(pdfInputStream);
        }
        
        return content;
@ -120,23 +127,81 @@ public class FileUtil {
        return content;
    }
    
+    /**
+     * Convert a file to PDF if necessary.
+     * 
+     * @param inputStream InputStream
+     * @param file File
+     * @return PDF input stream
+     * @throws Exception 
+     */
+    public static InputStream convertToPdf(InputStream inputStream, File file) throws Exception {
+        if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) {
+            // It's already PDF, just return the input
+            return inputStream;
+        }
+        
+        if (file.getMimeType().equals(MimeType.OFFICE_DOCUMENT)) {
+            return convertOfficeDocument(inputStream);
+        }
+        
+        if (file.getMimeType().equals(MimeType.OPEN_DOCUMENT_TEXT)) {
+            return convertOpenDocumentText(inputStream);
+        }
+        
+        // PDF conversion not necessary/possible
+        return null;
+    }
+    
+    /**
+     * Convert an open document text file to PDF.
+     * 
+     * @param inputStream Unencrypted input stream
+     * @return PDF input stream
+     * @throws Exception 
+     */
+    private static InputStream convertOpenDocumentText(InputStream inputStream) throws Exception {
+        ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream();
+        OdfTextDocument document = OdfTextDocument.loadDocument(inputStream);
+        PdfOptions options = PdfOptions.create();
+        PdfConverter.getInstance().convert(document, pdfOutputStream, options);
+        inputStream.reset();
+        return new ByteArrayInputStream(pdfOutputStream.toByteArray());
+    }
+    
+    /**
+     * Convert an Office document to PDF.
+     * 
+     * @param inputStream Unencrypted input stream
+     * @return PDF input stream
+     * @throws Exception 
+     */
+    private static InputStream convertOfficeDocument(InputStream inputStream) throws Exception {
+        ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream();
+        XWPFDocument document = new XWPFDocument(inputStream);
+        org.apache.poi.xwpf.converter.pdf.PdfOptions options = org.apache.poi.xwpf.converter.pdf.PdfOptions.create();
+        org.apache.poi.xwpf.converter.pdf.PdfConverter.getInstance().convert(document, pdfOutputStream, options);
+        inputStream.reset();
+        return new ByteArrayInputStream(pdfOutputStream.toByteArray());
+    }
+    
    /**
     * Save a file on the storage filesystem.
     * 
     * @param inputStream Unencrypted input stream
+     * @param pdf
     * @param file File to save
     * @param privateKey Private key used for encryption
     * @throws Exception
     */
-    public static void save(InputStream inputStream, File file, String privateKey) throws Exception {
+    public static void save(InputStream inputStream, InputStream pdfInputStream, File file, String privateKey) throws Exception {
        Cipher cipher = EncryptionUtil.getEncryptionCipher(privateKey);
        Path path = DirectoryUtil.getStorageDirectory().resolve(file.getId());
        Files.copy(new CipherInputStream(inputStream, cipher), path);
+        inputStream.reset();
        
        // Generate file variations
-        inputStream.reset();
-        saveVariations(file, inputStream, cipher);
-        inputStream.reset();
+        saveVariations(file, inputStream, pdfInputStream, cipher);
    }

    /**
@ -144,20 +209,23 @@ public class FileUtil {
     * 
     * @param file File from database
     * @param inputStream Unencrypted input stream
+     * @param pdfInputStream Unencrypted PDF input stream
     * @param cipher Cipher to use for encryption
     * @throws Exception
     */
-    public static void saveVariations(File file, InputStream inputStream, Cipher cipher) throws Exception {
+    public static void saveVariations(File file, InputStream inputStream, InputStream pdfInputStream, Cipher cipher) throws Exception {
        BufferedImage image = null;
        if (ImageUtil.isImage(file.getMimeType())) {
            image = ImageIO.read(inputStream);
-        } else if(file.getMimeType().equals(MimeType.APPLICATION_PDF)) {
+            inputStream.reset();
+        } else if(pdfInputStream != null) {
            // Generate preview from the first page of the PDF
            PDDocument pdfDocument = null;
            try {
-                pdfDocument = PDDocument.load(inputStream);
+                pdfDocument = PDDocument.load(pdfInputStream);
                PDFRenderer renderer = new PDFRenderer(pdfDocument);
                image = renderer.renderImage(0);
+                pdfInputStream.reset();
            } finally {
                pdfDocument.close();
            }
--- a/docs-core/src/main/java/com/sismics/util/mime/MimeType.java
+++ b/docs-core/src/main/java/com/sismics/util/mime/MimeType.java
@ -18,4 +18,8 @@ public class MimeType {
    public static final String APPLICATION_ZIP = "application/zip";
    
    public static final String APPLICATION_PDF = "application/pdf";
+    
+    public static final String OPEN_DOCUMENT_TEXT = "application/vnd.oasis.opendocument.text";
+    
+    public static final String OFFICE_DOCUMENT = "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
 }
--- a/docs-core/src/main/java/com/sismics/util/mime/MimeTypeUtil.java
+++ b/docs-core/src/main/java/com/sismics/util/mime/MimeTypeUtil.java
@ -3,6 +3,13 @@ package com.sismics.util.mime;
 import java.io.InputStream;
 import java.io.UnsupportedEncodingException;

+import org.apache.commons.compress.archivers.ArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
+import org.apache.commons.compress.utils.IOUtils;
+
+import com.google.common.base.Charsets;
+import com.sismics.docs.core.model.jpa.File;
+
 /**
 * Utility to check MIME types.
 *
@ -77,8 +84,59 @@ public class MimeTypeUtil {
            return "ico";
        case MimeType.APPLICATION_PDF:
            return "pdf";
+        case MimeType.OPEN_DOCUMENT_TEXT:
+            return "odt";
+        case MimeType.OFFICE_DOCUMENT:
+            return "docx";
        default:
            return null;
        }
    }
+    
+    /**
+     * Guess the MIME type of open document formats (docx and odt).
+     * It's more costly than the simple header check, but needed because open document formats
+     * are simple ZIP files on the outside and much bigger on the inside.
+     * 
+     * @param file File 
+     * @param inputStream Input stream
+     * @return MIME type
+     */
+    public static String guessOpenDocumentFormat(File file, InputStream inputStream) {
+        if (!MimeType.APPLICATION_ZIP.equals(file.getMimeType())) {
+            // open document formats are ZIP files
+            return file.getMimeType();
+        }
+        
+        String mimeType = file.getMimeType();
+        try (ZipArchiveInputStream archiveInputStream = new ZipArchiveInputStream(inputStream, Charsets.ISO_8859_1.name())) {
+            ArchiveEntry archiveEntry = archiveInputStream.getNextEntry();
+            while (archiveEntry != null) {
+                if (archiveEntry.getName().equals("mimetype")) {
+                    // Maybe it's an ODT file
+                    String content = new String(IOUtils.toByteArray(archiveInputStream), Charsets.ISO_8859_1);
+                    if (MimeType.OPEN_DOCUMENT_TEXT.equals(content.trim())) {
+                        mimeType = MimeType.OPEN_DOCUMENT_TEXT;
+                        break;
+                    }
+                } else if (archiveEntry.getName().equals("[Content_Types].xml")) {
+                    // Maybe it's a DOCX file
+                    String content = new String(IOUtils.toByteArray(archiveInputStream), Charsets.ISO_8859_1);
+                    if (content.contains(MimeType.OFFICE_DOCUMENT)) {
+                        mimeType =  MimeType.OFFICE_DOCUMENT;
+                        break;
+                    }
+                }
+    
+                archiveEntry = archiveInputStream.getNextEntry();
+            }
+            
+            inputStream.reset();
+        } catch (Exception e) {
+            // In case of any error, just give up and keep the ZIP MIME type
+            return file.getMimeType();
+        }
+        
+        return mimeType;
+    }
 }
--- a/docs-core/src/test/java/com/sismics/docs/core/util/TestEncryptUtil.java
+++ b/docs-core/src/test/java/com/sismics/docs/core/util/TestEncryptUtil.java
@ -18,7 +18,6 @@ import com.google.common.io.ByteStreams;
 * @author bgamard
 */
 public class TestEncryptUtil {
-
    /**
     * Test private key.
     */
--- a/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java
+++ b/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java
@ -0,0 +1,46 @@
+package com.sismics.docs.core.util;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+
+import junit.framework.Assert;
+
+import org.apache.pdfbox.io.IOUtils;
+import org.junit.Test;
+
+import com.google.common.io.Resources;
+import com.sismics.docs.core.model.jpa.File;
+import com.sismics.util.mime.MimeType;
+
+/**
+ * Test of the file entity utilities.
+ * 
+ * @author bgamard
+ */
+public class TestFileUtil {
+    @Test
+    public void extractContentOpenDocumentTextTest() throws Exception {
+        try (InputStream inputStream = Resources.getResource("file/document.odt").openStream();
+                InputStream bytesInputStream = new ByteArrayInputStream(IOUtils.toByteArray(inputStream))) {
+            File file = new File();
+            file.setMimeType(MimeType.OPEN_DOCUMENT_TEXT);
+            try (InputStream pdfInputStream = FileUtil.convertToPdf(bytesInputStream, file)) {
+                String content = FileUtil.extractContent(null, file, inputStream, pdfInputStream);
+                Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen."));
+            }
+        }
+    }
+    
+    @Test
+    public void extractContentOfficeDocumentTest() throws Exception {
+        try (InputStream inputStream = Resources.getResource("file/document.docx").openStream();
+                InputStream bytesInputStream = new ByteArrayInputStream(IOUtils.toByteArray(inputStream))) {
+            File file = new File();
+            file.setMimeType(MimeType.OFFICE_DOCUMENT);
+            try (InputStream pdfInputStream = FileUtil.convertToPdf(bytesInputStream, file)) {
+                String content = FileUtil.extractContent(null, file, inputStream, pdfInputStream);
+                Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen."));
+            }
+        }
+    }
+}
--- a/docs-core/src/test/java/com/sismics/util/TestMimeTypeUtil.java
+++ b/docs-core/src/test/java/com/sismics/util/TestMimeTypeUtil.java
@ -0,0 +1,40 @@
+package com.sismics.util;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+
+import org.apache.commons.compress.utils.IOUtils;
+import org.junit.Assert;
+import org.junit.Test;
+
+import com.google.common.io.Resources;
+import com.sismics.docs.core.model.jpa.File;
+import com.sismics.util.mime.MimeType;
+import com.sismics.util.mime.MimeTypeUtil;
+
+/**
+ * Test of the utilities to check MIME types.
+ * 
+ * @author bgamard
+ */
+public class TestMimeTypeUtil {
+
+    @Test
+    public void guessOpenDocumentFormatTest() throws Exception {
+        // Detect ODT files
+        try (InputStream inputStream = Resources.getResource("file/document.odt").openStream();
+                InputStream byteArrayInputStream = new ByteArrayInputStream(IOUtils.toByteArray(inputStream))) {
+            File file = new File();
+            file.setMimeType(MimeType.APPLICATION_ZIP);
+            Assert.assertEquals(MimeType.OPEN_DOCUMENT_TEXT, MimeTypeUtil.guessOpenDocumentFormat(file, byteArrayInputStream));
+        }
+        
+        // Detect DOCX files
+        try (InputStream inputStream = Resources.getResource("file/document.docx").openStream();
+                InputStream byteArrayInputStream = new ByteArrayInputStream(IOUtils.toByteArray(inputStream))) {
+            File file = new File();
+            file.setMimeType(MimeType.APPLICATION_ZIP);
+            Assert.assertEquals(MimeType.OFFICE_DOCUMENT, MimeTypeUtil.guessOpenDocumentFormat(file, byteArrayInputStream));
+        }
+    }
+}
--- a/docs-core/src/test/resources/file/document.docx
+++ b/docs-core/src/test/resources/file/document.docx
--- a/docs-core/src/test/resources/file/document.odt
+++ b/docs-core/src/test/resources/file/document.odt
--- a/docs-parent/pom.xml
+++ b/docs-parent/pom.xml
@ -36,6 +36,7 @@
    <org.hibernate.hibernate.version>4.1.0.Final</org.hibernate.hibernate.version>
    <javax.servlet.javax.servlet-api.version>3.1.0</javax.servlet.javax.servlet-api.version>
    <com.levigo.jbig2.levigo-jbig2-imageio.version>1.6.3</com.levigo.jbig2.levigo-jbig2-imageio.version>
+    <fr.opensagres.xdocreport.version>1.0.5</fr.opensagres.xdocreport.version>

    <org.eclipse.jetty.jetty-server.version>9.2.13.v20150730</org.eclipse.jetty.jetty-server.version>
    <org.eclipse.jetty.jetty-webapp.version>9.2.13.v20150730</org.eclipse.jetty.jetty-webapp.version>
@ -367,6 +368,18 @@
        <version>${org.bouncycastle.bcprov-jdk15on.version}</version>
      </dependency>
      
+      <dependency>
+	      <groupId>fr.opensagres.xdocreport</groupId>
+	      <artifactId>org.odftoolkit.odfdom.converter.pdf</artifactId>
+	      <version>${fr.opensagres.xdocreport.version}</version>
+	    </dependency>
+	
+	    <dependency>
+	      <groupId>fr.opensagres.xdocreport</groupId>
+	      <artifactId>org.apache.poi.xwpf.converter.pdf</artifactId>
+	      <version>${fr.opensagres.xdocreport.version}</version>
+	    </dependency>
+      
      <!-- Used to read JBIG2 images. See https://github.com/sismics/docs/issues/38 -->
      <dependency>
        <groupId>com.levigo.jbig2</groupId>
--- a/docs-web/src/main/java/com/sismics/docs/rest/resource/FileResource.java
+++ b/docs-web/src/main/java/com/sismics/docs/rest/resource/FileResource.java
@ -146,8 +146,14 @@ public class FileResource extends BaseResource {
            file.setUserId(principal.getId());
            String fileId = fileDao.create(file);
            
+            // Guess the mime type a second time, for open document format (first detected as simple ZIP file)
+            file.setMimeType(MimeTypeUtil.guessOpenDocumentFormat(file, fileInputStream));
+            
+            // Convert to PDF if necessary (for thumbnail and text extraction)
+            InputStream pdfIntputStream = FileUtil.convertToPdf(fileInputStream, file);
+            
            // Save the file
-            FileUtil.save(fileInputStream, file, user.getPrivateKey());
+            FileUtil.save(fileInputStream, pdfIntputStream, file, user.getPrivateKey());
            
            // Update the user quota
            user.setStorageCurrent(user.getStorageCurrent() + fileData.length);
@ -159,6 +165,7 @@ public class FileResource extends BaseResource {
                fileCreatedAsyncEvent.setDocument(document);
                fileCreatedAsyncEvent.setFile(file);
                fileCreatedAsyncEvent.setInputStream(fileInputStream);
+                fileCreatedAsyncEvent.setPdfInputStream(pdfIntputStream);
                AppContext.getInstance().getAsyncEventBus().post(fileCreatedAsyncEvent);
            }

--- a/docs-web/src/main/webapp/src/app/docs/filter/Filesize.js
+++ b/docs-web/src/main/webapp/src/app/docs/filter/Filesize.js
@ -0,0 +1,18 @@
+'use strict';
+
+/**
+ * Format file sizes.
+ */
+angular.module('docs').filter('filesize', function() {
+  return function(text) {
+    if (!text) {
+      return '';
+    }
+
+    var size = parseInt(text);
+    if (size > 1000000) { // 1MB
+      return Math.round(size / 1000000) + 'MB';
+    }
+    return Math.round(size / 1000) + 'kB';
+  }
+});
--- a/docs-web/src/main/webapp/src/app/docs/filter/Newline.js
+++ b/docs-web/src/main/webapp/src/app/docs/filter/Newline.js
@ -1,7 +1,7 @@
 'use strict';

 /**
- * Filter converting new lines in <br />
+ * Filter converting new lines in <br />.
 */
 angular.module('docs').filter('newline', function() {
  return function(text) {
@ -10,4 +10,4 @@ angular.module('docs').filter('newline', function() {
    }
    return text.replace(/\n/g, '<br/>');
  }
-})
+});
--- a/docs-web/src/main/webapp/src/app/docs/filter/Shorten.js
+++ b/docs-web/src/main/webapp/src/app/docs/filter/Shorten.js
@ -10,4 +10,4 @@ angular.module('docs').filter('shorten', function() {
    }
    return text.substring(0, 1).toUpperCase();
  }
-})
+});
--- a/docs-web/src/main/webapp/src/app/share/filter/Filesize.js
+++ b/docs-web/src/main/webapp/src/app/share/filter/Filesize.js
@ -0,0 +1,18 @@
+'use strict';
+
+/**
+ * Format file sizes.
+ */
+angular.module('share').filter('filesize', function() {
+  return function(text) {
+    if (!text) {
+      return '';
+    }
+
+    var size = parseInt(text);
+    if (size > 1000000) { // 1MB
+      return Math.round(size / 1000000) + 'MB';
+    }
+    return Math.round(size / 1000) + 'kB';
+  }
+});
--- a/docs-web/src/main/webapp/src/app/share/filter/Newline.js
+++ b/docs-web/src/main/webapp/src/app/share/filter/Newline.js
@ -1,7 +1,7 @@
 'use strict';

 /**
- * Filter converting new lines in <br />
+ * Filter converting new lines in <br />.
 */
 angular.module('share').filter('newline', function() {
  return function(text) {
@ -10,4 +10,4 @@ angular.module('share').filter('newline', function() {
    }
    return text.replace(/\n/g, '<br/>');
  }
-})
+});
--- a/docs-web/src/main/webapp/src/index.html
+++ b/docs-web/src/main/webapp/src/index.html
@ -63,6 +63,7 @@
    <script src="app/docs/service/Tag.js" type="text/javascript"></script>
    <script src="app/docs/filter/Newline.js" type="text/javascript"></script>
    <script src="app/docs/filter/Shorten.js" type="text/javascript"></script>
+    <script src="app/docs/filter/Filesize.js" type="text/javascript"></script>
    <script src="app/docs/directive/File.js" type="text/javascript"></script>
    <script src="app/docs/directive/SelectTag.js" type="text/javascript"></script>
    <script src="app/docs/directive/AuditLog.js" type="text/javascript"></script>
--- a/docs-web/src/main/webapp/src/partial/docs/document.default.html
+++ b/docs-web/src/main/webapp/src/partial/docs/document.default.html
@ -8,7 +8,7 @@
      <div class="col-xs-6 col-sm-4 col-md-3 col-lg-2 text-center" ng-repeat="file in files">
        <div class="thumbnail" ng-class="{ 'thumbnail-checked': file.checked }" ng-if="file.id">
          <a ng-click="openFile(file)">
-            <img class="thumbnail-file" ng-src="../api/file/{{ file.id }}/data?size=thumb" tooltip="{{ file.mimetype }}" tooltip-placement="top" />
+            <img class="thumbnail-file" ng-src="../api/file/{{ file.id }}/data?size=thumb" tooltip="{{ file.mimetype }} | {{ file.size | filesize }}" tooltip-placement="top" />
          </a>
          <div class="caption pointer" ng-click="file.checked = !file.checked">
            <div class="pull-left">
--- a/docs-web/src/main/webapp/src/partial/docs/document.edit.html
+++ b/docs-web/src/main/webapp/src/partial/docs/document.edit.html
@ -39,7 +39,8 @@
      <label class="col-sm-2 control-label" for="inputFiles">New files</label>
      <div class="col-sm-6">
        <file class="form-control" id="inputFiles" multiple="multiple" ng-model="newFiles"
-          accept="image/png,image/jpg,image/jpeg,image/gif,application/pdf" ng-disabled="fileIsUploading"></file>
+          accept="image/png,image/jpg,image/jpeg,image/gif,application/pdf,application/vnd.oasis.opendocument.text,application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+          ng-disabled="fileIsUploading"></file>
      </div>
      <div class="col-sm-4" ng-if="orphanFiles.length > 0">
        + {{ orphanFiles.length }} file{{ orphanFiles.length > 1 ? 's' : '' }}
--- a/docs-web/src/main/webapp/src/partial/docs/document.view.content.html
+++ b/docs-web/src/main/webapp/src/partial/docs/document.view.content.html
@ -6,7 +6,7 @@
    <div class="col-xs-6 col-sm-4 col-md-4 col-lg-3 text-center" ng-repeat="file in files">
      <div class="thumbnail" ng-if="file.id">
        <a ng-click="openFile(file)">
-          <img class="thumbnail-file" ng-src="../api/file/{{ file.id }}/data?size=thumb" tooltip="{{ file.mimetype }}" tooltip-placement="top" />
+          <img class="thumbnail-file" ng-src="../api/file/{{ file.id }}/data?size=thumb" tooltip="{{ file.mimetype }} | {{ file.size | filesize }}" tooltip-placement="top" />
        </a>
        <div class="caption" ng-show="document.writable">
          <div class="pull-left">
--- a/docs-web/src/main/webapp/src/partial/share/share.html
+++ b/docs-web/src/main/webapp/src/partial/share/share.html
@ -18,7 +18,9 @@
      <div class="col-xs-6 col-sm-4 col-md-3 col-lg-2 text-center" ng-repeat="file in files">
        <div class="thumbnail">
          <a ng-click="openFile(file)">
-            <img class="thumbnail-file" ng-src="../api/file/{{ file.id }}/data?size=thumb&share={{ $stateParams.shareId }}" tooltip="{{ file.mimetype }}" tooltip-placement="top" />
+            <img class="thumbnail-file"
+                 ng-src="../api/file/{{ file.id }}/data?size=thumb&share={{ $stateParams.shareId }}"
+                 tooltip="{{ file.mimetype }} | {{ file.size | filesize }}" tooltip-placement="top" />
          </a>
        </div>
      </div>
--- a/docs-web/src/main/webapp/src/share.html
+++ b/docs-web/src/main/webapp/src/share.html
@ -37,6 +37,7 @@
    <script src="app/share/controller/FileView.js" type="text/javascript"></script>
    <script src="app/share/controller/FileModalView.js" type="text/javascript"></script>
    <script src="app/share/filter/Newline.js" type="text/javascript"></script>
+    <script src="app/share/filter/Filesize.js" type="text/javascript"></script>
    <!-- endref -->
  </head>
  <body>
--- a/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java
+++ b/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java
@ -267,6 +267,124 @@ public class TestDocumentResource extends BaseJerseyTest {
        return json.getJsonArray("documents").size();
    }
    
+    /**
+     * Test ODT extraction.
+     * 
+     * @throws Exception 
+     */
+    @Test
+    public void testOdtExtraction() throws Exception {
+        // Login document_odt
+        clientUtil.createUser("document_odt");
+        String documentOdtToken = clientUtil.login("document_odt");
+
+        // Create a document
+        long create1Date = new Date().getTime();
+        JsonObject json = target().path("/document").request()
+                .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentOdtToken)
+                .put(Entity.form(new Form()
+                        .param("title", "My super title document 1")
+                        .param("description", "My super description for document 1")
+                        .param("language", "eng")
+                        .param("create_date", Long.toString(create1Date))), JsonObject.class);
+        String document1Id = json.getString("id");
+        Assert.assertNotNull(document1Id);
+        
+        // Add a PDF file
+        String file1Id = null;
+        try (InputStream is = Resources.getResource("file/document.odt").openStream()) {
+            StreamDataBodyPart streamDataBodyPart = new StreamDataBodyPart("file", is, "document.odt");
+            try (FormDataMultiPart multiPart = new FormDataMultiPart()) {
+                json = target()
+                        .register(MultiPartFeature.class)
+                        .path("/file").request()
+                        .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentOdtToken)
+                        .put(Entity.entity(multiPart.field("id", document1Id).bodyPart(streamDataBodyPart),
+                                MediaType.MULTIPART_FORM_DATA_TYPE), JsonObject.class);
+                file1Id = json.getString("id");
+                Assert.assertNotNull(file1Id);
+            }
+        }
+        
+        // Search documents by query in full content
+        json = target().path("/document/list")
+                .queryParam("search", "full:ipsum")
+                .request()
+                .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentOdtToken)
+                .get(JsonObject.class);
+        Assert.assertTrue(json.getJsonArray("documents").size() == 1);
+        
+        // Get the file thumbnail data
+        Response response = target().path("/file/" + file1Id + "/data")
+                .queryParam("size", "thumb")
+                .request()
+                .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentOdtToken)
+                .get();
+        InputStream is = (InputStream) response.getEntity();
+        byte[] fileBytes = ByteStreams.toByteArray(is);
+        Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues
+        Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes));
+    }
+    
+    /**
+     * Test DOCX extraction.
+     * 
+     * @throws Exception 
+     */
+    @Test
+    public void testDocxExtraction() throws Exception {
+        // Login document_docx
+        clientUtil.createUser("document_docx");
+        String documentDocxToken = clientUtil.login("document_docx");
+
+        // Create a document
+        long create1Date = new Date().getTime();
+        JsonObject json = target().path("/document").request()
+                .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentDocxToken)
+                .put(Entity.form(new Form()
+                        .param("title", "My super title document 1")
+                        .param("description", "My super description for document 1")
+                        .param("language", "eng")
+                        .param("create_date", Long.toString(create1Date))), JsonObject.class);
+        String document1Id = json.getString("id");
+        Assert.assertNotNull(document1Id);
+        
+        // Add a PDF file
+        String file1Id = null;
+        try (InputStream is = Resources.getResource("file/document.docx").openStream()) {
+            StreamDataBodyPart streamDataBodyPart = new StreamDataBodyPart("file", is, "document.docx");
+            try (FormDataMultiPart multiPart = new FormDataMultiPart()) {
+                json = target()
+                        .register(MultiPartFeature.class)
+                        .path("/file").request()
+                        .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentDocxToken)
+                        .put(Entity.entity(multiPart.field("id", document1Id).bodyPart(streamDataBodyPart),
+                                MediaType.MULTIPART_FORM_DATA_TYPE), JsonObject.class);
+                file1Id = json.getString("id");
+                Assert.assertNotNull(file1Id);
+            }
+        }
+        
+        // Search documents by query in full content
+        json = target().path("/document/list")
+                .queryParam("search", "full:dolor")
+                .request()
+                .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentDocxToken)
+                .get(JsonObject.class);
+        Assert.assertTrue(json.getJsonArray("documents").size() == 1);
+        
+        // Get the file thumbnail data
+        Response response = target().path("/file/" + file1Id + "/data")
+                .queryParam("size", "thumb")
+                .request()
+                .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentDocxToken)
+                .get();
+        InputStream is = (InputStream) response.getEntity();
+        byte[] fileBytes = ByteStreams.toByteArray(is);
+        Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues
+        Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes));
+    }
+    
    /**
     * Test PDF extraction.
     * 
@ -274,14 +392,14 @@ public class TestDocumentResource extends BaseJerseyTest {
     */
    @Test
    public void testPdfExtraction() throws Exception {
-        // Login document2
-        clientUtil.createUser("document2");
-        String document2Token = clientUtil.login("document2");
+        // Login document_pdf
+        clientUtil.createUser("document_pdf");
+        String documentPdfToken = clientUtil.login("document_pdf");

        // Create a document
        long create1Date = new Date().getTime();
        JsonObject json = target().path("/document").request()
-                .cookie(TokenBasedSecurityFilter.COOKIE_NAME, document2Token)
+                .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPdfToken)
                .put(Entity.form(new Form()
                        .param("title", "My super title document 1")
                        .param("description", "My super description for document 1")
@ -298,7 +416,7 @@ public class TestDocumentResource extends BaseJerseyTest {
                json = target()
                        .register(MultiPartFeature.class)
                        .path("/file").request()
-                        .cookie(TokenBasedSecurityFilter.COOKIE_NAME, document2Token)
+                        .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPdfToken)
                        .put(Entity.entity(multiPart.field("id", document1Id).bodyPart(streamDataBodyPart),
                                MediaType.MULTIPART_FORM_DATA_TYPE), JsonObject.class);
                file1Id = json.getString("id");
@ -310,7 +428,7 @@ public class TestDocumentResource extends BaseJerseyTest {
        json = target().path("/document/list")
                .queryParam("search", "full:vrandecic")
                .request()
-                .cookie(TokenBasedSecurityFilter.COOKIE_NAME, document2Token)
+                .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPdfToken)
                .get(JsonObject.class);
        Assert.assertTrue(json.getJsonArray("documents").size() == 1);
        
@ -318,7 +436,7 @@ public class TestDocumentResource extends BaseJerseyTest {
        Response response = target().path("/file/" + file1Id + "/data")
                .queryParam("size", "thumb")
                .request()
-                .cookie(TokenBasedSecurityFilter.COOKIE_NAME, document2Token)
+                .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPdfToken)
                .get();
        InputStream is = (InputStream) response.getEntity();
        byte[] fileBytes = ByteStreams.toByteArray(is);
--- a/docs-web/src/test/resources/file/document.docx
+++ b/docs-web/src/test/resources/file/document.docx
--- a/docs-web/src/test/resources/file/document.odt
+++ b/docs-web/src/test/resources/file/document.odt