diff --git a/docs-core/pom.xml b/docs-core/pom.xml
index cf4d3719..3ad8e390 100644
--- a/docs-core/pom.xml
+++ b/docs-core/pom.xml
@@ -117,6 +117,16 @@
com.levigo.jbig2
levigo-jbig2-imageio
+
+
+ fr.opensagres.xdocreport
+ org.odftoolkit.odfdom.converter.pdf
+
+
+
+ fr.opensagres.xdocreport
+ org.apache.poi.xwpf.converter.pdf
+
diff --git a/docs-core/src/main/java/com/sismics/docs/core/dao/jpa/FileDao.java b/docs-core/src/main/java/com/sismics/docs/core/dao/jpa/FileDao.java
index 28b7a92c..3aa3e310 100644
--- a/docs-core/src/main/java/com/sismics/docs/core/dao/jpa/FileDao.java
+++ b/docs-core/src/main/java/com/sismics/docs/core/dao/jpa/FileDao.java
@@ -142,6 +142,7 @@ public class FileDao {
fileFromDb.setDocumentId(file.getDocumentId());
fileFromDb.setContent(file.getContent());
fileFromDb.setOrder(file.getOrder());
+ fileFromDb.setMimeType(file.getMimeType());
return file;
}
diff --git a/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java b/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java
index 1b76cb0d..bda947ab 100644
--- a/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java
+++ b/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java
@@ -12,6 +12,7 @@ import com.sismics.docs.core.event.FileCreatedAsyncEvent;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.docs.core.util.FileUtil;
import com.sismics.docs.core.util.TransactionUtil;
+import com.sismics.util.mime.MimeTypeUtil;
/**
* Listener on file created.
@@ -36,20 +37,23 @@ public class FileCreatedAsyncListener {
log.info("File created event: " + fileCreatedAsyncEvent.toString());
}
- // OCR the file
+ // Guess the mime type a second time, for open document format (first detected as simple ZIP file)
final File file = fileCreatedAsyncEvent.getFile();
+ file.setMimeType(MimeTypeUtil.guessOpenDocumentFormat(file, fileCreatedAsyncEvent.getInputStream()));
+
+ // Extract text content from the file
long startTime = System.currentTimeMillis();
final String content = FileUtil.extractContent(fileCreatedAsyncEvent.getDocument(), file, fileCreatedAsyncEvent.getInputStream());
fileCreatedAsyncEvent.getInputStream().close();
log.info(MessageFormat.format("File content extracted in {0}ms", System.currentTimeMillis() - startTime));
- // Store the OCR-ization result in the database
+ // Store the text content in the database
TransactionUtil.handle(new Runnable() {
@Override
public void run() {
FileDao fileDao = new FileDao();
if (fileDao.getById(file.getId()) == null) {
- // The file has been deleted since the OCR-ization started, ignore the result
+ // The file has been deleted since the text extraction started, ignore the result
return;
}
diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java b/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java
index d836e94b..951590c6 100644
--- a/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java
+++ b/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java
@@ -15,9 +15,13 @@ import javax.imageio.ImageIO;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
+import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.imgscalr.Scalr;
import org.imgscalr.Scalr.Method;
import org.imgscalr.Scalr.Mode;
+import org.odftoolkit.odfdom.converter.pdf.PdfConverter;
+import org.odftoolkit.odfdom.converter.pdf.PdfOptions;
+import org.odftoolkit.odfdom.doc.OdfTextDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -53,6 +57,10 @@ public class FileUtil {
content = ocrFile(inputStream, document);
} else if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) {
content = extractPdf(inputStream);
+ } else if (file.getMimeType().equals(MimeType.OPEN_DOCUMENT_TEXT)) {
+ content = extractOpenDocumentText(inputStream);
+ } else if (file.getMimeType().equals(MimeType.OFFICE_DOCUMENT)) {
+ content = extractOfficeDocument(inputStream);
}
return content;
@@ -120,6 +128,76 @@ public class FileUtil {
return content;
}
+ /**
+ * Extract text from an open document text file.
+ *
+ * @param inputStream Unencrypted input stream
+ * @return Content extracted
+ */
+ private static String extractOpenDocumentText(InputStream inputStream) {
+ String content = null;
+ Path tempFile = null;
+ try {
+ // Convert the ODT file to a temporary PDF file
+ tempFile = Files.createTempFile("sismicsdocs_", ".pdf");
+ try (OutputStream out = Files.newOutputStream(tempFile)) {
+ OdfTextDocument document = OdfTextDocument.loadDocument(inputStream);
+ PdfOptions options = PdfOptions.create();
+ PdfConverter.getInstance().convert(document, out, options);
+ }
+
+ // Extract content from the PDF file
+ try (InputStream pdfInputStream = Files.newInputStream(tempFile)) {
+ content = extractPdf(pdfInputStream);
+ }
+
+ } catch (Exception e) {
+ log.error("Error while extracting text from the ODT", e);
+ } finally {
+ try {
+ Files.delete(tempFile); // Delete the temporary PDF file
+ } catch (IOException e) {
+ // Should not happen
+ }
+ }
+ return content;
+ }
+
+ /**
+ * Extract text from an Office document.
+ *
+ * @param inputStream Unencrypted input stream
+ * @return Content extracted
+ */
+ private static String extractOfficeDocument(InputStream inputStream) {
+ String content = null;
+ Path tempFile = null;
+ try {
+ // Convert the DOCX file to a temporary PDF file
+ tempFile = Files.createTempFile("sismicsdocs_", ".pdf");
+ try (OutputStream out = Files.newOutputStream(tempFile)) {
+ XWPFDocument document = new XWPFDocument(inputStream);
+ org.apache.poi.xwpf.converter.pdf.PdfOptions options = org.apache.poi.xwpf.converter.pdf.PdfOptions.create();
+ org.apache.poi.xwpf.converter.pdf.PdfConverter.getInstance().convert(document, out, options);
+ }
+
+ // Extract content from the PDF file
+ try (InputStream pdfInputStream = Files.newInputStream(tempFile)) {
+ content = extractPdf(pdfInputStream);
+ }
+
+ } catch (Exception e) {
+ log.error("Error while extracting text from the DOCX", e);
+ } finally {
+ try {
+ Files.delete(tempFile); // Delete the temporary PDF file
+ } catch (IOException e) {
+ // Should not happen
+ }
+ }
+ return content;
+ }
+
/**
* Save a file on the storage filesystem.
*
@@ -162,6 +240,7 @@ public class FileUtil {
pdfDocument.close();
}
}
+ // TODO Generate thumbnails for DOCX/ODT documents (guess the MIME type earlier and build a PDF version now?)
if (image != null) {
// Generate thumbnails from image
diff --git a/docs-core/src/main/java/com/sismics/util/mime/MimeType.java b/docs-core/src/main/java/com/sismics/util/mime/MimeType.java
index 9e1d6916..e5821a62 100644
--- a/docs-core/src/main/java/com/sismics/util/mime/MimeType.java
+++ b/docs-core/src/main/java/com/sismics/util/mime/MimeType.java
@@ -18,4 +18,8 @@ public class MimeType {
public static final String APPLICATION_ZIP = "application/zip";
public static final String APPLICATION_PDF = "application/pdf";
+
+ public static final String OPEN_DOCUMENT_TEXT = "application/vnd.oasis.opendocument.text";
+
+ public static final String OFFICE_DOCUMENT = "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
}
diff --git a/docs-core/src/main/java/com/sismics/util/mime/MimeTypeUtil.java b/docs-core/src/main/java/com/sismics/util/mime/MimeTypeUtil.java
index 76c49ba1..058f7f7d 100644
--- a/docs-core/src/main/java/com/sismics/util/mime/MimeTypeUtil.java
+++ b/docs-core/src/main/java/com/sismics/util/mime/MimeTypeUtil.java
@@ -3,6 +3,13 @@ package com.sismics.util.mime;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
+import org.apache.commons.compress.archivers.ArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
+import org.apache.commons.compress.utils.IOUtils;
+
+import com.google.common.base.Charsets;
+import com.sismics.docs.core.model.jpa.File;
+
/**
* Utility to check MIME types.
*
@@ -77,8 +84,59 @@ public class MimeTypeUtil {
return "ico";
case MimeType.APPLICATION_PDF:
return "pdf";
+ case MimeType.OPEN_DOCUMENT_TEXT:
+ return "odt";
+ case MimeType.OFFICE_DOCUMENT:
+ return "docx";
default:
return null;
}
}
+
+ /**
+ * Guess the MIME type of open document formats (docx and odt).
+ * It's more costly than the simple header check, but needed because open document formats
+ * are simple ZIP files on the outside and much bigger on the inside.
+ *
+ * @param file File
+ * @param inputStream Input stream
+ * @return MIME type
+ */
+ public static String guessOpenDocumentFormat(File file, InputStream inputStream) {
+ if (!MimeType.APPLICATION_ZIP.equals(file.getMimeType())) {
+ // open document formats are ZIP files
+ return file.getMimeType();
+ }
+
+ String mimeType = file.getMimeType();
+ try (ZipArchiveInputStream archiveInputStream = new ZipArchiveInputStream(inputStream, Charsets.ISO_8859_1.name())) {
+ ArchiveEntry archiveEntry = archiveInputStream.getNextEntry();
+ while (archiveEntry != null) {
+ if (archiveEntry.getName().equals("mimetype")) {
+ // Maybe it's an ODT file
+ String content = new String(IOUtils.toByteArray(archiveInputStream), Charsets.ISO_8859_1);
+ if (MimeType.OPEN_DOCUMENT_TEXT.equals(content.trim())) {
+ mimeType = MimeType.OPEN_DOCUMENT_TEXT;
+ break;
+ }
+ } else if (archiveEntry.getName().equals("[Content_Types].xml")) {
+ // Maybe it's a DOCX file
+ String content = new String(IOUtils.toByteArray(archiveInputStream), Charsets.ISO_8859_1);
+ if (content.contains(MimeType.OFFICE_DOCUMENT)) {
+ mimeType = MimeType.OFFICE_DOCUMENT;
+ break;
+ }
+ }
+
+ archiveEntry = archiveInputStream.getNextEntry();
+ }
+
+ inputStream.reset();
+ } catch (Exception e) {
+ // In case of any error, just give up and keep the ZIP MIME type
+ return file.getMimeType();
+ }
+
+ return mimeType;
+ }
}
diff --git a/docs-core/src/test/java/com/sismics/docs/core/util/TestEncryptUtil.java b/docs-core/src/test/java/com/sismics/docs/core/util/TestEncryptUtil.java
index 2e161157..b6f54964 100644
--- a/docs-core/src/test/java/com/sismics/docs/core/util/TestEncryptUtil.java
+++ b/docs-core/src/test/java/com/sismics/docs/core/util/TestEncryptUtil.java
@@ -18,7 +18,6 @@ import com.google.common.io.ByteStreams;
* @author bgamard
*/
public class TestEncryptUtil {
-
/**
* Test private key.
*/
diff --git a/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java b/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java
new file mode 100644
index 00000000..c9ecdeeb
--- /dev/null
+++ b/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java
@@ -0,0 +1,36 @@
+package com.sismics.docs.core.util;
+
+import java.io.InputStream;
+
+import junit.framework.Assert;
+
+import org.junit.Test;
+
+import com.google.common.io.Resources;
+import com.sismics.docs.core.model.jpa.File;
+import com.sismics.util.mime.MimeType;
+
+/**
+ * Test of the file entity utilities.
+ *
+ * @author bgamard
+ */
+public class TestFileUtil {
+ @Test
+ public void extractContentOpenDocumentTextTest() throws Exception {
+ try (InputStream inputStream = Resources.getResource("file/document.odt").openStream()) {
+ File file = new File();
+ file.setMimeType(MimeType.OPEN_DOCUMENT_TEXT);
+ Assert.assertEquals("Lorem ipsum dolor sit amen.\r\n", FileUtil.extractContent(null, file, inputStream));
+ }
+ }
+
+ @Test
+ public void extractContentOfficeDocumentTest() throws Exception {
+ try (InputStream inputStream = Resources.getResource("file/document.docx").openStream()) {
+ File file = new File();
+ file.setMimeType(MimeType.OFFICE_DOCUMENT);
+ Assert.assertEquals("Lorem ipsum dolor sit amen.\r\n", FileUtil.extractContent(null, file, inputStream));
+ }
+ }
+}
diff --git a/docs-core/src/test/java/com/sismics/util/TestMimeTypeUtil.java b/docs-core/src/test/java/com/sismics/util/TestMimeTypeUtil.java
new file mode 100644
index 00000000..b16e91c7
--- /dev/null
+++ b/docs-core/src/test/java/com/sismics/util/TestMimeTypeUtil.java
@@ -0,0 +1,40 @@
+package com.sismics.util;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+
+import org.apache.commons.compress.utils.IOUtils;
+import org.junit.Assert;
+import org.junit.Test;
+
+import com.google.common.io.Resources;
+import com.sismics.docs.core.model.jpa.File;
+import com.sismics.util.mime.MimeType;
+import com.sismics.util.mime.MimeTypeUtil;
+
+/**
+ * Test of the utilities to check MIME types.
+ *
+ * @author bgamard
+ */
+public class TestMimeTypeUtil {
+
+ @Test
+ public void guessOpenDocumentFormatTest() throws Exception {
+ // Detect ODT files
+ try (InputStream inputStream = Resources.getResource("file/document.odt").openStream();
+ InputStream byteArrayInputStream = new ByteArrayInputStream(IOUtils.toByteArray(inputStream))) {
+ File file = new File();
+ file.setMimeType(MimeType.APPLICATION_ZIP);
+ Assert.assertEquals(MimeType.OPEN_DOCUMENT_TEXT, MimeTypeUtil.guessOpenDocumentFormat(file, byteArrayInputStream));
+ }
+
+ // Detect DOCX files
+ try (InputStream inputStream = Resources.getResource("file/document.docx").openStream();
+ InputStream byteArrayInputStream = new ByteArrayInputStream(IOUtils.toByteArray(inputStream))) {
+ File file = new File();
+ file.setMimeType(MimeType.APPLICATION_ZIP);
+ Assert.assertEquals(MimeType.OFFICE_DOCUMENT, MimeTypeUtil.guessOpenDocumentFormat(file, byteArrayInputStream));
+ }
+ }
+}
diff --git a/docs-core/src/test/resources/file/document.docx b/docs-core/src/test/resources/file/document.docx
new file mode 100644
index 00000000..fb1e6c2c
Binary files /dev/null and b/docs-core/src/test/resources/file/document.docx differ
diff --git a/docs-core/src/test/resources/file/document.odt b/docs-core/src/test/resources/file/document.odt
new file mode 100644
index 00000000..b7062de0
Binary files /dev/null and b/docs-core/src/test/resources/file/document.odt differ
diff --git a/docs-parent/pom.xml b/docs-parent/pom.xml
index 18d616e0..40da3594 100644
--- a/docs-parent/pom.xml
+++ b/docs-parent/pom.xml
@@ -36,6 +36,7 @@
4.1.0.Final
3.1.0
1.6.3
+ 1.0.5
9.2.13.v20150730
9.2.13.v20150730
@@ -367,6 +368,18 @@
${org.bouncycastle.bcprov-jdk15on.version}
+
+ fr.opensagres.xdocreport
+ org.odftoolkit.odfdom.converter.pdf
+ ${fr.opensagres.xdocreport.version}
+
+
+
+ fr.opensagres.xdocreport
+ org.apache.poi.xwpf.converter.pdf
+ ${fr.opensagres.xdocreport.version}
+
+
com.levigo.jbig2