diff --git a/docs-core/pom.xml b/docs-core/pom.xml index cf4d3719..3ad8e390 100644 --- a/docs-core/pom.xml +++ b/docs-core/pom.xml @@ -117,6 +117,16 @@ com.levigo.jbig2 levigo-jbig2-imageio + + + fr.opensagres.xdocreport + org.odftoolkit.odfdom.converter.pdf + + + + fr.opensagres.xdocreport + org.apache.poi.xwpf.converter.pdf + diff --git a/docs-core/src/main/java/com/sismics/docs/core/dao/jpa/FileDao.java b/docs-core/src/main/java/com/sismics/docs/core/dao/jpa/FileDao.java index 28b7a92c..3aa3e310 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/dao/jpa/FileDao.java +++ b/docs-core/src/main/java/com/sismics/docs/core/dao/jpa/FileDao.java @@ -142,6 +142,7 @@ public class FileDao { fileFromDb.setDocumentId(file.getDocumentId()); fileFromDb.setContent(file.getContent()); fileFromDb.setOrder(file.getOrder()); + fileFromDb.setMimeType(file.getMimeType()); return file; } diff --git a/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java b/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java index 1b76cb0d..bda947ab 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java +++ b/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java @@ -12,6 +12,7 @@ import com.sismics.docs.core.event.FileCreatedAsyncEvent; import com.sismics.docs.core.model.jpa.File; import com.sismics.docs.core.util.FileUtil; import com.sismics.docs.core.util.TransactionUtil; +import com.sismics.util.mime.MimeTypeUtil; /** * Listener on file created. @@ -36,20 +37,23 @@ public class FileCreatedAsyncListener { log.info("File created event: " + fileCreatedAsyncEvent.toString()); } - // OCR the file + // Guess the mime type a second time, for open document format (first detected as simple ZIP file) final File file = fileCreatedAsyncEvent.getFile(); + file.setMimeType(MimeTypeUtil.guessOpenDocumentFormat(file, fileCreatedAsyncEvent.getInputStream())); + + // Extract text content from the file long startTime = System.currentTimeMillis(); final String content = FileUtil.extractContent(fileCreatedAsyncEvent.getDocument(), file, fileCreatedAsyncEvent.getInputStream()); fileCreatedAsyncEvent.getInputStream().close(); log.info(MessageFormat.format("File content extracted in {0}ms", System.currentTimeMillis() - startTime)); - // Store the OCR-ization result in the database + // Store the text content in the database TransactionUtil.handle(new Runnable() { @Override public void run() { FileDao fileDao = new FileDao(); if (fileDao.getById(file.getId()) == null) { - // The file has been deleted since the OCR-ization started, ignore the result + // The file has been deleted since the text extraction started, ignore the result return; } diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java b/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java index d836e94b..951590c6 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java +++ b/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java @@ -15,9 +15,13 @@ import javax.imageio.ImageIO; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.rendering.PDFRenderer; import org.apache.pdfbox.text.PDFTextStripper; +import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.imgscalr.Scalr; import org.imgscalr.Scalr.Method; import org.imgscalr.Scalr.Mode; +import org.odftoolkit.odfdom.converter.pdf.PdfConverter; +import org.odftoolkit.odfdom.converter.pdf.PdfOptions; +import org.odftoolkit.odfdom.doc.OdfTextDocument; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -53,6 +57,10 @@ public class FileUtil { content = ocrFile(inputStream, document); } else if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) { content = extractPdf(inputStream); + } else if (file.getMimeType().equals(MimeType.OPEN_DOCUMENT_TEXT)) { + content = extractOpenDocumentText(inputStream); + } else if (file.getMimeType().equals(MimeType.OFFICE_DOCUMENT)) { + content = extractOfficeDocument(inputStream); } return content; @@ -120,6 +128,76 @@ public class FileUtil { return content; } + /** + * Extract text from an open document text file. + * + * @param inputStream Unencrypted input stream + * @return Content extracted + */ + private static String extractOpenDocumentText(InputStream inputStream) { + String content = null; + Path tempFile = null; + try { + // Convert the ODT file to a temporary PDF file + tempFile = Files.createTempFile("sismicsdocs_", ".pdf"); + try (OutputStream out = Files.newOutputStream(tempFile)) { + OdfTextDocument document = OdfTextDocument.loadDocument(inputStream); + PdfOptions options = PdfOptions.create(); + PdfConverter.getInstance().convert(document, out, options); + } + + // Extract content from the PDF file + try (InputStream pdfInputStream = Files.newInputStream(tempFile)) { + content = extractPdf(pdfInputStream); + } + + } catch (Exception e) { + log.error("Error while extracting text from the ODT", e); + } finally { + try { + Files.delete(tempFile); // Delete the temporary PDF file + } catch (IOException e) { + // Should not happen + } + } + return content; + } + + /** + * Extract text from an Office document. + * + * @param inputStream Unencrypted input stream + * @return Content extracted + */ + private static String extractOfficeDocument(InputStream inputStream) { + String content = null; + Path tempFile = null; + try { + // Convert the DOCX file to a temporary PDF file + tempFile = Files.createTempFile("sismicsdocs_", ".pdf"); + try (OutputStream out = Files.newOutputStream(tempFile)) { + XWPFDocument document = new XWPFDocument(inputStream); + org.apache.poi.xwpf.converter.pdf.PdfOptions options = org.apache.poi.xwpf.converter.pdf.PdfOptions.create(); + org.apache.poi.xwpf.converter.pdf.PdfConverter.getInstance().convert(document, out, options); + } + + // Extract content from the PDF file + try (InputStream pdfInputStream = Files.newInputStream(tempFile)) { + content = extractPdf(pdfInputStream); + } + + } catch (Exception e) { + log.error("Error while extracting text from the DOCX", e); + } finally { + try { + Files.delete(tempFile); // Delete the temporary PDF file + } catch (IOException e) { + // Should not happen + } + } + return content; + } + /** * Save a file on the storage filesystem. * @@ -162,6 +240,7 @@ public class FileUtil { pdfDocument.close(); } } + // TODO Generate thumbnails for DOCX/ODT documents (guess the MIME type earlier and build a PDF version now?) if (image != null) { // Generate thumbnails from image diff --git a/docs-core/src/main/java/com/sismics/util/mime/MimeType.java b/docs-core/src/main/java/com/sismics/util/mime/MimeType.java index 9e1d6916..e5821a62 100644 --- a/docs-core/src/main/java/com/sismics/util/mime/MimeType.java +++ b/docs-core/src/main/java/com/sismics/util/mime/MimeType.java @@ -18,4 +18,8 @@ public class MimeType { public static final String APPLICATION_ZIP = "application/zip"; public static final String APPLICATION_PDF = "application/pdf"; + + public static final String OPEN_DOCUMENT_TEXT = "application/vnd.oasis.opendocument.text"; + + public static final String OFFICE_DOCUMENT = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"; } diff --git a/docs-core/src/main/java/com/sismics/util/mime/MimeTypeUtil.java b/docs-core/src/main/java/com/sismics/util/mime/MimeTypeUtil.java index 76c49ba1..058f7f7d 100644 --- a/docs-core/src/main/java/com/sismics/util/mime/MimeTypeUtil.java +++ b/docs-core/src/main/java/com/sismics/util/mime/MimeTypeUtil.java @@ -3,6 +3,13 @@ package com.sismics.util.mime; import java.io.InputStream; import java.io.UnsupportedEncodingException; +import org.apache.commons.compress.archivers.ArchiveEntry; +import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream; +import org.apache.commons.compress.utils.IOUtils; + +import com.google.common.base.Charsets; +import com.sismics.docs.core.model.jpa.File; + /** * Utility to check MIME types. * @@ -77,8 +84,59 @@ public class MimeTypeUtil { return "ico"; case MimeType.APPLICATION_PDF: return "pdf"; + case MimeType.OPEN_DOCUMENT_TEXT: + return "odt"; + case MimeType.OFFICE_DOCUMENT: + return "docx"; default: return null; } } + + /** + * Guess the MIME type of open document formats (docx and odt). + * It's more costly than the simple header check, but needed because open document formats + * are simple ZIP files on the outside and much bigger on the inside. + * + * @param file File + * @param inputStream Input stream + * @return MIME type + */ + public static String guessOpenDocumentFormat(File file, InputStream inputStream) { + if (!MimeType.APPLICATION_ZIP.equals(file.getMimeType())) { + // open document formats are ZIP files + return file.getMimeType(); + } + + String mimeType = file.getMimeType(); + try (ZipArchiveInputStream archiveInputStream = new ZipArchiveInputStream(inputStream, Charsets.ISO_8859_1.name())) { + ArchiveEntry archiveEntry = archiveInputStream.getNextEntry(); + while (archiveEntry != null) { + if (archiveEntry.getName().equals("mimetype")) { + // Maybe it's an ODT file + String content = new String(IOUtils.toByteArray(archiveInputStream), Charsets.ISO_8859_1); + if (MimeType.OPEN_DOCUMENT_TEXT.equals(content.trim())) { + mimeType = MimeType.OPEN_DOCUMENT_TEXT; + break; + } + } else if (archiveEntry.getName().equals("[Content_Types].xml")) { + // Maybe it's a DOCX file + String content = new String(IOUtils.toByteArray(archiveInputStream), Charsets.ISO_8859_1); + if (content.contains(MimeType.OFFICE_DOCUMENT)) { + mimeType = MimeType.OFFICE_DOCUMENT; + break; + } + } + + archiveEntry = archiveInputStream.getNextEntry(); + } + + inputStream.reset(); + } catch (Exception e) { + // In case of any error, just give up and keep the ZIP MIME type + return file.getMimeType(); + } + + return mimeType; + } } diff --git a/docs-core/src/test/java/com/sismics/docs/core/util/TestEncryptUtil.java b/docs-core/src/test/java/com/sismics/docs/core/util/TestEncryptUtil.java index 2e161157..b6f54964 100644 --- a/docs-core/src/test/java/com/sismics/docs/core/util/TestEncryptUtil.java +++ b/docs-core/src/test/java/com/sismics/docs/core/util/TestEncryptUtil.java @@ -18,7 +18,6 @@ import com.google.common.io.ByteStreams; * @author bgamard */ public class TestEncryptUtil { - /** * Test private key. */ diff --git a/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java b/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java new file mode 100644 index 00000000..c9ecdeeb --- /dev/null +++ b/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java @@ -0,0 +1,36 @@ +package com.sismics.docs.core.util; + +import java.io.InputStream; + +import junit.framework.Assert; + +import org.junit.Test; + +import com.google.common.io.Resources; +import com.sismics.docs.core.model.jpa.File; +import com.sismics.util.mime.MimeType; + +/** + * Test of the file entity utilities. + * + * @author bgamard + */ +public class TestFileUtil { + @Test + public void extractContentOpenDocumentTextTest() throws Exception { + try (InputStream inputStream = Resources.getResource("file/document.odt").openStream()) { + File file = new File(); + file.setMimeType(MimeType.OPEN_DOCUMENT_TEXT); + Assert.assertEquals("Lorem ipsum dolor sit amen.\r\n", FileUtil.extractContent(null, file, inputStream)); + } + } + + @Test + public void extractContentOfficeDocumentTest() throws Exception { + try (InputStream inputStream = Resources.getResource("file/document.docx").openStream()) { + File file = new File(); + file.setMimeType(MimeType.OFFICE_DOCUMENT); + Assert.assertEquals("Lorem ipsum dolor sit amen.\r\n", FileUtil.extractContent(null, file, inputStream)); + } + } +} diff --git a/docs-core/src/test/java/com/sismics/util/TestMimeTypeUtil.java b/docs-core/src/test/java/com/sismics/util/TestMimeTypeUtil.java new file mode 100644 index 00000000..b16e91c7 --- /dev/null +++ b/docs-core/src/test/java/com/sismics/util/TestMimeTypeUtil.java @@ -0,0 +1,40 @@ +package com.sismics.util; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; + +import org.apache.commons.compress.utils.IOUtils; +import org.junit.Assert; +import org.junit.Test; + +import com.google.common.io.Resources; +import com.sismics.docs.core.model.jpa.File; +import com.sismics.util.mime.MimeType; +import com.sismics.util.mime.MimeTypeUtil; + +/** + * Test of the utilities to check MIME types. + * + * @author bgamard + */ +public class TestMimeTypeUtil { + + @Test + public void guessOpenDocumentFormatTest() throws Exception { + // Detect ODT files + try (InputStream inputStream = Resources.getResource("file/document.odt").openStream(); + InputStream byteArrayInputStream = new ByteArrayInputStream(IOUtils.toByteArray(inputStream))) { + File file = new File(); + file.setMimeType(MimeType.APPLICATION_ZIP); + Assert.assertEquals(MimeType.OPEN_DOCUMENT_TEXT, MimeTypeUtil.guessOpenDocumentFormat(file, byteArrayInputStream)); + } + + // Detect DOCX files + try (InputStream inputStream = Resources.getResource("file/document.docx").openStream(); + InputStream byteArrayInputStream = new ByteArrayInputStream(IOUtils.toByteArray(inputStream))) { + File file = new File(); + file.setMimeType(MimeType.APPLICATION_ZIP); + Assert.assertEquals(MimeType.OFFICE_DOCUMENT, MimeTypeUtil.guessOpenDocumentFormat(file, byteArrayInputStream)); + } + } +} diff --git a/docs-core/src/test/resources/file/document.docx b/docs-core/src/test/resources/file/document.docx new file mode 100644 index 00000000..fb1e6c2c Binary files /dev/null and b/docs-core/src/test/resources/file/document.docx differ diff --git a/docs-core/src/test/resources/file/document.odt b/docs-core/src/test/resources/file/document.odt new file mode 100644 index 00000000..b7062de0 Binary files /dev/null and b/docs-core/src/test/resources/file/document.odt differ diff --git a/docs-parent/pom.xml b/docs-parent/pom.xml index 18d616e0..40da3594 100644 --- a/docs-parent/pom.xml +++ b/docs-parent/pom.xml @@ -36,6 +36,7 @@ 4.1.0.Final 3.1.0 1.6.3 + 1.0.5 9.2.13.v20150730 9.2.13.v20150730 @@ -367,6 +368,18 @@ ${org.bouncycastle.bcprov-jdk15on.version} + + fr.opensagres.xdocreport + org.odftoolkit.odfdom.converter.pdf + ${fr.opensagres.xdocreport.version} + + + + fr.opensagres.xdocreport + org.apache.poi.xwpf.converter.pdf + ${fr.opensagres.xdocreport.version} + + com.levigo.jbig2