mirror of
https://github.com/sismics/docs.git
synced 2024-11-22 14:07:55 +01:00
#53: Handle and extract text content from DOCX and ODT files
This commit is contained in:
parent
046984a447
commit
1a37d97a61
@ -117,6 +117,16 @@
|
|||||||
<groupId>com.levigo.jbig2</groupId>
|
<groupId>com.levigo.jbig2</groupId>
|
||||||
<artifactId>levigo-jbig2-imageio</artifactId>
|
<artifactId>levigo-jbig2-imageio</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>fr.opensagres.xdocreport</groupId>
|
||||||
|
<artifactId>org.odftoolkit.odfdom.converter.pdf</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>fr.opensagres.xdocreport</groupId>
|
||||||
|
<artifactId>org.apache.poi.xwpf.converter.pdf</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<!-- OCR dependencies -->
|
<!-- OCR dependencies -->
|
||||||
<dependency>
|
<dependency>
|
||||||
|
@ -142,6 +142,7 @@ public class FileDao {
|
|||||||
fileFromDb.setDocumentId(file.getDocumentId());
|
fileFromDb.setDocumentId(file.getDocumentId());
|
||||||
fileFromDb.setContent(file.getContent());
|
fileFromDb.setContent(file.getContent());
|
||||||
fileFromDb.setOrder(file.getOrder());
|
fileFromDb.setOrder(file.getOrder());
|
||||||
|
fileFromDb.setMimeType(file.getMimeType());
|
||||||
|
|
||||||
return file;
|
return file;
|
||||||
}
|
}
|
||||||
|
@ -12,6 +12,7 @@ import com.sismics.docs.core.event.FileCreatedAsyncEvent;
|
|||||||
import com.sismics.docs.core.model.jpa.File;
|
import com.sismics.docs.core.model.jpa.File;
|
||||||
import com.sismics.docs.core.util.FileUtil;
|
import com.sismics.docs.core.util.FileUtil;
|
||||||
import com.sismics.docs.core.util.TransactionUtil;
|
import com.sismics.docs.core.util.TransactionUtil;
|
||||||
|
import com.sismics.util.mime.MimeTypeUtil;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Listener on file created.
|
* Listener on file created.
|
||||||
@ -36,20 +37,23 @@ public class FileCreatedAsyncListener {
|
|||||||
log.info("File created event: " + fileCreatedAsyncEvent.toString());
|
log.info("File created event: " + fileCreatedAsyncEvent.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
// OCR the file
|
// Guess the mime type a second time, for open document format (first detected as simple ZIP file)
|
||||||
final File file = fileCreatedAsyncEvent.getFile();
|
final File file = fileCreatedAsyncEvent.getFile();
|
||||||
|
file.setMimeType(MimeTypeUtil.guessOpenDocumentFormat(file, fileCreatedAsyncEvent.getInputStream()));
|
||||||
|
|
||||||
|
// Extract text content from the file
|
||||||
long startTime = System.currentTimeMillis();
|
long startTime = System.currentTimeMillis();
|
||||||
final String content = FileUtil.extractContent(fileCreatedAsyncEvent.getDocument(), file, fileCreatedAsyncEvent.getInputStream());
|
final String content = FileUtil.extractContent(fileCreatedAsyncEvent.getDocument(), file, fileCreatedAsyncEvent.getInputStream());
|
||||||
fileCreatedAsyncEvent.getInputStream().close();
|
fileCreatedAsyncEvent.getInputStream().close();
|
||||||
log.info(MessageFormat.format("File content extracted in {0}ms", System.currentTimeMillis() - startTime));
|
log.info(MessageFormat.format("File content extracted in {0}ms", System.currentTimeMillis() - startTime));
|
||||||
|
|
||||||
// Store the OCR-ization result in the database
|
// Store the text content in the database
|
||||||
TransactionUtil.handle(new Runnable() {
|
TransactionUtil.handle(new Runnable() {
|
||||||
@Override
|
@Override
|
||||||
public void run() {
|
public void run() {
|
||||||
FileDao fileDao = new FileDao();
|
FileDao fileDao = new FileDao();
|
||||||
if (fileDao.getById(file.getId()) == null) {
|
if (fileDao.getById(file.getId()) == null) {
|
||||||
// The file has been deleted since the OCR-ization started, ignore the result
|
// The file has been deleted since the text extraction started, ignore the result
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -15,9 +15,13 @@ import javax.imageio.ImageIO;
|
|||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
import org.apache.pdfbox.rendering.PDFRenderer;
|
import org.apache.pdfbox.rendering.PDFRenderer;
|
||||||
import org.apache.pdfbox.text.PDFTextStripper;
|
import org.apache.pdfbox.text.PDFTextStripper;
|
||||||
|
import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
||||||
import org.imgscalr.Scalr;
|
import org.imgscalr.Scalr;
|
||||||
import org.imgscalr.Scalr.Method;
|
import org.imgscalr.Scalr.Method;
|
||||||
import org.imgscalr.Scalr.Mode;
|
import org.imgscalr.Scalr.Mode;
|
||||||
|
import org.odftoolkit.odfdom.converter.pdf.PdfConverter;
|
||||||
|
import org.odftoolkit.odfdom.converter.pdf.PdfOptions;
|
||||||
|
import org.odftoolkit.odfdom.doc.OdfTextDocument;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -53,6 +57,10 @@ public class FileUtil {
|
|||||||
content = ocrFile(inputStream, document);
|
content = ocrFile(inputStream, document);
|
||||||
} else if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) {
|
} else if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) {
|
||||||
content = extractPdf(inputStream);
|
content = extractPdf(inputStream);
|
||||||
|
} else if (file.getMimeType().equals(MimeType.OPEN_DOCUMENT_TEXT)) {
|
||||||
|
content = extractOpenDocumentText(inputStream);
|
||||||
|
} else if (file.getMimeType().equals(MimeType.OFFICE_DOCUMENT)) {
|
||||||
|
content = extractOfficeDocument(inputStream);
|
||||||
}
|
}
|
||||||
|
|
||||||
return content;
|
return content;
|
||||||
@ -120,6 +128,76 @@ public class FileUtil {
|
|||||||
return content;
|
return content;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract text from an open document text file.
|
||||||
|
*
|
||||||
|
* @param inputStream Unencrypted input stream
|
||||||
|
* @return Content extracted
|
||||||
|
*/
|
||||||
|
private static String extractOpenDocumentText(InputStream inputStream) {
|
||||||
|
String content = null;
|
||||||
|
Path tempFile = null;
|
||||||
|
try {
|
||||||
|
// Convert the ODT file to a temporary PDF file
|
||||||
|
tempFile = Files.createTempFile("sismicsdocs_", ".pdf");
|
||||||
|
try (OutputStream out = Files.newOutputStream(tempFile)) {
|
||||||
|
OdfTextDocument document = OdfTextDocument.loadDocument(inputStream);
|
||||||
|
PdfOptions options = PdfOptions.create();
|
||||||
|
PdfConverter.getInstance().convert(document, out, options);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract content from the PDF file
|
||||||
|
try (InputStream pdfInputStream = Files.newInputStream(tempFile)) {
|
||||||
|
content = extractPdf(pdfInputStream);
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.error("Error while extracting text from the ODT", e);
|
||||||
|
} finally {
|
||||||
|
try {
|
||||||
|
Files.delete(tempFile); // Delete the temporary PDF file
|
||||||
|
} catch (IOException e) {
|
||||||
|
// Should not happen
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return content;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract text from an Office document.
|
||||||
|
*
|
||||||
|
* @param inputStream Unencrypted input stream
|
||||||
|
* @return Content extracted
|
||||||
|
*/
|
||||||
|
private static String extractOfficeDocument(InputStream inputStream) {
|
||||||
|
String content = null;
|
||||||
|
Path tempFile = null;
|
||||||
|
try {
|
||||||
|
// Convert the DOCX file to a temporary PDF file
|
||||||
|
tempFile = Files.createTempFile("sismicsdocs_", ".pdf");
|
||||||
|
try (OutputStream out = Files.newOutputStream(tempFile)) {
|
||||||
|
XWPFDocument document = new XWPFDocument(inputStream);
|
||||||
|
org.apache.poi.xwpf.converter.pdf.PdfOptions options = org.apache.poi.xwpf.converter.pdf.PdfOptions.create();
|
||||||
|
org.apache.poi.xwpf.converter.pdf.PdfConverter.getInstance().convert(document, out, options);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract content from the PDF file
|
||||||
|
try (InputStream pdfInputStream = Files.newInputStream(tempFile)) {
|
||||||
|
content = extractPdf(pdfInputStream);
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.error("Error while extracting text from the DOCX", e);
|
||||||
|
} finally {
|
||||||
|
try {
|
||||||
|
Files.delete(tempFile); // Delete the temporary PDF file
|
||||||
|
} catch (IOException e) {
|
||||||
|
// Should not happen
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return content;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Save a file on the storage filesystem.
|
* Save a file on the storage filesystem.
|
||||||
*
|
*
|
||||||
@ -162,6 +240,7 @@ public class FileUtil {
|
|||||||
pdfDocument.close();
|
pdfDocument.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// TODO Generate thumbnails for DOCX/ODT documents (guess the MIME type earlier and build a PDF version now?)
|
||||||
|
|
||||||
if (image != null) {
|
if (image != null) {
|
||||||
// Generate thumbnails from image
|
// Generate thumbnails from image
|
||||||
|
@ -18,4 +18,8 @@ public class MimeType {
|
|||||||
public static final String APPLICATION_ZIP = "application/zip";
|
public static final String APPLICATION_ZIP = "application/zip";
|
||||||
|
|
||||||
public static final String APPLICATION_PDF = "application/pdf";
|
public static final String APPLICATION_PDF = "application/pdf";
|
||||||
|
|
||||||
|
public static final String OPEN_DOCUMENT_TEXT = "application/vnd.oasis.opendocument.text";
|
||||||
|
|
||||||
|
public static final String OFFICE_DOCUMENT = "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
|
||||||
}
|
}
|
||||||
|
@ -3,6 +3,13 @@ package com.sismics.util.mime;
|
|||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.UnsupportedEncodingException;
|
import java.io.UnsupportedEncodingException;
|
||||||
|
|
||||||
|
import org.apache.commons.compress.archivers.ArchiveEntry;
|
||||||
|
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
|
||||||
|
import org.apache.commons.compress.utils.IOUtils;
|
||||||
|
|
||||||
|
import com.google.common.base.Charsets;
|
||||||
|
import com.sismics.docs.core.model.jpa.File;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Utility to check MIME types.
|
* Utility to check MIME types.
|
||||||
*
|
*
|
||||||
@ -77,8 +84,59 @@ public class MimeTypeUtil {
|
|||||||
return "ico";
|
return "ico";
|
||||||
case MimeType.APPLICATION_PDF:
|
case MimeType.APPLICATION_PDF:
|
||||||
return "pdf";
|
return "pdf";
|
||||||
|
case MimeType.OPEN_DOCUMENT_TEXT:
|
||||||
|
return "odt";
|
||||||
|
case MimeType.OFFICE_DOCUMENT:
|
||||||
|
return "docx";
|
||||||
default:
|
default:
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Guess the MIME type of open document formats (docx and odt).
|
||||||
|
* It's more costly than the simple header check, but needed because open document formats
|
||||||
|
* are simple ZIP files on the outside and much bigger on the inside.
|
||||||
|
*
|
||||||
|
* @param file File
|
||||||
|
* @param inputStream Input stream
|
||||||
|
* @return MIME type
|
||||||
|
*/
|
||||||
|
public static String guessOpenDocumentFormat(File file, InputStream inputStream) {
|
||||||
|
if (!MimeType.APPLICATION_ZIP.equals(file.getMimeType())) {
|
||||||
|
// open document formats are ZIP files
|
||||||
|
return file.getMimeType();
|
||||||
|
}
|
||||||
|
|
||||||
|
String mimeType = file.getMimeType();
|
||||||
|
try (ZipArchiveInputStream archiveInputStream = new ZipArchiveInputStream(inputStream, Charsets.ISO_8859_1.name())) {
|
||||||
|
ArchiveEntry archiveEntry = archiveInputStream.getNextEntry();
|
||||||
|
while (archiveEntry != null) {
|
||||||
|
if (archiveEntry.getName().equals("mimetype")) {
|
||||||
|
// Maybe it's an ODT file
|
||||||
|
String content = new String(IOUtils.toByteArray(archiveInputStream), Charsets.ISO_8859_1);
|
||||||
|
if (MimeType.OPEN_DOCUMENT_TEXT.equals(content.trim())) {
|
||||||
|
mimeType = MimeType.OPEN_DOCUMENT_TEXT;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else if (archiveEntry.getName().equals("[Content_Types].xml")) {
|
||||||
|
// Maybe it's a DOCX file
|
||||||
|
String content = new String(IOUtils.toByteArray(archiveInputStream), Charsets.ISO_8859_1);
|
||||||
|
if (content.contains(MimeType.OFFICE_DOCUMENT)) {
|
||||||
|
mimeType = MimeType.OFFICE_DOCUMENT;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
archiveEntry = archiveInputStream.getNextEntry();
|
||||||
|
}
|
||||||
|
|
||||||
|
inputStream.reset();
|
||||||
|
} catch (Exception e) {
|
||||||
|
// In case of any error, just give up and keep the ZIP MIME type
|
||||||
|
return file.getMimeType();
|
||||||
|
}
|
||||||
|
|
||||||
|
return mimeType;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -18,7 +18,6 @@ import com.google.common.io.ByteStreams;
|
|||||||
* @author bgamard
|
* @author bgamard
|
||||||
*/
|
*/
|
||||||
public class TestEncryptUtil {
|
public class TestEncryptUtil {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test private key.
|
* Test private key.
|
||||||
*/
|
*/
|
||||||
|
@ -0,0 +1,36 @@
|
|||||||
|
package com.sismics.docs.core.util;
|
||||||
|
|
||||||
|
import java.io.InputStream;
|
||||||
|
|
||||||
|
import junit.framework.Assert;
|
||||||
|
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import com.google.common.io.Resources;
|
||||||
|
import com.sismics.docs.core.model.jpa.File;
|
||||||
|
import com.sismics.util.mime.MimeType;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test of the file entity utilities.
|
||||||
|
*
|
||||||
|
* @author bgamard
|
||||||
|
*/
|
||||||
|
public class TestFileUtil {
|
||||||
|
@Test
|
||||||
|
public void extractContentOpenDocumentTextTest() throws Exception {
|
||||||
|
try (InputStream inputStream = Resources.getResource("file/document.odt").openStream()) {
|
||||||
|
File file = new File();
|
||||||
|
file.setMimeType(MimeType.OPEN_DOCUMENT_TEXT);
|
||||||
|
Assert.assertEquals("Lorem ipsum dolor sit amen.\r\n", FileUtil.extractContent(null, file, inputStream));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void extractContentOfficeDocumentTest() throws Exception {
|
||||||
|
try (InputStream inputStream = Resources.getResource("file/document.docx").openStream()) {
|
||||||
|
File file = new File();
|
||||||
|
file.setMimeType(MimeType.OFFICE_DOCUMENT);
|
||||||
|
Assert.assertEquals("Lorem ipsum dolor sit amen.\r\n", FileUtil.extractContent(null, file, inputStream));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,40 @@
|
|||||||
|
package com.sismics.util;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.InputStream;
|
||||||
|
|
||||||
|
import org.apache.commons.compress.utils.IOUtils;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import com.google.common.io.Resources;
|
||||||
|
import com.sismics.docs.core.model.jpa.File;
|
||||||
|
import com.sismics.util.mime.MimeType;
|
||||||
|
import com.sismics.util.mime.MimeTypeUtil;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test of the utilities to check MIME types.
|
||||||
|
*
|
||||||
|
* @author bgamard
|
||||||
|
*/
|
||||||
|
public class TestMimeTypeUtil {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void guessOpenDocumentFormatTest() throws Exception {
|
||||||
|
// Detect ODT files
|
||||||
|
try (InputStream inputStream = Resources.getResource("file/document.odt").openStream();
|
||||||
|
InputStream byteArrayInputStream = new ByteArrayInputStream(IOUtils.toByteArray(inputStream))) {
|
||||||
|
File file = new File();
|
||||||
|
file.setMimeType(MimeType.APPLICATION_ZIP);
|
||||||
|
Assert.assertEquals(MimeType.OPEN_DOCUMENT_TEXT, MimeTypeUtil.guessOpenDocumentFormat(file, byteArrayInputStream));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Detect DOCX files
|
||||||
|
try (InputStream inputStream = Resources.getResource("file/document.docx").openStream();
|
||||||
|
InputStream byteArrayInputStream = new ByteArrayInputStream(IOUtils.toByteArray(inputStream))) {
|
||||||
|
File file = new File();
|
||||||
|
file.setMimeType(MimeType.APPLICATION_ZIP);
|
||||||
|
Assert.assertEquals(MimeType.OFFICE_DOCUMENT, MimeTypeUtil.guessOpenDocumentFormat(file, byteArrayInputStream));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
BIN
docs-core/src/test/resources/file/document.docx
Normal file
BIN
docs-core/src/test/resources/file/document.docx
Normal file
Binary file not shown.
BIN
docs-core/src/test/resources/file/document.odt
Normal file
BIN
docs-core/src/test/resources/file/document.odt
Normal file
Binary file not shown.
@ -36,6 +36,7 @@
|
|||||||
<org.hibernate.hibernate.version>4.1.0.Final</org.hibernate.hibernate.version>
|
<org.hibernate.hibernate.version>4.1.0.Final</org.hibernate.hibernate.version>
|
||||||
<javax.servlet.javax.servlet-api.version>3.1.0</javax.servlet.javax.servlet-api.version>
|
<javax.servlet.javax.servlet-api.version>3.1.0</javax.servlet.javax.servlet-api.version>
|
||||||
<com.levigo.jbig2.levigo-jbig2-imageio.version>1.6.3</com.levigo.jbig2.levigo-jbig2-imageio.version>
|
<com.levigo.jbig2.levigo-jbig2-imageio.version>1.6.3</com.levigo.jbig2.levigo-jbig2-imageio.version>
|
||||||
|
<fr.opensagres.xdocreport.version>1.0.5</fr.opensagres.xdocreport.version>
|
||||||
|
|
||||||
<org.eclipse.jetty.jetty-server.version>9.2.13.v20150730</org.eclipse.jetty.jetty-server.version>
|
<org.eclipse.jetty.jetty-server.version>9.2.13.v20150730</org.eclipse.jetty.jetty-server.version>
|
||||||
<org.eclipse.jetty.jetty-webapp.version>9.2.13.v20150730</org.eclipse.jetty.jetty-webapp.version>
|
<org.eclipse.jetty.jetty-webapp.version>9.2.13.v20150730</org.eclipse.jetty.jetty-webapp.version>
|
||||||
@ -367,6 +368,18 @@
|
|||||||
<version>${org.bouncycastle.bcprov-jdk15on.version}</version>
|
<version>${org.bouncycastle.bcprov-jdk15on.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>fr.opensagres.xdocreport</groupId>
|
||||||
|
<artifactId>org.odftoolkit.odfdom.converter.pdf</artifactId>
|
||||||
|
<version>${fr.opensagres.xdocreport.version}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>fr.opensagres.xdocreport</groupId>
|
||||||
|
<artifactId>org.apache.poi.xwpf.converter.pdf</artifactId>
|
||||||
|
<version>${fr.opensagres.xdocreport.version}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<!-- Used to read JBIG2 images. See https://github.com/sismics/docs/issues/38 -->
|
<!-- Used to read JBIG2 images. See https://github.com/sismics/docs/issues/38 -->
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.levigo.jbig2</groupId>
|
<groupId>com.levigo.jbig2</groupId>
|
||||||
|
Loading…
Reference in New Issue
Block a user