diff --git a/docs-core/pom.xml b/docs-core/pom.xml index 0bf5cf36..3b6e8fc6 100644 --- a/docs-core/pom.xml +++ b/docs-core/pom.xml @@ -117,6 +117,11 @@ imgscalr-lib + + org.apache.pdfbox + pdfbox + + jna diff --git a/docs-core/src/main/java/com/sismics/docs/core/event/OcrFileAsyncEvent.java b/docs-core/src/main/java/com/sismics/docs/core/event/ExtractFileAsyncEvent.java similarity index 77% rename from docs-core/src/main/java/com/sismics/docs/core/event/OcrFileAsyncEvent.java rename to docs-core/src/main/java/com/sismics/docs/core/event/ExtractFileAsyncEvent.java index c0926b9a..28dd6faa 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/event/OcrFileAsyncEvent.java +++ b/docs-core/src/main/java/com/sismics/docs/core/event/ExtractFileAsyncEvent.java @@ -3,11 +3,11 @@ package com.sismics.docs.core.event; import com.google.common.base.Objects; /** - * OCR all files in database event. + * Extract file content event. * * @author bgamard */ -public class OcrFileAsyncEvent { +public class ExtractFileAsyncEvent { @Override public String toString() { return Objects.toStringHelper(this) diff --git a/docs-core/src/main/java/com/sismics/docs/core/listener/async/OcrFileAsyncListener.java b/docs-core/src/main/java/com/sismics/docs/core/listener/async/ExtractFileAsyncListener.java similarity index 62% rename from docs-core/src/main/java/com/sismics/docs/core/listener/async/OcrFileAsyncListener.java rename to docs-core/src/main/java/com/sismics/docs/core/listener/async/ExtractFileAsyncListener.java index 1324bf39..8b532560 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/listener/async/OcrFileAsyncListener.java +++ b/docs-core/src/main/java/com/sismics/docs/core/listener/async/ExtractFileAsyncListener.java @@ -9,33 +9,33 @@ import org.slf4j.LoggerFactory; import com.google.common.eventbus.Subscribe; import com.sismics.docs.core.dao.jpa.DocumentDao; import com.sismics.docs.core.dao.jpa.FileDao; -import com.sismics.docs.core.event.OcrFileAsyncEvent; +import com.sismics.docs.core.event.ExtractFileAsyncEvent; import com.sismics.docs.core.model.jpa.Document; import com.sismics.docs.core.model.jpa.File; import com.sismics.docs.core.util.FileUtil; import com.sismics.docs.core.util.TransactionUtil; /** - * Listener on OCR all files in database. + * Listener on extract content from all files. * * @author bgamard */ -public class OcrFileAsyncListener { +public class ExtractFileAsyncListener { /** * Logger. */ - private static final Logger log = LoggerFactory.getLogger(OcrFileAsyncListener.class); + private static final Logger log = LoggerFactory.getLogger(ExtractFileAsyncListener.class); /** - * OCR all files. + * Extract content from all files. * - * @param ocrFileAsyncEvent OCR all files in database event + * @param extractFileAsyncEvent Extract file content event * @throws Exception */ @Subscribe - public void on(final OcrFileAsyncEvent ocrFileAsyncEvent) throws Exception { + public void on(final ExtractFileAsyncEvent extractFileAsyncEvent) throws Exception { if (log.isInfoEnabled()) { - log.info("OCR all files in database event: " + ocrFileAsyncEvent.toString()); + log.info("Extract file content event: " + extractFileAsyncEvent.toString()); } TransactionUtil.handle(new Runnable() { @@ -47,10 +47,9 @@ public class OcrFileAsyncListener { for (File file : fileList) { long startTime = System.currentTimeMillis(); Document document = documentDao.getById(file.getDocumentId()); - String content = FileUtil.ocrFile(document, file); - file.setContent(content); + file.setContent(FileUtil.extractContent(document, file)); TransactionUtil.commit(); - log.info(MessageFormat.format("File OCR-ized in {0}ms", System.currentTimeMillis() - startTime)); + log.info(MessageFormat.format("File content extracted in {0}ms", System.currentTimeMillis() - startTime)); } } }); diff --git a/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java b/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java index f01a9daa..6ea1a214 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java +++ b/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java @@ -39,7 +39,7 @@ public class FileCreatedAsyncListener { // OCR the file final File file = fileCreatedAsyncEvent.getFile(); long startTime = System.currentTimeMillis(); - final String content = FileUtil.ocrFile(fileCreatedAsyncEvent.getDocument(), file); + final String content = FileUtil.extractContent(fileCreatedAsyncEvent.getDocument(), file); log.info(MessageFormat.format("File OCR-ized in {0}ms", System.currentTimeMillis() - startTime)); // Store the OCR-ization result in the database diff --git a/docs-core/src/main/java/com/sismics/docs/core/model/context/AppContext.java b/docs-core/src/main/java/com/sismics/docs/core/model/context/AppContext.java index f95ec2e5..cf997a4b 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/model/context/AppContext.java +++ b/docs-core/src/main/java/com/sismics/docs/core/model/context/AppContext.java @@ -16,7 +16,7 @@ import com.sismics.docs.core.listener.async.DocumentDeletedAsyncListener; import com.sismics.docs.core.listener.async.DocumentUpdatedAsyncListener; import com.sismics.docs.core.listener.async.FileCreatedAsyncListener; import com.sismics.docs.core.listener.async.FileDeletedAsyncListener; -import com.sismics.docs.core.listener.async.OcrFileAsyncListener; +import com.sismics.docs.core.listener.async.ExtractFileAsyncListener; import com.sismics.docs.core.listener.async.RebuildIndexAsyncListener; import com.sismics.docs.core.listener.sync.DeadEventListener; import com.sismics.docs.core.model.jpa.Config; @@ -82,7 +82,7 @@ public class AppContext { asyncEventBus.register(new DocumentUpdatedAsyncListener()); asyncEventBus.register(new DocumentDeletedAsyncListener()); asyncEventBus.register(new RebuildIndexAsyncListener()); - asyncEventBus.register(new OcrFileAsyncListener()); + asyncEventBus.register(new ExtractFileAsyncListener()); } /** diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java b/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java index fc0fc2ca..5ab00ae4 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java +++ b/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java @@ -6,11 +6,15 @@ import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.util.List; import javax.imageio.ImageIO; import net.sourceforge.tess4j.Tesseract; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.util.PDFTextStripper; import org.imgscalr.Scalr; import org.imgscalr.Scalr.Method; import org.imgscalr.Scalr.Mode; @@ -20,6 +24,7 @@ import org.slf4j.LoggerFactory; import com.sismics.docs.core.model.jpa.Document; import com.sismics.docs.core.model.jpa.File; import com.sismics.util.ImageUtil; +import com.sismics.util.mime.MimeType; /** * File entity utilities. @@ -33,18 +38,32 @@ public class FileUtil { private static final Logger log = LoggerFactory.getLogger(FileUtil.class); /** - * OCR a file. + * Extract content from a file. + * + * @param document Document linked to the file + * @param file File to extract + * @return Content extract + */ + public static String extractContent(Document document, File file) { + String content = null; + + if (ImageUtil.isImage(file.getMimeType())) { + content = ocrFile(document, file); + } else if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) { + content = extractPdf(file); + } + + return content; + } + + /** + * Optical character recognition on a file. * * @param document Document linked to the file * @param file File to OCR - * @return OCR-ized content + * @return Content extracted */ - public static String ocrFile(Document document, final File file) { - if (!ImageUtil.isImage(file.getMimeType())) { - // The file is not OCR-izable - return null; - } - + private static String ocrFile(Document document, File file) { Tesseract instance = Tesseract.getInstance(); java.io.File storedfile = Paths.get(DirectoryUtil.getStorageDirectory().getPath(), file.getId()).toFile(); String content = null; @@ -72,6 +91,35 @@ public class FileUtil { return content; } + /** + * Extract text from a PDF. + * + * @param file File to extract + * @return Content extracted + */ + private static String extractPdf(File file) { + String content = null; + PDDocument pdfDocument = null; + java.io.File storedfile = Paths.get(DirectoryUtil.getStorageDirectory().getPath(), file.getId()).toFile(); + try { + PDFTextStripper stripper = new PDFTextStripper(); + pdfDocument = PDDocument.load(storedfile); + content = stripper.getText(pdfDocument); + } catch (IOException e) { + log.error("Error while extracting text from the PDF " + storedfile, e); + } finally { + if (pdfDocument != null) { + try { + pdfDocument.close(); + } catch (IOException e) { + // NOP + } + } + } + + return content; + } + /** * Save a file on the storage filesystem. * @@ -84,7 +132,12 @@ public class FileUtil { Files.copy(is, path); // Generate file variations - saveVariations(file, path.toFile()); + try { + saveVariations(file, path.toFile()); + } catch (IOException e) { + // Don't rethrow Exception from file variations generation + log.error("Error creating file variations", e); + } } /** @@ -95,8 +148,22 @@ public class FileUtil { * @throws IOException */ public static void saveVariations(File file, java.io.File originalFile) throws IOException { + BufferedImage image = null; if (ImageUtil.isImage(file.getMimeType())) { - BufferedImage image = ImageIO.read(originalFile); + image = ImageIO.read(originalFile); + } else if(file.getMimeType().equals(MimeType.APPLICATION_PDF)) { + // Generate preview from the first page of the PDF + PDDocument pdfDocument = PDDocument.load(originalFile); + @SuppressWarnings("unchecked") + List pageList = pdfDocument.getDocumentCatalog().getAllPages(); + if (pageList.size() > 0) { + PDPage page = pageList.get(0); + image = page.convertToImage(); + } + } + + if (image != null) { + // Generate thumbnails from image BufferedImage web = Scalr.resize(image, Scalr.Method.AUTOMATIC, Scalr.Mode.AUTOMATIC, 1280, Scalr.OP_ANTIALIAS); BufferedImage thumbnail = Scalr.resize(image, Scalr.Method.AUTOMATIC, Scalr.Mode.AUTOMATIC, 256, Scalr.OP_ANTIALIAS); image.flush(); diff --git a/docs-parent/TODO b/docs-parent/TODO index a0c153ff..e69de29b 100644 --- a/docs-parent/TODO +++ b/docs-parent/TODO @@ -1,2 +0,0 @@ -- Extract text from PDF for indexing, see PDFBox (server) -- Make thumbnail of the first page of PDF, see PDFBox (server) \ No newline at end of file diff --git a/docs-parent/pom.xml b/docs-parent/pom.xml index a033fb5c..d8e29a15 100644 --- a/docs-parent/pom.xml +++ b/docs-parent/pom.xml @@ -15,7 +15,7 @@ 1.7 UTF-8 - + 1.5 2.6 2.1 @@ -62,6 +62,7 @@ 8.1.2.v20120308 1.0.1 1.7 + 1.8.2 @@ -436,24 +437,30 @@ ${org.imgscalr.imgscalr-lib.version} + + org.apache.pdfbox + pdfbox + ${org.apache.pdfbox.pdfbox.version} + + - jna - jna - 1.0 - - - - jai - imageio - 1.0 - - - - tess4j - tess4j - 1.0 - + jna + jna + 1.0 + + + + jai + imageio + 1.0 + + + + tess4j + tess4j + 1.0 + @@ -478,64 +485,64 @@ - org.apache.maven.plugins - maven-install-plugin - 2.3.1 - - - - install-jna - validate - - ${project.basedir}/lib/jna.jar - default - jna - jna - 1.0 - jar - true - - - install-file - - - - - install-jai-imageio - validate - - ${project.basedir}/lib/jai_imageio.jar - default - jai - imageio - 1.0 - jar - true - - - install-file - - - - - install-tess4j - validate - - ${project.basedir}/lib/tess4j.jar - default - tess4j - tess4j - 1.0 - jar - true - - - install-file - - - - - + org.apache.maven.plugins + maven-install-plugin + 2.3.1 + + + + install-jna + validate + + ${project.basedir}/lib/jna.jar + default + jna + jna + 1.0 + jar + true + + + install-file + + + + + install-jai-imageio + validate + + ${project.basedir}/lib/jai_imageio.jar + default + jai + imageio + 1.0 + jar + true + + + install-file + + + + + install-tess4j + validate + + ${project.basedir}/lib/tess4j.jar + default + tess4j + tess4j + 1.0 + jar + true + + + install-file + + + + + diff --git a/docs-web/src/main/java/com/sismics/docs/rest/resource/AppResource.java b/docs-web/src/main/java/com/sismics/docs/rest/resource/AppResource.java index b93c694c..5e90d110 100644 --- a/docs-web/src/main/java/com/sismics/docs/rest/resource/AppResource.java +++ b/docs-web/src/main/java/com/sismics/docs/rest/resource/AppResource.java @@ -26,7 +26,7 @@ import com.sismics.docs.core.dao.jpa.DocumentDao; import com.sismics.docs.core.dao.jpa.FileDao; import com.sismics.docs.core.dao.jpa.criteria.DocumentCriteria; import com.sismics.docs.core.dao.jpa.dto.DocumentDto; -import com.sismics.docs.core.event.OcrFileAsyncEvent; +import com.sismics.docs.core.event.ExtractFileAsyncEvent; import com.sismics.docs.core.model.context.AppContext; import com.sismics.docs.core.model.jpa.File; import com.sismics.docs.core.util.ConfigUtil; @@ -163,7 +163,7 @@ public class AppResource extends BaseResource { checkBaseFunction(BaseFunction.ADMIN); // Raise a OCR file event - AppContext.getInstance().getAsyncEventBus().post(new OcrFileAsyncEvent()); + AppContext.getInstance().getAsyncEventBus().post(new ExtractFileAsyncEvent()); JSONObject response = new JSONObject(); response.put("status", "ok"); diff --git a/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java b/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java index b4e38286..614fd58c 100644 --- a/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java +++ b/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java @@ -15,6 +15,7 @@ import org.codehaus.jettison.json.JSONObject; import org.joda.time.format.DateTimeFormat; import org.junit.Test; +import com.google.common.io.ByteStreams; import com.sismics.docs.rest.filter.CookieAuthenticationFilter; import com.sun.jersey.api.client.ClientResponse; import com.sun.jersey.api.client.ClientResponse.Status; @@ -354,4 +355,67 @@ public class TestDocumentResource extends BaseJerseyTest { json = response.getEntity(JSONObject.class); Assert.assertEquals(Status.BAD_REQUEST, Status.fromStatusCode(response.getStatus())); } + + /** + * Test PDF extraction. + * + * @throws Exception + */ + @Test + public void testPdfExtraction() throws Exception { + // Login document2 + clientUtil.createUser("document2"); + String document2Token = clientUtil.login("document2"); + + // Create a document + WebResource documentResource = resource().path("/document"); + documentResource.addFilter(new CookieAuthenticationFilter(document2Token)); + MultivaluedMapImpl postParams = new MultivaluedMapImpl(); + postParams.add("title", "My super title document 1"); + postParams.add("description", "My super description for document 1"); + postParams.add("language", "eng"); + long create1Date = new Date().getTime(); + postParams.add("create_date", create1Date); + ClientResponse response = documentResource.put(ClientResponse.class, postParams); + Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus())); + JSONObject json = response.getEntity(JSONObject.class); + String document1Id = json.optString("id"); + Assert.assertNotNull(document1Id); + + // Add a PDF file + WebResource fileResource = resource().path("/file"); + fileResource.addFilter(new CookieAuthenticationFilter(document2Token)); + FormDataMultiPart form = new FormDataMultiPart(); + InputStream file = this.getClass().getResourceAsStream("/file/wikipedia.pdf"); + FormDataBodyPart fdp = new FormDataBodyPart("file", + new BufferedInputStream(file), + MediaType.APPLICATION_OCTET_STREAM_TYPE); + form.bodyPart(fdp); + form.field("id", document1Id); + response = fileResource.type(MediaType.MULTIPART_FORM_DATA).put(ClientResponse.class, form); + Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus())); + json = response.getEntity(JSONObject.class); + String file1Id = json.getString("id"); + + // Search documents by query in full content + documentResource = resource().path("/document/list"); + documentResource.addFilter(new CookieAuthenticationFilter(document2Token)); + MultivaluedMapImpl getParams = new MultivaluedMapImpl(); + getParams.putSingle("search", "full:vrandecic"); + response = documentResource.queryParams(getParams).get(ClientResponse.class); + json = response.getEntity(JSONObject.class); + Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus())); + Assert.assertTrue(json.getJSONArray("documents").length() == 1); + + // Get the file thumbnail data + fileResource = resource().path("/file/" + file1Id + "/data"); + fileResource.addFilter(new CookieAuthenticationFilter(document2Token)); + getParams = new MultivaluedMapImpl(); + getParams.putSingle("size", "thumb"); + response = fileResource.queryParams(getParams).get(ClientResponse.class); + Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus())); + InputStream is = response.getEntityInputStream(); + byte[] fileBytes = ByteStreams.toByteArray(is); + Assert.assertEquals(3457, fileBytes.length); + } } \ No newline at end of file diff --git a/docs-web/src/test/resources/file/wikipedia.pdf b/docs-web/src/test/resources/file/wikipedia.pdf new file mode 100644 index 00000000..08bc5206 Binary files /dev/null and b/docs-web/src/test/resources/file/wikipedia.pdf differ