From e2bd85da5ff4eb41c385b6a8129df665a2460c67 Mon Sep 17 00:00:00 2001 From: jendib Date: Sat, 17 Aug 2013 16:54:53 +0200 Subject: [PATCH] OCR all files in database asynchronously --- .../docs/core/event/OcrFileAsyncEvent.java | 16 +++++ .../async/FileCreatedAsyncListener.java | 25 +++++--- .../listener/async/OcrFileAsyncListener.java | 58 +++++++++++++++++++ .../async}/RebuildIndexAsyncListener.java | 3 +- .../docs/core/model/context/AppContext.java | 4 +- .../com/sismics/docs/core/util/FileUtil.java | 21 +++---- docs-parent/TODO | 2 + .../docs/rest/resource/AppResource.java | 15 ++--- docs-web/src/test/resources/log4j.properties | 2 +- 9 files changed, 113 insertions(+), 33 deletions(-) create mode 100644 docs-core/src/main/java/com/sismics/docs/core/event/OcrFileAsyncEvent.java create mode 100644 docs-core/src/main/java/com/sismics/docs/core/listener/async/OcrFileAsyncListener.java rename docs-core/src/main/java/com/sismics/docs/core/{event => listener/async}/RebuildIndexAsyncListener.java (94%) diff --git a/docs-core/src/main/java/com/sismics/docs/core/event/OcrFileAsyncEvent.java b/docs-core/src/main/java/com/sismics/docs/core/event/OcrFileAsyncEvent.java new file mode 100644 index 00000000..c0926b9a --- /dev/null +++ b/docs-core/src/main/java/com/sismics/docs/core/event/OcrFileAsyncEvent.java @@ -0,0 +1,16 @@ +package com.sismics.docs.core.event; + +import com.google.common.base.Objects; + +/** + * OCR all files in database event. + * + * @author bgamard + */ +public class OcrFileAsyncEvent { + @Override + public String toString() { + return Objects.toStringHelper(this) + .toString(); + } +} diff --git a/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java b/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java index 876ace67..f01a9daa 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java +++ b/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java @@ -6,10 +6,12 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.eventbus.Subscribe; +import com.sismics.docs.core.dao.jpa.FileDao; import com.sismics.docs.core.dao.lucene.LuceneDao; import com.sismics.docs.core.event.FileCreatedAsyncEvent; +import com.sismics.docs.core.model.jpa.File; import com.sismics.docs.core.util.FileUtil; -import com.sismics.util.ImageUtil; +import com.sismics.docs.core.util.TransactionUtil; /** * Listener on file created. @@ -34,12 +36,21 @@ public class FileCreatedAsyncListener { log.info("File created event: " + fileCreatedAsyncEvent.toString()); } - // OCR the file if it is an image - if (ImageUtil.isImage(fileCreatedAsyncEvent.getFile().getMimeType())) { - long startTime = System.currentTimeMillis(); - FileUtil.ocrFile(fileCreatedAsyncEvent.getDocument(), fileCreatedAsyncEvent.getFile()); - log.info(MessageFormat.format("File OCR-ized in {0}ms", System.currentTimeMillis() - startTime)); - } + // OCR the file + final File file = fileCreatedAsyncEvent.getFile(); + long startTime = System.currentTimeMillis(); + final String content = FileUtil.ocrFile(fileCreatedAsyncEvent.getDocument(), file); + log.info(MessageFormat.format("File OCR-ized in {0}ms", System.currentTimeMillis() - startTime)); + + // Store the OCR-ization result in the database + TransactionUtil.handle(new Runnable() { + @Override + public void run() { + FileDao fileDao = new FileDao(); + file.setContent(content); + fileDao.updateContent(file); + } + }); // Update Lucene index LuceneDao luceneDao = new LuceneDao(); diff --git a/docs-core/src/main/java/com/sismics/docs/core/listener/async/OcrFileAsyncListener.java b/docs-core/src/main/java/com/sismics/docs/core/listener/async/OcrFileAsyncListener.java new file mode 100644 index 00000000..1324bf39 --- /dev/null +++ b/docs-core/src/main/java/com/sismics/docs/core/listener/async/OcrFileAsyncListener.java @@ -0,0 +1,58 @@ +package com.sismics.docs.core.listener.async; + +import java.text.MessageFormat; +import java.util.List; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.eventbus.Subscribe; +import com.sismics.docs.core.dao.jpa.DocumentDao; +import com.sismics.docs.core.dao.jpa.FileDao; +import com.sismics.docs.core.event.OcrFileAsyncEvent; +import com.sismics.docs.core.model.jpa.Document; +import com.sismics.docs.core.model.jpa.File; +import com.sismics.docs.core.util.FileUtil; +import com.sismics.docs.core.util.TransactionUtil; + +/** + * Listener on OCR all files in database. + * + * @author bgamard + */ +public class OcrFileAsyncListener { + /** + * Logger. + */ + private static final Logger log = LoggerFactory.getLogger(OcrFileAsyncListener.class); + + /** + * OCR all files. + * + * @param ocrFileAsyncEvent OCR all files in database event + * @throws Exception + */ + @Subscribe + public void on(final OcrFileAsyncEvent ocrFileAsyncEvent) throws Exception { + if (log.isInfoEnabled()) { + log.info("OCR all files in database event: " + ocrFileAsyncEvent.toString()); + } + + TransactionUtil.handle(new Runnable() { + @Override + public void run() { + FileDao fileDao = new FileDao(); + DocumentDao documentDao = new DocumentDao(); + List fileList = fileDao.findAll(); + for (File file : fileList) { + long startTime = System.currentTimeMillis(); + Document document = documentDao.getById(file.getDocumentId()); + String content = FileUtil.ocrFile(document, file); + file.setContent(content); + TransactionUtil.commit(); + log.info(MessageFormat.format("File OCR-ized in {0}ms", System.currentTimeMillis() - startTime)); + } + } + }); + } +} diff --git a/docs-core/src/main/java/com/sismics/docs/core/event/RebuildIndexAsyncListener.java b/docs-core/src/main/java/com/sismics/docs/core/listener/async/RebuildIndexAsyncListener.java similarity index 94% rename from docs-core/src/main/java/com/sismics/docs/core/event/RebuildIndexAsyncListener.java rename to docs-core/src/main/java/com/sismics/docs/core/listener/async/RebuildIndexAsyncListener.java index c68d1924..9a7f4a53 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/event/RebuildIndexAsyncListener.java +++ b/docs-core/src/main/java/com/sismics/docs/core/listener/async/RebuildIndexAsyncListener.java @@ -1,4 +1,4 @@ -package com.sismics.docs.core.event; +package com.sismics.docs.core.listener.async; import java.util.List; @@ -9,6 +9,7 @@ import com.google.common.eventbus.Subscribe; import com.sismics.docs.core.dao.jpa.DocumentDao; import com.sismics.docs.core.dao.jpa.FileDao; import com.sismics.docs.core.dao.lucene.LuceneDao; +import com.sismics.docs.core.event.RebuildIndexAsyncEvent; import com.sismics.docs.core.model.jpa.Document; import com.sismics.docs.core.model.jpa.File; import com.sismics.docs.core.util.TransactionUtil; diff --git a/docs-core/src/main/java/com/sismics/docs/core/model/context/AppContext.java b/docs-core/src/main/java/com/sismics/docs/core/model/context/AppContext.java index 253074d2..5350a2a9 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/model/context/AppContext.java +++ b/docs-core/src/main/java/com/sismics/docs/core/model/context/AppContext.java @@ -13,12 +13,13 @@ import com.google.common.eventbus.AsyncEventBus; import com.google.common.eventbus.EventBus; import com.sismics.docs.core.constant.ConfigType; import com.sismics.docs.core.dao.jpa.ConfigDao; -import com.sismics.docs.core.event.RebuildIndexAsyncListener; +import com.sismics.docs.core.event.OcrFileAsyncEvent; import com.sismics.docs.core.listener.async.DocumentCreatedAsyncListener; import com.sismics.docs.core.listener.async.DocumentDeletedAsyncListener; import com.sismics.docs.core.listener.async.DocumentUpdatedAsyncListener; import com.sismics.docs.core.listener.async.FileCreatedAsyncListener; import com.sismics.docs.core.listener.async.FileDeletedAsyncListener; +import com.sismics.docs.core.listener.async.RebuildIndexAsyncListener; import com.sismics.docs.core.listener.sync.DeadEventListener; import com.sismics.docs.core.model.jpa.Config; import com.sismics.docs.core.service.IndexingService; @@ -90,6 +91,7 @@ public class AppContext { asyncEventBus.register(new DocumentUpdatedAsyncListener()); asyncEventBus.register(new DocumentDeletedAsyncListener()); asyncEventBus.register(new RebuildIndexAsyncListener()); + asyncEventBus.register(new OcrFileAsyncEvent()); } /** diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java b/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java index 683c3342..89e5c2d6 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java +++ b/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java @@ -16,9 +16,9 @@ import org.imgscalr.Scalr.Mode; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.sismics.docs.core.dao.jpa.FileDao; import com.sismics.docs.core.model.jpa.Document; import com.sismics.docs.core.model.jpa.File; +import com.sismics.util.ImageUtil; /** * File entity utilities. @@ -36,8 +36,14 @@ public class FileUtil { * * @param document Document linked to the file * @param file File to OCR + * @return OCR-ized content */ - public static void ocrFile(Document document, final File file) { + public static String ocrFile(Document document, final File file) { + if (!ImageUtil.isImage(file.getMimeType())) { + // The file is not OCR-izable + return null; + } + Tesseract instance = Tesseract.getInstance(); java.io.File storedfile = Paths.get(DirectoryUtil.getStorageDirectory().getPath(), file.getId()).toFile(); String content = null; @@ -63,15 +69,6 @@ public class FileUtil { log.error("Error while OCR-izing the file " + storedfile, e); } - file.setContent(content); - - // Store the OCR-ization result in the database - TransactionUtil.handle(new Runnable() { - @Override - public void run() { - FileDao fileDao = new FileDao(); - fileDao.updateContent(file); - } - }); + return content; } } diff --git a/docs-parent/TODO b/docs-parent/TODO index e69de29b..fe7beb2e 100644 --- a/docs-parent/TODO +++ b/docs-parent/TODO @@ -0,0 +1,2 @@ +- New image rescale between thumbnail and original (client/server) +- Batch to regenerate all thumbnails (server) \ No newline at end of file diff --git a/docs-web/src/main/java/com/sismics/docs/rest/resource/AppResource.java b/docs-web/src/main/java/com/sismics/docs/rest/resource/AppResource.java index 1db29d34..05749db2 100644 --- a/docs-web/src/main/java/com/sismics/docs/rest/resource/AppResource.java +++ b/docs-web/src/main/java/com/sismics/docs/rest/resource/AppResource.java @@ -19,14 +19,11 @@ import org.codehaus.jettison.json.JSONException; import org.codehaus.jettison.json.JSONObject; import com.sismics.docs.core.dao.jpa.DocumentDao; -import com.sismics.docs.core.dao.jpa.FileDao; import com.sismics.docs.core.dao.jpa.criteria.DocumentCriteria; import com.sismics.docs.core.dao.jpa.dto.DocumentDto; +import com.sismics.docs.core.event.OcrFileAsyncEvent; import com.sismics.docs.core.model.context.AppContext; -import com.sismics.docs.core.model.jpa.Document; -import com.sismics.docs.core.model.jpa.File; import com.sismics.docs.core.util.ConfigUtil; -import com.sismics.docs.core.util.FileUtil; import com.sismics.docs.core.util.jpa.PaginatedList; import com.sismics.docs.core.util.jpa.PaginatedLists; import com.sismics.docs.core.util.jpa.SortCriteria; @@ -157,13 +154,9 @@ public class AppResource extends BaseResource { } checkBaseFunction(BaseFunction.ADMIN); - FileDao fileDao = new FileDao(); - DocumentDao documentDao = new DocumentDao(); - List fileList = fileDao.findAll(); - for (File file : fileList) { - Document document = documentDao.getById(file.getDocumentId()); - FileUtil.ocrFile(document, file); - } + // Raise a OCR file event + OcrFileAsyncEvent ocrFileAsyncEvent = new OcrFileAsyncEvent(); + AppContext.getInstance().getAsyncEventBus().post(ocrFileAsyncEvent); JSONObject response = new JSONObject(); response.put("status", "ok"); diff --git a/docs-web/src/test/resources/log4j.properties b/docs-web/src/test/resources/log4j.properties index c4c13395..2f80816a 100644 --- a/docs-web/src/test/resources/log4j.properties +++ b/docs-web/src/test/resources/log4j.properties @@ -6,4 +6,4 @@ log4j.appender.MEMORY=com.sismics.util.log4j.MemoryAppender log4j.appender.MEMORY.size=1000 log4j.logger.com.sismics=DEBUG -log4j.logger.org.hibernate=ERROR \ No newline at end of file +log4j.logger.org.hibernate=INFO \ No newline at end of file