mirror of
https://github.com/sismics/docs.git
synced 2024-11-22 05:57:57 +01:00
OCR all files in database asynchronously
This commit is contained in:
parent
234eaf047e
commit
e2bd85da5f
@ -0,0 +1,16 @@
|
|||||||
|
package com.sismics.docs.core.event;
|
||||||
|
|
||||||
|
import com.google.common.base.Objects;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* OCR all files in database event.
|
||||||
|
*
|
||||||
|
* @author bgamard
|
||||||
|
*/
|
||||||
|
public class OcrFileAsyncEvent {
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return Objects.toStringHelper(this)
|
||||||
|
.toString();
|
||||||
|
}
|
||||||
|
}
|
@ -6,10 +6,12 @@ import org.slf4j.Logger;
|
|||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.google.common.eventbus.Subscribe;
|
import com.google.common.eventbus.Subscribe;
|
||||||
|
import com.sismics.docs.core.dao.jpa.FileDao;
|
||||||
import com.sismics.docs.core.dao.lucene.LuceneDao;
|
import com.sismics.docs.core.dao.lucene.LuceneDao;
|
||||||
import com.sismics.docs.core.event.FileCreatedAsyncEvent;
|
import com.sismics.docs.core.event.FileCreatedAsyncEvent;
|
||||||
|
import com.sismics.docs.core.model.jpa.File;
|
||||||
import com.sismics.docs.core.util.FileUtil;
|
import com.sismics.docs.core.util.FileUtil;
|
||||||
import com.sismics.util.ImageUtil;
|
import com.sismics.docs.core.util.TransactionUtil;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Listener on file created.
|
* Listener on file created.
|
||||||
@ -34,12 +36,21 @@ public class FileCreatedAsyncListener {
|
|||||||
log.info("File created event: " + fileCreatedAsyncEvent.toString());
|
log.info("File created event: " + fileCreatedAsyncEvent.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
// OCR the file if it is an image
|
// OCR the file
|
||||||
if (ImageUtil.isImage(fileCreatedAsyncEvent.getFile().getMimeType())) {
|
final File file = fileCreatedAsyncEvent.getFile();
|
||||||
long startTime = System.currentTimeMillis();
|
long startTime = System.currentTimeMillis();
|
||||||
FileUtil.ocrFile(fileCreatedAsyncEvent.getDocument(), fileCreatedAsyncEvent.getFile());
|
final String content = FileUtil.ocrFile(fileCreatedAsyncEvent.getDocument(), file);
|
||||||
log.info(MessageFormat.format("File OCR-ized in {0}ms", System.currentTimeMillis() - startTime));
|
log.info(MessageFormat.format("File OCR-ized in {0}ms", System.currentTimeMillis() - startTime));
|
||||||
}
|
|
||||||
|
// Store the OCR-ization result in the database
|
||||||
|
TransactionUtil.handle(new Runnable() {
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
FileDao fileDao = new FileDao();
|
||||||
|
file.setContent(content);
|
||||||
|
fileDao.updateContent(file);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
// Update Lucene index
|
// Update Lucene index
|
||||||
LuceneDao luceneDao = new LuceneDao();
|
LuceneDao luceneDao = new LuceneDao();
|
||||||
|
@ -0,0 +1,58 @@
|
|||||||
|
package com.sismics.docs.core.listener.async;
|
||||||
|
|
||||||
|
import java.text.MessageFormat;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.google.common.eventbus.Subscribe;
|
||||||
|
import com.sismics.docs.core.dao.jpa.DocumentDao;
|
||||||
|
import com.sismics.docs.core.dao.jpa.FileDao;
|
||||||
|
import com.sismics.docs.core.event.OcrFileAsyncEvent;
|
||||||
|
import com.sismics.docs.core.model.jpa.Document;
|
||||||
|
import com.sismics.docs.core.model.jpa.File;
|
||||||
|
import com.sismics.docs.core.util.FileUtil;
|
||||||
|
import com.sismics.docs.core.util.TransactionUtil;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Listener on OCR all files in database.
|
||||||
|
*
|
||||||
|
* @author bgamard
|
||||||
|
*/
|
||||||
|
public class OcrFileAsyncListener {
|
||||||
|
/**
|
||||||
|
* Logger.
|
||||||
|
*/
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(OcrFileAsyncListener.class);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* OCR all files.
|
||||||
|
*
|
||||||
|
* @param ocrFileAsyncEvent OCR all files in database event
|
||||||
|
* @throws Exception
|
||||||
|
*/
|
||||||
|
@Subscribe
|
||||||
|
public void on(final OcrFileAsyncEvent ocrFileAsyncEvent) throws Exception {
|
||||||
|
if (log.isInfoEnabled()) {
|
||||||
|
log.info("OCR all files in database event: " + ocrFileAsyncEvent.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
TransactionUtil.handle(new Runnable() {
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
FileDao fileDao = new FileDao();
|
||||||
|
DocumentDao documentDao = new DocumentDao();
|
||||||
|
List<File> fileList = fileDao.findAll();
|
||||||
|
for (File file : fileList) {
|
||||||
|
long startTime = System.currentTimeMillis();
|
||||||
|
Document document = documentDao.getById(file.getDocumentId());
|
||||||
|
String content = FileUtil.ocrFile(document, file);
|
||||||
|
file.setContent(content);
|
||||||
|
TransactionUtil.commit();
|
||||||
|
log.info(MessageFormat.format("File OCR-ized in {0}ms", System.currentTimeMillis() - startTime));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
@ -1,4 +1,4 @@
|
|||||||
package com.sismics.docs.core.event;
|
package com.sismics.docs.core.listener.async;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@ -9,6 +9,7 @@ import com.google.common.eventbus.Subscribe;
|
|||||||
import com.sismics.docs.core.dao.jpa.DocumentDao;
|
import com.sismics.docs.core.dao.jpa.DocumentDao;
|
||||||
import com.sismics.docs.core.dao.jpa.FileDao;
|
import com.sismics.docs.core.dao.jpa.FileDao;
|
||||||
import com.sismics.docs.core.dao.lucene.LuceneDao;
|
import com.sismics.docs.core.dao.lucene.LuceneDao;
|
||||||
|
import com.sismics.docs.core.event.RebuildIndexAsyncEvent;
|
||||||
import com.sismics.docs.core.model.jpa.Document;
|
import com.sismics.docs.core.model.jpa.Document;
|
||||||
import com.sismics.docs.core.model.jpa.File;
|
import com.sismics.docs.core.model.jpa.File;
|
||||||
import com.sismics.docs.core.util.TransactionUtil;
|
import com.sismics.docs.core.util.TransactionUtil;
|
@ -13,12 +13,13 @@ import com.google.common.eventbus.AsyncEventBus;
|
|||||||
import com.google.common.eventbus.EventBus;
|
import com.google.common.eventbus.EventBus;
|
||||||
import com.sismics.docs.core.constant.ConfigType;
|
import com.sismics.docs.core.constant.ConfigType;
|
||||||
import com.sismics.docs.core.dao.jpa.ConfigDao;
|
import com.sismics.docs.core.dao.jpa.ConfigDao;
|
||||||
import com.sismics.docs.core.event.RebuildIndexAsyncListener;
|
import com.sismics.docs.core.event.OcrFileAsyncEvent;
|
||||||
import com.sismics.docs.core.listener.async.DocumentCreatedAsyncListener;
|
import com.sismics.docs.core.listener.async.DocumentCreatedAsyncListener;
|
||||||
import com.sismics.docs.core.listener.async.DocumentDeletedAsyncListener;
|
import com.sismics.docs.core.listener.async.DocumentDeletedAsyncListener;
|
||||||
import com.sismics.docs.core.listener.async.DocumentUpdatedAsyncListener;
|
import com.sismics.docs.core.listener.async.DocumentUpdatedAsyncListener;
|
||||||
import com.sismics.docs.core.listener.async.FileCreatedAsyncListener;
|
import com.sismics.docs.core.listener.async.FileCreatedAsyncListener;
|
||||||
import com.sismics.docs.core.listener.async.FileDeletedAsyncListener;
|
import com.sismics.docs.core.listener.async.FileDeletedAsyncListener;
|
||||||
|
import com.sismics.docs.core.listener.async.RebuildIndexAsyncListener;
|
||||||
import com.sismics.docs.core.listener.sync.DeadEventListener;
|
import com.sismics.docs.core.listener.sync.DeadEventListener;
|
||||||
import com.sismics.docs.core.model.jpa.Config;
|
import com.sismics.docs.core.model.jpa.Config;
|
||||||
import com.sismics.docs.core.service.IndexingService;
|
import com.sismics.docs.core.service.IndexingService;
|
||||||
@ -90,6 +91,7 @@ public class AppContext {
|
|||||||
asyncEventBus.register(new DocumentUpdatedAsyncListener());
|
asyncEventBus.register(new DocumentUpdatedAsyncListener());
|
||||||
asyncEventBus.register(new DocumentDeletedAsyncListener());
|
asyncEventBus.register(new DocumentDeletedAsyncListener());
|
||||||
asyncEventBus.register(new RebuildIndexAsyncListener());
|
asyncEventBus.register(new RebuildIndexAsyncListener());
|
||||||
|
asyncEventBus.register(new OcrFileAsyncEvent());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -16,9 +16,9 @@ import org.imgscalr.Scalr.Mode;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.sismics.docs.core.dao.jpa.FileDao;
|
|
||||||
import com.sismics.docs.core.model.jpa.Document;
|
import com.sismics.docs.core.model.jpa.Document;
|
||||||
import com.sismics.docs.core.model.jpa.File;
|
import com.sismics.docs.core.model.jpa.File;
|
||||||
|
import com.sismics.util.ImageUtil;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* File entity utilities.
|
* File entity utilities.
|
||||||
@ -36,8 +36,14 @@ public class FileUtil {
|
|||||||
*
|
*
|
||||||
* @param document Document linked to the file
|
* @param document Document linked to the file
|
||||||
* @param file File to OCR
|
* @param file File to OCR
|
||||||
|
* @return OCR-ized content
|
||||||
*/
|
*/
|
||||||
public static void ocrFile(Document document, final File file) {
|
public static String ocrFile(Document document, final File file) {
|
||||||
|
if (!ImageUtil.isImage(file.getMimeType())) {
|
||||||
|
// The file is not OCR-izable
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
Tesseract instance = Tesseract.getInstance();
|
Tesseract instance = Tesseract.getInstance();
|
||||||
java.io.File storedfile = Paths.get(DirectoryUtil.getStorageDirectory().getPath(), file.getId()).toFile();
|
java.io.File storedfile = Paths.get(DirectoryUtil.getStorageDirectory().getPath(), file.getId()).toFile();
|
||||||
String content = null;
|
String content = null;
|
||||||
@ -63,15 +69,6 @@ public class FileUtil {
|
|||||||
log.error("Error while OCR-izing the file " + storedfile, e);
|
log.error("Error while OCR-izing the file " + storedfile, e);
|
||||||
}
|
}
|
||||||
|
|
||||||
file.setContent(content);
|
return content;
|
||||||
|
|
||||||
// Store the OCR-ization result in the database
|
|
||||||
TransactionUtil.handle(new Runnable() {
|
|
||||||
@Override
|
|
||||||
public void run() {
|
|
||||||
FileDao fileDao = new FileDao();
|
|
||||||
fileDao.updateContent(file);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,2 @@
|
|||||||
|
- New image rescale between thumbnail and original (client/server)
|
||||||
|
- Batch to regenerate all thumbnails (server)
|
@ -19,14 +19,11 @@ import org.codehaus.jettison.json.JSONException;
|
|||||||
import org.codehaus.jettison.json.JSONObject;
|
import org.codehaus.jettison.json.JSONObject;
|
||||||
|
|
||||||
import com.sismics.docs.core.dao.jpa.DocumentDao;
|
import com.sismics.docs.core.dao.jpa.DocumentDao;
|
||||||
import com.sismics.docs.core.dao.jpa.FileDao;
|
|
||||||
import com.sismics.docs.core.dao.jpa.criteria.DocumentCriteria;
|
import com.sismics.docs.core.dao.jpa.criteria.DocumentCriteria;
|
||||||
import com.sismics.docs.core.dao.jpa.dto.DocumentDto;
|
import com.sismics.docs.core.dao.jpa.dto.DocumentDto;
|
||||||
|
import com.sismics.docs.core.event.OcrFileAsyncEvent;
|
||||||
import com.sismics.docs.core.model.context.AppContext;
|
import com.sismics.docs.core.model.context.AppContext;
|
||||||
import com.sismics.docs.core.model.jpa.Document;
|
|
||||||
import com.sismics.docs.core.model.jpa.File;
|
|
||||||
import com.sismics.docs.core.util.ConfigUtil;
|
import com.sismics.docs.core.util.ConfigUtil;
|
||||||
import com.sismics.docs.core.util.FileUtil;
|
|
||||||
import com.sismics.docs.core.util.jpa.PaginatedList;
|
import com.sismics.docs.core.util.jpa.PaginatedList;
|
||||||
import com.sismics.docs.core.util.jpa.PaginatedLists;
|
import com.sismics.docs.core.util.jpa.PaginatedLists;
|
||||||
import com.sismics.docs.core.util.jpa.SortCriteria;
|
import com.sismics.docs.core.util.jpa.SortCriteria;
|
||||||
@ -157,13 +154,9 @@ public class AppResource extends BaseResource {
|
|||||||
}
|
}
|
||||||
checkBaseFunction(BaseFunction.ADMIN);
|
checkBaseFunction(BaseFunction.ADMIN);
|
||||||
|
|
||||||
FileDao fileDao = new FileDao();
|
// Raise a OCR file event
|
||||||
DocumentDao documentDao = new DocumentDao();
|
OcrFileAsyncEvent ocrFileAsyncEvent = new OcrFileAsyncEvent();
|
||||||
List<File> fileList = fileDao.findAll();
|
AppContext.getInstance().getAsyncEventBus().post(ocrFileAsyncEvent);
|
||||||
for (File file : fileList) {
|
|
||||||
Document document = documentDao.getById(file.getDocumentId());
|
|
||||||
FileUtil.ocrFile(document, file);
|
|
||||||
}
|
|
||||||
|
|
||||||
JSONObject response = new JSONObject();
|
JSONObject response = new JSONObject();
|
||||||
response.put("status", "ok");
|
response.put("status", "ok");
|
||||||
|
@ -6,4 +6,4 @@ log4j.appender.MEMORY=com.sismics.util.log4j.MemoryAppender
|
|||||||
log4j.appender.MEMORY.size=1000
|
log4j.appender.MEMORY.size=1000
|
||||||
|
|
||||||
log4j.logger.com.sismics=DEBUG
|
log4j.logger.com.sismics=DEBUG
|
||||||
log4j.logger.org.hibernate=ERROR
|
log4j.logger.org.hibernate=INFO
|
Loading…
Reference in New Issue
Block a user