Index files OCR-ized content and documents, search on index fields

This commit is contained in:
jendib 2013-08-17 14:16:55 +02:00
parent 5507d4ca57
commit 7ed976b27a
23 changed files with 695 additions and 91 deletions

View File

@ -1,8 +1,23 @@
package com.sismics.docs.core.dao.jpa;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import javax.persistence.EntityManager;
import javax.persistence.NoResultException;
import javax.persistence.Query;
import com.google.common.base.Joiner;
import com.google.common.base.Strings;
import com.sismics.docs.core.dao.jpa.criteria.DocumentCriteria;
import com.sismics.docs.core.dao.jpa.dto.DocumentDto;
import com.sismics.docs.core.dao.lucene.LuceneDao;
import com.sismics.docs.core.model.jpa.Document;
import com.sismics.docs.core.util.jpa.PaginatedList;
import com.sismics.docs.core.util.jpa.PaginatedLists;
@ -10,12 +25,6 @@ import com.sismics.docs.core.util.jpa.QueryParam;
import com.sismics.docs.core.util.jpa.SortCriteria;
import com.sismics.util.context.ThreadLocalContext;
import javax.persistence.EntityManager;
import javax.persistence.NoResultException;
import javax.persistence.Query;
import java.sql.Timestamp;
import java.util.*;
/**
* Document DAO.
*
@ -40,6 +49,18 @@ public class DocumentDao {
return document.getId();
}
/**
* Returns the list of all documents.
*
* @return List of documents
*/
@SuppressWarnings("unchecked")
public List<Document> findAll() {
EntityManager em = ThreadLocalContext.get().getEntityManager();
Query q = em.createQuery("select d from Document d where d.deleteDate is null");
return q.getResultList();
}
/**
* Returns an active document.
*
@ -118,8 +139,9 @@ public class DocumentDao {
* @param paginatedList List of documents (updated by side effects)
* @param criteria Search criteria
* @return List of document
* @throws Exception
*/
public void findByCriteria(PaginatedList<DocumentDto> paginatedList, DocumentCriteria criteria, SortCriteria sortCriteria) {
public void findByCriteria(PaginatedList<DocumentDto> paginatedList, DocumentCriteria criteria, SortCriteria sortCriteria) throws Exception {
Map<String, Object> parameterMap = new HashMap<String, Object>();
List<String> criteriaList = new ArrayList<String>();
@ -133,9 +155,15 @@ public class DocumentDao {
criteriaList.add("d.DOC_IDUSER_C = :userId");
parameterMap.put("userId", criteria.getUserId());
}
if (criteria.getSearch() != null) {
criteriaList.add("(d.DOC_TITLE_C LIKE :search OR d.DOC_DESCRIPTION_C LIKE :search OR f.FIL_CONTENT_C LIKE :search)");
parameterMap.put("search", "%" + criteria.getSearch() + "%");
if (!Strings.isNullOrEmpty(criteria.getSearch())) {
LuceneDao luceneDao = new LuceneDao();
Set<String> documentIdList = luceneDao.search(criteria.getUserId(), criteria.getSearch());
if (documentIdList.size() == 0) {
// If the search doesn't find any document, the request should return nothing
documentIdList.add(UUID.randomUUID().toString());
}
criteriaList.add("d.DOC_ID_C in :documentIdList");
parameterMap.put("documentIdList", documentIdList);
}
if (criteria.getCreateDateMin() != null) {
criteriaList.add("d.DOC_CREATEDATE_D >= :createDateMin");

View File

@ -1,8 +1,10 @@
package com.sismics.docs.core.dao.lucene;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.document.Field;
@ -18,12 +20,12 @@ import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.util.Version;
import com.sismics.docs.core.model.context.AppContext;
import com.sismics.docs.core.model.jpa.Document;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.docs.core.util.LuceneUtil;
import com.sismics.docs.core.util.LuceneUtil.LuceneRunnable;
@ -40,55 +42,102 @@ public class LuceneDao {
*
* @param fileList
*/
public void rebuildIndex(final List<File> fileList) {
public void rebuildIndex(final List<Document> documentList, final List<File> fileList) {
LuceneUtil.handle(new LuceneRunnable() {
@Override
public void run(IndexWriter indexWriter) throws Exception {
// Empty index
indexWriter.deleteAll();
// Add all files
for (File file : fileList) {
org.apache.lucene.document.Document document = getDocumentFromFile(file);
indexWriter.addDocument(document);
// Add all documents
Map<String, Document> documentMap = new HashMap<>();
for (Document document : documentList) {
org.apache.lucene.document.Document luceneDocument = getDocumentFromDocument(document);
indexWriter.addDocument(luceneDocument);
documentMap.put(document.getId(), document);
}
}
});
}
/**
* Add files to the index.
*
* @param fileList
*/
public void create(final List<File> fileList) {
LuceneUtil.handle(new LuceneRunnable() {
@Override
public void run(IndexWriter indexWriter) throws Exception {
// Add all files
for (File file : fileList) {
org.apache.lucene.document.Document document = getDocumentFromFile(file);
indexWriter.addDocument(document);
org.apache.lucene.document.Document luceneDocument = getDocumentFromFile(file, documentMap.get(file.getDocumentId()));
indexWriter.addDocument(luceneDocument);
}
}
});
}
/**
* Update index.
* Add document to the index.
*
* @param fileList File list
* @param document Document to add
*/
public void update(final List<File> fileList) {
public void createDocument(final Document document) {
LuceneUtil.handle(new LuceneRunnable() {
@Override
public void run(IndexWriter indexWriter) throws Exception {
// Update all files
for (File file : fileList) {
org.apache.lucene.document.Document document = getDocumentFromFile(file);
indexWriter.updateDocument(new Term("id", file.getId()), document);
}
org.apache.lucene.document.Document luceneDocument = getDocumentFromDocument(document);
indexWriter.addDocument(luceneDocument);
}
});
}
/**
* Add file to the index.
*
* @param file File to add
* @param document Document linked to the file
*/
public void createFile(final File file, final Document document) {
LuceneUtil.handle(new LuceneRunnable() {
@Override
public void run(IndexWriter indexWriter) throws Exception {
org.apache.lucene.document.Document luceneDocument = getDocumentFromFile(file, document);
indexWriter.addDocument(luceneDocument);
}
});
}
/**
* Update document index.
*
* @param document Updated document
*/
public void updateDocument(final Document document) {
LuceneUtil.handle(new LuceneRunnable() {
@Override
public void run(IndexWriter indexWriter) throws Exception {
org.apache.lucene.document.Document luceneDocument = getDocumentFromDocument(document);
indexWriter.updateDocument(new Term("id", document.getId()), luceneDocument);
}
});
}
/**
* Update file index.
*
* @param file Updated file
* @param document Document linked to the file
*/
public void updateFile(final File file, final Document document) {
LuceneUtil.handle(new LuceneRunnable() {
@Override
public void run(IndexWriter indexWriter) throws Exception {
org.apache.lucene.document.Document luceneDocument = getDocumentFromFile(file, document);
indexWriter.updateDocument(new Term("id", file.getId()), luceneDocument);
}
});
}
/**
* Delete document from the index.
*
* @param id Document ID to delete
*/
public void deleteDocument(final String id) {
LuceneUtil.handle(new LuceneRunnable() {
@Override
public void run(IndexWriter indexWriter) throws Exception {
indexWriter.deleteDocuments(new Term("id", id));
}
});
}
@ -96,58 +145,87 @@ public class LuceneDao {
/**
* Search files.
*
* @param paginatedList
* @param feedList
* @param searchQuery
* @return List of file IDs
* @param userId User ID to filter on
* @param searchQuery Search query
* @return List of document IDs
* @throws Exception
*/
public Set<String> search(String userId, String searchQuery, int limit) throws Exception {
public Set<String> search(String userId, String searchQuery) throws Exception {
// Escape query and add quotes so QueryParser generate a PhraseQuery
searchQuery = "\"" + QueryParserUtil.escape(searchQuery) + "\"";
// Build search query
StandardQueryParser qpHelper = new StandardQueryParser(new DocsStandardAnalyzer(Version.LUCENE_42));
qpHelper.setPhraseSlop(100000); // PhraseQuery add terms
Query contentQuery = qpHelper.parse(searchQuery, "content");
// Search on file content
// Search on documents and files
BooleanQuery query = new BooleanQuery();
query.add(contentQuery, Occur.SHOULD);
query.add(qpHelper.parse(searchQuery, "content"), Occur.SHOULD);
query.add(qpHelper.parse(searchQuery, "title"), Occur.SHOULD);
query.add(qpHelper.parse(searchQuery, "description"), Occur.SHOULD);
// Filter on provided user ID
List<Term> terms = new ArrayList<Term>();
terms.add(new Term("user_id", userId));
TermsFilter feedsFilter = new TermsFilter(terms);
if (userId != null) {
terms.add(new Term("user_id", userId));
}
TermsFilter userFilter = new TermsFilter(terms);
// Search
IndexReader reader = DirectoryReader.open(AppContext.getInstance().getLuceneDirectory());
IndexSearcher searcher = new IndexSearcher(reader);
TopDocs topDocs = searcher.search(query, feedsFilter, limit);
TopDocs topDocs = searcher.search(query, userFilter, Integer.MAX_VALUE);
ScoreDoc[] docs = topDocs.scoreDocs;
// Extract file IDs
Set<String> fileIdList = new HashSet<String>();
// Extract document IDs
Set<String> documentIdList = new HashSet<String>();
for (int i = 0; i < docs.length; i++) {
String id = searcher.doc(docs[i].doc).get("id");
fileIdList.add(id);
org.apache.lucene.document.Document document = searcher.doc(docs[i].doc);
String type = document.get("type");
String documentId = null;
if (type.equals("document")) {
documentId = document.get("id");
} else if (type.equals("file")) {
documentId = document.get("document_id");
}
documentIdList.add(documentId);
}
return fileIdList;
return documentIdList;
}
/**
* Build Lucene document from database document.
*
* @param document Document
* @return Document
*/
private org.apache.lucene.document.Document getDocumentFromDocument(Document document) {
org.apache.lucene.document.Document luceneDocument = new org.apache.lucene.document.Document();
luceneDocument.add(new StringField("id", document.getId(), Field.Store.YES));
luceneDocument.add(new StringField("user_id", document.getUserId(), Field.Store.YES));
luceneDocument.add(new StringField("type", "document", Field.Store.YES));
luceneDocument.add(new TextField("title", document.getTitle(), Field.Store.NO));
luceneDocument.add(new TextField("description", document.getDescription(), Field.Store.NO));
return luceneDocument;
}
/**
* Build Lucene document from file.
*
* @param file File
* @param document Document linked to the file
* @return Document
*/
private org.apache.lucene.document.Document getDocumentFromFile(File file) {
// Building document
org.apache.lucene.document.Document document = new org.apache.lucene.document.Document();
document.add(new StringField("id", file.getId(), Field.Store.YES));
document.add(new TextField("content", file.getContent(), Field.Store.NO));
private org.apache.lucene.document.Document getDocumentFromFile(File file, Document document) {
org.apache.lucene.document.Document luceneDocument = new org.apache.lucene.document.Document();
luceneDocument.add(new StringField("id", file.getId(), Field.Store.YES));
luceneDocument.add(new StringField("user_id", document.getUserId(), Field.Store.YES));
luceneDocument.add(new StringField("type", "file", Field.Store.YES));
luceneDocument.add(new StringField("document_id", file.getDocumentId(), Field.Store.YES));
luceneDocument.add(new TextField("content", file.getContent(), Field.Store.NO));
return document;
return luceneDocument;
}
}

View File

@ -0,0 +1,41 @@
package com.sismics.docs.core.event;
import com.google.common.base.Objects;
import com.sismics.docs.core.model.jpa.Document;
/**
* Document created event.
*
* @author bgamard
*/
public class DocumentCreatedAsyncEvent {
/**
* Created document.
*/
private Document document;
/**
* Getter of document.
*
* @return the document
*/
public Document getDocument() {
return document;
}
/**
* Setter of document.
*
* @param document document
*/
public void setDocument(Document document) {
this.document = document;
}
@Override
public String toString() {
return Objects.toStringHelper(this)
.add("document", document)
.toString();
}
}

View File

@ -0,0 +1,41 @@
package com.sismics.docs.core.event;
import com.google.common.base.Objects;
import com.sismics.docs.core.model.jpa.Document;
/**
* Document deleted event.
*
* @author bgamard
*/
public class DocumentDeletedAsyncEvent {
/**
* Created document.
*/
private Document document;
/**
* Getter of document.
*
* @return the document
*/
public Document getDocument() {
return document;
}
/**
* Setter of document.
*
* @param document document
*/
public void setDocument(Document document) {
this.document = document;
}
@Override
public String toString() {
return Objects.toStringHelper(this)
.add("document", document)
.toString();
}
}

View File

@ -0,0 +1,41 @@
package com.sismics.docs.core.event;
import com.google.common.base.Objects;
import com.sismics.docs.core.model.jpa.Document;
/**
* Document updated event.
*
* @author bgamard
*/
public class DocumentUpdatedAsyncEvent {
/**
* Created document.
*/
private Document document;
/**
* Getter of document.
*
* @return the document
*/
public Document getDocument() {
return document;
}
/**
* Setter of document.
*
* @param document document
*/
public void setDocument(Document document) {
this.document = document;
}
@Override
public String toString() {
return Objects.toStringHelper(this)
.add("document", document)
.toString();
}
}

View File

@ -0,0 +1,41 @@
package com.sismics.docs.core.event;
import com.google.common.base.Objects;
import com.sismics.docs.core.model.jpa.File;
/**
* File deleted event.
*
* @author bgamard
*/
public class FileDeletedAsyncEvent {
/**
* Deleted file.
*/
private File file;
/**
* Getter of file.
*
* @return the file
*/
public File getFile() {
return file;
}
/**
* Setter of file.
*
* @param file file
*/
public void setFile(File file) {
this.file = file;
}
@Override
public String toString() {
return Objects.toStringHelper(this)
.add("file", file)
.toString();
}
}

View File

@ -0,0 +1,16 @@
package com.sismics.docs.core.event;
import com.google.common.base.Objects;
/**
* Rebuild index event.
*
* @author bgamard
*/
public class RebuildIndexAsyncEvent {
@Override
public String toString() {
return Objects.toStringHelper(this)
.toString();
}
}

View File

@ -0,0 +1,57 @@
package com.sismics.docs.core.event;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.eventbus.Subscribe;
import com.sismics.docs.core.dao.jpa.DocumentDao;
import com.sismics.docs.core.dao.jpa.FileDao;
import com.sismics.docs.core.dao.lucene.LuceneDao;
import com.sismics.docs.core.model.jpa.Document;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.docs.core.util.TransactionUtil;
/**
* Listener on rebuild index.
*
* @author bgamard
*/
public class RebuildIndexAsyncListener {
/**
* Logger.
*/
private static final Logger log = LoggerFactory.getLogger(RebuildIndexAsyncListener.class);
/**
* Rebuild Lucene index.
*
* @param rebuildIndexAsyncEvent Index rebuild event
* @throws Exception
*/
@Subscribe
public void onArticleCreated(final RebuildIndexAsyncEvent rebuildIndexAsyncEvent) throws Exception {
if (log.isInfoEnabled()) {
log.info("Rebuild index event: " + rebuildIndexAsyncEvent.toString());
}
// Fetch all documents and files
TransactionUtil.handle(new Runnable() {
@Override
public void run() {
// Fetch all documents
DocumentDao documentDao = new DocumentDao();
List<Document> documentList = documentDao.findAll();
// Fetch all files
FileDao fileDao = new FileDao();
List<File> fileList = fileDao.findAll();
// Rebuild index
LuceneDao luceneDao = new LuceneDao();
luceneDao.rebuildIndex(documentList, fileList);
}
});
}
}

View File

@ -0,0 +1,37 @@
package com.sismics.docs.core.listener.async;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.eventbus.Subscribe;
import com.sismics.docs.core.dao.lucene.LuceneDao;
import com.sismics.docs.core.event.DocumentCreatedAsyncEvent;
/**
* Listener on document created.
*
* @author bgamard
*/
public class DocumentCreatedAsyncListener {
/**
* Logger.
*/
private static final Logger log = LoggerFactory.getLogger(DocumentCreatedAsyncListener.class);
/**
* Document created.
*
* @param documentCreatedAsyncEvent Document created event
* @throws Exception
*/
@Subscribe
public void on(final DocumentCreatedAsyncEvent documentCreatedAsyncEvent) throws Exception {
if (log.isInfoEnabled()) {
log.info("Document created event: " + documentCreatedAsyncEvent.toString());
}
// Update Lucene index
LuceneDao luceneDao = new LuceneDao();
luceneDao.createDocument(documentCreatedAsyncEvent.getDocument());
}
}

View File

@ -0,0 +1,37 @@
package com.sismics.docs.core.listener.async;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.eventbus.Subscribe;
import com.sismics.docs.core.dao.lucene.LuceneDao;
import com.sismics.docs.core.event.DocumentDeletedAsyncEvent;
/**
* Listener on document deleted.
*
* @author bgamard
*/
public class DocumentDeletedAsyncListener {
/**
* Logger.
*/
private static final Logger log = LoggerFactory.getLogger(DocumentDeletedAsyncListener.class);
/**
* Document deleted.
*
* @param documentDeletedAsyncEvent Document deleted event
* @throws Exception
*/
@Subscribe
public void on(final DocumentDeletedAsyncEvent documentDeletedAsyncEvent) throws Exception {
if (log.isInfoEnabled()) {
log.info("Document deleted event: " + documentDeletedAsyncEvent.toString());
}
// Update Lucene index
LuceneDao luceneDao = new LuceneDao();
luceneDao.deleteDocument(documentDeletedAsyncEvent.getDocument().getId());
}
}

View File

@ -0,0 +1,37 @@
package com.sismics.docs.core.listener.async;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.eventbus.Subscribe;
import com.sismics.docs.core.dao.lucene.LuceneDao;
import com.sismics.docs.core.event.DocumentUpdatedAsyncEvent;
/**
* Listener on document updated.
*
* @author bgamard
*/
public class DocumentUpdatedAsyncListener {
/**
* Logger.
*/
private static final Logger log = LoggerFactory.getLogger(DocumentUpdatedAsyncListener.class);
/**
* Document updated.
*
* @param documentUpdatedAsyncEvent Document updated event
* @throws Exception
*/
@Subscribe
public void on(final DocumentUpdatedAsyncEvent documentUpdatedAsyncEvent) throws Exception {
if (log.isInfoEnabled()) {
log.info("Document updated event: " + documentUpdatedAsyncEvent.toString());
}
// Update Lucene index
LuceneDao luceneDao = new LuceneDao();
luceneDao.updateDocument(documentUpdatedAsyncEvent.getDocument());
}
}

View File

@ -6,12 +6,13 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.eventbus.Subscribe;
import com.sismics.docs.core.dao.lucene.LuceneDao;
import com.sismics.docs.core.event.FileCreatedAsyncEvent;
import com.sismics.docs.core.util.FileUtil;
import com.sismics.util.ImageUtil;
/**
* Listener on new file.
* Listener on file created.
*
* @author bgamard
*/
@ -22,13 +23,13 @@ public class FileCreatedAsyncListener {
private static final Logger log = LoggerFactory.getLogger(FileCreatedAsyncListener.class);
/**
* Process new file.
* File created.
*
* @param fileCreatedAsyncEvent New file created event
* @param fileCreatedAsyncEvent File created event
* @throws Exception
*/
@Subscribe
public void onFileCreated(final FileCreatedAsyncEvent fileCreatedAsyncEvent) throws Exception {
public void on(final FileCreatedAsyncEvent fileCreatedAsyncEvent) throws Exception {
if (log.isInfoEnabled()) {
log.info("File created event: " + fileCreatedAsyncEvent.toString());
}
@ -39,5 +40,9 @@ public class FileCreatedAsyncListener {
FileUtil.ocrFile(fileCreatedAsyncEvent.getDocument(), fileCreatedAsyncEvent.getFile());
log.info(MessageFormat.format("File OCR-ized in {0}ms", System.currentTimeMillis() - startTime));
}
// Update Lucene index
LuceneDao luceneDao = new LuceneDao();
luceneDao.createFile(fileCreatedAsyncEvent.getFile(), fileCreatedAsyncEvent.getDocument());
}
}

View File

@ -0,0 +1,39 @@
package com.sismics.docs.core.listener.async;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.eventbus.Subscribe;
import com.sismics.docs.core.dao.lucene.LuceneDao;
import com.sismics.docs.core.event.FileDeletedAsyncEvent;
/**
* Listener on file deleted.
*
* @author bgamard
*/
public class FileDeletedAsyncListener {
/**
* Logger.
*/
private static final Logger log = LoggerFactory.getLogger(FileDeletedAsyncListener.class);
/**
* File deleted.
*
* @param fileDeletedAsyncEvent File deleted event
* @throws Exception
*/
@Subscribe
public void on(final FileDeletedAsyncEvent fileDeletedAsyncEvent) throws Exception {
if (log.isInfoEnabled()) {
log.info("File deleted event: " + fileDeletedAsyncEvent.toString());
}
// TODO Delete the file from storage
// Update Lucene index
LuceneDao luceneDao = new LuceneDao();
luceneDao.deleteDocument(fileDeletedAsyncEvent.getFile().getId());
}
}

View File

@ -13,7 +13,12 @@ import com.google.common.eventbus.AsyncEventBus;
import com.google.common.eventbus.EventBus;
import com.sismics.docs.core.constant.ConfigType;
import com.sismics.docs.core.dao.jpa.ConfigDao;
import com.sismics.docs.core.event.RebuildIndexAsyncListener;
import com.sismics.docs.core.listener.async.DocumentCreatedAsyncListener;
import com.sismics.docs.core.listener.async.DocumentDeletedAsyncListener;
import com.sismics.docs.core.listener.async.DocumentUpdatedAsyncListener;
import com.sismics.docs.core.listener.async.FileCreatedAsyncListener;
import com.sismics.docs.core.listener.async.FileDeletedAsyncListener;
import com.sismics.docs.core.listener.sync.DeadEventListener;
import com.sismics.docs.core.model.jpa.Config;
import com.sismics.docs.core.service.IndexingService;
@ -80,6 +85,11 @@ public class AppContext {
asyncEventBus = newAsyncEventBus();
asyncEventBus.register(new FileCreatedAsyncListener());
asyncEventBus.register(new FileDeletedAsyncListener());
asyncEventBus.register(new DocumentCreatedAsyncListener());
asyncEventBus.register(new DocumentUpdatedAsyncListener());
asyncEventBus.register(new DocumentDeletedAsyncListener());
asyncEventBus.register(new RebuildIndexAsyncListener());
}
/**

View File

@ -1,10 +1,9 @@
package com.sismics.docs.core.service;
import com.google.common.util.concurrent.AbstractScheduledService;
import com.sismics.docs.core.constant.Constants;
import com.sismics.docs.core.model.context.AppContext;
import com.sismics.docs.core.util.DirectoryUtil;
import com.sismics.docs.core.util.TransactionUtil;
import java.io.File;
import java.io.IOException;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.store.SimpleFSDirectory;
@ -12,9 +11,12 @@ import org.apache.lucene.store.SimpleFSLockFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.util.concurrent.TimeUnit;
import com.google.common.util.concurrent.AbstractScheduledService;
import com.sismics.docs.core.constant.Constants;
import com.sismics.docs.core.event.RebuildIndexAsyncEvent;
import com.sismics.docs.core.model.context.AppContext;
import com.sismics.docs.core.util.DirectoryUtil;
import com.sismics.docs.core.util.TransactionUtil;
/**
* Indexing service.
@ -85,6 +87,16 @@ public class IndexingService extends AbstractScheduledService {
return Scheduler.newFixedDelaySchedule(0, 1, TimeUnit.HOURS);
}
/**
* Destroy and rebuild Lucene index.
*
* @throws Exception
*/
public void rebuildIndex() throws Exception {
RebuildIndexAsyncEvent rebuildIndexAsyncEvent = new RebuildIndexAsyncEvent();
AppContext.getInstance().getAsyncEventBus().post(rebuildIndexAsyncEvent);
}
/**
* Getter of directory.
*

View File

@ -1,6 +1,8 @@
package com.sismics.docs.core.util;
import java.awt.color.ColorSpace;
import java.awt.image.BufferedImage;
import java.awt.image.ColorConvertOp;
import java.io.IOException;
import java.nio.file.Paths;
@ -46,12 +48,11 @@ public class FileUtil {
log.error("Error reading the image " + storedfile, e);
}
// Upscale the image if it is too small
if (image.getWidth() < 2500 || image.getHeight() < 2500) {
BufferedImage resizedImage = Scalr.resize(image, Method.AUTOMATIC, Mode.AUTOMATIC, 3500);
image.flush();
image = resizedImage;
}
// Upscale and grayscale the image
BufferedImage resizedImage = Scalr.resize(image, Method.AUTOMATIC, Mode.AUTOMATIC, 3500,
new ColorConvertOp(ColorSpace.getInstance(ColorSpace.CS_GRAY), null));
image.flush();
image = resizedImage;
// OCR the file
try {

View File

@ -1,4 +0,0 @@
- Index title and description (server)
- Use Lucene for title and description searching (server)
- Index OCR-ized content (server)
- Batch to rebuild Lucene index (server)

View File

@ -22,6 +22,7 @@ import com.sismics.docs.core.dao.jpa.DocumentDao;
import com.sismics.docs.core.dao.jpa.FileDao;
import com.sismics.docs.core.dao.jpa.criteria.DocumentCriteria;
import com.sismics.docs.core.dao.jpa.dto.DocumentDto;
import com.sismics.docs.core.model.context.AppContext;
import com.sismics.docs.core.model.jpa.Document;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.docs.core.util.ConfigUtil;
@ -68,7 +69,11 @@ public class AppResource extends BaseResource {
SortCriteria sortCriteria = new SortCriteria(0, true);
DocumentCriteria documentCriteria = new DocumentCriteria();
documentCriteria.setUserId(principal.getId());
documentDao.findByCriteria(paginatedList, documentCriteria, sortCriteria);
try {
documentDao.findByCriteria(paginatedList, documentCriteria, sortCriteria);
} catch (Exception e) {
throw new ServerException("SearchError", "Error searching in documents", e);
}
response.put("document_count", paginatedList.getResultCount());
// General data
@ -146,7 +151,7 @@ public class AppResource extends BaseResource {
@POST
@Path("batch/ocr")
@Produces(MediaType.APPLICATION_JSON)
public Response batchReindex() throws JSONException {
public Response batchOcr() throws JSONException {
if (!authenticate()) {
throw new ForbiddenClientException();
}
@ -164,4 +169,29 @@ public class AppResource extends BaseResource {
response.put("status", "ok");
return Response.ok().entity(response).build();
}
/**
* Destroy and rebuild Lucene index.
*
* @return Response
* @throws JSONException
*/
@POST
@Path("batch/reindex")
@Produces(MediaType.APPLICATION_JSON)
public Response batchReindex() throws JSONException {
if (!authenticate()) {
throw new ForbiddenClientException();
}
checkBaseFunction(BaseFunction.ADMIN);
JSONObject response = new JSONObject();
try {
AppContext.getInstance().getIndexingService().rebuildIndex();
} catch (Exception e) {
throw new ServerException("IndexingError", "Error rebuilding index", e);
}
response.put("status", "ok");
return Response.ok().entity(response).build();
}
}

View File

@ -38,6 +38,10 @@ import com.sismics.docs.core.dao.jpa.TagDao;
import com.sismics.docs.core.dao.jpa.criteria.DocumentCriteria;
import com.sismics.docs.core.dao.jpa.dto.DocumentDto;
import com.sismics.docs.core.dao.jpa.dto.TagDto;
import com.sismics.docs.core.event.DocumentCreatedAsyncEvent;
import com.sismics.docs.core.event.DocumentDeletedAsyncEvent;
import com.sismics.docs.core.event.DocumentUpdatedAsyncEvent;
import com.sismics.docs.core.model.context.AppContext;
import com.sismics.docs.core.model.jpa.Document;
import com.sismics.docs.core.model.jpa.Share;
import com.sismics.docs.core.model.jpa.Tag;
@ -46,6 +50,7 @@ import com.sismics.docs.core.util.jpa.PaginatedLists;
import com.sismics.docs.core.util.jpa.SortCriteria;
import com.sismics.rest.exception.ClientException;
import com.sismics.rest.exception.ForbiddenClientException;
import com.sismics.rest.exception.ServerException;
import com.sismics.rest.util.ValidationUtil;
/**
@ -148,7 +153,11 @@ public class DocumentResource extends BaseResource {
SortCriteria sortCriteria = new SortCriteria(sortColumn, asc);
DocumentCriteria documentCriteria = parseSearchQuery(search);
documentCriteria.setUserId(principal.getId());
documentDao.findByCriteria(paginatedList, documentCriteria, sortCriteria);
try {
documentDao.findByCriteria(paginatedList, documentCriteria, sortCriteria);
} catch (Exception e) {
throw new ServerException("SearchError", "Error searching in documents", e);
}
for (DocumentDto documentDto : paginatedList.getResultList()) {
JSONObject document = new JSONObject();
@ -296,6 +305,11 @@ public class DocumentResource extends BaseResource {
// Update tags
updateTagList(documentId, tagList);
// Raise a document created event
DocumentCreatedAsyncEvent documentCreatedAsyncEvent = new DocumentCreatedAsyncEvent();
documentCreatedAsyncEvent.setDocument(document);
AppContext.getInstance().getAsyncEventBus().post(documentCreatedAsyncEvent);
JSONObject response = new JSONObject();
response.put("id", documentId);
return Response.ok().entity(response).build();
@ -358,6 +372,11 @@ public class DocumentResource extends BaseResource {
// Update tags
updateTagList(id, tagList);
// Raise a document updated event
DocumentUpdatedAsyncEvent documentUpdatedAsyncEvent = new DocumentUpdatedAsyncEvent();
documentUpdatedAsyncEvent.setDocument(document);
AppContext.getInstance().getAsyncEventBus().post(documentUpdatedAsyncEvent);
// Always return ok
JSONObject response = new JSONObject();
response.put("id", id);
@ -415,6 +434,11 @@ public class DocumentResource extends BaseResource {
throw new ClientException("DocumentNotFound", MessageFormat.format("Document not found: {0}", id));
}
// Raise a document deleted event
DocumentDeletedAsyncEvent documentDeletedAsyncEvent = new DocumentDeletedAsyncEvent();
documentDeletedAsyncEvent.setDocument(document);
AppContext.getInstance().getAsyncEventBus().post(documentDeletedAsyncEvent);
// Delete the document
documentDao.delete(document.getId());

View File

@ -30,6 +30,7 @@ import com.sismics.docs.core.dao.jpa.DocumentDao;
import com.sismics.docs.core.dao.jpa.FileDao;
import com.sismics.docs.core.dao.jpa.ShareDao;
import com.sismics.docs.core.event.FileCreatedAsyncEvent;
import com.sismics.docs.core.event.FileDeletedAsyncEvent;
import com.sismics.docs.core.model.context.AppContext;
import com.sismics.docs.core.model.jpa.Document;
import com.sismics.docs.core.model.jpa.File;
@ -247,8 +248,12 @@ public class FileResource extends BaseResource {
throw new ClientException("FileNotFound", MessageFormat.format("File not found: {0}", id));
}
// Raise a new file deleted event
FileDeletedAsyncEvent fileDeletedAsyncEvent = new FileDeletedAsyncEvent();
fileDeletedAsyncEvent.setFile(file);
AppContext.getInstance().getAsyncEventBus().post(fileDeletedAsyncEvent);
// Delete the file
// TODO Delete the file from storage too
fileDao.delete(file.getId());
// Always return ok

View File

@ -49,6 +49,12 @@ public class TestAppResource extends BaseJerseyTest {
response = appResource.post(ClientResponse.class);
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
json = response.getEntity(JSONObject.class);
// Rebuild Lucene index
appResource = resource().path("/app/batch/reindex");
appResource.addFilter(new CookieAuthenticationFilter(adminAuthenticationToken));
response = appResource.post(ClientResponse.class);
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
}
/**

View File

@ -54,7 +54,7 @@ public class TestDocumentResource extends BaseJerseyTest {
WebResource documentResource = resource().path("/document");
documentResource.addFilter(new CookieAuthenticationFilter(document1Token));
postParams = new MultivaluedMapImpl();
postParams.add("title", "My super document 1");
postParams.add("title", "My super title document 1");
postParams.add("description", "My super description for document 1");
postParams.add("tags", tag1Id);
postParams.add("language", "eng");
@ -121,6 +121,28 @@ public class TestDocumentResource extends BaseJerseyTest {
Assert.assertEquals(document1Id, documents.getJSONObject(0).getString("id"));
Assert.assertEquals(create1Date, documents.getJSONObject(0).getLong("create_date"));
// Search documents by query
documentResource = resource().path("/document/list");
documentResource.addFilter(new CookieAuthenticationFilter(document1Token));
getParams = new MultivaluedMapImpl();
getParams.putSingle("search", "title");
response = documentResource.queryParams(getParams).get(ClientResponse.class);
json = response.getEntity(JSONObject.class);
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
documents = json.getJSONArray("documents");
Assert.assertTrue(documents.length() == 1);
// Search documents by query
documentResource = resource().path("/document/list");
documentResource.addFilter(new CookieAuthenticationFilter(document1Token));
getParams = new MultivaluedMapImpl();
getParams.putSingle("search", "description");
response = documentResource.queryParams(getParams).get(ClientResponse.class);
json = response.getEntity(JSONObject.class);
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
documents = json.getJSONArray("documents");
Assert.assertTrue(documents.length() == 1);
// Search documents by date
documentResource = resource().path("/document/list");
documentResource.addFilter(new CookieAuthenticationFilter(document1Token));

View File

@ -6,4 +6,4 @@ log4j.appender.MEMORY=com.sismics.util.log4j.MemoryAppender
log4j.appender.MEMORY.size=1000
log4j.logger.com.sismics=DEBUG
log4j.logger.org.hibernate.internal.util.EntityPrinter=INFO
log4j.logger.org.hibernate=ERROR