mirror of
https://github.com/sismics/docs.git
synced 2024-11-21 21:47:57 +01:00
Document language (server), OCR files and store result in database
This commit is contained in:
parent
70a86dc86f
commit
1f1f02ed41
@ -117,6 +117,22 @@
|
||||
<artifactId>imgscalr-lib</artifactId>
|
||||
</dependency>
|
||||
|
||||
<!-- OCR dependencies -->
|
||||
<dependency>
|
||||
<groupId>jna</groupId>
|
||||
<artifactId>jna</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>jai</groupId>
|
||||
<artifactId>imageio</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>tess4j</groupId>
|
||||
<artifactId>tess4j</artifactId>
|
||||
</dependency>
|
||||
|
||||
<!-- Test dependencies -->
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
|
@ -1,5 +1,9 @@
|
||||
package com.sismics.docs.core.constant;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
/**
|
||||
* Application constants.
|
||||
*
|
||||
@ -40,4 +44,9 @@ public class Constants {
|
||||
* Default generic user role.
|
||||
*/
|
||||
public static final String DEFAULT_USER_ROLE = "user";
|
||||
|
||||
/**
|
||||
* Supported document languages.
|
||||
*/
|
||||
public static final List<String> SUPPORTED_LANGUAGES = Lists.newArrayList("eng", "fra");
|
||||
}
|
||||
|
@ -123,7 +123,7 @@ public class DocumentDao {
|
||||
Map<String, Object> parameterMap = new HashMap<String, Object>();
|
||||
List<String> criteriaList = new ArrayList<String>();
|
||||
|
||||
StringBuilder sb = new StringBuilder("select d.DOC_ID_C c0, d.DOC_TITLE_C c1, d.DOC_DESCRIPTION_C c2, d.DOC_CREATEDATE_D c3, s.SHA_ID_C is not null c4 ");
|
||||
StringBuilder sb = new StringBuilder("select d.DOC_ID_C c0, d.DOC_TITLE_C c1, d.DOC_DESCRIPTION_C c2, d.DOC_CREATEDATE_D c3, d.DOC_LANGUAGE_C c4, s.SHA_ID_C is not null c5 ");
|
||||
sb.append(" from T_DOCUMENT d ");
|
||||
sb.append(" left join T_SHARE s on s.SHA_IDDOCUMENT_C = d.DOC_ID_C and s.SHA_DELETEDATE_D is null ");
|
||||
|
||||
@ -156,6 +156,10 @@ public class DocumentDao {
|
||||
if (criteria.getShared() != null && criteria.getShared()) {
|
||||
criteriaList.add("s.SHA_ID_C is not null");
|
||||
}
|
||||
if (criteria.getLanguage() != null) {
|
||||
criteriaList.add("d.DOC_LANGUAGE_C = :language");
|
||||
parameterMap.put("language", criteria.getLanguage());
|
||||
}
|
||||
|
||||
criteriaList.add("d.DOC_DELETEDATE_D is null");
|
||||
|
||||
@ -177,6 +181,7 @@ public class DocumentDao {
|
||||
documentDto.setTitle((String) o[i++]);
|
||||
documentDto.setDescription((String) o[i++]);
|
||||
documentDto.setCreateTimestamp(((Timestamp) o[i++]).getTime());
|
||||
documentDto.setLanguage((String) o[i++]);
|
||||
documentDto.setShared((Boolean) o[i++]);
|
||||
documentDtoList.add(documentDto);
|
||||
}
|
||||
|
@ -1,14 +1,15 @@
|
||||
package com.sismics.docs.core.dao.jpa;
|
||||
|
||||
import com.sismics.docs.core.model.jpa.File;
|
||||
import com.sismics.util.context.ThreadLocalContext;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
||||
import javax.persistence.EntityManager;
|
||||
import javax.persistence.NoResultException;
|
||||
import javax.persistence.Query;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
||||
import com.sismics.docs.core.model.jpa.File;
|
||||
import com.sismics.util.context.ThreadLocalContext;
|
||||
|
||||
/**
|
||||
* File DAO.
|
||||
@ -66,6 +67,26 @@ public class FileDao {
|
||||
fileDb.setDeleteDate(dateNow);
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates the content of a file.
|
||||
*
|
||||
* @param file File to update
|
||||
* @return Updated file
|
||||
*/
|
||||
public File updateContent(File file) {
|
||||
EntityManager em = ThreadLocalContext.get().getEntityManager();
|
||||
|
||||
// Get the file
|
||||
Query q = em.createQuery("select f from File f where f.id = :id and f.deleteDate is null");
|
||||
q.setParameter("id", file.getId());
|
||||
File fileFromDb = (File) q.getSingleResult();
|
||||
|
||||
// Update the user
|
||||
fileFromDb.setContent(file.getContent());
|
||||
|
||||
return file;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a file by its ID.
|
||||
*
|
||||
|
@ -40,6 +40,11 @@ public class DocumentCriteria {
|
||||
*/
|
||||
private Boolean shared;
|
||||
|
||||
/**
|
||||
* Language.
|
||||
*/
|
||||
private String language;
|
||||
|
||||
/**
|
||||
* Getter of userId.
|
||||
*
|
||||
@ -147,4 +152,22 @@ public class DocumentCriteria {
|
||||
public void setShared(Boolean shared) {
|
||||
this.shared = shared;
|
||||
}
|
||||
|
||||
/**
|
||||
* Getter of language.
|
||||
*
|
||||
* @return the language
|
||||
*/
|
||||
public String getLanguage() {
|
||||
return language;
|
||||
}
|
||||
|
||||
/**
|
||||
* Setter of language.
|
||||
*
|
||||
* @param language language
|
||||
*/
|
||||
public void setLanguage(String language) {
|
||||
this.language = language;
|
||||
}
|
||||
}
|
||||
|
@ -24,6 +24,11 @@ public class DocumentDto {
|
||||
*/
|
||||
private String description;
|
||||
|
||||
/**
|
||||
* Language.
|
||||
*/
|
||||
private String language;
|
||||
|
||||
/**
|
||||
* Creation date.
|
||||
*/
|
||||
@ -123,4 +128,22 @@ public class DocumentDto {
|
||||
public void setShared(Boolean shared) {
|
||||
this.shared = shared;
|
||||
}
|
||||
|
||||
/**
|
||||
* Getter of language.
|
||||
*
|
||||
* @return the language
|
||||
*/
|
||||
public String getLanguage() {
|
||||
return language;
|
||||
}
|
||||
|
||||
/**
|
||||
* Setter of language.
|
||||
*
|
||||
* @param language language
|
||||
*/
|
||||
public void setLanguage(String language) {
|
||||
this.language = language;
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,151 @@
|
||||
package com.sismics.docs.core.dao.lucene;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.queries.TermsFilter;
|
||||
import org.apache.lucene.queryparser.flexible.standard.QueryParserUtil;
|
||||
import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import com.sismics.docs.core.model.context.AppContext;
|
||||
import com.sismics.docs.core.model.jpa.File;
|
||||
import com.sismics.docs.core.util.LuceneUtil;
|
||||
import com.sismics.docs.core.util.LuceneUtil.LuceneRunnable;
|
||||
|
||||
/**
|
||||
* Lucene DAO.
|
||||
*
|
||||
* @author bgamard
|
||||
*/
|
||||
public class LuceneDao {
|
||||
|
||||
/**
|
||||
* Destroy and rebuild index.
|
||||
*
|
||||
* @param fileList
|
||||
*/
|
||||
public void rebuildIndex(final List<File> fileList) {
|
||||
LuceneUtil.handle(new LuceneRunnable() {
|
||||
@Override
|
||||
public void run(IndexWriter indexWriter) throws Exception {
|
||||
// Empty index
|
||||
indexWriter.deleteAll();
|
||||
|
||||
// Add all files
|
||||
for (File file : fileList) {
|
||||
org.apache.lucene.document.Document document = getDocumentFromFile(file);
|
||||
indexWriter.addDocument(document);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Add files to the index.
|
||||
*
|
||||
* @param fileList
|
||||
*/
|
||||
public void create(final List<File> fileList) {
|
||||
LuceneUtil.handle(new LuceneRunnable() {
|
||||
@Override
|
||||
public void run(IndexWriter indexWriter) throws Exception {
|
||||
// Add all files
|
||||
for (File file : fileList) {
|
||||
org.apache.lucene.document.Document document = getDocumentFromFile(file);
|
||||
indexWriter.addDocument(document);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Update index.
|
||||
*
|
||||
* @param fileList File list
|
||||
*/
|
||||
public void update(final List<File> fileList) {
|
||||
LuceneUtil.handle(new LuceneRunnable() {
|
||||
@Override
|
||||
public void run(IndexWriter indexWriter) throws Exception {
|
||||
// Update all files
|
||||
for (File file : fileList) {
|
||||
org.apache.lucene.document.Document document = getDocumentFromFile(file);
|
||||
indexWriter.updateDocument(new Term("id", file.getId()), document);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Search files.
|
||||
*
|
||||
* @param paginatedList
|
||||
* @param feedList
|
||||
* @param searchQuery
|
||||
* @return List of file IDs
|
||||
* @throws Exception
|
||||
*/
|
||||
public List<String> search(String userId, String searchQuery, int limit) throws Exception {
|
||||
// Escape query and add quotes so QueryParser generate a PhraseQuery
|
||||
searchQuery = "\"" + QueryParserUtil.escape(searchQuery) + "\"";
|
||||
|
||||
// Build search query
|
||||
StandardQueryParser qpHelper = new StandardQueryParser(new DocsStandardAnalyzer(Version.LUCENE_42));
|
||||
qpHelper.setPhraseSlop(100000); // PhraseQuery add terms
|
||||
Query contentQuery = qpHelper.parse(searchQuery, "content");
|
||||
|
||||
// Search on file content
|
||||
BooleanQuery query = new BooleanQuery();
|
||||
query.add(contentQuery, Occur.SHOULD);
|
||||
|
||||
// Filter on provided user ID
|
||||
List<Term> terms = new ArrayList<Term>();
|
||||
terms.add(new Term("user_id", userId));
|
||||
TermsFilter feedsFilter = new TermsFilter(terms);
|
||||
|
||||
// Search
|
||||
IndexReader reader = DirectoryReader.open(AppContext.getInstance().getLuceneDirectory());
|
||||
IndexSearcher searcher = new IndexSearcher(reader);
|
||||
TopDocs topDocs = searcher.search(query, feedsFilter, limit);
|
||||
ScoreDoc[] docs = topDocs.scoreDocs;
|
||||
|
||||
// Extract file IDs
|
||||
List<String> fileIdList = new ArrayList<String>();
|
||||
for (int i = 0; i < docs.length; i++) {
|
||||
String id = searcher.doc(docs[i].doc).get("id");
|
||||
fileIdList.add(id);
|
||||
}
|
||||
|
||||
return fileIdList;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build Lucene document from file.
|
||||
*
|
||||
* @param file File
|
||||
* @return Document
|
||||
*/
|
||||
private org.apache.lucene.document.Document getDocumentFromFile(File file) {
|
||||
// Building document
|
||||
org.apache.lucene.document.Document document = new org.apache.lucene.document.Document();
|
||||
document.add(new StringField("id", file.getId(), Field.Store.YES));
|
||||
document.add(new TextField("content", file.getContent(), Field.Store.NO));
|
||||
|
||||
return document;
|
||||
}
|
||||
}
|
@ -0,0 +1,66 @@
|
||||
package com.sismics.docs.core.event;
|
||||
|
||||
import com.google.common.base.Objects;
|
||||
import com.sismics.docs.core.model.jpa.Document;
|
||||
import com.sismics.docs.core.model.jpa.File;
|
||||
|
||||
/**
|
||||
* New file created event.
|
||||
*
|
||||
* @author bgamard
|
||||
*/
|
||||
public class FileCreatedAsyncEvent {
|
||||
/**
|
||||
* Created file.
|
||||
*/
|
||||
private File file;
|
||||
|
||||
/**
|
||||
* Document linked to the file.
|
||||
*/
|
||||
private Document document;
|
||||
|
||||
/**
|
||||
* Getter of file.
|
||||
*
|
||||
* @return the file
|
||||
*/
|
||||
public File getFile() {
|
||||
return file;
|
||||
}
|
||||
|
||||
/**
|
||||
* Setter of file.
|
||||
*
|
||||
* @param file file
|
||||
*/
|
||||
public void setFile(File file) {
|
||||
this.file = file;
|
||||
}
|
||||
|
||||
/**
|
||||
* Getter of document.
|
||||
*
|
||||
* @return the document
|
||||
*/
|
||||
public Document getDocument() {
|
||||
return document;
|
||||
}
|
||||
|
||||
/**
|
||||
* Setter of document.
|
||||
*
|
||||
* @param document document
|
||||
*/
|
||||
public void setDocument(Document document) {
|
||||
this.document = document;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return Objects.toStringHelper(this)
|
||||
.add("file", file)
|
||||
.add("document", document)
|
||||
.toString();
|
||||
}
|
||||
}
|
@ -0,0 +1,43 @@
|
||||
package com.sismics.docs.core.listener.async;
|
||||
|
||||
import java.text.MessageFormat;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.common.eventbus.Subscribe;
|
||||
import com.sismics.docs.core.event.FileCreatedAsyncEvent;
|
||||
import com.sismics.docs.core.util.FileUtil;
|
||||
import com.sismics.util.ImageUtil;
|
||||
|
||||
/**
|
||||
* Listener on new file.
|
||||
*
|
||||
* @author bgamard
|
||||
*/
|
||||
public class FileCreatedAsyncListener {
|
||||
/**
|
||||
* Logger.
|
||||
*/
|
||||
private static final Logger log = LoggerFactory.getLogger(FileCreatedAsyncListener.class);
|
||||
|
||||
/**
|
||||
* Process new file.
|
||||
*
|
||||
* @param fileCreatedAsyncEvent New file created event
|
||||
* @throws Exception
|
||||
*/
|
||||
@Subscribe
|
||||
public void onArticleCreated(final FileCreatedAsyncEvent fileCreatedAsyncEvent) throws Exception {
|
||||
if (log.isInfoEnabled()) {
|
||||
log.info("File created event: " + fileCreatedAsyncEvent.toString());
|
||||
}
|
||||
|
||||
// OCR the file if it is an image
|
||||
if (ImageUtil.isImage(fileCreatedAsyncEvent.getFile().getMimeType())) {
|
||||
long startTime = System.currentTimeMillis();
|
||||
FileUtil.ocrFile(fileCreatedAsyncEvent.getDocument(), fileCreatedAsyncEvent.getFile());
|
||||
log.info(MessageFormat.format("File OCR-ized in {0}ms", System.currentTimeMillis() - startTime));
|
||||
}
|
||||
}
|
||||
}
|
@ -1,15 +1,5 @@
|
||||
package com.sismics.docs.core.model.context;
|
||||
|
||||
import com.google.common.eventbus.AsyncEventBus;
|
||||
import com.google.common.eventbus.EventBus;
|
||||
import com.sismics.docs.core.constant.ConfigType;
|
||||
import com.sismics.docs.core.dao.jpa.ConfigDao;
|
||||
import com.sismics.docs.core.listener.sync.DeadEventListener;
|
||||
import com.sismics.docs.core.model.jpa.Config;
|
||||
import com.sismics.docs.core.service.IndexingService;
|
||||
import com.sismics.util.EnvironmentUtil;
|
||||
import org.apache.lucene.store.Directory;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
@ -17,6 +7,18 @@ import java.util.concurrent.LinkedBlockingQueue;
|
||||
import java.util.concurrent.ThreadPoolExecutor;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import org.apache.lucene.store.Directory;
|
||||
|
||||
import com.google.common.eventbus.AsyncEventBus;
|
||||
import com.google.common.eventbus.EventBus;
|
||||
import com.sismics.docs.core.constant.ConfigType;
|
||||
import com.sismics.docs.core.dao.jpa.ConfigDao;
|
||||
import com.sismics.docs.core.listener.async.FileCreatedAsyncListener;
|
||||
import com.sismics.docs.core.listener.sync.DeadEventListener;
|
||||
import com.sismics.docs.core.model.jpa.Config;
|
||||
import com.sismics.docs.core.service.IndexingService;
|
||||
import com.sismics.util.EnvironmentUtil;
|
||||
|
||||
/**
|
||||
* Global application context.
|
||||
*
|
||||
@ -77,6 +79,7 @@ public class AppContext {
|
||||
asyncExecutorList = new ArrayList<ExecutorService>();
|
||||
|
||||
asyncEventBus = newAsyncEventBus();
|
||||
asyncEventBus.register(new FileCreatedAsyncListener());
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -29,6 +29,12 @@ public class Document {
|
||||
@Column(name = "DOC_IDUSER_C", nullable = false, length = 36)
|
||||
private String userId;
|
||||
|
||||
/**
|
||||
* Language (ISO 639-9).
|
||||
*/
|
||||
@Column(name = "DOC_LANGUAGE_C", nullable = false, length = 3)
|
||||
private String language;
|
||||
|
||||
/**
|
||||
* Title.
|
||||
*/
|
||||
@ -71,6 +77,24 @@ public class Document {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
/**
|
||||
* Getter of language.
|
||||
*
|
||||
* @return the language
|
||||
*/
|
||||
public String getLanguage() {
|
||||
return language;
|
||||
}
|
||||
|
||||
/**
|
||||
* Setter of language.
|
||||
*
|
||||
* @param language language
|
||||
*/
|
||||
public void setLanguage(String language) {
|
||||
this.language = language;
|
||||
}
|
||||
|
||||
/**
|
||||
* Getter of userId.
|
||||
*
|
||||
|
@ -5,6 +5,7 @@ import com.google.common.base.Objects;
|
||||
import javax.persistence.Column;
|
||||
import javax.persistence.Entity;
|
||||
import javax.persistence.Id;
|
||||
import javax.persistence.Lob;
|
||||
import javax.persistence.Table;
|
||||
import java.util.Date;
|
||||
|
||||
@ -30,11 +31,18 @@ public class File {
|
||||
private String documentId;
|
||||
|
||||
/**
|
||||
* Document ID.
|
||||
* MIME type.
|
||||
*/
|
||||
@Column(name = "FIL_MIMETYPE_C", length = 100)
|
||||
private String mimeType;
|
||||
|
||||
/**
|
||||
* OCR-ized content.
|
||||
*/
|
||||
@Lob
|
||||
@Column(name = "FIL_CONTENT_C")
|
||||
private String content;
|
||||
|
||||
/**
|
||||
* Creation date.
|
||||
*/
|
||||
@ -143,6 +151,24 @@ public class File {
|
||||
this.deleteDate = deleteDate;
|
||||
}
|
||||
|
||||
/**
|
||||
* Getter of content.
|
||||
*
|
||||
* @return the content
|
||||
*/
|
||||
public String getContent() {
|
||||
return content;
|
||||
}
|
||||
|
||||
/**
|
||||
* Setter of content.
|
||||
*
|
||||
* @param content content
|
||||
*/
|
||||
public void setContent(String content) {
|
||||
this.content = content;
|
||||
}
|
||||
|
||||
/**
|
||||
* Getter of order.
|
||||
*
|
||||
|
@ -0,0 +1,75 @@
|
||||
package com.sismics.docs.core.util;
|
||||
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Paths;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
|
||||
import net.sourceforge.tess4j.Tesseract;
|
||||
|
||||
import org.imgscalr.Scalr;
|
||||
import org.imgscalr.Scalr.Method;
|
||||
import org.imgscalr.Scalr.Mode;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.sismics.docs.core.dao.jpa.FileDao;
|
||||
import com.sismics.docs.core.model.jpa.Document;
|
||||
import com.sismics.docs.core.model.jpa.File;
|
||||
|
||||
/**
|
||||
* File entity utilities.
|
||||
*
|
||||
* @author bgamard
|
||||
*/
|
||||
public class FileUtil {
|
||||
/**
|
||||
* Logger.
|
||||
*/
|
||||
private static final Logger log = LoggerFactory.getLogger(FileUtil.class);
|
||||
|
||||
/**
|
||||
* OCR a file.
|
||||
*
|
||||
* @param document Document linked to the file
|
||||
* @param file File to OCR
|
||||
*/
|
||||
public static void ocrFile(Document document, final File file) {
|
||||
Tesseract instance = Tesseract.getInstance();
|
||||
java.io.File storedfile = Paths.get(DirectoryUtil.getStorageDirectory().getPath(), file.getId()).toFile();
|
||||
String content = null;
|
||||
BufferedImage image = null;
|
||||
try {
|
||||
image = ImageIO.read(storedfile);
|
||||
} catch (IOException e) {
|
||||
log.error("Error reading the image " + storedfile, e);
|
||||
}
|
||||
|
||||
// Upscale the image if it is too small
|
||||
if (image.getWidth() < 2500 || image.getHeight() < 2500) {
|
||||
BufferedImage resizedImage = Scalr.resize(image, Method.AUTOMATIC, Mode.AUTOMATIC, 3500);
|
||||
image.flush();
|
||||
image = resizedImage;
|
||||
}
|
||||
|
||||
// OCR the file
|
||||
try {
|
||||
instance.setLanguage(document.getLanguage());
|
||||
content = instance.doOCR(image);
|
||||
} catch (Exception e) {
|
||||
log.error("Error while OCR-izing the file " + storedfile, e);
|
||||
}
|
||||
|
||||
file.setContent(content);
|
||||
|
||||
// Store the OCR-ization result in the database
|
||||
TransactionUtil.handle(new Runnable() {
|
||||
@Override
|
||||
public void run() {
|
||||
FileDao fileDao = new FileDao();
|
||||
fileDao.updateContent(file);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
@ -1 +1 @@
|
||||
db.version=4
|
||||
db.version=5
|
@ -0,0 +1,3 @@
|
||||
alter table T_FILE add column FIL_CONTENT_C LONGVARCHAR;
|
||||
alter table T_DOCUMENT add column DOC_LANGUAGE_C varchar(3) default 'fra' not null;
|
||||
update T_CONFIG set CFG_VALUE_C='5' where CFG_ID_C='DB_VERSION';
|
@ -0,0 +1,7 @@
|
||||
- Add language on document (client)
|
||||
- Index title and description (server)
|
||||
- Use Lucene for title and description searching (server)
|
||||
- Index OCR-ized content (server)
|
||||
- Search in OCR-ized files (server)
|
||||
- Batch to OCR all documents (server)
|
||||
- Batch to rebuild Lucene index (server)
|
BIN
docs-parent/lib/jai_imageio.jar
Normal file
BIN
docs-parent/lib/jai_imageio.jar
Normal file
Binary file not shown.
BIN
docs-parent/lib/jna.jar
Normal file
BIN
docs-parent/lib/jna.jar
Normal file
Binary file not shown.
BIN
docs-parent/lib/tess4j.jar
Normal file
BIN
docs-parent/lib/tess4j.jar
Normal file
Binary file not shown.
@ -159,6 +159,7 @@
|
||||
<artifactId>osxappbundle-maven-plugin</artifactId>
|
||||
<version>${org.codehaus.mojo.osxappbundle-maven-plugin.version}</version>
|
||||
</plugin>
|
||||
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
@ -434,6 +435,25 @@
|
||||
<artifactId>imgscalr-lib</artifactId>
|
||||
<version>${org.imgscalr.imgscalr-lib.version}</version>
|
||||
</dependency>
|
||||
|
||||
<!-- OCR dependencies -->
|
||||
<dependency>
|
||||
<groupId>jna</groupId>
|
||||
<artifactId>jna</artifactId>
|
||||
<version>1.0</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>jai</groupId>
|
||||
<artifactId>imageio</artifactId>
|
||||
<version>1.0</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>tess4j</groupId>
|
||||
<artifactId>tess4j</artifactId>
|
||||
<version>1.0</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</dependencyManagement>
|
||||
|
||||
@ -450,4 +470,74 @@
|
||||
</releases>
|
||||
</pluginRepository>
|
||||
</pluginRepositories>
|
||||
|
||||
<profiles>
|
||||
<profile>
|
||||
<id>init</id>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-install-plugin</artifactId>
|
||||
<version>2.3.1</version>
|
||||
<executions>
|
||||
|
||||
<execution>
|
||||
<id>install-jna</id>
|
||||
<phase>validate</phase>
|
||||
<configuration>
|
||||
<file>${project.basedir}/lib/jna.jar</file>
|
||||
<repositoryLayout>default</repositoryLayout>
|
||||
<groupId>jna</groupId>
|
||||
<artifactId>jna</artifactId>
|
||||
<version>1.0</version>
|
||||
<packaging>jar</packaging>
|
||||
<generatePom>true</generatePom>
|
||||
</configuration>
|
||||
<goals>
|
||||
<goal>install-file</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
|
||||
<execution>
|
||||
<id>install-jai-imageio</id>
|
||||
<phase>validate</phase>
|
||||
<configuration>
|
||||
<file>${project.basedir}/lib/jai_imageio.jar</file>
|
||||
<repositoryLayout>default</repositoryLayout>
|
||||
<groupId>jai</groupId>
|
||||
<artifactId>imageio</artifactId>
|
||||
<version>1.0</version>
|
||||
<packaging>jar</packaging>
|
||||
<generatePom>true</generatePom>
|
||||
</configuration>
|
||||
<goals>
|
||||
<goal>install-file</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
|
||||
<execution>
|
||||
<id>install-tess4j</id>
|
||||
<phase>validate</phase>
|
||||
<configuration>
|
||||
<file>${project.basedir}/lib/tess4j.jar</file>
|
||||
<repositoryLayout>default</repositoryLayout>
|
||||
<groupId>tess4j</groupId>
|
||||
<artifactId>tess4j</artifactId>
|
||||
<version>1.0</version>
|
||||
<packaging>jar</packaging>
|
||||
<generatePom>true</generatePom>
|
||||
</configuration>
|
||||
<goals>
|
||||
<goal>install-file</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</profile>
|
||||
</profiles>
|
||||
</project>
|
||||
|
@ -1,3 +1,3 @@
|
||||
api.current_version=${project.version}
|
||||
api.min_version=1.0
|
||||
db.version=4
|
||||
db.version=5
|
@ -31,6 +31,7 @@ import org.joda.time.format.DateTimeFormatterBuilder;
|
||||
import org.joda.time.format.DateTimeParser;
|
||||
|
||||
import com.google.common.base.Strings;
|
||||
import com.sismics.docs.core.constant.Constants;
|
||||
import com.sismics.docs.core.dao.jpa.DocumentDao;
|
||||
import com.sismics.docs.core.dao.jpa.ShareDao;
|
||||
import com.sismics.docs.core.dao.jpa.TagDao;
|
||||
@ -155,6 +156,7 @@ public class DocumentResource extends BaseResource {
|
||||
document.put("description", documentDto.getDescription());
|
||||
document.put("create_date", documentDto.getCreateTimestamp());
|
||||
document.put("shared", documentDto.getShared());
|
||||
document.put("language", documentDto.getLanguage());
|
||||
|
||||
// Get tags
|
||||
List<TagDto> tagDtoList = tagDao.getByDocumentId(documentDto.getId());
|
||||
@ -178,7 +180,7 @@ public class DocumentResource extends BaseResource {
|
||||
|
||||
/**
|
||||
* Parse a query according to the specified syntax, eg.:
|
||||
* tag:assurance tag:other before:2012 after:2011-09 shared:yes thing
|
||||
* tag:assurance tag:other before:2012 after:2011-09 shared:yes lang:fra thing
|
||||
*
|
||||
* @param search Search query
|
||||
* @return DocumentCriteria
|
||||
@ -233,6 +235,11 @@ public class DocumentResource extends BaseResource {
|
||||
if (params[1].equals("yes")) {
|
||||
documentCriteria.setShared(true);
|
||||
}
|
||||
} else if (params[0].equals("lang")) {
|
||||
// New shared state criteria
|
||||
if (Constants.SUPPORTED_LANGUAGES.contains(params[1])) {
|
||||
documentCriteria.setLanguage(params[1]);
|
||||
}
|
||||
} else {
|
||||
query.append(criteria);
|
||||
}
|
||||
@ -256,6 +263,7 @@ public class DocumentResource extends BaseResource {
|
||||
@FormParam("title") String title,
|
||||
@FormParam("description") String description,
|
||||
@FormParam("tags") List<String> tagList,
|
||||
@FormParam("language") String language,
|
||||
@FormParam("create_date") String createDateStr) throws JSONException {
|
||||
if (!authenticate()) {
|
||||
throw new ForbiddenClientException();
|
||||
@ -263,8 +271,12 @@ public class DocumentResource extends BaseResource {
|
||||
|
||||
// Validate input data
|
||||
title = ValidationUtil.validateLength(title, "title", 1, 100, false);
|
||||
language = ValidationUtil.validateLength(language, "language", 3, 3, false);
|
||||
description = ValidationUtil.validateLength(description, "description", 0, 4000, true);
|
||||
Date createDate = ValidationUtil.validateDate(createDateStr, "create_date", true);
|
||||
if (!Constants.SUPPORTED_LANGUAGES.contains(language)) {
|
||||
throw new ClientException("ValidationError", MessageFormat.format("{0} is not a supported language", language));
|
||||
}
|
||||
|
||||
// Create the document
|
||||
DocumentDao documentDao = new DocumentDao();
|
||||
@ -272,6 +284,7 @@ public class DocumentResource extends BaseResource {
|
||||
document.setUserId(principal.getId());
|
||||
document.setTitle(title);
|
||||
document.setDescription(description);
|
||||
document.setLanguage(language);
|
||||
if (createDate == null) {
|
||||
document.setCreateDate(new Date());
|
||||
} else {
|
||||
@ -303,6 +316,7 @@ public class DocumentResource extends BaseResource {
|
||||
@FormParam("title") String title,
|
||||
@FormParam("description") String description,
|
||||
@FormParam("tags") List<String> tagList,
|
||||
@FormParam("language") String language,
|
||||
@FormParam("create_date") String createDateStr) throws JSONException {
|
||||
if (!authenticate()) {
|
||||
throw new ForbiddenClientException();
|
||||
@ -310,8 +324,12 @@ public class DocumentResource extends BaseResource {
|
||||
|
||||
// Validate input data
|
||||
title = ValidationUtil.validateLength(title, "title", 1, 100, true);
|
||||
language = ValidationUtil.validateLength(language, "language", 3, 3, true);
|
||||
description = ValidationUtil.validateLength(description, "description", 0, 4000, true);
|
||||
Date createDate = ValidationUtil.validateDate(createDateStr, "create_date", true);
|
||||
if (language != null && !Constants.SUPPORTED_LANGUAGES.contains(language)) {
|
||||
throw new ClientException("ValidationError", MessageFormat.format("{0} is not a supported language", language));
|
||||
}
|
||||
|
||||
// Get the document
|
||||
DocumentDao documentDao = new DocumentDao();
|
||||
@ -332,6 +350,9 @@ public class DocumentResource extends BaseResource {
|
||||
if (createDate != null) {
|
||||
document.setCreateDate(createDate);
|
||||
}
|
||||
if (language != null) {
|
||||
document.setLanguage(language);
|
||||
}
|
||||
|
||||
// Update tags
|
||||
updateTagList(id, tagList);
|
||||
|
@ -29,6 +29,8 @@ import org.codehaus.jettison.json.JSONObject;
|
||||
import com.sismics.docs.core.dao.jpa.DocumentDao;
|
||||
import com.sismics.docs.core.dao.jpa.FileDao;
|
||||
import com.sismics.docs.core.dao.jpa.ShareDao;
|
||||
import com.sismics.docs.core.event.FileCreatedAsyncEvent;
|
||||
import com.sismics.docs.core.model.context.AppContext;
|
||||
import com.sismics.docs.core.model.jpa.Document;
|
||||
import com.sismics.docs.core.model.jpa.File;
|
||||
import com.sismics.docs.core.util.DirectoryUtil;
|
||||
@ -110,6 +112,12 @@ public class FileResource extends BaseResource {
|
||||
|
||||
// Save the file
|
||||
FileUtil.save(is, file);
|
||||
|
||||
// Raise a new file created event
|
||||
FileCreatedAsyncEvent fileCreatedAsyncEvent = new FileCreatedAsyncEvent();
|
||||
fileCreatedAsyncEvent.setDocument(document);
|
||||
fileCreatedAsyncEvent.setFile(file);
|
||||
AppContext.getInstance().getAsyncEventBus().post(fileCreatedAsyncEvent);
|
||||
|
||||
// Always return ok
|
||||
JSONObject response = new JSONObject();
|
||||
|
@ -1,3 +1,3 @@
|
||||
api.current_version=${project.version}
|
||||
api.min_version=1.0
|
||||
db.version=4
|
||||
db.version=5
|
@ -50,6 +50,7 @@ public class TestDocumentResource extends BaseJerseyTest {
|
||||
postParams.add("title", "My super document 1");
|
||||
postParams.add("description", "My super description for document 1");
|
||||
postParams.add("tags", tag1Id);
|
||||
postParams.add("language", "eng");
|
||||
long create1Date = new Date().getTime();
|
||||
postParams.add("create_date", create1Date);
|
||||
response = documentResource.put(ClientResponse.class, postParams);
|
||||
@ -80,6 +81,7 @@ public class TestDocumentResource extends BaseJerseyTest {
|
||||
JSONArray tags = documents.getJSONObject(0).getJSONArray("tags");
|
||||
Assert.assertTrue(documents.length() == 1);
|
||||
Assert.assertEquals(document1Id, documents.getJSONObject(0).getString("id"));
|
||||
Assert.assertEquals("eng", documents.getJSONObject(0).getString("language"));
|
||||
Assert.assertEquals(1, tags.length());
|
||||
Assert.assertEquals(tag1Id, tags.getJSONObject(0).getString("id"));
|
||||
Assert.assertEquals("SuperTag", tags.getJSONObject(0).getString("name"));
|
||||
@ -135,18 +137,30 @@ public class TestDocumentResource extends BaseJerseyTest {
|
||||
Assert.assertEquals(document1Id, documents.getJSONObject(0).getString("id"));
|
||||
Assert.assertEquals(true, documents.getJSONObject(0).getBoolean("shared"));
|
||||
|
||||
// Search documents with multiple criteria
|
||||
// Search documents by language
|
||||
documentResource = resource().path("/document/list");
|
||||
documentResource.addFilter(new CookieAuthenticationFilter(document1Token));
|
||||
getParams = new MultivaluedMapImpl();
|
||||
getParams.putSingle("search", "after:2010 before:2040-08 tag:super shared:yes for");
|
||||
getParams.putSingle("search", "lang:eng");
|
||||
response = documentResource.queryParams(getParams).get(ClientResponse.class);
|
||||
json = response.getEntity(JSONObject.class);
|
||||
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||
documents = json.getJSONArray("documents");
|
||||
Assert.assertTrue(documents.length() == 1);
|
||||
Assert.assertEquals(document1Id, documents.getJSONObject(0).getString("id"));
|
||||
Assert.assertEquals("eng", documents.getJSONObject(0).getString("language"));
|
||||
|
||||
// Search documents with multiple criteria
|
||||
documentResource = resource().path("/document/list");
|
||||
documentResource.addFilter(new CookieAuthenticationFilter(document1Token));
|
||||
getParams = new MultivaluedMapImpl();
|
||||
getParams.putSingle("search", "after:2010 before:2040-08 tag:super shared:yes lang:eng for");
|
||||
response = documentResource.queryParams(getParams).get(ClientResponse.class);
|
||||
json = response.getEntity(JSONObject.class);
|
||||
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||
documents = json.getJSONArray("documents");
|
||||
Assert.assertTrue(documents.length() == 1);
|
||||
Assert.assertEquals(document1Id, documents.getJSONObject(0).getString("id"));
|
||||
Assert.assertEquals(true, documents.getJSONObject(0).getBoolean("shared"));
|
||||
|
||||
// Search documents (nothing)
|
||||
documentResource = resource().path("/document/list");
|
||||
@ -181,6 +195,17 @@ public class TestDocumentResource extends BaseJerseyTest {
|
||||
documents = json.getJSONArray("documents");
|
||||
Assert.assertTrue(documents.length() == 0);
|
||||
|
||||
// Search documents (nothing)
|
||||
documentResource = resource().path("/document/list");
|
||||
documentResource.addFilter(new CookieAuthenticationFilter(document1Token));
|
||||
getParams = new MultivaluedMapImpl();
|
||||
getParams.putSingle("search", "lang:fra");
|
||||
response = documentResource.queryParams(getParams).get(ClientResponse.class);
|
||||
json = response.getEntity(JSONObject.class);
|
||||
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||
documents = json.getJSONArray("documents");
|
||||
Assert.assertTrue(documents.length() == 0);
|
||||
|
||||
// Get a document
|
||||
documentResource = resource().path("/document/" + document1Id);
|
||||
documentResource.addFilter(new CookieAuthenticationFilter(document1Token));
|
||||
|
@ -42,6 +42,7 @@ public class TestFileResource extends BaseJerseyTest {
|
||||
documentResource.addFilter(new CookieAuthenticationFilter(file1AuthenticationToken));
|
||||
MultivaluedMapImpl postParams = new MultivaluedMapImpl();
|
||||
postParams.add("title", "File test document 1");
|
||||
postParams.add("language", "eng");
|
||||
ClientResponse response = documentResource.put(ClientResponse.class, postParams);
|
||||
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||
JSONObject json = response.getEntity(JSONObject.class);
|
||||
@ -52,7 +53,7 @@ public class TestFileResource extends BaseJerseyTest {
|
||||
WebResource fileResource = resource().path("/file");
|
||||
fileResource.addFilter(new CookieAuthenticationFilter(file1AuthenticationToken));
|
||||
FormDataMultiPart form = new FormDataMultiPart();
|
||||
InputStream file = this.getClass().getResourceAsStream("/file/PIA00452.jpg");
|
||||
InputStream file = this.getClass().getResourceAsStream("/file/Einstein-Roosevelt-letter.png");
|
||||
FormDataBodyPart fdp = new FormDataBodyPart("file",
|
||||
new BufferedInputStream(file),
|
||||
MediaType.APPLICATION_OCTET_STREAM_TYPE);
|
||||
@ -87,7 +88,7 @@ public class TestFileResource extends BaseJerseyTest {
|
||||
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||
InputStream is = response.getEntityInputStream();
|
||||
byte[] fileBytes = ByteStreams.toByteArray(is);
|
||||
Assert.assertEquals(163510, fileBytes.length);
|
||||
Assert.assertEquals(292641, fileBytes.length);
|
||||
|
||||
// Get the thumbnail data
|
||||
fileResource = resource().path("/file/" + file1Id + "/data");
|
||||
@ -98,7 +99,7 @@ public class TestFileResource extends BaseJerseyTest {
|
||||
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||
is = response.getEntityInputStream();
|
||||
fileBytes = ByteStreams.toByteArray(is);
|
||||
Assert.assertEquals(41935, fileBytes.length);
|
||||
Assert.assertEquals(34050, fileBytes.length);
|
||||
|
||||
// Get all files from a document
|
||||
fileResource = resource().path("/file/list");
|
||||
|
@ -42,6 +42,7 @@ public class TestShareResource extends BaseJerseyTest {
|
||||
documentResource.addFilter(new CookieAuthenticationFilter(share1AuthenticationToken));
|
||||
MultivaluedMapImpl postParams = new MultivaluedMapImpl();
|
||||
postParams.add("title", "File test document 1");
|
||||
postParams.add("language", "eng");
|
||||
ClientResponse response = documentResource.put(ClientResponse.class, postParams);
|
||||
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||
JSONObject json = response.getEntity(JSONObject.class);
|
||||
|
@ -66,6 +66,7 @@ public class TestTagResource extends BaseJerseyTest {
|
||||
postParams = new MultivaluedMapImpl();
|
||||
postParams.add("title", "My super document 1");
|
||||
postParams.add("tags", tag3Id);
|
||||
postParams.add("language", "eng");
|
||||
response = documentResource.put(ClientResponse.class, postParams);
|
||||
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||
json = response.getEntity(JSONObject.class);
|
||||
@ -76,6 +77,7 @@ public class TestTagResource extends BaseJerseyTest {
|
||||
postParams = new MultivaluedMapImpl();
|
||||
postParams.add("title", "My super document 1");
|
||||
postParams.add("tags", tag4Id);
|
||||
postParams.add("language", "eng");
|
||||
response = documentResource.put(ClientResponse.class, postParams);
|
||||
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||
json = response.getEntity(JSONObject.class);
|
||||
|
BIN
docs-web/src/test/resources/file/Einstein-Roosevelt-letter.png
Normal file
BIN
docs-web/src/test/resources/file/Einstein-Roosevelt-letter.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 286 KiB |
Loading…
Reference in New Issue
Block a user