Search in OCR content, batch to OCR all files

This commit is contained in:
jendib 2013-08-17 00:36:36 +02:00
parent 1f1f02ed41
commit 82682600df
8 changed files with 101 additions and 28 deletions

View File

@ -126,6 +126,7 @@ public class DocumentDao {
StringBuilder sb = new StringBuilder("select d.DOC_ID_C c0, d.DOC_TITLE_C c1, d.DOC_DESCRIPTION_C c2, d.DOC_CREATEDATE_D c3, d.DOC_LANGUAGE_C c4, s.SHA_ID_C is not null c5 ");
sb.append(" from T_DOCUMENT d ");
sb.append(" left join T_SHARE s on s.SHA_IDDOCUMENT_C = d.DOC_ID_C and s.SHA_DELETEDATE_D is null ");
sb.append(" left join T_FILE f on f.FIL_IDDOC_C = d.DOC_ID_C and f.FIL_DELETEDATE_D is null ");
// Adds search criteria
if (criteria.getUserId() != null) {
@ -133,7 +134,7 @@ public class DocumentDao {
parameterMap.put("userId", criteria.getUserId());
}
if (criteria.getSearch() != null) {
criteriaList.add("(d.DOC_TITLE_C LIKE :search OR d.DOC_DESCRIPTION_C LIKE :search)");
criteriaList.add("(d.DOC_TITLE_C LIKE :search OR d.DOC_DESCRIPTION_C LIKE :search OR f.FIL_CONTENT_C LIKE :search)");
parameterMap.put("search", "%" + criteria.getSearch() + "%");
}
if (criteria.getCreateDateMin() != null) {

View File

@ -36,6 +36,18 @@ public class FileDao {
return file.getId();
}
/**
* Returns the list of all files.
*
* @return List of files
*/
@SuppressWarnings("unchecked")
public List<File> findAll() {
EntityManager em = ThreadLocalContext.get().getEntityManager();
Query q = em.createQuery("select f from File f where f.deleteDate is null");
return q.getResultList();
}
/**
* Returns an active file.
*

View File

@ -28,7 +28,7 @@ public class FileCreatedAsyncListener {
* @throws Exception
*/
@Subscribe
public void onArticleCreated(final FileCreatedAsyncEvent fileCreatedAsyncEvent) throws Exception {
public void onFileCreated(final FileCreatedAsyncEvent fileCreatedAsyncEvent) throws Exception {
if (log.isInfoEnabled()) {
log.info("File created event: " + fileCreatedAsyncEvent.toString());
}

View File

@ -2,6 +2,4 @@
- Index title and description (server)
- Use Lucene for title and description searching (server)
- Index OCR-ized content (server)
- Search in OCR-ized files (server)
- Batch to OCR all documents (server)
- Batch to rebuild Lucene index (server)

View File

@ -1,9 +1,31 @@
package com.sismics.docs.rest.resource;
import java.util.ArrayList;
import java.util.List;
import java.util.ResourceBundle;
import javax.ws.rs.GET;
import javax.ws.rs.POST;
import javax.ws.rs.Path;
import javax.ws.rs.Produces;
import javax.ws.rs.QueryParam;
import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Appender;
import org.apache.log4j.Logger;
import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject;
import com.sismics.docs.core.dao.jpa.DocumentDao;
import com.sismics.docs.core.dao.jpa.FileDao;
import com.sismics.docs.core.dao.jpa.criteria.DocumentCriteria;
import com.sismics.docs.core.dao.jpa.dto.DocumentDto;
import com.sismics.docs.core.model.jpa.Document;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.docs.core.util.ConfigUtil;
import com.sismics.docs.core.util.FileUtil;
import com.sismics.docs.core.util.jpa.PaginatedList;
import com.sismics.docs.core.util.jpa.PaginatedLists;
import com.sismics.docs.core.util.jpa.SortCriteria;
@ -13,21 +35,6 @@ import com.sismics.rest.exception.ServerException;
import com.sismics.util.log4j.LogCriteria;
import com.sismics.util.log4j.LogEntry;
import com.sismics.util.log4j.MemoryAppender;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Appender;
import org.apache.log4j.Logger;
import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject;
import javax.ws.rs.GET;
import javax.ws.rs.Path;
import javax.ws.rs.Produces;
import javax.ws.rs.QueryParam;
import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;
import java.util.ArrayList;
import java.util.List;
import java.util.ResourceBundle;
/**
* General app REST resource.
@ -129,4 +136,32 @@ public class AppResource extends BaseResource {
return Response.ok().entity(response).build();
}
/**
* OCR-ize all files again.
*
* @return Response
* @throws JSONException
*/
@POST
@Path("batch/ocr")
@Produces(MediaType.APPLICATION_JSON)
public Response batchReindex() throws JSONException {
if (!authenticate()) {
throw new ForbiddenClientException();
}
checkBaseFunction(BaseFunction.ADMIN);
FileDao fileDao = new FileDao();
DocumentDao documentDao = new DocumentDao();
List<File> fileList = fileDao.findAll();
for (File file : fileList) {
Document document = documentDao.getById(file.getDocumentId());
FileUtil.ocrFile(document, file);
}
JSONObject response = new JSONObject();
response.put("status", "ok");
return Response.ok().entity(response).build();
}
}

View File

@ -23,13 +23,12 @@ public class TestAppResource extends BaseJerseyTest {
*/
@Test
public void testAppResource() throws JSONException {
// Login app1
clientUtil.createUser("app1");
String app1Token = clientUtil.login("app1");
// Login admin
String adminAuthenticationToken = clientUtil.login("admin", "admin", false);
// Check the application info
WebResource appResource = resource().path("/app");
appResource.addFilter(new CookieAuthenticationFilter(app1Token));
appResource.addFilter(new CookieAuthenticationFilter(adminAuthenticationToken));
ClientResponse response = appResource.get(ClientResponse.class);
response = appResource.get(ClientResponse.class);
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
@ -43,6 +42,13 @@ public class TestAppResource extends BaseJerseyTest {
Long totalMemory = json.getLong("total_memory");
Assert.assertTrue(totalMemory > 0 && totalMemory > freeMemory);
Assert.assertEquals(0, json.getInt("document_count"));
// OCR-ize all files
appResource = resource().path("/app/batch/ocr");
appResource.addFilter(new CookieAuthenticationFilter(adminAuthenticationToken));
response = appResource.post(ClientResponse.class);
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
json = response.getEntity(JSONObject.class);
}
/**

View File

@ -6,14 +6,21 @@ import com.sun.jersey.api.client.ClientResponse;
import com.sun.jersey.api.client.ClientResponse.Status;
import com.sun.jersey.api.client.WebResource;
import com.sun.jersey.core.util.MultivaluedMapImpl;
import com.sun.jersey.multipart.FormDataBodyPart;
import com.sun.jersey.multipart.FormDataMultiPart;
import junit.framework.Assert;
import org.codehaus.jettison.json.JSONArray;
import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject;
import org.junit.Test;
import java.io.BufferedInputStream;
import java.io.InputStream;
import java.util.Date;
import javax.ws.rs.core.MediaType;
/**
* Exhaustive test of the document resource.
*
@ -59,6 +66,20 @@ public class TestDocumentResource extends BaseJerseyTest {
String document1Id = json.optString("id");
Assert.assertNotNull(document1Id);
// Add a file
WebResource fileResource = resource().path("/file");
fileResource.addFilter(new CookieAuthenticationFilter(document1Token));
FormDataMultiPart form = new FormDataMultiPart();
InputStream file = this.getClass().getResourceAsStream("/file/Einstein-Roosevelt-letter.png");
FormDataBodyPart fdp = new FormDataBodyPart("file",
new BufferedInputStream(file),
MediaType.APPLICATION_OCTET_STREAM_TYPE);
form.bodyPart(fdp);
form.field("id", document1Id);
response = fileResource.type(MediaType.MULTIPART_FORM_DATA).put(ClientResponse.class, form);
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
json = response.getEntity(JSONObject.class);
// Share this document
WebResource fileShareResource = resource().path("/share");
fileShareResource.addFilter(new CookieAuthenticationFilter(document1Token));
@ -91,7 +112,7 @@ public class TestDocumentResource extends BaseJerseyTest {
documentResource = resource().path("/document/list");
documentResource.addFilter(new CookieAuthenticationFilter(document1Token));
getParams = new MultivaluedMapImpl();
getParams.putSingle("search", "Sup");
getParams.putSingle("search", "uranium");
response = documentResource.queryParams(getParams).get(ClientResponse.class);
json = response.getEntity(JSONObject.class);
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
@ -154,7 +175,7 @@ public class TestDocumentResource extends BaseJerseyTest {
documentResource = resource().path("/document/list");
documentResource.addFilter(new CookieAuthenticationFilter(document1Token));
getParams = new MultivaluedMapImpl();
getParams.putSingle("search", "after:2010 before:2040-08 tag:super shared:yes lang:eng for");
getParams.putSingle("search", "after:2010 before:2040-08 tag:super shared:yes lang:eng uranium");
response = documentResource.queryParams(getParams).get(ClientResponse.class);
json = response.getEntity(JSONObject.class);
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));

View File

@ -53,7 +53,7 @@ public class TestFileResource extends BaseJerseyTest {
WebResource fileResource = resource().path("/file");
fileResource.addFilter(new CookieAuthenticationFilter(file1AuthenticationToken));
FormDataMultiPart form = new FormDataMultiPart();
InputStream file = this.getClass().getResourceAsStream("/file/Einstein-Roosevelt-letter.png");
InputStream file = this.getClass().getResourceAsStream("/file/PIA00452.jpg");
FormDataBodyPart fdp = new FormDataBodyPart("file",
new BufferedInputStream(file),
MediaType.APPLICATION_OCTET_STREAM_TYPE);
@ -88,7 +88,7 @@ public class TestFileResource extends BaseJerseyTest {
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
InputStream is = response.getEntityInputStream();
byte[] fileBytes = ByteStreams.toByteArray(is);
Assert.assertEquals(292641, fileBytes.length);
Assert.assertEquals(163510, fileBytes.length);
// Get the thumbnail data
fileResource = resource().path("/file/" + file1Id + "/data");
@ -99,7 +99,7 @@ public class TestFileResource extends BaseJerseyTest {
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
is = response.getEntityInputStream();
fileBytes = ByteStreams.toByteArray(is);
Assert.assertEquals(34050, fileBytes.length);
Assert.assertEquals(41935, fileBytes.length);
// Get all files from a document
fileResource = resource().path("/file/list");