Search in OCR content, batch to OCR all files

This commit is contained in:
jendib 2013-08-17 00:36:36 +02:00
parent 1f1f02ed41
commit 82682600df
8 changed files with 101 additions and 28 deletions

View File

@ -126,6 +126,7 @@ public class DocumentDao {
StringBuilder sb = new StringBuilder("select d.DOC_ID_C c0, d.DOC_TITLE_C c1, d.DOC_DESCRIPTION_C c2, d.DOC_CREATEDATE_D c3, d.DOC_LANGUAGE_C c4, s.SHA_ID_C is not null c5 "); StringBuilder sb = new StringBuilder("select d.DOC_ID_C c0, d.DOC_TITLE_C c1, d.DOC_DESCRIPTION_C c2, d.DOC_CREATEDATE_D c3, d.DOC_LANGUAGE_C c4, s.SHA_ID_C is not null c5 ");
sb.append(" from T_DOCUMENT d "); sb.append(" from T_DOCUMENT d ");
sb.append(" left join T_SHARE s on s.SHA_IDDOCUMENT_C = d.DOC_ID_C and s.SHA_DELETEDATE_D is null "); sb.append(" left join T_SHARE s on s.SHA_IDDOCUMENT_C = d.DOC_ID_C and s.SHA_DELETEDATE_D is null ");
sb.append(" left join T_FILE f on f.FIL_IDDOC_C = d.DOC_ID_C and f.FIL_DELETEDATE_D is null ");
// Adds search criteria // Adds search criteria
if (criteria.getUserId() != null) { if (criteria.getUserId() != null) {
@ -133,7 +134,7 @@ public class DocumentDao {
parameterMap.put("userId", criteria.getUserId()); parameterMap.put("userId", criteria.getUserId());
} }
if (criteria.getSearch() != null) { if (criteria.getSearch() != null) {
criteriaList.add("(d.DOC_TITLE_C LIKE :search OR d.DOC_DESCRIPTION_C LIKE :search)"); criteriaList.add("(d.DOC_TITLE_C LIKE :search OR d.DOC_DESCRIPTION_C LIKE :search OR f.FIL_CONTENT_C LIKE :search)");
parameterMap.put("search", "%" + criteria.getSearch() + "%"); parameterMap.put("search", "%" + criteria.getSearch() + "%");
} }
if (criteria.getCreateDateMin() != null) { if (criteria.getCreateDateMin() != null) {

View File

@ -36,6 +36,18 @@ public class FileDao {
return file.getId(); return file.getId();
} }
/**
* Returns the list of all files.
*
* @return List of files
*/
@SuppressWarnings("unchecked")
public List<File> findAll() {
EntityManager em = ThreadLocalContext.get().getEntityManager();
Query q = em.createQuery("select f from File f where f.deleteDate is null");
return q.getResultList();
}
/** /**
* Returns an active file. * Returns an active file.
* *

View File

@ -28,7 +28,7 @@ public class FileCreatedAsyncListener {
* @throws Exception * @throws Exception
*/ */
@Subscribe @Subscribe
public void onArticleCreated(final FileCreatedAsyncEvent fileCreatedAsyncEvent) throws Exception { public void onFileCreated(final FileCreatedAsyncEvent fileCreatedAsyncEvent) throws Exception {
if (log.isInfoEnabled()) { if (log.isInfoEnabled()) {
log.info("File created event: " + fileCreatedAsyncEvent.toString()); log.info("File created event: " + fileCreatedAsyncEvent.toString());
} }

View File

@ -2,6 +2,4 @@
- Index title and description (server) - Index title and description (server)
- Use Lucene for title and description searching (server) - Use Lucene for title and description searching (server)
- Index OCR-ized content (server) - Index OCR-ized content (server)
- Search in OCR-ized files (server)
- Batch to OCR all documents (server)
- Batch to rebuild Lucene index (server) - Batch to rebuild Lucene index (server)

View File

@ -1,9 +1,31 @@
package com.sismics.docs.rest.resource; package com.sismics.docs.rest.resource;
import java.util.ArrayList;
import java.util.List;
import java.util.ResourceBundle;
import javax.ws.rs.GET;
import javax.ws.rs.POST;
import javax.ws.rs.Path;
import javax.ws.rs.Produces;
import javax.ws.rs.QueryParam;
import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Appender;
import org.apache.log4j.Logger;
import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject;
import com.sismics.docs.core.dao.jpa.DocumentDao; import com.sismics.docs.core.dao.jpa.DocumentDao;
import com.sismics.docs.core.dao.jpa.FileDao;
import com.sismics.docs.core.dao.jpa.criteria.DocumentCriteria; import com.sismics.docs.core.dao.jpa.criteria.DocumentCriteria;
import com.sismics.docs.core.dao.jpa.dto.DocumentDto; import com.sismics.docs.core.dao.jpa.dto.DocumentDto;
import com.sismics.docs.core.model.jpa.Document;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.docs.core.util.ConfigUtil; import com.sismics.docs.core.util.ConfigUtil;
import com.sismics.docs.core.util.FileUtil;
import com.sismics.docs.core.util.jpa.PaginatedList; import com.sismics.docs.core.util.jpa.PaginatedList;
import com.sismics.docs.core.util.jpa.PaginatedLists; import com.sismics.docs.core.util.jpa.PaginatedLists;
import com.sismics.docs.core.util.jpa.SortCriteria; import com.sismics.docs.core.util.jpa.SortCriteria;
@ -13,21 +35,6 @@ import com.sismics.rest.exception.ServerException;
import com.sismics.util.log4j.LogCriteria; import com.sismics.util.log4j.LogCriteria;
import com.sismics.util.log4j.LogEntry; import com.sismics.util.log4j.LogEntry;
import com.sismics.util.log4j.MemoryAppender; import com.sismics.util.log4j.MemoryAppender;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Appender;
import org.apache.log4j.Logger;
import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject;
import javax.ws.rs.GET;
import javax.ws.rs.Path;
import javax.ws.rs.Produces;
import javax.ws.rs.QueryParam;
import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;
import java.util.ArrayList;
import java.util.List;
import java.util.ResourceBundle;
/** /**
* General app REST resource. * General app REST resource.
@ -129,4 +136,32 @@ public class AppResource extends BaseResource {
return Response.ok().entity(response).build(); return Response.ok().entity(response).build();
} }
/**
* OCR-ize all files again.
*
* @return Response
* @throws JSONException
*/
@POST
@Path("batch/ocr")
@Produces(MediaType.APPLICATION_JSON)
public Response batchReindex() throws JSONException {
if (!authenticate()) {
throw new ForbiddenClientException();
}
checkBaseFunction(BaseFunction.ADMIN);
FileDao fileDao = new FileDao();
DocumentDao documentDao = new DocumentDao();
List<File> fileList = fileDao.findAll();
for (File file : fileList) {
Document document = documentDao.getById(file.getDocumentId());
FileUtil.ocrFile(document, file);
}
JSONObject response = new JSONObject();
response.put("status", "ok");
return Response.ok().entity(response).build();
}
} }

View File

@ -23,13 +23,12 @@ public class TestAppResource extends BaseJerseyTest {
*/ */
@Test @Test
public void testAppResource() throws JSONException { public void testAppResource() throws JSONException {
// Login app1 // Login admin
clientUtil.createUser("app1"); String adminAuthenticationToken = clientUtil.login("admin", "admin", false);
String app1Token = clientUtil.login("app1");
// Check the application info // Check the application info
WebResource appResource = resource().path("/app"); WebResource appResource = resource().path("/app");
appResource.addFilter(new CookieAuthenticationFilter(app1Token)); appResource.addFilter(new CookieAuthenticationFilter(adminAuthenticationToken));
ClientResponse response = appResource.get(ClientResponse.class); ClientResponse response = appResource.get(ClientResponse.class);
response = appResource.get(ClientResponse.class); response = appResource.get(ClientResponse.class);
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus())); Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
@ -43,6 +42,13 @@ public class TestAppResource extends BaseJerseyTest {
Long totalMemory = json.getLong("total_memory"); Long totalMemory = json.getLong("total_memory");
Assert.assertTrue(totalMemory > 0 && totalMemory > freeMemory); Assert.assertTrue(totalMemory > 0 && totalMemory > freeMemory);
Assert.assertEquals(0, json.getInt("document_count")); Assert.assertEquals(0, json.getInt("document_count"));
// OCR-ize all files
appResource = resource().path("/app/batch/ocr");
appResource.addFilter(new CookieAuthenticationFilter(adminAuthenticationToken));
response = appResource.post(ClientResponse.class);
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
json = response.getEntity(JSONObject.class);
} }
/** /**

View File

@ -6,14 +6,21 @@ import com.sun.jersey.api.client.ClientResponse;
import com.sun.jersey.api.client.ClientResponse.Status; import com.sun.jersey.api.client.ClientResponse.Status;
import com.sun.jersey.api.client.WebResource; import com.sun.jersey.api.client.WebResource;
import com.sun.jersey.core.util.MultivaluedMapImpl; import com.sun.jersey.core.util.MultivaluedMapImpl;
import com.sun.jersey.multipart.FormDataBodyPart;
import com.sun.jersey.multipart.FormDataMultiPart;
import junit.framework.Assert; import junit.framework.Assert;
import org.codehaus.jettison.json.JSONArray; import org.codehaus.jettison.json.JSONArray;
import org.codehaus.jettison.json.JSONException; import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject; import org.codehaus.jettison.json.JSONObject;
import org.junit.Test; import org.junit.Test;
import java.io.BufferedInputStream;
import java.io.InputStream;
import java.util.Date; import java.util.Date;
import javax.ws.rs.core.MediaType;
/** /**
* Exhaustive test of the document resource. * Exhaustive test of the document resource.
* *
@ -59,6 +66,20 @@ public class TestDocumentResource extends BaseJerseyTest {
String document1Id = json.optString("id"); String document1Id = json.optString("id");
Assert.assertNotNull(document1Id); Assert.assertNotNull(document1Id);
// Add a file
WebResource fileResource = resource().path("/file");
fileResource.addFilter(new CookieAuthenticationFilter(document1Token));
FormDataMultiPart form = new FormDataMultiPart();
InputStream file = this.getClass().getResourceAsStream("/file/Einstein-Roosevelt-letter.png");
FormDataBodyPart fdp = new FormDataBodyPart("file",
new BufferedInputStream(file),
MediaType.APPLICATION_OCTET_STREAM_TYPE);
form.bodyPart(fdp);
form.field("id", document1Id);
response = fileResource.type(MediaType.MULTIPART_FORM_DATA).put(ClientResponse.class, form);
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
json = response.getEntity(JSONObject.class);
// Share this document // Share this document
WebResource fileShareResource = resource().path("/share"); WebResource fileShareResource = resource().path("/share");
fileShareResource.addFilter(new CookieAuthenticationFilter(document1Token)); fileShareResource.addFilter(new CookieAuthenticationFilter(document1Token));
@ -91,7 +112,7 @@ public class TestDocumentResource extends BaseJerseyTest {
documentResource = resource().path("/document/list"); documentResource = resource().path("/document/list");
documentResource.addFilter(new CookieAuthenticationFilter(document1Token)); documentResource.addFilter(new CookieAuthenticationFilter(document1Token));
getParams = new MultivaluedMapImpl(); getParams = new MultivaluedMapImpl();
getParams.putSingle("search", "Sup"); getParams.putSingle("search", "uranium");
response = documentResource.queryParams(getParams).get(ClientResponse.class); response = documentResource.queryParams(getParams).get(ClientResponse.class);
json = response.getEntity(JSONObject.class); json = response.getEntity(JSONObject.class);
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus())); Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
@ -154,7 +175,7 @@ public class TestDocumentResource extends BaseJerseyTest {
documentResource = resource().path("/document/list"); documentResource = resource().path("/document/list");
documentResource.addFilter(new CookieAuthenticationFilter(document1Token)); documentResource.addFilter(new CookieAuthenticationFilter(document1Token));
getParams = new MultivaluedMapImpl(); getParams = new MultivaluedMapImpl();
getParams.putSingle("search", "after:2010 before:2040-08 tag:super shared:yes lang:eng for"); getParams.putSingle("search", "after:2010 before:2040-08 tag:super shared:yes lang:eng uranium");
response = documentResource.queryParams(getParams).get(ClientResponse.class); response = documentResource.queryParams(getParams).get(ClientResponse.class);
json = response.getEntity(JSONObject.class); json = response.getEntity(JSONObject.class);
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus())); Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));

View File

@ -53,7 +53,7 @@ public class TestFileResource extends BaseJerseyTest {
WebResource fileResource = resource().path("/file"); WebResource fileResource = resource().path("/file");
fileResource.addFilter(new CookieAuthenticationFilter(file1AuthenticationToken)); fileResource.addFilter(new CookieAuthenticationFilter(file1AuthenticationToken));
FormDataMultiPart form = new FormDataMultiPart(); FormDataMultiPart form = new FormDataMultiPart();
InputStream file = this.getClass().getResourceAsStream("/file/Einstein-Roosevelt-letter.png"); InputStream file = this.getClass().getResourceAsStream("/file/PIA00452.jpg");
FormDataBodyPart fdp = new FormDataBodyPart("file", FormDataBodyPart fdp = new FormDataBodyPart("file",
new BufferedInputStream(file), new BufferedInputStream(file),
MediaType.APPLICATION_OCTET_STREAM_TYPE); MediaType.APPLICATION_OCTET_STREAM_TYPE);
@ -88,7 +88,7 @@ public class TestFileResource extends BaseJerseyTest {
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus())); Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
InputStream is = response.getEntityInputStream(); InputStream is = response.getEntityInputStream();
byte[] fileBytes = ByteStreams.toByteArray(is); byte[] fileBytes = ByteStreams.toByteArray(is);
Assert.assertEquals(292641, fileBytes.length); Assert.assertEquals(163510, fileBytes.length);
// Get the thumbnail data // Get the thumbnail data
fileResource = resource().path("/file/" + file1Id + "/data"); fileResource = resource().path("/file/" + file1Id + "/data");
@ -99,7 +99,7 @@ public class TestFileResource extends BaseJerseyTest {
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus())); Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
is = response.getEntityInputStream(); is = response.getEntityInputStream();
fileBytes = ByteStreams.toByteArray(is); fileBytes = ByteStreams.toByteArray(is);
Assert.assertEquals(34050, fileBytes.length); Assert.assertEquals(41935, fileBytes.length);
// Get all files from a document // Get all files from a document
fileResource = resource().path("/file/list"); fileResource = resource().path("/file/list");