From 82682600df535783203ef4fe5f4ac168d5c77163 Mon Sep 17 00:00:00 2001 From: jendib Date: Sat, 17 Aug 2013 00:36:36 +0200 Subject: [PATCH] Search in OCR content, batch to OCR all files --- .../docs/core/dao/jpa/DocumentDao.java | 3 +- .../sismics/docs/core/dao/jpa/FileDao.java | 12 ++++ .../async/FileCreatedAsyncListener.java | 2 +- docs-parent/TODO | 2 - .../docs/rest/resource/AppResource.java | 65 ++++++++++++++----- .../sismics/docs/rest/TestAppResource.java | 14 ++-- .../docs/rest/TestDocumentResource.java | 25 ++++++- .../sismics/docs/rest/TestFileResource.java | 6 +- 8 files changed, 101 insertions(+), 28 deletions(-) diff --git a/docs-core/src/main/java/com/sismics/docs/core/dao/jpa/DocumentDao.java b/docs-core/src/main/java/com/sismics/docs/core/dao/jpa/DocumentDao.java index 90249978..443bfaa8 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/dao/jpa/DocumentDao.java +++ b/docs-core/src/main/java/com/sismics/docs/core/dao/jpa/DocumentDao.java @@ -126,6 +126,7 @@ public class DocumentDao { StringBuilder sb = new StringBuilder("select d.DOC_ID_C c0, d.DOC_TITLE_C c1, d.DOC_DESCRIPTION_C c2, d.DOC_CREATEDATE_D c3, d.DOC_LANGUAGE_C c4, s.SHA_ID_C is not null c5 "); sb.append(" from T_DOCUMENT d "); sb.append(" left join T_SHARE s on s.SHA_IDDOCUMENT_C = d.DOC_ID_C and s.SHA_DELETEDATE_D is null "); + sb.append(" left join T_FILE f on f.FIL_IDDOC_C = d.DOC_ID_C and f.FIL_DELETEDATE_D is null "); // Adds search criteria if (criteria.getUserId() != null) { @@ -133,7 +134,7 @@ public class DocumentDao { parameterMap.put("userId", criteria.getUserId()); } if (criteria.getSearch() != null) { - criteriaList.add("(d.DOC_TITLE_C LIKE :search OR d.DOC_DESCRIPTION_C LIKE :search)"); + criteriaList.add("(d.DOC_TITLE_C LIKE :search OR d.DOC_DESCRIPTION_C LIKE :search OR f.FIL_CONTENT_C LIKE :search)"); parameterMap.put("search", "%" + criteria.getSearch() + "%"); } if (criteria.getCreateDateMin() != null) { diff --git a/docs-core/src/main/java/com/sismics/docs/core/dao/jpa/FileDao.java b/docs-core/src/main/java/com/sismics/docs/core/dao/jpa/FileDao.java index 0eee780f..1aa007a5 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/dao/jpa/FileDao.java +++ b/docs-core/src/main/java/com/sismics/docs/core/dao/jpa/FileDao.java @@ -36,6 +36,18 @@ public class FileDao { return file.getId(); } + /** + * Returns the list of all files. + * + * @return List of files + */ + @SuppressWarnings("unchecked") + public List findAll() { + EntityManager em = ThreadLocalContext.get().getEntityManager(); + Query q = em.createQuery("select f from File f where f.deleteDate is null"); + return q.getResultList(); + } + /** * Returns an active file. * diff --git a/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java b/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java index c25e7d37..2d6be9f8 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java +++ b/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java @@ -28,7 +28,7 @@ public class FileCreatedAsyncListener { * @throws Exception */ @Subscribe - public void onArticleCreated(final FileCreatedAsyncEvent fileCreatedAsyncEvent) throws Exception { + public void onFileCreated(final FileCreatedAsyncEvent fileCreatedAsyncEvent) throws Exception { if (log.isInfoEnabled()) { log.info("File created event: " + fileCreatedAsyncEvent.toString()); } diff --git a/docs-parent/TODO b/docs-parent/TODO index f6d835e9..fddeffdb 100644 --- a/docs-parent/TODO +++ b/docs-parent/TODO @@ -2,6 +2,4 @@ - Index title and description (server) - Use Lucene for title and description searching (server) - Index OCR-ized content (server) -- Search in OCR-ized files (server) -- Batch to OCR all documents (server) - Batch to rebuild Lucene index (server) \ No newline at end of file diff --git a/docs-web/src/main/java/com/sismics/docs/rest/resource/AppResource.java b/docs-web/src/main/java/com/sismics/docs/rest/resource/AppResource.java index 5a2bdc7a..da1ba190 100644 --- a/docs-web/src/main/java/com/sismics/docs/rest/resource/AppResource.java +++ b/docs-web/src/main/java/com/sismics/docs/rest/resource/AppResource.java @@ -1,9 +1,31 @@ package com.sismics.docs.rest.resource; +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; + +import javax.ws.rs.GET; +import javax.ws.rs.POST; +import javax.ws.rs.Path; +import javax.ws.rs.Produces; +import javax.ws.rs.QueryParam; +import javax.ws.rs.core.MediaType; +import javax.ws.rs.core.Response; + +import org.apache.commons.lang.StringUtils; +import org.apache.log4j.Appender; +import org.apache.log4j.Logger; +import org.codehaus.jettison.json.JSONException; +import org.codehaus.jettison.json.JSONObject; + import com.sismics.docs.core.dao.jpa.DocumentDao; +import com.sismics.docs.core.dao.jpa.FileDao; import com.sismics.docs.core.dao.jpa.criteria.DocumentCriteria; import com.sismics.docs.core.dao.jpa.dto.DocumentDto; +import com.sismics.docs.core.model.jpa.Document; +import com.sismics.docs.core.model.jpa.File; import com.sismics.docs.core.util.ConfigUtil; +import com.sismics.docs.core.util.FileUtil; import com.sismics.docs.core.util.jpa.PaginatedList; import com.sismics.docs.core.util.jpa.PaginatedLists; import com.sismics.docs.core.util.jpa.SortCriteria; @@ -13,21 +35,6 @@ import com.sismics.rest.exception.ServerException; import com.sismics.util.log4j.LogCriteria; import com.sismics.util.log4j.LogEntry; import com.sismics.util.log4j.MemoryAppender; -import org.apache.commons.lang.StringUtils; -import org.apache.log4j.Appender; -import org.apache.log4j.Logger; -import org.codehaus.jettison.json.JSONException; -import org.codehaus.jettison.json.JSONObject; - -import javax.ws.rs.GET; -import javax.ws.rs.Path; -import javax.ws.rs.Produces; -import javax.ws.rs.QueryParam; -import javax.ws.rs.core.MediaType; -import javax.ws.rs.core.Response; -import java.util.ArrayList; -import java.util.List; -import java.util.ResourceBundle; /** * General app REST resource. @@ -129,4 +136,32 @@ public class AppResource extends BaseResource { return Response.ok().entity(response).build(); } + + /** + * OCR-ize all files again. + * + * @return Response + * @throws JSONException + */ + @POST + @Path("batch/ocr") + @Produces(MediaType.APPLICATION_JSON) + public Response batchReindex() throws JSONException { + if (!authenticate()) { + throw new ForbiddenClientException(); + } + checkBaseFunction(BaseFunction.ADMIN); + + FileDao fileDao = new FileDao(); + DocumentDao documentDao = new DocumentDao(); + List fileList = fileDao.findAll(); + for (File file : fileList) { + Document document = documentDao.getById(file.getDocumentId()); + FileUtil.ocrFile(document, file); + } + + JSONObject response = new JSONObject(); + response.put("status", "ok"); + return Response.ok().entity(response).build(); + } } diff --git a/docs-web/src/test/java/com/sismics/docs/rest/TestAppResource.java b/docs-web/src/test/java/com/sismics/docs/rest/TestAppResource.java index b35f340e..c9470a78 100644 --- a/docs-web/src/test/java/com/sismics/docs/rest/TestAppResource.java +++ b/docs-web/src/test/java/com/sismics/docs/rest/TestAppResource.java @@ -23,13 +23,12 @@ public class TestAppResource extends BaseJerseyTest { */ @Test public void testAppResource() throws JSONException { - // Login app1 - clientUtil.createUser("app1"); - String app1Token = clientUtil.login("app1"); + // Login admin + String adminAuthenticationToken = clientUtil.login("admin", "admin", false); // Check the application info WebResource appResource = resource().path("/app"); - appResource.addFilter(new CookieAuthenticationFilter(app1Token)); + appResource.addFilter(new CookieAuthenticationFilter(adminAuthenticationToken)); ClientResponse response = appResource.get(ClientResponse.class); response = appResource.get(ClientResponse.class); Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus())); @@ -43,6 +42,13 @@ public class TestAppResource extends BaseJerseyTest { Long totalMemory = json.getLong("total_memory"); Assert.assertTrue(totalMemory > 0 && totalMemory > freeMemory); Assert.assertEquals(0, json.getInt("document_count")); + + // OCR-ize all files + appResource = resource().path("/app/batch/ocr"); + appResource.addFilter(new CookieAuthenticationFilter(adminAuthenticationToken)); + response = appResource.post(ClientResponse.class); + Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus())); + json = response.getEntity(JSONObject.class); } /** diff --git a/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java b/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java index d718ef31..b0f0a3d2 100644 --- a/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java +++ b/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java @@ -6,14 +6,21 @@ import com.sun.jersey.api.client.ClientResponse; import com.sun.jersey.api.client.ClientResponse.Status; import com.sun.jersey.api.client.WebResource; import com.sun.jersey.core.util.MultivaluedMapImpl; +import com.sun.jersey.multipart.FormDataBodyPart; +import com.sun.jersey.multipart.FormDataMultiPart; + import junit.framework.Assert; import org.codehaus.jettison.json.JSONArray; import org.codehaus.jettison.json.JSONException; import org.codehaus.jettison.json.JSONObject; import org.junit.Test; +import java.io.BufferedInputStream; +import java.io.InputStream; import java.util.Date; +import javax.ws.rs.core.MediaType; + /** * Exhaustive test of the document resource. * @@ -59,6 +66,20 @@ public class TestDocumentResource extends BaseJerseyTest { String document1Id = json.optString("id"); Assert.assertNotNull(document1Id); + // Add a file + WebResource fileResource = resource().path("/file"); + fileResource.addFilter(new CookieAuthenticationFilter(document1Token)); + FormDataMultiPart form = new FormDataMultiPart(); + InputStream file = this.getClass().getResourceAsStream("/file/Einstein-Roosevelt-letter.png"); + FormDataBodyPart fdp = new FormDataBodyPart("file", + new BufferedInputStream(file), + MediaType.APPLICATION_OCTET_STREAM_TYPE); + form.bodyPart(fdp); + form.field("id", document1Id); + response = fileResource.type(MediaType.MULTIPART_FORM_DATA).put(ClientResponse.class, form); + Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus())); + json = response.getEntity(JSONObject.class); + // Share this document WebResource fileShareResource = resource().path("/share"); fileShareResource.addFilter(new CookieAuthenticationFilter(document1Token)); @@ -91,7 +112,7 @@ public class TestDocumentResource extends BaseJerseyTest { documentResource = resource().path("/document/list"); documentResource.addFilter(new CookieAuthenticationFilter(document1Token)); getParams = new MultivaluedMapImpl(); - getParams.putSingle("search", "Sup"); + getParams.putSingle("search", "uranium"); response = documentResource.queryParams(getParams).get(ClientResponse.class); json = response.getEntity(JSONObject.class); Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus())); @@ -154,7 +175,7 @@ public class TestDocumentResource extends BaseJerseyTest { documentResource = resource().path("/document/list"); documentResource.addFilter(new CookieAuthenticationFilter(document1Token)); getParams = new MultivaluedMapImpl(); - getParams.putSingle("search", "after:2010 before:2040-08 tag:super shared:yes lang:eng for"); + getParams.putSingle("search", "after:2010 before:2040-08 tag:super shared:yes lang:eng uranium"); response = documentResource.queryParams(getParams).get(ClientResponse.class); json = response.getEntity(JSONObject.class); Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus())); diff --git a/docs-web/src/test/java/com/sismics/docs/rest/TestFileResource.java b/docs-web/src/test/java/com/sismics/docs/rest/TestFileResource.java index 1a70f0b2..fe6dc25a 100644 --- a/docs-web/src/test/java/com/sismics/docs/rest/TestFileResource.java +++ b/docs-web/src/test/java/com/sismics/docs/rest/TestFileResource.java @@ -53,7 +53,7 @@ public class TestFileResource extends BaseJerseyTest { WebResource fileResource = resource().path("/file"); fileResource.addFilter(new CookieAuthenticationFilter(file1AuthenticationToken)); FormDataMultiPart form = new FormDataMultiPart(); - InputStream file = this.getClass().getResourceAsStream("/file/Einstein-Roosevelt-letter.png"); + InputStream file = this.getClass().getResourceAsStream("/file/PIA00452.jpg"); FormDataBodyPart fdp = new FormDataBodyPart("file", new BufferedInputStream(file), MediaType.APPLICATION_OCTET_STREAM_TYPE); @@ -88,7 +88,7 @@ public class TestFileResource extends BaseJerseyTest { Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus())); InputStream is = response.getEntityInputStream(); byte[] fileBytes = ByteStreams.toByteArray(is); - Assert.assertEquals(292641, fileBytes.length); + Assert.assertEquals(163510, fileBytes.length); // Get the thumbnail data fileResource = resource().path("/file/" + file1Id + "/data"); @@ -99,7 +99,7 @@ public class TestFileResource extends BaseJerseyTest { Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus())); is = response.getEntityInputStream(); fileBytes = ByteStreams.toByteArray(is); - Assert.assertEquals(34050, fileBytes.length); + Assert.assertEquals(41935, fileBytes.length); // Get all files from a document fileResource = resource().path("/file/list");