mirror of
https://github.com/sismics/docs.git
synced 2024-12-22 11:23:48 +01:00
Search in OCR content, batch to OCR all files
This commit is contained in:
parent
1f1f02ed41
commit
82682600df
@ -126,6 +126,7 @@ public class DocumentDao {
|
||||
StringBuilder sb = new StringBuilder("select d.DOC_ID_C c0, d.DOC_TITLE_C c1, d.DOC_DESCRIPTION_C c2, d.DOC_CREATEDATE_D c3, d.DOC_LANGUAGE_C c4, s.SHA_ID_C is not null c5 ");
|
||||
sb.append(" from T_DOCUMENT d ");
|
||||
sb.append(" left join T_SHARE s on s.SHA_IDDOCUMENT_C = d.DOC_ID_C and s.SHA_DELETEDATE_D is null ");
|
||||
sb.append(" left join T_FILE f on f.FIL_IDDOC_C = d.DOC_ID_C and f.FIL_DELETEDATE_D is null ");
|
||||
|
||||
// Adds search criteria
|
||||
if (criteria.getUserId() != null) {
|
||||
@ -133,7 +134,7 @@ public class DocumentDao {
|
||||
parameterMap.put("userId", criteria.getUserId());
|
||||
}
|
||||
if (criteria.getSearch() != null) {
|
||||
criteriaList.add("(d.DOC_TITLE_C LIKE :search OR d.DOC_DESCRIPTION_C LIKE :search)");
|
||||
criteriaList.add("(d.DOC_TITLE_C LIKE :search OR d.DOC_DESCRIPTION_C LIKE :search OR f.FIL_CONTENT_C LIKE :search)");
|
||||
parameterMap.put("search", "%" + criteria.getSearch() + "%");
|
||||
}
|
||||
if (criteria.getCreateDateMin() != null) {
|
||||
|
@ -36,6 +36,18 @@ public class FileDao {
|
||||
return file.getId();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the list of all files.
|
||||
*
|
||||
* @return List of files
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
public List<File> findAll() {
|
||||
EntityManager em = ThreadLocalContext.get().getEntityManager();
|
||||
Query q = em.createQuery("select f from File f where f.deleteDate is null");
|
||||
return q.getResultList();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an active file.
|
||||
*
|
||||
|
@ -28,7 +28,7 @@ public class FileCreatedAsyncListener {
|
||||
* @throws Exception
|
||||
*/
|
||||
@Subscribe
|
||||
public void onArticleCreated(final FileCreatedAsyncEvent fileCreatedAsyncEvent) throws Exception {
|
||||
public void onFileCreated(final FileCreatedAsyncEvent fileCreatedAsyncEvent) throws Exception {
|
||||
if (log.isInfoEnabled()) {
|
||||
log.info("File created event: " + fileCreatedAsyncEvent.toString());
|
||||
}
|
||||
|
@ -2,6 +2,4 @@
|
||||
- Index title and description (server)
|
||||
- Use Lucene for title and description searching (server)
|
||||
- Index OCR-ized content (server)
|
||||
- Search in OCR-ized files (server)
|
||||
- Batch to OCR all documents (server)
|
||||
- Batch to rebuild Lucene index (server)
|
@ -1,9 +1,31 @@
|
||||
package com.sismics.docs.rest.resource;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.ResourceBundle;
|
||||
|
||||
import javax.ws.rs.GET;
|
||||
import javax.ws.rs.POST;
|
||||
import javax.ws.rs.Path;
|
||||
import javax.ws.rs.Produces;
|
||||
import javax.ws.rs.QueryParam;
|
||||
import javax.ws.rs.core.MediaType;
|
||||
import javax.ws.rs.core.Response;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.log4j.Appender;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.codehaus.jettison.json.JSONException;
|
||||
import org.codehaus.jettison.json.JSONObject;
|
||||
|
||||
import com.sismics.docs.core.dao.jpa.DocumentDao;
|
||||
import com.sismics.docs.core.dao.jpa.FileDao;
|
||||
import com.sismics.docs.core.dao.jpa.criteria.DocumentCriteria;
|
||||
import com.sismics.docs.core.dao.jpa.dto.DocumentDto;
|
||||
import com.sismics.docs.core.model.jpa.Document;
|
||||
import com.sismics.docs.core.model.jpa.File;
|
||||
import com.sismics.docs.core.util.ConfigUtil;
|
||||
import com.sismics.docs.core.util.FileUtil;
|
||||
import com.sismics.docs.core.util.jpa.PaginatedList;
|
||||
import com.sismics.docs.core.util.jpa.PaginatedLists;
|
||||
import com.sismics.docs.core.util.jpa.SortCriteria;
|
||||
@ -13,21 +35,6 @@ import com.sismics.rest.exception.ServerException;
|
||||
import com.sismics.util.log4j.LogCriteria;
|
||||
import com.sismics.util.log4j.LogEntry;
|
||||
import com.sismics.util.log4j.MemoryAppender;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.log4j.Appender;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.codehaus.jettison.json.JSONException;
|
||||
import org.codehaus.jettison.json.JSONObject;
|
||||
|
||||
import javax.ws.rs.GET;
|
||||
import javax.ws.rs.Path;
|
||||
import javax.ws.rs.Produces;
|
||||
import javax.ws.rs.QueryParam;
|
||||
import javax.ws.rs.core.MediaType;
|
||||
import javax.ws.rs.core.Response;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.ResourceBundle;
|
||||
|
||||
/**
|
||||
* General app REST resource.
|
||||
@ -129,4 +136,32 @@ public class AppResource extends BaseResource {
|
||||
|
||||
return Response.ok().entity(response).build();
|
||||
}
|
||||
|
||||
/**
|
||||
* OCR-ize all files again.
|
||||
*
|
||||
* @return Response
|
||||
* @throws JSONException
|
||||
*/
|
||||
@POST
|
||||
@Path("batch/ocr")
|
||||
@Produces(MediaType.APPLICATION_JSON)
|
||||
public Response batchReindex() throws JSONException {
|
||||
if (!authenticate()) {
|
||||
throw new ForbiddenClientException();
|
||||
}
|
||||
checkBaseFunction(BaseFunction.ADMIN);
|
||||
|
||||
FileDao fileDao = new FileDao();
|
||||
DocumentDao documentDao = new DocumentDao();
|
||||
List<File> fileList = fileDao.findAll();
|
||||
for (File file : fileList) {
|
||||
Document document = documentDao.getById(file.getDocumentId());
|
||||
FileUtil.ocrFile(document, file);
|
||||
}
|
||||
|
||||
JSONObject response = new JSONObject();
|
||||
response.put("status", "ok");
|
||||
return Response.ok().entity(response).build();
|
||||
}
|
||||
}
|
||||
|
@ -23,13 +23,12 @@ public class TestAppResource extends BaseJerseyTest {
|
||||
*/
|
||||
@Test
|
||||
public void testAppResource() throws JSONException {
|
||||
// Login app1
|
||||
clientUtil.createUser("app1");
|
||||
String app1Token = clientUtil.login("app1");
|
||||
// Login admin
|
||||
String adminAuthenticationToken = clientUtil.login("admin", "admin", false);
|
||||
|
||||
// Check the application info
|
||||
WebResource appResource = resource().path("/app");
|
||||
appResource.addFilter(new CookieAuthenticationFilter(app1Token));
|
||||
appResource.addFilter(new CookieAuthenticationFilter(adminAuthenticationToken));
|
||||
ClientResponse response = appResource.get(ClientResponse.class);
|
||||
response = appResource.get(ClientResponse.class);
|
||||
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||
@ -43,6 +42,13 @@ public class TestAppResource extends BaseJerseyTest {
|
||||
Long totalMemory = json.getLong("total_memory");
|
||||
Assert.assertTrue(totalMemory > 0 && totalMemory > freeMemory);
|
||||
Assert.assertEquals(0, json.getInt("document_count"));
|
||||
|
||||
// OCR-ize all files
|
||||
appResource = resource().path("/app/batch/ocr");
|
||||
appResource.addFilter(new CookieAuthenticationFilter(adminAuthenticationToken));
|
||||
response = appResource.post(ClientResponse.class);
|
||||
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||
json = response.getEntity(JSONObject.class);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -6,14 +6,21 @@ import com.sun.jersey.api.client.ClientResponse;
|
||||
import com.sun.jersey.api.client.ClientResponse.Status;
|
||||
import com.sun.jersey.api.client.WebResource;
|
||||
import com.sun.jersey.core.util.MultivaluedMapImpl;
|
||||
import com.sun.jersey.multipart.FormDataBodyPart;
|
||||
import com.sun.jersey.multipart.FormDataMultiPart;
|
||||
|
||||
import junit.framework.Assert;
|
||||
import org.codehaus.jettison.json.JSONArray;
|
||||
import org.codehaus.jettison.json.JSONException;
|
||||
import org.codehaus.jettison.json.JSONObject;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.util.Date;
|
||||
|
||||
import javax.ws.rs.core.MediaType;
|
||||
|
||||
/**
|
||||
* Exhaustive test of the document resource.
|
||||
*
|
||||
@ -59,6 +66,20 @@ public class TestDocumentResource extends BaseJerseyTest {
|
||||
String document1Id = json.optString("id");
|
||||
Assert.assertNotNull(document1Id);
|
||||
|
||||
// Add a file
|
||||
WebResource fileResource = resource().path("/file");
|
||||
fileResource.addFilter(new CookieAuthenticationFilter(document1Token));
|
||||
FormDataMultiPart form = new FormDataMultiPart();
|
||||
InputStream file = this.getClass().getResourceAsStream("/file/Einstein-Roosevelt-letter.png");
|
||||
FormDataBodyPart fdp = new FormDataBodyPart("file",
|
||||
new BufferedInputStream(file),
|
||||
MediaType.APPLICATION_OCTET_STREAM_TYPE);
|
||||
form.bodyPart(fdp);
|
||||
form.field("id", document1Id);
|
||||
response = fileResource.type(MediaType.MULTIPART_FORM_DATA).put(ClientResponse.class, form);
|
||||
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||
json = response.getEntity(JSONObject.class);
|
||||
|
||||
// Share this document
|
||||
WebResource fileShareResource = resource().path("/share");
|
||||
fileShareResource.addFilter(new CookieAuthenticationFilter(document1Token));
|
||||
@ -91,7 +112,7 @@ public class TestDocumentResource extends BaseJerseyTest {
|
||||
documentResource = resource().path("/document/list");
|
||||
documentResource.addFilter(new CookieAuthenticationFilter(document1Token));
|
||||
getParams = new MultivaluedMapImpl();
|
||||
getParams.putSingle("search", "Sup");
|
||||
getParams.putSingle("search", "uranium");
|
||||
response = documentResource.queryParams(getParams).get(ClientResponse.class);
|
||||
json = response.getEntity(JSONObject.class);
|
||||
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||
@ -154,7 +175,7 @@ public class TestDocumentResource extends BaseJerseyTest {
|
||||
documentResource = resource().path("/document/list");
|
||||
documentResource.addFilter(new CookieAuthenticationFilter(document1Token));
|
||||
getParams = new MultivaluedMapImpl();
|
||||
getParams.putSingle("search", "after:2010 before:2040-08 tag:super shared:yes lang:eng for");
|
||||
getParams.putSingle("search", "after:2010 before:2040-08 tag:super shared:yes lang:eng uranium");
|
||||
response = documentResource.queryParams(getParams).get(ClientResponse.class);
|
||||
json = response.getEntity(JSONObject.class);
|
||||
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||
|
@ -53,7 +53,7 @@ public class TestFileResource extends BaseJerseyTest {
|
||||
WebResource fileResource = resource().path("/file");
|
||||
fileResource.addFilter(new CookieAuthenticationFilter(file1AuthenticationToken));
|
||||
FormDataMultiPart form = new FormDataMultiPart();
|
||||
InputStream file = this.getClass().getResourceAsStream("/file/Einstein-Roosevelt-letter.png");
|
||||
InputStream file = this.getClass().getResourceAsStream("/file/PIA00452.jpg");
|
||||
FormDataBodyPart fdp = new FormDataBodyPart("file",
|
||||
new BufferedInputStream(file),
|
||||
MediaType.APPLICATION_OCTET_STREAM_TYPE);
|
||||
@ -88,7 +88,7 @@ public class TestFileResource extends BaseJerseyTest {
|
||||
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||
InputStream is = response.getEntityInputStream();
|
||||
byte[] fileBytes = ByteStreams.toByteArray(is);
|
||||
Assert.assertEquals(292641, fileBytes.length);
|
||||
Assert.assertEquals(163510, fileBytes.length);
|
||||
|
||||
// Get the thumbnail data
|
||||
fileResource = resource().path("/file/" + file1Id + "/data");
|
||||
@ -99,7 +99,7 @@ public class TestFileResource extends BaseJerseyTest {
|
||||
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||
is = response.getEntityInputStream();
|
||||
fileBytes = ByteStreams.toByteArray(is);
|
||||
Assert.assertEquals(34050, fileBytes.length);
|
||||
Assert.assertEquals(41935, fileBytes.length);
|
||||
|
||||
// Get all files from a document
|
||||
fileResource = resource().path("/file/list");
|
||||
|
Loading…
Reference in New Issue
Block a user