mirror of
https://github.com/sismics/docs.git
synced 2024-11-21 21:47:57 +01:00
Search in OCR content, batch to OCR all files
This commit is contained in:
parent
1f1f02ed41
commit
82682600df
@ -126,6 +126,7 @@ public class DocumentDao {
|
|||||||
StringBuilder sb = new StringBuilder("select d.DOC_ID_C c0, d.DOC_TITLE_C c1, d.DOC_DESCRIPTION_C c2, d.DOC_CREATEDATE_D c3, d.DOC_LANGUAGE_C c4, s.SHA_ID_C is not null c5 ");
|
StringBuilder sb = new StringBuilder("select d.DOC_ID_C c0, d.DOC_TITLE_C c1, d.DOC_DESCRIPTION_C c2, d.DOC_CREATEDATE_D c3, d.DOC_LANGUAGE_C c4, s.SHA_ID_C is not null c5 ");
|
||||||
sb.append(" from T_DOCUMENT d ");
|
sb.append(" from T_DOCUMENT d ");
|
||||||
sb.append(" left join T_SHARE s on s.SHA_IDDOCUMENT_C = d.DOC_ID_C and s.SHA_DELETEDATE_D is null ");
|
sb.append(" left join T_SHARE s on s.SHA_IDDOCUMENT_C = d.DOC_ID_C and s.SHA_DELETEDATE_D is null ");
|
||||||
|
sb.append(" left join T_FILE f on f.FIL_IDDOC_C = d.DOC_ID_C and f.FIL_DELETEDATE_D is null ");
|
||||||
|
|
||||||
// Adds search criteria
|
// Adds search criteria
|
||||||
if (criteria.getUserId() != null) {
|
if (criteria.getUserId() != null) {
|
||||||
@ -133,7 +134,7 @@ public class DocumentDao {
|
|||||||
parameterMap.put("userId", criteria.getUserId());
|
parameterMap.put("userId", criteria.getUserId());
|
||||||
}
|
}
|
||||||
if (criteria.getSearch() != null) {
|
if (criteria.getSearch() != null) {
|
||||||
criteriaList.add("(d.DOC_TITLE_C LIKE :search OR d.DOC_DESCRIPTION_C LIKE :search)");
|
criteriaList.add("(d.DOC_TITLE_C LIKE :search OR d.DOC_DESCRIPTION_C LIKE :search OR f.FIL_CONTENT_C LIKE :search)");
|
||||||
parameterMap.put("search", "%" + criteria.getSearch() + "%");
|
parameterMap.put("search", "%" + criteria.getSearch() + "%");
|
||||||
}
|
}
|
||||||
if (criteria.getCreateDateMin() != null) {
|
if (criteria.getCreateDateMin() != null) {
|
||||||
|
@ -36,6 +36,18 @@ public class FileDao {
|
|||||||
return file.getId();
|
return file.getId();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the list of all files.
|
||||||
|
*
|
||||||
|
* @return List of files
|
||||||
|
*/
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
public List<File> findAll() {
|
||||||
|
EntityManager em = ThreadLocalContext.get().getEntityManager();
|
||||||
|
Query q = em.createQuery("select f from File f where f.deleteDate is null");
|
||||||
|
return q.getResultList();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns an active file.
|
* Returns an active file.
|
||||||
*
|
*
|
||||||
|
@ -28,7 +28,7 @@ public class FileCreatedAsyncListener {
|
|||||||
* @throws Exception
|
* @throws Exception
|
||||||
*/
|
*/
|
||||||
@Subscribe
|
@Subscribe
|
||||||
public void onArticleCreated(final FileCreatedAsyncEvent fileCreatedAsyncEvent) throws Exception {
|
public void onFileCreated(final FileCreatedAsyncEvent fileCreatedAsyncEvent) throws Exception {
|
||||||
if (log.isInfoEnabled()) {
|
if (log.isInfoEnabled()) {
|
||||||
log.info("File created event: " + fileCreatedAsyncEvent.toString());
|
log.info("File created event: " + fileCreatedAsyncEvent.toString());
|
||||||
}
|
}
|
||||||
|
@ -2,6 +2,4 @@
|
|||||||
- Index title and description (server)
|
- Index title and description (server)
|
||||||
- Use Lucene for title and description searching (server)
|
- Use Lucene for title and description searching (server)
|
||||||
- Index OCR-ized content (server)
|
- Index OCR-ized content (server)
|
||||||
- Search in OCR-ized files (server)
|
|
||||||
- Batch to OCR all documents (server)
|
|
||||||
- Batch to rebuild Lucene index (server)
|
- Batch to rebuild Lucene index (server)
|
@ -1,9 +1,31 @@
|
|||||||
package com.sismics.docs.rest.resource;
|
package com.sismics.docs.rest.resource;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.ResourceBundle;
|
||||||
|
|
||||||
|
import javax.ws.rs.GET;
|
||||||
|
import javax.ws.rs.POST;
|
||||||
|
import javax.ws.rs.Path;
|
||||||
|
import javax.ws.rs.Produces;
|
||||||
|
import javax.ws.rs.QueryParam;
|
||||||
|
import javax.ws.rs.core.MediaType;
|
||||||
|
import javax.ws.rs.core.Response;
|
||||||
|
|
||||||
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
import org.apache.log4j.Appender;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import org.codehaus.jettison.json.JSONException;
|
||||||
|
import org.codehaus.jettison.json.JSONObject;
|
||||||
|
|
||||||
import com.sismics.docs.core.dao.jpa.DocumentDao;
|
import com.sismics.docs.core.dao.jpa.DocumentDao;
|
||||||
|
import com.sismics.docs.core.dao.jpa.FileDao;
|
||||||
import com.sismics.docs.core.dao.jpa.criteria.DocumentCriteria;
|
import com.sismics.docs.core.dao.jpa.criteria.DocumentCriteria;
|
||||||
import com.sismics.docs.core.dao.jpa.dto.DocumentDto;
|
import com.sismics.docs.core.dao.jpa.dto.DocumentDto;
|
||||||
|
import com.sismics.docs.core.model.jpa.Document;
|
||||||
|
import com.sismics.docs.core.model.jpa.File;
|
||||||
import com.sismics.docs.core.util.ConfigUtil;
|
import com.sismics.docs.core.util.ConfigUtil;
|
||||||
|
import com.sismics.docs.core.util.FileUtil;
|
||||||
import com.sismics.docs.core.util.jpa.PaginatedList;
|
import com.sismics.docs.core.util.jpa.PaginatedList;
|
||||||
import com.sismics.docs.core.util.jpa.PaginatedLists;
|
import com.sismics.docs.core.util.jpa.PaginatedLists;
|
||||||
import com.sismics.docs.core.util.jpa.SortCriteria;
|
import com.sismics.docs.core.util.jpa.SortCriteria;
|
||||||
@ -13,21 +35,6 @@ import com.sismics.rest.exception.ServerException;
|
|||||||
import com.sismics.util.log4j.LogCriteria;
|
import com.sismics.util.log4j.LogCriteria;
|
||||||
import com.sismics.util.log4j.LogEntry;
|
import com.sismics.util.log4j.LogEntry;
|
||||||
import com.sismics.util.log4j.MemoryAppender;
|
import com.sismics.util.log4j.MemoryAppender;
|
||||||
import org.apache.commons.lang.StringUtils;
|
|
||||||
import org.apache.log4j.Appender;
|
|
||||||
import org.apache.log4j.Logger;
|
|
||||||
import org.codehaus.jettison.json.JSONException;
|
|
||||||
import org.codehaus.jettison.json.JSONObject;
|
|
||||||
|
|
||||||
import javax.ws.rs.GET;
|
|
||||||
import javax.ws.rs.Path;
|
|
||||||
import javax.ws.rs.Produces;
|
|
||||||
import javax.ws.rs.QueryParam;
|
|
||||||
import javax.ws.rs.core.MediaType;
|
|
||||||
import javax.ws.rs.core.Response;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.ResourceBundle;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* General app REST resource.
|
* General app REST resource.
|
||||||
@ -129,4 +136,32 @@ public class AppResource extends BaseResource {
|
|||||||
|
|
||||||
return Response.ok().entity(response).build();
|
return Response.ok().entity(response).build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* OCR-ize all files again.
|
||||||
|
*
|
||||||
|
* @return Response
|
||||||
|
* @throws JSONException
|
||||||
|
*/
|
||||||
|
@POST
|
||||||
|
@Path("batch/ocr")
|
||||||
|
@Produces(MediaType.APPLICATION_JSON)
|
||||||
|
public Response batchReindex() throws JSONException {
|
||||||
|
if (!authenticate()) {
|
||||||
|
throw new ForbiddenClientException();
|
||||||
|
}
|
||||||
|
checkBaseFunction(BaseFunction.ADMIN);
|
||||||
|
|
||||||
|
FileDao fileDao = new FileDao();
|
||||||
|
DocumentDao documentDao = new DocumentDao();
|
||||||
|
List<File> fileList = fileDao.findAll();
|
||||||
|
for (File file : fileList) {
|
||||||
|
Document document = documentDao.getById(file.getDocumentId());
|
||||||
|
FileUtil.ocrFile(document, file);
|
||||||
|
}
|
||||||
|
|
||||||
|
JSONObject response = new JSONObject();
|
||||||
|
response.put("status", "ok");
|
||||||
|
return Response.ok().entity(response).build();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -23,13 +23,12 @@ public class TestAppResource extends BaseJerseyTest {
|
|||||||
*/
|
*/
|
||||||
@Test
|
@Test
|
||||||
public void testAppResource() throws JSONException {
|
public void testAppResource() throws JSONException {
|
||||||
// Login app1
|
// Login admin
|
||||||
clientUtil.createUser("app1");
|
String adminAuthenticationToken = clientUtil.login("admin", "admin", false);
|
||||||
String app1Token = clientUtil.login("app1");
|
|
||||||
|
|
||||||
// Check the application info
|
// Check the application info
|
||||||
WebResource appResource = resource().path("/app");
|
WebResource appResource = resource().path("/app");
|
||||||
appResource.addFilter(new CookieAuthenticationFilter(app1Token));
|
appResource.addFilter(new CookieAuthenticationFilter(adminAuthenticationToken));
|
||||||
ClientResponse response = appResource.get(ClientResponse.class);
|
ClientResponse response = appResource.get(ClientResponse.class);
|
||||||
response = appResource.get(ClientResponse.class);
|
response = appResource.get(ClientResponse.class);
|
||||||
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||||
@ -43,6 +42,13 @@ public class TestAppResource extends BaseJerseyTest {
|
|||||||
Long totalMemory = json.getLong("total_memory");
|
Long totalMemory = json.getLong("total_memory");
|
||||||
Assert.assertTrue(totalMemory > 0 && totalMemory > freeMemory);
|
Assert.assertTrue(totalMemory > 0 && totalMemory > freeMemory);
|
||||||
Assert.assertEquals(0, json.getInt("document_count"));
|
Assert.assertEquals(0, json.getInt("document_count"));
|
||||||
|
|
||||||
|
// OCR-ize all files
|
||||||
|
appResource = resource().path("/app/batch/ocr");
|
||||||
|
appResource.addFilter(new CookieAuthenticationFilter(adminAuthenticationToken));
|
||||||
|
response = appResource.post(ClientResponse.class);
|
||||||
|
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||||
|
json = response.getEntity(JSONObject.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -6,14 +6,21 @@ import com.sun.jersey.api.client.ClientResponse;
|
|||||||
import com.sun.jersey.api.client.ClientResponse.Status;
|
import com.sun.jersey.api.client.ClientResponse.Status;
|
||||||
import com.sun.jersey.api.client.WebResource;
|
import com.sun.jersey.api.client.WebResource;
|
||||||
import com.sun.jersey.core.util.MultivaluedMapImpl;
|
import com.sun.jersey.core.util.MultivaluedMapImpl;
|
||||||
|
import com.sun.jersey.multipart.FormDataBodyPart;
|
||||||
|
import com.sun.jersey.multipart.FormDataMultiPart;
|
||||||
|
|
||||||
import junit.framework.Assert;
|
import junit.framework.Assert;
|
||||||
import org.codehaus.jettison.json.JSONArray;
|
import org.codehaus.jettison.json.JSONArray;
|
||||||
import org.codehaus.jettison.json.JSONException;
|
import org.codehaus.jettison.json.JSONException;
|
||||||
import org.codehaus.jettison.json.JSONObject;
|
import org.codehaus.jettison.json.JSONObject;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.BufferedInputStream;
|
||||||
|
import java.io.InputStream;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
|
|
||||||
|
import javax.ws.rs.core.MediaType;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Exhaustive test of the document resource.
|
* Exhaustive test of the document resource.
|
||||||
*
|
*
|
||||||
@ -59,6 +66,20 @@ public class TestDocumentResource extends BaseJerseyTest {
|
|||||||
String document1Id = json.optString("id");
|
String document1Id = json.optString("id");
|
||||||
Assert.assertNotNull(document1Id);
|
Assert.assertNotNull(document1Id);
|
||||||
|
|
||||||
|
// Add a file
|
||||||
|
WebResource fileResource = resource().path("/file");
|
||||||
|
fileResource.addFilter(new CookieAuthenticationFilter(document1Token));
|
||||||
|
FormDataMultiPart form = new FormDataMultiPart();
|
||||||
|
InputStream file = this.getClass().getResourceAsStream("/file/Einstein-Roosevelt-letter.png");
|
||||||
|
FormDataBodyPart fdp = new FormDataBodyPart("file",
|
||||||
|
new BufferedInputStream(file),
|
||||||
|
MediaType.APPLICATION_OCTET_STREAM_TYPE);
|
||||||
|
form.bodyPart(fdp);
|
||||||
|
form.field("id", document1Id);
|
||||||
|
response = fileResource.type(MediaType.MULTIPART_FORM_DATA).put(ClientResponse.class, form);
|
||||||
|
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||||
|
json = response.getEntity(JSONObject.class);
|
||||||
|
|
||||||
// Share this document
|
// Share this document
|
||||||
WebResource fileShareResource = resource().path("/share");
|
WebResource fileShareResource = resource().path("/share");
|
||||||
fileShareResource.addFilter(new CookieAuthenticationFilter(document1Token));
|
fileShareResource.addFilter(new CookieAuthenticationFilter(document1Token));
|
||||||
@ -91,7 +112,7 @@ public class TestDocumentResource extends BaseJerseyTest {
|
|||||||
documentResource = resource().path("/document/list");
|
documentResource = resource().path("/document/list");
|
||||||
documentResource.addFilter(new CookieAuthenticationFilter(document1Token));
|
documentResource.addFilter(new CookieAuthenticationFilter(document1Token));
|
||||||
getParams = new MultivaluedMapImpl();
|
getParams = new MultivaluedMapImpl();
|
||||||
getParams.putSingle("search", "Sup");
|
getParams.putSingle("search", "uranium");
|
||||||
response = documentResource.queryParams(getParams).get(ClientResponse.class);
|
response = documentResource.queryParams(getParams).get(ClientResponse.class);
|
||||||
json = response.getEntity(JSONObject.class);
|
json = response.getEntity(JSONObject.class);
|
||||||
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||||
@ -154,7 +175,7 @@ public class TestDocumentResource extends BaseJerseyTest {
|
|||||||
documentResource = resource().path("/document/list");
|
documentResource = resource().path("/document/list");
|
||||||
documentResource.addFilter(new CookieAuthenticationFilter(document1Token));
|
documentResource.addFilter(new CookieAuthenticationFilter(document1Token));
|
||||||
getParams = new MultivaluedMapImpl();
|
getParams = new MultivaluedMapImpl();
|
||||||
getParams.putSingle("search", "after:2010 before:2040-08 tag:super shared:yes lang:eng for");
|
getParams.putSingle("search", "after:2010 before:2040-08 tag:super shared:yes lang:eng uranium");
|
||||||
response = documentResource.queryParams(getParams).get(ClientResponse.class);
|
response = documentResource.queryParams(getParams).get(ClientResponse.class);
|
||||||
json = response.getEntity(JSONObject.class);
|
json = response.getEntity(JSONObject.class);
|
||||||
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||||
|
@ -53,7 +53,7 @@ public class TestFileResource extends BaseJerseyTest {
|
|||||||
WebResource fileResource = resource().path("/file");
|
WebResource fileResource = resource().path("/file");
|
||||||
fileResource.addFilter(new CookieAuthenticationFilter(file1AuthenticationToken));
|
fileResource.addFilter(new CookieAuthenticationFilter(file1AuthenticationToken));
|
||||||
FormDataMultiPart form = new FormDataMultiPart();
|
FormDataMultiPart form = new FormDataMultiPart();
|
||||||
InputStream file = this.getClass().getResourceAsStream("/file/Einstein-Roosevelt-letter.png");
|
InputStream file = this.getClass().getResourceAsStream("/file/PIA00452.jpg");
|
||||||
FormDataBodyPart fdp = new FormDataBodyPart("file",
|
FormDataBodyPart fdp = new FormDataBodyPart("file",
|
||||||
new BufferedInputStream(file),
|
new BufferedInputStream(file),
|
||||||
MediaType.APPLICATION_OCTET_STREAM_TYPE);
|
MediaType.APPLICATION_OCTET_STREAM_TYPE);
|
||||||
@ -88,7 +88,7 @@ public class TestFileResource extends BaseJerseyTest {
|
|||||||
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||||
InputStream is = response.getEntityInputStream();
|
InputStream is = response.getEntityInputStream();
|
||||||
byte[] fileBytes = ByteStreams.toByteArray(is);
|
byte[] fileBytes = ByteStreams.toByteArray(is);
|
||||||
Assert.assertEquals(292641, fileBytes.length);
|
Assert.assertEquals(163510, fileBytes.length);
|
||||||
|
|
||||||
// Get the thumbnail data
|
// Get the thumbnail data
|
||||||
fileResource = resource().path("/file/" + file1Id + "/data");
|
fileResource = resource().path("/file/" + file1Id + "/data");
|
||||||
@ -99,7 +99,7 @@ public class TestFileResource extends BaseJerseyTest {
|
|||||||
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||||
is = response.getEntityInputStream();
|
is = response.getEntityInputStream();
|
||||||
fileBytes = ByteStreams.toByteArray(is);
|
fileBytes = ByteStreams.toByteArray(is);
|
||||||
Assert.assertEquals(34050, fileBytes.length);
|
Assert.assertEquals(41935, fileBytes.length);
|
||||||
|
|
||||||
// Get all files from a document
|
// Get all files from a document
|
||||||
fileResource = resource().path("/file/list");
|
fileResource = resource().path("/file/list");
|
||||||
|
Loading…
Reference in New Issue
Block a user