diff --git a/docs-core/pom.xml b/docs-core/pom.xml
index 0bf5cf36..3b6e8fc6 100644
--- a/docs-core/pom.xml
+++ b/docs-core/pom.xml
@@ -117,6 +117,11 @@
imgscalr-lib
+
+ org.apache.pdfbox
+ pdfbox
+
+
jna
diff --git a/docs-core/src/main/java/com/sismics/docs/core/event/OcrFileAsyncEvent.java b/docs-core/src/main/java/com/sismics/docs/core/event/ExtractFileAsyncEvent.java
similarity index 77%
rename from docs-core/src/main/java/com/sismics/docs/core/event/OcrFileAsyncEvent.java
rename to docs-core/src/main/java/com/sismics/docs/core/event/ExtractFileAsyncEvent.java
index c0926b9a..28dd6faa 100644
--- a/docs-core/src/main/java/com/sismics/docs/core/event/OcrFileAsyncEvent.java
+++ b/docs-core/src/main/java/com/sismics/docs/core/event/ExtractFileAsyncEvent.java
@@ -3,11 +3,11 @@ package com.sismics.docs.core.event;
import com.google.common.base.Objects;
/**
- * OCR all files in database event.
+ * Extract file content event.
*
* @author bgamard
*/
-public class OcrFileAsyncEvent {
+public class ExtractFileAsyncEvent {
@Override
public String toString() {
return Objects.toStringHelper(this)
diff --git a/docs-core/src/main/java/com/sismics/docs/core/listener/async/OcrFileAsyncListener.java b/docs-core/src/main/java/com/sismics/docs/core/listener/async/ExtractFileAsyncListener.java
similarity index 62%
rename from docs-core/src/main/java/com/sismics/docs/core/listener/async/OcrFileAsyncListener.java
rename to docs-core/src/main/java/com/sismics/docs/core/listener/async/ExtractFileAsyncListener.java
index 1324bf39..8b532560 100644
--- a/docs-core/src/main/java/com/sismics/docs/core/listener/async/OcrFileAsyncListener.java
+++ b/docs-core/src/main/java/com/sismics/docs/core/listener/async/ExtractFileAsyncListener.java
@@ -9,33 +9,33 @@ import org.slf4j.LoggerFactory;
import com.google.common.eventbus.Subscribe;
import com.sismics.docs.core.dao.jpa.DocumentDao;
import com.sismics.docs.core.dao.jpa.FileDao;
-import com.sismics.docs.core.event.OcrFileAsyncEvent;
+import com.sismics.docs.core.event.ExtractFileAsyncEvent;
import com.sismics.docs.core.model.jpa.Document;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.docs.core.util.FileUtil;
import com.sismics.docs.core.util.TransactionUtil;
/**
- * Listener on OCR all files in database.
+ * Listener on extract content from all files.
*
* @author bgamard
*/
-public class OcrFileAsyncListener {
+public class ExtractFileAsyncListener {
/**
* Logger.
*/
- private static final Logger log = LoggerFactory.getLogger(OcrFileAsyncListener.class);
+ private static final Logger log = LoggerFactory.getLogger(ExtractFileAsyncListener.class);
/**
- * OCR all files.
+ * Extract content from all files.
*
- * @param ocrFileAsyncEvent OCR all files in database event
+ * @param extractFileAsyncEvent Extract file content event
* @throws Exception
*/
@Subscribe
- public void on(final OcrFileAsyncEvent ocrFileAsyncEvent) throws Exception {
+ public void on(final ExtractFileAsyncEvent extractFileAsyncEvent) throws Exception {
if (log.isInfoEnabled()) {
- log.info("OCR all files in database event: " + ocrFileAsyncEvent.toString());
+ log.info("Extract file content event: " + extractFileAsyncEvent.toString());
}
TransactionUtil.handle(new Runnable() {
@@ -47,10 +47,9 @@ public class OcrFileAsyncListener {
for (File file : fileList) {
long startTime = System.currentTimeMillis();
Document document = documentDao.getById(file.getDocumentId());
- String content = FileUtil.ocrFile(document, file);
- file.setContent(content);
+ file.setContent(FileUtil.extractContent(document, file));
TransactionUtil.commit();
- log.info(MessageFormat.format("File OCR-ized in {0}ms", System.currentTimeMillis() - startTime));
+ log.info(MessageFormat.format("File content extracted in {0}ms", System.currentTimeMillis() - startTime));
}
}
});
diff --git a/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java b/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java
index f01a9daa..6ea1a214 100644
--- a/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java
+++ b/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java
@@ -39,7 +39,7 @@ public class FileCreatedAsyncListener {
// OCR the file
final File file = fileCreatedAsyncEvent.getFile();
long startTime = System.currentTimeMillis();
- final String content = FileUtil.ocrFile(fileCreatedAsyncEvent.getDocument(), file);
+ final String content = FileUtil.extractContent(fileCreatedAsyncEvent.getDocument(), file);
log.info(MessageFormat.format("File OCR-ized in {0}ms", System.currentTimeMillis() - startTime));
// Store the OCR-ization result in the database
diff --git a/docs-core/src/main/java/com/sismics/docs/core/model/context/AppContext.java b/docs-core/src/main/java/com/sismics/docs/core/model/context/AppContext.java
index f95ec2e5..cf997a4b 100644
--- a/docs-core/src/main/java/com/sismics/docs/core/model/context/AppContext.java
+++ b/docs-core/src/main/java/com/sismics/docs/core/model/context/AppContext.java
@@ -16,7 +16,7 @@ import com.sismics.docs.core.listener.async.DocumentDeletedAsyncListener;
import com.sismics.docs.core.listener.async.DocumentUpdatedAsyncListener;
import com.sismics.docs.core.listener.async.FileCreatedAsyncListener;
import com.sismics.docs.core.listener.async.FileDeletedAsyncListener;
-import com.sismics.docs.core.listener.async.OcrFileAsyncListener;
+import com.sismics.docs.core.listener.async.ExtractFileAsyncListener;
import com.sismics.docs.core.listener.async.RebuildIndexAsyncListener;
import com.sismics.docs.core.listener.sync.DeadEventListener;
import com.sismics.docs.core.model.jpa.Config;
@@ -82,7 +82,7 @@ public class AppContext {
asyncEventBus.register(new DocumentUpdatedAsyncListener());
asyncEventBus.register(new DocumentDeletedAsyncListener());
asyncEventBus.register(new RebuildIndexAsyncListener());
- asyncEventBus.register(new OcrFileAsyncListener());
+ asyncEventBus.register(new ExtractFileAsyncListener());
}
/**
diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java b/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java
index fc0fc2ca..5ab00ae4 100644
--- a/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java
+++ b/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java
@@ -6,11 +6,15 @@ import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
+import java.util.List;
import javax.imageio.ImageIO;
import net.sourceforge.tess4j.Tesseract;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.util.PDFTextStripper;
import org.imgscalr.Scalr;
import org.imgscalr.Scalr.Method;
import org.imgscalr.Scalr.Mode;
@@ -20,6 +24,7 @@ import org.slf4j.LoggerFactory;
import com.sismics.docs.core.model.jpa.Document;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.util.ImageUtil;
+import com.sismics.util.mime.MimeType;
/**
* File entity utilities.
@@ -33,18 +38,32 @@ public class FileUtil {
private static final Logger log = LoggerFactory.getLogger(FileUtil.class);
/**
- * OCR a file.
+ * Extract content from a file.
+ *
+ * @param document Document linked to the file
+ * @param file File to extract
+ * @return Content extract
+ */
+ public static String extractContent(Document document, File file) {
+ String content = null;
+
+ if (ImageUtil.isImage(file.getMimeType())) {
+ content = ocrFile(document, file);
+ } else if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) {
+ content = extractPdf(file);
+ }
+
+ return content;
+ }
+
+ /**
+ * Optical character recognition on a file.
*
* @param document Document linked to the file
* @param file File to OCR
- * @return OCR-ized content
+ * @return Content extracted
*/
- public static String ocrFile(Document document, final File file) {
- if (!ImageUtil.isImage(file.getMimeType())) {
- // The file is not OCR-izable
- return null;
- }
-
+ private static String ocrFile(Document document, File file) {
Tesseract instance = Tesseract.getInstance();
java.io.File storedfile = Paths.get(DirectoryUtil.getStorageDirectory().getPath(), file.getId()).toFile();
String content = null;
@@ -72,6 +91,35 @@ public class FileUtil {
return content;
}
+ /**
+ * Extract text from a PDF.
+ *
+ * @param file File to extract
+ * @return Content extracted
+ */
+ private static String extractPdf(File file) {
+ String content = null;
+ PDDocument pdfDocument = null;
+ java.io.File storedfile = Paths.get(DirectoryUtil.getStorageDirectory().getPath(), file.getId()).toFile();
+ try {
+ PDFTextStripper stripper = new PDFTextStripper();
+ pdfDocument = PDDocument.load(storedfile);
+ content = stripper.getText(pdfDocument);
+ } catch (IOException e) {
+ log.error("Error while extracting text from the PDF " + storedfile, e);
+ } finally {
+ if (pdfDocument != null) {
+ try {
+ pdfDocument.close();
+ } catch (IOException e) {
+ // NOP
+ }
+ }
+ }
+
+ return content;
+ }
+
/**
* Save a file on the storage filesystem.
*
@@ -84,7 +132,12 @@ public class FileUtil {
Files.copy(is, path);
// Generate file variations
- saveVariations(file, path.toFile());
+ try {
+ saveVariations(file, path.toFile());
+ } catch (IOException e) {
+ // Don't rethrow Exception from file variations generation
+ log.error("Error creating file variations", e);
+ }
}
/**
@@ -95,8 +148,22 @@ public class FileUtil {
* @throws IOException
*/
public static void saveVariations(File file, java.io.File originalFile) throws IOException {
+ BufferedImage image = null;
if (ImageUtil.isImage(file.getMimeType())) {
- BufferedImage image = ImageIO.read(originalFile);
+ image = ImageIO.read(originalFile);
+ } else if(file.getMimeType().equals(MimeType.APPLICATION_PDF)) {
+ // Generate preview from the first page of the PDF
+ PDDocument pdfDocument = PDDocument.load(originalFile);
+ @SuppressWarnings("unchecked")
+ List pageList = pdfDocument.getDocumentCatalog().getAllPages();
+ if (pageList.size() > 0) {
+ PDPage page = pageList.get(0);
+ image = page.convertToImage();
+ }
+ }
+
+ if (image != null) {
+ // Generate thumbnails from image
BufferedImage web = Scalr.resize(image, Scalr.Method.AUTOMATIC, Scalr.Mode.AUTOMATIC, 1280, Scalr.OP_ANTIALIAS);
BufferedImage thumbnail = Scalr.resize(image, Scalr.Method.AUTOMATIC, Scalr.Mode.AUTOMATIC, 256, Scalr.OP_ANTIALIAS);
image.flush();
diff --git a/docs-parent/TODO b/docs-parent/TODO
index a0c153ff..e69de29b 100644
--- a/docs-parent/TODO
+++ b/docs-parent/TODO
@@ -1,2 +0,0 @@
-- Extract text from PDF for indexing, see PDFBox (server)
-- Make thumbnail of the first page of PDF, see PDFBox (server)
\ No newline at end of file
diff --git a/docs-parent/pom.xml b/docs-parent/pom.xml
index a033fb5c..d8e29a15 100644
--- a/docs-parent/pom.xml
+++ b/docs-parent/pom.xml
@@ -15,7 +15,7 @@
1.7
UTF-8
-
+
1.5
2.6
2.1
@@ -62,6 +62,7 @@
8.1.2.v20120308
1.0.1
1.7
+ 1.8.2
@@ -436,24 +437,30 @@
${org.imgscalr.imgscalr-lib.version}
+
+ org.apache.pdfbox
+ pdfbox
+ ${org.apache.pdfbox.pdfbox.version}
+
+
- jna
- jna
- 1.0
-
-
-
- jai
- imageio
- 1.0
-
-
-
- tess4j
- tess4j
- 1.0
-
+ jna
+ jna
+ 1.0
+
+
+
+ jai
+ imageio
+ 1.0
+
+
+
+ tess4j
+ tess4j
+ 1.0
+
@@ -478,64 +485,64 @@
- org.apache.maven.plugins
- maven-install-plugin
- 2.3.1
-
-
-
- install-jna
- validate
-
- ${project.basedir}/lib/jna.jar
- default
- jna
- jna
- 1.0
- jar
- true
-
-
- install-file
-
-
-
-
- install-jai-imageio
- validate
-
- ${project.basedir}/lib/jai_imageio.jar
- default
- jai
- imageio
- 1.0
- jar
- true
-
-
- install-file
-
-
-
-
- install-tess4j
- validate
-
- ${project.basedir}/lib/tess4j.jar
- default
- tess4j
- tess4j
- 1.0
- jar
- true
-
-
- install-file
-
-
-
-
-
+ org.apache.maven.plugins
+ maven-install-plugin
+ 2.3.1
+
+
+
+ install-jna
+ validate
+
+ ${project.basedir}/lib/jna.jar
+ default
+ jna
+ jna
+ 1.0
+ jar
+ true
+
+
+ install-file
+
+
+
+
+ install-jai-imageio
+ validate
+
+ ${project.basedir}/lib/jai_imageio.jar
+ default
+ jai
+ imageio
+ 1.0
+ jar
+ true
+
+
+ install-file
+
+
+
+
+ install-tess4j
+ validate
+
+ ${project.basedir}/lib/tess4j.jar
+ default
+ tess4j
+ tess4j
+ 1.0
+ jar
+ true
+
+
+ install-file
+
+
+
+
+
diff --git a/docs-web/src/main/java/com/sismics/docs/rest/resource/AppResource.java b/docs-web/src/main/java/com/sismics/docs/rest/resource/AppResource.java
index b93c694c..5e90d110 100644
--- a/docs-web/src/main/java/com/sismics/docs/rest/resource/AppResource.java
+++ b/docs-web/src/main/java/com/sismics/docs/rest/resource/AppResource.java
@@ -26,7 +26,7 @@ import com.sismics.docs.core.dao.jpa.DocumentDao;
import com.sismics.docs.core.dao.jpa.FileDao;
import com.sismics.docs.core.dao.jpa.criteria.DocumentCriteria;
import com.sismics.docs.core.dao.jpa.dto.DocumentDto;
-import com.sismics.docs.core.event.OcrFileAsyncEvent;
+import com.sismics.docs.core.event.ExtractFileAsyncEvent;
import com.sismics.docs.core.model.context.AppContext;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.docs.core.util.ConfigUtil;
@@ -163,7 +163,7 @@ public class AppResource extends BaseResource {
checkBaseFunction(BaseFunction.ADMIN);
// Raise a OCR file event
- AppContext.getInstance().getAsyncEventBus().post(new OcrFileAsyncEvent());
+ AppContext.getInstance().getAsyncEventBus().post(new ExtractFileAsyncEvent());
JSONObject response = new JSONObject();
response.put("status", "ok");
diff --git a/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java b/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java
index b4e38286..614fd58c 100644
--- a/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java
+++ b/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java
@@ -15,6 +15,7 @@ import org.codehaus.jettison.json.JSONObject;
import org.joda.time.format.DateTimeFormat;
import org.junit.Test;
+import com.google.common.io.ByteStreams;
import com.sismics.docs.rest.filter.CookieAuthenticationFilter;
import com.sun.jersey.api.client.ClientResponse;
import com.sun.jersey.api.client.ClientResponse.Status;
@@ -354,4 +355,67 @@ public class TestDocumentResource extends BaseJerseyTest {
json = response.getEntity(JSONObject.class);
Assert.assertEquals(Status.BAD_REQUEST, Status.fromStatusCode(response.getStatus()));
}
+
+ /**
+ * Test PDF extraction.
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testPdfExtraction() throws Exception {
+ // Login document2
+ clientUtil.createUser("document2");
+ String document2Token = clientUtil.login("document2");
+
+ // Create a document
+ WebResource documentResource = resource().path("/document");
+ documentResource.addFilter(new CookieAuthenticationFilter(document2Token));
+ MultivaluedMapImpl postParams = new MultivaluedMapImpl();
+ postParams.add("title", "My super title document 1");
+ postParams.add("description", "My super description for document 1");
+ postParams.add("language", "eng");
+ long create1Date = new Date().getTime();
+ postParams.add("create_date", create1Date);
+ ClientResponse response = documentResource.put(ClientResponse.class, postParams);
+ Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
+ JSONObject json = response.getEntity(JSONObject.class);
+ String document1Id = json.optString("id");
+ Assert.assertNotNull(document1Id);
+
+ // Add a PDF file
+ WebResource fileResource = resource().path("/file");
+ fileResource.addFilter(new CookieAuthenticationFilter(document2Token));
+ FormDataMultiPart form = new FormDataMultiPart();
+ InputStream file = this.getClass().getResourceAsStream("/file/wikipedia.pdf");
+ FormDataBodyPart fdp = new FormDataBodyPart("file",
+ new BufferedInputStream(file),
+ MediaType.APPLICATION_OCTET_STREAM_TYPE);
+ form.bodyPart(fdp);
+ form.field("id", document1Id);
+ response = fileResource.type(MediaType.MULTIPART_FORM_DATA).put(ClientResponse.class, form);
+ Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
+ json = response.getEntity(JSONObject.class);
+ String file1Id = json.getString("id");
+
+ // Search documents by query in full content
+ documentResource = resource().path("/document/list");
+ documentResource.addFilter(new CookieAuthenticationFilter(document2Token));
+ MultivaluedMapImpl getParams = new MultivaluedMapImpl();
+ getParams.putSingle("search", "full:vrandecic");
+ response = documentResource.queryParams(getParams).get(ClientResponse.class);
+ json = response.getEntity(JSONObject.class);
+ Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
+ Assert.assertTrue(json.getJSONArray("documents").length() == 1);
+
+ // Get the file thumbnail data
+ fileResource = resource().path("/file/" + file1Id + "/data");
+ fileResource.addFilter(new CookieAuthenticationFilter(document2Token));
+ getParams = new MultivaluedMapImpl();
+ getParams.putSingle("size", "thumb");
+ response = fileResource.queryParams(getParams).get(ClientResponse.class);
+ Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
+ InputStream is = response.getEntityInputStream();
+ byte[] fileBytes = ByteStreams.toByteArray(is);
+ Assert.assertEquals(3457, fileBytes.length);
+ }
}
\ No newline at end of file
diff --git a/docs-web/src/test/resources/file/wikipedia.pdf b/docs-web/src/test/resources/file/wikipedia.pdf
new file mode 100644
index 00000000..08bc5206
Binary files /dev/null and b/docs-web/src/test/resources/file/wikipedia.pdf differ