Index and generate thumbnails from PDF

This commit is contained in:
jendib 2013-08-18 13:48:08 +02:00
parent 4f8076427b
commit d3f59554f8
11 changed files with 245 additions and 105 deletions

View File

@ -117,6 +117,11 @@
<artifactId>imgscalr-lib</artifactId> <artifactId>imgscalr-lib</artifactId>
</dependency> </dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
</dependency>
<!-- OCR dependencies --> <!-- OCR dependencies -->
<dependency> <dependency>
<groupId>jna</groupId> <groupId>jna</groupId>

View File

@ -3,11 +3,11 @@ package com.sismics.docs.core.event;
import com.google.common.base.Objects; import com.google.common.base.Objects;
/** /**
* OCR all files in database event. * Extract file content event.
* *
* @author bgamard * @author bgamard
*/ */
public class OcrFileAsyncEvent { public class ExtractFileAsyncEvent {
@Override @Override
public String toString() { public String toString() {
return Objects.toStringHelper(this) return Objects.toStringHelper(this)

View File

@ -9,33 +9,33 @@ import org.slf4j.LoggerFactory;
import com.google.common.eventbus.Subscribe; import com.google.common.eventbus.Subscribe;
import com.sismics.docs.core.dao.jpa.DocumentDao; import com.sismics.docs.core.dao.jpa.DocumentDao;
import com.sismics.docs.core.dao.jpa.FileDao; import com.sismics.docs.core.dao.jpa.FileDao;
import com.sismics.docs.core.event.OcrFileAsyncEvent; import com.sismics.docs.core.event.ExtractFileAsyncEvent;
import com.sismics.docs.core.model.jpa.Document; import com.sismics.docs.core.model.jpa.Document;
import com.sismics.docs.core.model.jpa.File; import com.sismics.docs.core.model.jpa.File;
import com.sismics.docs.core.util.FileUtil; import com.sismics.docs.core.util.FileUtil;
import com.sismics.docs.core.util.TransactionUtil; import com.sismics.docs.core.util.TransactionUtil;
/** /**
* Listener on OCR all files in database. * Listener on extract content from all files.
* *
* @author bgamard * @author bgamard
*/ */
public class OcrFileAsyncListener { public class ExtractFileAsyncListener {
/** /**
* Logger. * Logger.
*/ */
private static final Logger log = LoggerFactory.getLogger(OcrFileAsyncListener.class); private static final Logger log = LoggerFactory.getLogger(ExtractFileAsyncListener.class);
/** /**
* OCR all files. * Extract content from all files.
* *
* @param ocrFileAsyncEvent OCR all files in database event * @param extractFileAsyncEvent Extract file content event
* @throws Exception * @throws Exception
*/ */
@Subscribe @Subscribe
public void on(final OcrFileAsyncEvent ocrFileAsyncEvent) throws Exception { public void on(final ExtractFileAsyncEvent extractFileAsyncEvent) throws Exception {
if (log.isInfoEnabled()) { if (log.isInfoEnabled()) {
log.info("OCR all files in database event: " + ocrFileAsyncEvent.toString()); log.info("Extract file content event: " + extractFileAsyncEvent.toString());
} }
TransactionUtil.handle(new Runnable() { TransactionUtil.handle(new Runnable() {
@ -47,10 +47,9 @@ public class OcrFileAsyncListener {
for (File file : fileList) { for (File file : fileList) {
long startTime = System.currentTimeMillis(); long startTime = System.currentTimeMillis();
Document document = documentDao.getById(file.getDocumentId()); Document document = documentDao.getById(file.getDocumentId());
String content = FileUtil.ocrFile(document, file); file.setContent(FileUtil.extractContent(document, file));
file.setContent(content);
TransactionUtil.commit(); TransactionUtil.commit();
log.info(MessageFormat.format("File OCR-ized in {0}ms", System.currentTimeMillis() - startTime)); log.info(MessageFormat.format("File content extracted in {0}ms", System.currentTimeMillis() - startTime));
} }
} }
}); });

View File

@ -39,7 +39,7 @@ public class FileCreatedAsyncListener {
// OCR the file // OCR the file
final File file = fileCreatedAsyncEvent.getFile(); final File file = fileCreatedAsyncEvent.getFile();
long startTime = System.currentTimeMillis(); long startTime = System.currentTimeMillis();
final String content = FileUtil.ocrFile(fileCreatedAsyncEvent.getDocument(), file); final String content = FileUtil.extractContent(fileCreatedAsyncEvent.getDocument(), file);
log.info(MessageFormat.format("File OCR-ized in {0}ms", System.currentTimeMillis() - startTime)); log.info(MessageFormat.format("File OCR-ized in {0}ms", System.currentTimeMillis() - startTime));
// Store the OCR-ization result in the database // Store the OCR-ization result in the database

View File

@ -16,7 +16,7 @@ import com.sismics.docs.core.listener.async.DocumentDeletedAsyncListener;
import com.sismics.docs.core.listener.async.DocumentUpdatedAsyncListener; import com.sismics.docs.core.listener.async.DocumentUpdatedAsyncListener;
import com.sismics.docs.core.listener.async.FileCreatedAsyncListener; import com.sismics.docs.core.listener.async.FileCreatedAsyncListener;
import com.sismics.docs.core.listener.async.FileDeletedAsyncListener; import com.sismics.docs.core.listener.async.FileDeletedAsyncListener;
import com.sismics.docs.core.listener.async.OcrFileAsyncListener; import com.sismics.docs.core.listener.async.ExtractFileAsyncListener;
import com.sismics.docs.core.listener.async.RebuildIndexAsyncListener; import com.sismics.docs.core.listener.async.RebuildIndexAsyncListener;
import com.sismics.docs.core.listener.sync.DeadEventListener; import com.sismics.docs.core.listener.sync.DeadEventListener;
import com.sismics.docs.core.model.jpa.Config; import com.sismics.docs.core.model.jpa.Config;
@ -82,7 +82,7 @@ public class AppContext {
asyncEventBus.register(new DocumentUpdatedAsyncListener()); asyncEventBus.register(new DocumentUpdatedAsyncListener());
asyncEventBus.register(new DocumentDeletedAsyncListener()); asyncEventBus.register(new DocumentDeletedAsyncListener());
asyncEventBus.register(new RebuildIndexAsyncListener()); asyncEventBus.register(new RebuildIndexAsyncListener());
asyncEventBus.register(new OcrFileAsyncListener()); asyncEventBus.register(new ExtractFileAsyncListener());
} }
/** /**

View File

@ -6,11 +6,15 @@ import java.io.InputStream;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.nio.file.Paths; import java.nio.file.Paths;
import java.util.List;
import javax.imageio.ImageIO; import javax.imageio.ImageIO;
import net.sourceforge.tess4j.Tesseract; import net.sourceforge.tess4j.Tesseract;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.util.PDFTextStripper;
import org.imgscalr.Scalr; import org.imgscalr.Scalr;
import org.imgscalr.Scalr.Method; import org.imgscalr.Scalr.Method;
import org.imgscalr.Scalr.Mode; import org.imgscalr.Scalr.Mode;
@ -20,6 +24,7 @@ import org.slf4j.LoggerFactory;
import com.sismics.docs.core.model.jpa.Document; import com.sismics.docs.core.model.jpa.Document;
import com.sismics.docs.core.model.jpa.File; import com.sismics.docs.core.model.jpa.File;
import com.sismics.util.ImageUtil; import com.sismics.util.ImageUtil;
import com.sismics.util.mime.MimeType;
/** /**
* File entity utilities. * File entity utilities.
@ -33,18 +38,32 @@ public class FileUtil {
private static final Logger log = LoggerFactory.getLogger(FileUtil.class); private static final Logger log = LoggerFactory.getLogger(FileUtil.class);
/** /**
* OCR a file. * Extract content from a file.
*
* @param document Document linked to the file
* @param file File to extract
* @return Content extract
*/
public static String extractContent(Document document, File file) {
String content = null;
if (ImageUtil.isImage(file.getMimeType())) {
content = ocrFile(document, file);
} else if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) {
content = extractPdf(file);
}
return content;
}
/**
* Optical character recognition on a file.
* *
* @param document Document linked to the file * @param document Document linked to the file
* @param file File to OCR * @param file File to OCR
* @return OCR-ized content * @return Content extracted
*/ */
public static String ocrFile(Document document, final File file) { private static String ocrFile(Document document, File file) {
if (!ImageUtil.isImage(file.getMimeType())) {
// The file is not OCR-izable
return null;
}
Tesseract instance = Tesseract.getInstance(); Tesseract instance = Tesseract.getInstance();
java.io.File storedfile = Paths.get(DirectoryUtil.getStorageDirectory().getPath(), file.getId()).toFile(); java.io.File storedfile = Paths.get(DirectoryUtil.getStorageDirectory().getPath(), file.getId()).toFile();
String content = null; String content = null;
@ -72,6 +91,35 @@ public class FileUtil {
return content; return content;
} }
/**
* Extract text from a PDF.
*
* @param file File to extract
* @return Content extracted
*/
private static String extractPdf(File file) {
String content = null;
PDDocument pdfDocument = null;
java.io.File storedfile = Paths.get(DirectoryUtil.getStorageDirectory().getPath(), file.getId()).toFile();
try {
PDFTextStripper stripper = new PDFTextStripper();
pdfDocument = PDDocument.load(storedfile);
content = stripper.getText(pdfDocument);
} catch (IOException e) {
log.error("Error while extracting text from the PDF " + storedfile, e);
} finally {
if (pdfDocument != null) {
try {
pdfDocument.close();
} catch (IOException e) {
// NOP
}
}
}
return content;
}
/** /**
* Save a file on the storage filesystem. * Save a file on the storage filesystem.
* *
@ -84,7 +132,12 @@ public class FileUtil {
Files.copy(is, path); Files.copy(is, path);
// Generate file variations // Generate file variations
try {
saveVariations(file, path.toFile()); saveVariations(file, path.toFile());
} catch (IOException e) {
// Don't rethrow Exception from file variations generation
log.error("Error creating file variations", e);
}
} }
/** /**
@ -95,8 +148,22 @@ public class FileUtil {
* @throws IOException * @throws IOException
*/ */
public static void saveVariations(File file, java.io.File originalFile) throws IOException { public static void saveVariations(File file, java.io.File originalFile) throws IOException {
BufferedImage image = null;
if (ImageUtil.isImage(file.getMimeType())) { if (ImageUtil.isImage(file.getMimeType())) {
BufferedImage image = ImageIO.read(originalFile); image = ImageIO.read(originalFile);
} else if(file.getMimeType().equals(MimeType.APPLICATION_PDF)) {
// Generate preview from the first page of the PDF
PDDocument pdfDocument = PDDocument.load(originalFile);
@SuppressWarnings("unchecked")
List<PDPage> pageList = pdfDocument.getDocumentCatalog().getAllPages();
if (pageList.size() > 0) {
PDPage page = pageList.get(0);
image = page.convertToImage();
}
}
if (image != null) {
// Generate thumbnails from image
BufferedImage web = Scalr.resize(image, Scalr.Method.AUTOMATIC, Scalr.Mode.AUTOMATIC, 1280, Scalr.OP_ANTIALIAS); BufferedImage web = Scalr.resize(image, Scalr.Method.AUTOMATIC, Scalr.Mode.AUTOMATIC, 1280, Scalr.OP_ANTIALIAS);
BufferedImage thumbnail = Scalr.resize(image, Scalr.Method.AUTOMATIC, Scalr.Mode.AUTOMATIC, 256, Scalr.OP_ANTIALIAS); BufferedImage thumbnail = Scalr.resize(image, Scalr.Method.AUTOMATIC, Scalr.Mode.AUTOMATIC, 256, Scalr.OP_ANTIALIAS);
image.flush(); image.flush();

View File

@ -1,2 +0,0 @@
- Extract text from PDF for indexing, see PDFBox (server)
- Make thumbnail of the first page of PDF, see PDFBox (server)

View File

@ -62,6 +62,7 @@
<org.mortbay.jetty.jetty-maven-plugin.version>8.1.2.v20120308</org.mortbay.jetty.jetty-maven-plugin.version> <org.mortbay.jetty.jetty-maven-plugin.version>8.1.2.v20120308</org.mortbay.jetty.jetty-maven-plugin.version>
<org.vafer.jdeb.version>1.0.1</org.vafer.jdeb.version> <org.vafer.jdeb.version>1.0.1</org.vafer.jdeb.version>
<com.samaxes.maven.minify-maven-plugin.version>1.7</com.samaxes.maven.minify-maven-plugin.version> <com.samaxes.maven.minify-maven-plugin.version>1.7</com.samaxes.maven.minify-maven-plugin.version>
<org.apache.pdfbox.pdfbox.version>1.8.2</org.apache.pdfbox.pdfbox.version>
</properties> </properties>
<scm> <scm>
@ -436,6 +437,12 @@
<version>${org.imgscalr.imgscalr-lib.version}</version> <version>${org.imgscalr.imgscalr-lib.version}</version>
</dependency> </dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>${org.apache.pdfbox.pdfbox.version}</version>
</dependency>
<!-- OCR dependencies --> <!-- OCR dependencies -->
<dependency> <dependency>
<groupId>jna</groupId> <groupId>jna</groupId>

View File

@ -26,7 +26,7 @@ import com.sismics.docs.core.dao.jpa.DocumentDao;
import com.sismics.docs.core.dao.jpa.FileDao; import com.sismics.docs.core.dao.jpa.FileDao;
import com.sismics.docs.core.dao.jpa.criteria.DocumentCriteria; import com.sismics.docs.core.dao.jpa.criteria.DocumentCriteria;
import com.sismics.docs.core.dao.jpa.dto.DocumentDto; import com.sismics.docs.core.dao.jpa.dto.DocumentDto;
import com.sismics.docs.core.event.OcrFileAsyncEvent; import com.sismics.docs.core.event.ExtractFileAsyncEvent;
import com.sismics.docs.core.model.context.AppContext; import com.sismics.docs.core.model.context.AppContext;
import com.sismics.docs.core.model.jpa.File; import com.sismics.docs.core.model.jpa.File;
import com.sismics.docs.core.util.ConfigUtil; import com.sismics.docs.core.util.ConfigUtil;
@ -163,7 +163,7 @@ public class AppResource extends BaseResource {
checkBaseFunction(BaseFunction.ADMIN); checkBaseFunction(BaseFunction.ADMIN);
// Raise a OCR file event // Raise a OCR file event
AppContext.getInstance().getAsyncEventBus().post(new OcrFileAsyncEvent()); AppContext.getInstance().getAsyncEventBus().post(new ExtractFileAsyncEvent());
JSONObject response = new JSONObject(); JSONObject response = new JSONObject();
response.put("status", "ok"); response.put("status", "ok");

View File

@ -15,6 +15,7 @@ import org.codehaus.jettison.json.JSONObject;
import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormat;
import org.junit.Test; import org.junit.Test;
import com.google.common.io.ByteStreams;
import com.sismics.docs.rest.filter.CookieAuthenticationFilter; import com.sismics.docs.rest.filter.CookieAuthenticationFilter;
import com.sun.jersey.api.client.ClientResponse; import com.sun.jersey.api.client.ClientResponse;
import com.sun.jersey.api.client.ClientResponse.Status; import com.sun.jersey.api.client.ClientResponse.Status;
@ -354,4 +355,67 @@ public class TestDocumentResource extends BaseJerseyTest {
json = response.getEntity(JSONObject.class); json = response.getEntity(JSONObject.class);
Assert.assertEquals(Status.BAD_REQUEST, Status.fromStatusCode(response.getStatus())); Assert.assertEquals(Status.BAD_REQUEST, Status.fromStatusCode(response.getStatus()));
} }
/**
* Test PDF extraction.
*
* @throws Exception
*/
@Test
public void testPdfExtraction() throws Exception {
// Login document2
clientUtil.createUser("document2");
String document2Token = clientUtil.login("document2");
// Create a document
WebResource documentResource = resource().path("/document");
documentResource.addFilter(new CookieAuthenticationFilter(document2Token));
MultivaluedMapImpl postParams = new MultivaluedMapImpl();
postParams.add("title", "My super title document 1");
postParams.add("description", "My super description for document 1");
postParams.add("language", "eng");
long create1Date = new Date().getTime();
postParams.add("create_date", create1Date);
ClientResponse response = documentResource.put(ClientResponse.class, postParams);
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
JSONObject json = response.getEntity(JSONObject.class);
String document1Id = json.optString("id");
Assert.assertNotNull(document1Id);
// Add a PDF file
WebResource fileResource = resource().path("/file");
fileResource.addFilter(new CookieAuthenticationFilter(document2Token));
FormDataMultiPart form = new FormDataMultiPart();
InputStream file = this.getClass().getResourceAsStream("/file/wikipedia.pdf");
FormDataBodyPart fdp = new FormDataBodyPart("file",
new BufferedInputStream(file),
MediaType.APPLICATION_OCTET_STREAM_TYPE);
form.bodyPart(fdp);
form.field("id", document1Id);
response = fileResource.type(MediaType.MULTIPART_FORM_DATA).put(ClientResponse.class, form);
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
json = response.getEntity(JSONObject.class);
String file1Id = json.getString("id");
// Search documents by query in full content
documentResource = resource().path("/document/list");
documentResource.addFilter(new CookieAuthenticationFilter(document2Token));
MultivaluedMapImpl getParams = new MultivaluedMapImpl();
getParams.putSingle("search", "full:vrandecic");
response = documentResource.queryParams(getParams).get(ClientResponse.class);
json = response.getEntity(JSONObject.class);
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
Assert.assertTrue(json.getJSONArray("documents").length() == 1);
// Get the file thumbnail data
fileResource = resource().path("/file/" + file1Id + "/data");
fileResource.addFilter(new CookieAuthenticationFilter(document2Token));
getParams = new MultivaluedMapImpl();
getParams.putSingle("size", "thumb");
response = fileResource.queryParams(getParams).get(ClientResponse.class);
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
InputStream is = response.getEntityInputStream();
byte[] fileBytes = ByteStreams.toByteArray(is);
Assert.assertEquals(3457, fileBytes.length);
}
} }

Binary file not shown.