From 2c7083aa43b7e6636614460eb09c1be63bf3bd0e Mon Sep 17 00:00:00 2001 From: jendib Date: Sun, 18 Aug 2013 14:11:08 +0200 Subject: [PATCH] Force loading of corrupted PDF --- .../core/listener/async/FileCreatedAsyncListener.java | 2 +- .../main/java/com/sismics/docs/core/util/FileUtil.java | 4 ++-- .../java/com/sismics/docs/rest/resource/AppResource.java | 8 ++++---- docs-web/src/main/webapp/partial/docs/document.edit.html | 2 +- .../test/java/com/sismics/docs/rest/TestAppResource.java | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java b/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java index 6ea1a214..ffb2af22 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java +++ b/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java @@ -40,7 +40,7 @@ public class FileCreatedAsyncListener { final File file = fileCreatedAsyncEvent.getFile(); long startTime = System.currentTimeMillis(); final String content = FileUtil.extractContent(fileCreatedAsyncEvent.getDocument(), file); - log.info(MessageFormat.format("File OCR-ized in {0}ms", System.currentTimeMillis() - startTime)); + log.info(MessageFormat.format("File content extracted in {0}ms", System.currentTimeMillis() - startTime)); // Store the OCR-ization result in the database TransactionUtil.handle(new Runnable() { diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java b/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java index 5ab00ae4..869b2f04 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java +++ b/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java @@ -103,7 +103,7 @@ public class FileUtil { java.io.File storedfile = Paths.get(DirectoryUtil.getStorageDirectory().getPath(), file.getId()).toFile(); try { PDFTextStripper stripper = new PDFTextStripper(); - pdfDocument = PDDocument.load(storedfile); + pdfDocument = PDDocument.load(storedfile.getAbsolutePath(), true); content = stripper.getText(pdfDocument); } catch (IOException e) { log.error("Error while extracting text from the PDF " + storedfile, e); @@ -153,7 +153,7 @@ public class FileUtil { image = ImageIO.read(originalFile); } else if(file.getMimeType().equals(MimeType.APPLICATION_PDF)) { // Generate preview from the first page of the PDF - PDDocument pdfDocument = PDDocument.load(originalFile); + PDDocument pdfDocument = PDDocument.load(originalFile.getAbsolutePath(), true); @SuppressWarnings("unchecked") List pageList = pdfDocument.getDocumentCatalog().getAllPages(); if (pageList.size() > 0) { diff --git a/docs-web/src/main/java/com/sismics/docs/rest/resource/AppResource.java b/docs-web/src/main/java/com/sismics/docs/rest/resource/AppResource.java index 5e90d110..06402533 100644 --- a/docs-web/src/main/java/com/sismics/docs/rest/resource/AppResource.java +++ b/docs-web/src/main/java/com/sismics/docs/rest/resource/AppResource.java @@ -148,21 +148,21 @@ public class AppResource extends BaseResource { } /** - * OCR-ize all files again. + * Extract content from all files again. * * @return Response * @throws JSONException */ @POST - @Path("batch/ocr") + @Path("batch/extract") @Produces(MediaType.APPLICATION_JSON) - public Response batchOcr() throws JSONException { + public Response batchExtract() throws JSONException { if (!authenticate()) { throw new ForbiddenClientException(); } checkBaseFunction(BaseFunction.ADMIN); - // Raise a OCR file event + // Raise an extract file content event AppContext.getInstance().getAsyncEventBus().post(new ExtractFileAsyncEvent()); JSONObject response = new JSONObject(); diff --git a/docs-web/src/main/webapp/partial/docs/document.edit.html b/docs-web/src/main/webapp/partial/docs/document.edit.html index c2fb318e..e3b7eb9d 100644 --- a/docs-web/src/main/webapp/partial/docs/document.edit.html +++ b/docs-web/src/main/webapp/partial/docs/document.edit.html @@ -33,7 +33,7 @@
- +
diff --git a/docs-web/src/test/java/com/sismics/docs/rest/TestAppResource.java b/docs-web/src/test/java/com/sismics/docs/rest/TestAppResource.java index 92600e29..3cfc0040 100644 --- a/docs-web/src/test/java/com/sismics/docs/rest/TestAppResource.java +++ b/docs-web/src/test/java/com/sismics/docs/rest/TestAppResource.java @@ -44,7 +44,7 @@ public class TestAppResource extends BaseJerseyTest { Assert.assertEquals(0, json.getInt("document_count")); // OCR-ize all files - appResource = resource().path("/app/batch/ocr"); + appResource = resource().path("/app/batch/extract"); appResource.addFilter(new CookieAuthenticationFilter(adminAuthenticationToken)); response = appResource.post(ClientResponse.class); Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));