Force loading of corrupted PDF

This commit is contained in:
jendib 2013-08-18 14:11:08 +02:00
parent 77c5a10aba
commit 2c7083aa43
5 changed files with 9 additions and 9 deletions

View File

@ -40,7 +40,7 @@ public class FileCreatedAsyncListener {
final File file = fileCreatedAsyncEvent.getFile(); final File file = fileCreatedAsyncEvent.getFile();
long startTime = System.currentTimeMillis(); long startTime = System.currentTimeMillis();
final String content = FileUtil.extractContent(fileCreatedAsyncEvent.getDocument(), file); final String content = FileUtil.extractContent(fileCreatedAsyncEvent.getDocument(), file);
log.info(MessageFormat.format("File OCR-ized in {0}ms", System.currentTimeMillis() - startTime)); log.info(MessageFormat.format("File content extracted in {0}ms", System.currentTimeMillis() - startTime));
// Store the OCR-ization result in the database // Store the OCR-ization result in the database
TransactionUtil.handle(new Runnable() { TransactionUtil.handle(new Runnable() {

View File

@ -103,7 +103,7 @@ public class FileUtil {
java.io.File storedfile = Paths.get(DirectoryUtil.getStorageDirectory().getPath(), file.getId()).toFile(); java.io.File storedfile = Paths.get(DirectoryUtil.getStorageDirectory().getPath(), file.getId()).toFile();
try { try {
PDFTextStripper stripper = new PDFTextStripper(); PDFTextStripper stripper = new PDFTextStripper();
pdfDocument = PDDocument.load(storedfile); pdfDocument = PDDocument.load(storedfile.getAbsolutePath(), true);
content = stripper.getText(pdfDocument); content = stripper.getText(pdfDocument);
} catch (IOException e) { } catch (IOException e) {
log.error("Error while extracting text from the PDF " + storedfile, e); log.error("Error while extracting text from the PDF " + storedfile, e);
@ -153,7 +153,7 @@ public class FileUtil {
image = ImageIO.read(originalFile); image = ImageIO.read(originalFile);
} else if(file.getMimeType().equals(MimeType.APPLICATION_PDF)) { } else if(file.getMimeType().equals(MimeType.APPLICATION_PDF)) {
// Generate preview from the first page of the PDF // Generate preview from the first page of the PDF
PDDocument pdfDocument = PDDocument.load(originalFile); PDDocument pdfDocument = PDDocument.load(originalFile.getAbsolutePath(), true);
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
List<PDPage> pageList = pdfDocument.getDocumentCatalog().getAllPages(); List<PDPage> pageList = pdfDocument.getDocumentCatalog().getAllPages();
if (pageList.size() > 0) { if (pageList.size() > 0) {

View File

@ -148,21 +148,21 @@ public class AppResource extends BaseResource {
} }
/** /**
* OCR-ize all files again. * Extract content from all files again.
* *
* @return Response * @return Response
* @throws JSONException * @throws JSONException
*/ */
@POST @POST
@Path("batch/ocr") @Path("batch/extract")
@Produces(MediaType.APPLICATION_JSON) @Produces(MediaType.APPLICATION_JSON)
public Response batchOcr() throws JSONException { public Response batchExtract() throws JSONException {
if (!authenticate()) { if (!authenticate()) {
throw new ForbiddenClientException(); throw new ForbiddenClientException();
} }
checkBaseFunction(BaseFunction.ADMIN); checkBaseFunction(BaseFunction.ADMIN);
// Raise a OCR file event // Raise an extract file content event
AppContext.getInstance().getAsyncEventBus().post(new ExtractFileAsyncEvent()); AppContext.getInstance().getAsyncEventBus().post(new ExtractFileAsyncEvent());
JSONObject response = new JSONObject(); JSONObject response = new JSONObject();

View File

@ -33,7 +33,7 @@
<div class="control-group"> <div class="control-group">
<label class="control-label" for="inputFiles">New files</label> <label class="control-label" for="inputFiles">New files</label>
<div class="controls"> <div class="controls">
<file class="input-block-level" id="inputFiles" multiple="multiple" ng-model="newFiles" accept="image/png,image/jpg,image/jpeg,image/gif" /> <file class="input-block-level" id="inputFiles" multiple="multiple" ng-model="newFiles" accept="image/png,image/jpg,image/jpeg,image/gif,application/pdf" />
</div> </div>
</div> </div>
<div class="control-group"> <div class="control-group">

View File

@ -44,7 +44,7 @@ public class TestAppResource extends BaseJerseyTest {
Assert.assertEquals(0, json.getInt("document_count")); Assert.assertEquals(0, json.getInt("document_count"));
// OCR-ize all files // OCR-ize all files
appResource = resource().path("/app/batch/ocr"); appResource = resource().path("/app/batch/extract");
appResource.addFilter(new CookieAuthenticationFilter(adminAuthenticationToken)); appResource.addFilter(new CookieAuthenticationFilter(adminAuthenticationToken));
response = appResource.post(ClientResponse.class); response = appResource.post(ClientResponse.class);
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus())); Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));