Force loading of corrupted PDF

This commit is contained in:
jendib 2013-08-18 14:11:08 +02:00
parent 77c5a10aba
commit 2c7083aa43
5 changed files with 9 additions and 9 deletions

View File

@ -40,7 +40,7 @@ public class FileCreatedAsyncListener {
final File file = fileCreatedAsyncEvent.getFile();
long startTime = System.currentTimeMillis();
final String content = FileUtil.extractContent(fileCreatedAsyncEvent.getDocument(), file);
log.info(MessageFormat.format("File OCR-ized in {0}ms", System.currentTimeMillis() - startTime));
log.info(MessageFormat.format("File content extracted in {0}ms", System.currentTimeMillis() - startTime));
// Store the OCR-ization result in the database
TransactionUtil.handle(new Runnable() {

View File

@ -103,7 +103,7 @@ public class FileUtil {
java.io.File storedfile = Paths.get(DirectoryUtil.getStorageDirectory().getPath(), file.getId()).toFile();
try {
PDFTextStripper stripper = new PDFTextStripper();
pdfDocument = PDDocument.load(storedfile);
pdfDocument = PDDocument.load(storedfile.getAbsolutePath(), true);
content = stripper.getText(pdfDocument);
} catch (IOException e) {
log.error("Error while extracting text from the PDF " + storedfile, e);
@ -153,7 +153,7 @@ public class FileUtil {
image = ImageIO.read(originalFile);
} else if(file.getMimeType().equals(MimeType.APPLICATION_PDF)) {
// Generate preview from the first page of the PDF
PDDocument pdfDocument = PDDocument.load(originalFile);
PDDocument pdfDocument = PDDocument.load(originalFile.getAbsolutePath(), true);
@SuppressWarnings("unchecked")
List<PDPage> pageList = pdfDocument.getDocumentCatalog().getAllPages();
if (pageList.size() > 0) {

View File

@ -148,21 +148,21 @@ public class AppResource extends BaseResource {
}
/**
* OCR-ize all files again.
* Extract content from all files again.
*
* @return Response
* @throws JSONException
*/
@POST
@Path("batch/ocr")
@Path("batch/extract")
@Produces(MediaType.APPLICATION_JSON)
public Response batchOcr() throws JSONException {
public Response batchExtract() throws JSONException {
if (!authenticate()) {
throw new ForbiddenClientException();
}
checkBaseFunction(BaseFunction.ADMIN);
// Raise a OCR file event
// Raise an extract file content event
AppContext.getInstance().getAsyncEventBus().post(new ExtractFileAsyncEvent());
JSONObject response = new JSONObject();

View File

@ -33,7 +33,7 @@
<div class="control-group">
<label class="control-label" for="inputFiles">New files</label>
<div class="controls">
<file class="input-block-level" id="inputFiles" multiple="multiple" ng-model="newFiles" accept="image/png,image/jpg,image/jpeg,image/gif" />
<file class="input-block-level" id="inputFiles" multiple="multiple" ng-model="newFiles" accept="image/png,image/jpg,image/jpeg,image/gif,application/pdf" />
</div>
</div>
<div class="control-group">

View File

@ -44,7 +44,7 @@ public class TestAppResource extends BaseJerseyTest {
Assert.assertEquals(0, json.getInt("document_count"));
// OCR-ize all files
appResource = resource().path("/app/batch/ocr");
appResource = resource().path("/app/batch/extract");
appResource.addFilter(new CookieAuthenticationFilter(adminAuthenticationToken));
response = appResource.post(ClientResponse.class);
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));