Closes #373: high quality PDF to image conversion before OCR

This commit is contained in:
bgamard 2020-02-13 17:43:07 +01:00
parent a7423caeb1
commit 90a49efa4a
5 changed files with 41 additions and 21 deletions

View File

@ -190,6 +190,25 @@
<artifactId>postgresql</artifactId> <artifactId>postgresql</artifactId>
</dependency> </dependency>
<!-- JDK 11 JAXB dependencies -->
<dependency>
<groupId>javax.xml.bind</groupId>
<artifactId>jaxb-api</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>com.sun.xml.bind</groupId>
<artifactId>jaxb-core</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>com.sun.xml.bind</groupId>
<artifactId>jaxb-impl</artifactId>
<version>2.3.0</version>
</dependency>
<!-- Test dependencies --> <!-- Test dependencies -->
<dependency> <dependency>
<groupId>junit</groupId> <groupId>junit</groupId>

View File

@ -6,6 +6,7 @@ import com.sismics.util.mime.MimeType;
import org.apache.pdfbox.io.MemoryUsageSetting; import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.multipdf.PDFMergerUtility; import org.apache.pdfbox.multipdf.PDFMergerUtility;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer; import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.PDFTextStripper;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -60,7 +61,7 @@ public class PdfFormatHandler implements FormatHandler {
for (int pageIndex = 0; pageIndex < pdfDocument.getNumberOfPages(); pageIndex++) { for (int pageIndex = 0; pageIndex < pdfDocument.getNumberOfPages(); pageIndex++) {
log.info("OCR page " + (pageIndex + 1) + "/" + pdfDocument.getNumberOfPages() + " of PDF file containing only images"); log.info("OCR page " + (pageIndex + 1) + "/" + pdfDocument.getNumberOfPages() + " of PDF file containing only images");
sb.append(" "); sb.append(" ");
sb.append(FileUtil.ocrFile(language, renderer.renderImage(pageIndex))); sb.append(FileUtil.ocrFile(language, renderer.renderImageWithDPI(pageIndex, 300, ImageType.GRAY)));
} }
return sb.toString(); return sb.toString();
} catch (Exception e) { } catch (Exception e) {

View File

@ -0,0 +1,19 @@
package com.sismics.util.format;
import com.sismics.docs.core.util.format.PdfFormatHandler;
import org.junit.Assert;
import org.junit.Test;
import java.nio.file.Paths;
public class TestPdfFormatHandler {
@Test
public void testIssue373() throws Exception {
PdfFormatHandler formatHandler = new PdfFormatHandler();
String content = formatHandler.extractContent("deu", Paths.get(ClassLoader.getSystemResource("file/issue373.pdf").toURI()));
Assert.assertTrue(content.contains("Aufrechterhaltung"));
Assert.assertTrue(content.contains("Außentemperatur"));
Assert.assertTrue(content.contains("Grundumsatzmessungen"));
Assert.assertTrue(content.contains("ermitteln"));
}
}

Binary file not shown.

View File

@ -26,25 +26,6 @@
<artifactId>docs-web-common</artifactId> <artifactId>docs-web-common</artifactId>
</dependency> </dependency>
<!-- JDK 11 JAXB dependencies -->
<dependency>
<groupId>javax.xml.bind</groupId>
<artifactId>jaxb-api</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>com.sun.xml.bind</groupId>
<artifactId>jaxb-core</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>com.sun.xml.bind</groupId>
<artifactId>jaxb-impl</artifactId>
<version>2.3.0</version>
</dependency>
<!-- Dependencies to Jersey --> <!-- Dependencies to Jersey -->
<dependency> <dependency>
<groupId>org.glassfish.jersey.containers</groupId> <groupId>org.glassfish.jersey.containers</groupId>