mirror of
https://github.com/sismics/docs.git
synced 2024-11-22 05:57:57 +01:00
Closes #373: high quality PDF to image conversion before OCR
This commit is contained in:
parent
a7423caeb1
commit
90a49efa4a
@ -190,6 +190,25 @@
|
||||
<artifactId>postgresql</artifactId>
|
||||
</dependency>
|
||||
|
||||
<!-- JDK 11 JAXB dependencies -->
|
||||
<dependency>
|
||||
<groupId>javax.xml.bind</groupId>
|
||||
<artifactId>jaxb-api</artifactId>
|
||||
<version>2.3.0</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.sun.xml.bind</groupId>
|
||||
<artifactId>jaxb-core</artifactId>
|
||||
<version>2.3.0</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.sun.xml.bind</groupId>
|
||||
<artifactId>jaxb-impl</artifactId>
|
||||
<version>2.3.0</version>
|
||||
</dependency>
|
||||
|
||||
<!-- Test dependencies -->
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
|
@ -6,6 +6,7 @@ import com.sismics.util.mime.MimeType;
|
||||
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||
import org.apache.pdfbox.multipdf.PDFMergerUtility;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.rendering.ImageType;
|
||||
import org.apache.pdfbox.rendering.PDFRenderer;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.slf4j.Logger;
|
||||
@ -60,7 +61,7 @@ public class PdfFormatHandler implements FormatHandler {
|
||||
for (int pageIndex = 0; pageIndex < pdfDocument.getNumberOfPages(); pageIndex++) {
|
||||
log.info("OCR page " + (pageIndex + 1) + "/" + pdfDocument.getNumberOfPages() + " of PDF file containing only images");
|
||||
sb.append(" ");
|
||||
sb.append(FileUtil.ocrFile(language, renderer.renderImage(pageIndex)));
|
||||
sb.append(FileUtil.ocrFile(language, renderer.renderImageWithDPI(pageIndex, 300, ImageType.GRAY)));
|
||||
}
|
||||
return sb.toString();
|
||||
} catch (Exception e) {
|
||||
|
@ -0,0 +1,19 @@
|
||||
package com.sismics.util.format;
|
||||
|
||||
import com.sismics.docs.core.util.format.PdfFormatHandler;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.nio.file.Paths;
|
||||
|
||||
public class TestPdfFormatHandler {
|
||||
@Test
|
||||
public void testIssue373() throws Exception {
|
||||
PdfFormatHandler formatHandler = new PdfFormatHandler();
|
||||
String content = formatHandler.extractContent("deu", Paths.get(ClassLoader.getSystemResource("file/issue373.pdf").toURI()));
|
||||
Assert.assertTrue(content.contains("Aufrechterhaltung"));
|
||||
Assert.assertTrue(content.contains("Außentemperatur"));
|
||||
Assert.assertTrue(content.contains("Grundumsatzmessungen"));
|
||||
Assert.assertTrue(content.contains("ermitteln"));
|
||||
}
|
||||
}
|
BIN
docs-core/src/test/resources/file/issue373.pdf
Normal file
BIN
docs-core/src/test/resources/file/issue373.pdf
Normal file
Binary file not shown.
@ -26,25 +26,6 @@
|
||||
<artifactId>docs-web-common</artifactId>
|
||||
</dependency>
|
||||
|
||||
<!-- JDK 11 JAXB dependencies -->
|
||||
<dependency>
|
||||
<groupId>javax.xml.bind</groupId>
|
||||
<artifactId>jaxb-api</artifactId>
|
||||
<version>2.3.0</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.sun.xml.bind</groupId>
|
||||
<artifactId>jaxb-core</artifactId>
|
||||
<version>2.3.0</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.sun.xml.bind</groupId>
|
||||
<artifactId>jaxb-impl</artifactId>
|
||||
<version>2.3.0</version>
|
||||
</dependency>
|
||||
|
||||
<!-- Dependencies to Jersey -->
|
||||
<dependency>
|
||||
<groupId>org.glassfish.jersey.containers</groupId>
|
||||
|
Loading…
Reference in New Issue
Block a user