mirror of
https://github.com/sismics/docs.git
synced 2024-11-25 15:17:57 +01:00
Closes #373: high quality PDF to image conversion before OCR
This commit is contained in:
parent
a7423caeb1
commit
90a49efa4a
@ -190,6 +190,25 @@
|
|||||||
<artifactId>postgresql</artifactId>
|
<artifactId>postgresql</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<!-- JDK 11 JAXB dependencies -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>javax.xml.bind</groupId>
|
||||||
|
<artifactId>jaxb-api</artifactId>
|
||||||
|
<version>2.3.0</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.sun.xml.bind</groupId>
|
||||||
|
<artifactId>jaxb-core</artifactId>
|
||||||
|
<version>2.3.0</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.sun.xml.bind</groupId>
|
||||||
|
<artifactId>jaxb-impl</artifactId>
|
||||||
|
<version>2.3.0</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<!-- Test dependencies -->
|
<!-- Test dependencies -->
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>junit</groupId>
|
<groupId>junit</groupId>
|
||||||
|
@ -6,6 +6,7 @@ import com.sismics.util.mime.MimeType;
|
|||||||
import org.apache.pdfbox.io.MemoryUsageSetting;
|
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||||
import org.apache.pdfbox.multipdf.PDFMergerUtility;
|
import org.apache.pdfbox.multipdf.PDFMergerUtility;
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
|
import org.apache.pdfbox.rendering.ImageType;
|
||||||
import org.apache.pdfbox.rendering.PDFRenderer;
|
import org.apache.pdfbox.rendering.PDFRenderer;
|
||||||
import org.apache.pdfbox.text.PDFTextStripper;
|
import org.apache.pdfbox.text.PDFTextStripper;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -60,7 +61,7 @@ public class PdfFormatHandler implements FormatHandler {
|
|||||||
for (int pageIndex = 0; pageIndex < pdfDocument.getNumberOfPages(); pageIndex++) {
|
for (int pageIndex = 0; pageIndex < pdfDocument.getNumberOfPages(); pageIndex++) {
|
||||||
log.info("OCR page " + (pageIndex + 1) + "/" + pdfDocument.getNumberOfPages() + " of PDF file containing only images");
|
log.info("OCR page " + (pageIndex + 1) + "/" + pdfDocument.getNumberOfPages() + " of PDF file containing only images");
|
||||||
sb.append(" ");
|
sb.append(" ");
|
||||||
sb.append(FileUtil.ocrFile(language, renderer.renderImage(pageIndex)));
|
sb.append(FileUtil.ocrFile(language, renderer.renderImageWithDPI(pageIndex, 300, ImageType.GRAY)));
|
||||||
}
|
}
|
||||||
return sb.toString();
|
return sb.toString();
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
@ -0,0 +1,19 @@
|
|||||||
|
package com.sismics.util.format;
|
||||||
|
|
||||||
|
import com.sismics.docs.core.util.format.PdfFormatHandler;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
|
||||||
|
public class TestPdfFormatHandler {
|
||||||
|
@Test
|
||||||
|
public void testIssue373() throws Exception {
|
||||||
|
PdfFormatHandler formatHandler = new PdfFormatHandler();
|
||||||
|
String content = formatHandler.extractContent("deu", Paths.get(ClassLoader.getSystemResource("file/issue373.pdf").toURI()));
|
||||||
|
Assert.assertTrue(content.contains("Aufrechterhaltung"));
|
||||||
|
Assert.assertTrue(content.contains("Außentemperatur"));
|
||||||
|
Assert.assertTrue(content.contains("Grundumsatzmessungen"));
|
||||||
|
Assert.assertTrue(content.contains("ermitteln"));
|
||||||
|
}
|
||||||
|
}
|
BIN
docs-core/src/test/resources/file/issue373.pdf
Normal file
BIN
docs-core/src/test/resources/file/issue373.pdf
Normal file
Binary file not shown.
@ -26,25 +26,6 @@
|
|||||||
<artifactId>docs-web-common</artifactId>
|
<artifactId>docs-web-common</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<!-- JDK 11 JAXB dependencies -->
|
|
||||||
<dependency>
|
|
||||||
<groupId>javax.xml.bind</groupId>
|
|
||||||
<artifactId>jaxb-api</artifactId>
|
|
||||||
<version>2.3.0</version>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>com.sun.xml.bind</groupId>
|
|
||||||
<artifactId>jaxb-core</artifactId>
|
|
||||||
<version>2.3.0</version>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>com.sun.xml.bind</groupId>
|
|
||||||
<artifactId>jaxb-impl</artifactId>
|
|
||||||
<version>2.3.0</version>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<!-- Dependencies to Jersey -->
|
<!-- Dependencies to Jersey -->
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.glassfish.jersey.containers</groupId>
|
<groupId>org.glassfish.jersey.containers</groupId>
|
||||||
|
Loading…
Reference in New Issue
Block a user