diff --git a/docs-core/pom.xml b/docs-core/pom.xml index 77ea42ed..d80e2d8d 100644 --- a/docs-core/pom.xml +++ b/docs-core/pom.xml @@ -189,7 +189,26 @@ org.postgresql postgresql - + + + + javax.xml.bind + jaxb-api + 2.3.0 + + + + com.sun.xml.bind + jaxb-core + 2.3.0 + + + + com.sun.xml.bind + jaxb-impl + 2.3.0 + + junit diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/format/PdfFormatHandler.java b/docs-core/src/main/java/com/sismics/docs/core/util/format/PdfFormatHandler.java index 08c698a1..670358b9 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/util/format/PdfFormatHandler.java +++ b/docs-core/src/main/java/com/sismics/docs/core/util/format/PdfFormatHandler.java @@ -6,6 +6,7 @@ import com.sismics.util.mime.MimeType; import org.apache.pdfbox.io.MemoryUsageSetting; import org.apache.pdfbox.multipdf.PDFMergerUtility; import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.rendering.ImageType; import org.apache.pdfbox.rendering.PDFRenderer; import org.apache.pdfbox.text.PDFTextStripper; import org.slf4j.Logger; @@ -60,7 +61,7 @@ public class PdfFormatHandler implements FormatHandler { for (int pageIndex = 0; pageIndex < pdfDocument.getNumberOfPages(); pageIndex++) { log.info("OCR page " + (pageIndex + 1) + "/" + pdfDocument.getNumberOfPages() + " of PDF file containing only images"); sb.append(" "); - sb.append(FileUtil.ocrFile(language, renderer.renderImage(pageIndex))); + sb.append(FileUtil.ocrFile(language, renderer.renderImageWithDPI(pageIndex, 300, ImageType.GRAY))); } return sb.toString(); } catch (Exception e) { diff --git a/docs-core/src/test/java/com/sismics/util/format/TestPdfFormatHandler.java b/docs-core/src/test/java/com/sismics/util/format/TestPdfFormatHandler.java new file mode 100644 index 00000000..7b664df7 --- /dev/null +++ b/docs-core/src/test/java/com/sismics/util/format/TestPdfFormatHandler.java @@ -0,0 +1,19 @@ +package com.sismics.util.format; + +import com.sismics.docs.core.util.format.PdfFormatHandler; +import org.junit.Assert; +import org.junit.Test; + +import java.nio.file.Paths; + +public class TestPdfFormatHandler { + @Test + public void testIssue373() throws Exception { + PdfFormatHandler formatHandler = new PdfFormatHandler(); + String content = formatHandler.extractContent("deu", Paths.get(ClassLoader.getSystemResource("file/issue373.pdf").toURI())); + Assert.assertTrue(content.contains("Aufrechterhaltung")); + Assert.assertTrue(content.contains("Außentemperatur")); + Assert.assertTrue(content.contains("Grundumsatzmessungen")); + Assert.assertTrue(content.contains("ermitteln")); + } +} diff --git a/docs-core/src/test/resources/file/issue373.pdf b/docs-core/src/test/resources/file/issue373.pdf new file mode 100644 index 00000000..180fc9b7 Binary files /dev/null and b/docs-core/src/test/resources/file/issue373.pdf differ diff --git a/docs-web/pom.xml b/docs-web/pom.xml index 9fd91074..e3abadea 100644 --- a/docs-web/pom.xml +++ b/docs-web/pom.xml @@ -26,25 +26,6 @@ docs-web-common - - - javax.xml.bind - jaxb-api - 2.3.0 - - - - com.sun.xml.bind - jaxb-core - 2.3.0 - - - - com.sun.xml.bind - jaxb-impl - 2.3.0 - - org.glassfish.jersey.containers