mirror of
https://github.com/sismics/docs.git
synced 2024-11-22 22:07:56 +01:00
#186: ocr pdf if it contains no text
This commit is contained in:
parent
a66a1e6f8e
commit
647ad841df
@ -27,7 +27,6 @@ public class FileCreatedAsyncListener {
|
|||||||
* File created.
|
* File created.
|
||||||
*
|
*
|
||||||
* @param fileCreatedAsyncEvent File created event
|
* @param fileCreatedAsyncEvent File created event
|
||||||
* @throws Exception e
|
|
||||||
*/
|
*/
|
||||||
@Subscribe
|
@Subscribe
|
||||||
public void on(final FileCreatedAsyncEvent fileCreatedAsyncEvent) {
|
public void on(final FileCreatedAsyncEvent fileCreatedAsyncEvent) {
|
||||||
|
@ -58,7 +58,37 @@ public class FileUtil {
|
|||||||
} else if (VideoUtil.isVideo(file.getMimeType())) {
|
} else if (VideoUtil.isVideo(file.getMimeType())) {
|
||||||
content = VideoUtil.getMetadata(unencryptedFile);
|
content = VideoUtil.getMetadata(unencryptedFile);
|
||||||
} else if (unencryptedPdfFile != null) {
|
} else if (unencryptedPdfFile != null) {
|
||||||
content = PdfUtil.extractPdf(unencryptedPdfFile);
|
content = PdfUtil.extractPdf(unencryptedPdfFile, language);
|
||||||
|
}
|
||||||
|
|
||||||
|
return content;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Optical character recognition on an image.
|
||||||
|
*
|
||||||
|
* @param image Buffered image
|
||||||
|
* @param language Language to OCR
|
||||||
|
* @return Content extracted
|
||||||
|
*/
|
||||||
|
public static String ocrFile(BufferedImage image, String language) {
|
||||||
|
// Upscale, grayscale and deskew the image
|
||||||
|
String content = null;
|
||||||
|
BufferedImage resizedImage = Scalr.resize(image, Scalr.Method.AUTOMATIC, Scalr.Mode.AUTOMATIC, 3500, Scalr.OP_ANTIALIAS, Scalr.OP_GRAYSCALE);
|
||||||
|
image.flush();
|
||||||
|
ImageDeskew imageDeskew = new ImageDeskew(resizedImage);
|
||||||
|
BufferedImage deskewedImage = Scalr.rotate(resizedImage, - imageDeskew.getSkewAngle(), Scalr.OP_ANTIALIAS, Scalr.OP_GRAYSCALE);
|
||||||
|
resizedImage.flush();
|
||||||
|
image = deskewedImage;
|
||||||
|
|
||||||
|
// OCR the file
|
||||||
|
try {
|
||||||
|
Tesseract instance = Tesseract.getInstance();
|
||||||
|
log.info("Starting OCR with TESSDATA_PREFIX=" + System.getenv("TESSDATA_PREFIX") + ";LC_NUMERIC=" + System.getenv("LC_NUMERIC"));
|
||||||
|
instance.setLanguage(language);
|
||||||
|
content = instance.doOCR(image);
|
||||||
|
} catch (Throwable e) {
|
||||||
|
log.error("Error while OCR-izing the image", e);
|
||||||
}
|
}
|
||||||
|
|
||||||
return content;
|
return content;
|
||||||
@ -72,8 +102,6 @@ public class FileUtil {
|
|||||||
* @return Content extracted
|
* @return Content extracted
|
||||||
*/
|
*/
|
||||||
private static String ocrFile(Path unecryptedFile, String language) {
|
private static String ocrFile(Path unecryptedFile, String language) {
|
||||||
Tesseract instance = Tesseract.getInstance();
|
|
||||||
String content = null;
|
|
||||||
BufferedImage image;
|
BufferedImage image;
|
||||||
try (InputStream inputStream = Files.newInputStream(unecryptedFile)) {
|
try (InputStream inputStream = Files.newInputStream(unecryptedFile)) {
|
||||||
image = ImageIO.read(inputStream);
|
image = ImageIO.read(inputStream);
|
||||||
@ -82,24 +110,7 @@ public class FileUtil {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Upscale, grayscale and deskew the image
|
return ocrFile(image, language);
|
||||||
BufferedImage resizedImage = Scalr.resize(image, Scalr.Method.AUTOMATIC, Scalr.Mode.AUTOMATIC, 3500, Scalr.OP_ANTIALIAS, Scalr.OP_GRAYSCALE);
|
|
||||||
image.flush();
|
|
||||||
ImageDeskew imageDeskew = new ImageDeskew(resizedImage);
|
|
||||||
BufferedImage deskewedImage = Scalr.rotate(resizedImage, - imageDeskew.getSkewAngle(), Scalr.OP_ANTIALIAS, Scalr.OP_GRAYSCALE);
|
|
||||||
resizedImage.flush();
|
|
||||||
image = deskewedImage;
|
|
||||||
|
|
||||||
// OCR the file
|
|
||||||
try {
|
|
||||||
log.info("Starting OCR with TESSDATA_PREFIX=" + System.getenv("TESSDATA_PREFIX") + ";LC_NUMERIC=" + System.getenv("LC_NUMERIC"));
|
|
||||||
instance.setLanguage(language);
|
|
||||||
content = instance.doOCR(image);
|
|
||||||
} catch (Throwable e) {
|
|
||||||
log.error("Error while OCR-izing the image", e);
|
|
||||||
}
|
|
||||||
|
|
||||||
return content;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -59,24 +59,31 @@ public class PdfUtil {
|
|||||||
* Extract text from a PDF.
|
* Extract text from a PDF.
|
||||||
*
|
*
|
||||||
* @param unencryptedPdfFile Unencrypted PDF file
|
* @param unencryptedPdfFile Unencrypted PDF file
|
||||||
|
* @param language Language
|
||||||
* @return Content extracted
|
* @return Content extracted
|
||||||
*/
|
*/
|
||||||
public static String extractPdf(Path unencryptedPdfFile) {
|
public static String extractPdf(Path unencryptedPdfFile, String language) {
|
||||||
String content = null;
|
String content = null;
|
||||||
PDDocument pdfDocument = null;
|
try (InputStream inputStream = Files.newInputStream(unencryptedPdfFile);
|
||||||
try (InputStream inputStream = Files.newInputStream(unencryptedPdfFile)) {
|
PDDocument pdfDocument = PDDocument.load(inputStream)) {
|
||||||
PDFTextStripper stripper = new PDFTextStripper();
|
content = new PDFTextStripper().getText(pdfDocument);
|
||||||
pdfDocument = PDDocument.load(inputStream);
|
|
||||||
content = stripper.getText(pdfDocument);
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.error("Error while extracting text from the PDF", e);
|
log.error("Error while extracting text from the PDF", e);
|
||||||
} finally {
|
}
|
||||||
if (pdfDocument != null) {
|
|
||||||
try {
|
// No text content, try to OCR it
|
||||||
pdfDocument.close();
|
if (language != null && content != null && content.trim().isEmpty()) {
|
||||||
} catch (IOException e) {
|
StringBuilder sb = new StringBuilder();
|
||||||
// NOP
|
try (InputStream inputStream = Files.newInputStream(unencryptedPdfFile);
|
||||||
|
PDDocument pdfDocument = PDDocument.load(inputStream)) {
|
||||||
|
PDFRenderer renderer = new PDFRenderer(pdfDocument);
|
||||||
|
for (int pageIndex = 0; pageIndex < pdfDocument.getNumberOfPages(); pageIndex++) {
|
||||||
|
sb.append(" ");
|
||||||
|
sb.append(FileUtil.ocrFile(renderer.renderImage(pageIndex), language));
|
||||||
}
|
}
|
||||||
|
return sb.toString();
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.error("Error while OCR-izing the PDF", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -17,7 +17,7 @@ import java.nio.file.StandardCopyOption;
|
|||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test of the file entity utilities.
|
* Test of the file utilities.
|
||||||
*
|
*
|
||||||
* @author bgamard
|
* @author bgamard
|
||||||
*/
|
*/
|
||||||
@ -42,6 +42,25 @@ public class TestFileUtil {
|
|||||||
Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen."));
|
Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen."));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void extractContentPdf() throws Exception {
|
||||||
|
Path path = Paths.get(ClassLoader.getSystemResource("file/udhr.pdf").toURI());
|
||||||
|
File file = new File();
|
||||||
|
file.setMimeType(MimeType.APPLICATION_PDF);
|
||||||
|
String content = FileUtil.extractContent(null, file, path, path);
|
||||||
|
Assert.assertTrue(content.contains("All human beings are born free and equal in dignity and rights."));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void extractContentScannedPdf() throws Exception {
|
||||||
|
Path path = Paths.get(ClassLoader.getSystemResource("file/scanned.pdf").toURI());
|
||||||
|
File file = new File();
|
||||||
|
file.setMimeType(MimeType.APPLICATION_PDF);
|
||||||
|
String content = FileUtil.extractContent("eng", file, path, path);
|
||||||
|
System.out.println(content);
|
||||||
|
Assert.assertTrue(content.contains("All human beings are born free and equal in dignity and rights."));
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void convertToPdfTest() throws Exception {
|
public void convertToPdfTest() throws Exception {
|
||||||
try (InputStream inputStream0 = Resources.getResource("file/apollo_landscape.jpg").openStream();
|
try (InputStream inputStream0 = Resources.getResource("file/apollo_landscape.jpg").openStream();
|
||||||
@ -52,7 +71,7 @@ public class TestFileUtil {
|
|||||||
// Document
|
// Document
|
||||||
DocumentDto documentDto = new DocumentDto();
|
DocumentDto documentDto = new DocumentDto();
|
||||||
documentDto.setTitle("My super document 1");
|
documentDto.setTitle("My super document 1");
|
||||||
documentDto.setDescription("Lorem ipsum dolor sit amet, consectetur adipiscing elit. Duis id turpis iaculis, commodo est ac, efficitur quam. Nam accumsan magna in orci vulputate ultricies. Sed vulputate neque magna, at laoreet leo ultricies vel. Proin eu hendrerit felis. Quisque sit amet arcu efficitur, pulvinar orci sed, imperdiet elit. Nunc posuere ex sed fermentum congue. Aliquam ultrices convallis finibus. Praesent iaculis justo vitae dictum auctor. Praesent suscipit imperdiet erat ac maximus. Aenean pharetra quam sed fermentum commodo. Donec sagittis ipsum nibh, id congue dolor venenatis quis. In tincidunt nisl non ex sollicitudin, a imperdiet neque scelerisque. Nullam lacinia ac orci sed faucibus. Donec tincidunt venenatis justo, nec fermentum justo rutrum a.");
|
documentDto.setDescription("Lorem ipsum dolor sit amet, consectetur adipiscing elit.\r\n Duis id turpis iaculis, commodo est ac, efficitur quam.\t Nam accumsan magna in orci vulputate ultricies. Sed vulputate neque magna, at laoreet leo ultricies vel. Proin eu hendrerit felis. Quisque sit amet arcu efficitur, pulvinar orci sed, imperdiet elit. Nunc posuere ex sed fermentum congue. Aliquam ultrices convallis finibus. Praesent iaculis justo vitae dictum auctor. Praesent suscipit imperdiet erat ac maximus. Aenean pharetra quam sed fermentum commodo. Donec sagittis ipsum nibh, id congue dolor venenatis quis. In tincidunt nisl non ex sollicitudin, a imperdiet neque scelerisque. Nullam lacinia ac orci sed faucibus. Donec tincidunt venenatis justo, nec fermentum justo rutrum a.");
|
||||||
documentDto.setSubject("A set of random picture");
|
documentDto.setSubject("A set of random picture");
|
||||||
documentDto.setIdentifier("ID-2016-08-00001");
|
documentDto.setIdentifier("ID-2016-08-00001");
|
||||||
documentDto.setPublisher("My Publisher, Inc.");
|
documentDto.setPublisher("My Publisher, Inc.");
|
||||||
|
BIN
docs-core/src/test/resources/file/scanned.pdf
Normal file
BIN
docs-core/src/test/resources/file/scanned.pdf
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user