diff --git a/.travis.yml b/.travis.yml index fa9309df..30fe4095 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,8 +17,6 @@ after_success: - docker push $REPO env: global: - - TESSDATA_PREFIX=/usr/share/tesseract-ocr - - LC_NUMERIC=C - secure: LRGpjWORb0qy6VuypZjTAfA8uRHlFUMTwb77cenS9PPRBxuSnctC531asS9Xg3DqC5nsRxBBprgfCKotn5S8nBSD1ceHh84NASyzLSBft3xSMbg7f/2i7MQ+pGVwLncusBU6E/drnMFwZBleo+9M8Tf96axY5zuUp90MUTpSgt0= - secure: bCDDR6+I7PmSkuTYZv1HF/z98ANX/SFEESUCqxVmV5Gs0zFC0vQXaPJQ2xaJNRop1HZBFMZLeMMPleb0iOs985smpvK2F6Rbop9Tu+Vyo0uKqv9tbZ7F8Nfgnv9suHKZlL84FNeUQZJX6vsFIYPEJ/r7K5P/M0PdUy++fEwxEhU= - secure: ewXnzbkgCIHpDWtaWGMa1OYZJ/ki99zcIl4jcDPIC0eB3njX/WgfcC6i0Ke9mLqDqwXarWJ6helm22sNh+xtQiz6isfBtBX+novfRt9AANrBe3koCMUemMDy7oh5VflBaFNP0DVb8LSCnwf6dx6ZB5E9EB8knvk40quc/cXpGjY= diff --git a/Dockerfile b/Dockerfile index 7c8a6453..0690a346 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,9 +4,6 @@ MAINTAINER b.gamard@sismics.com RUN apt-get update && apt-get -y -q install ffmpeg mediainfo tesseract-ocr tesseract-ocr-fra tesseract-ocr-ita tesseract-ocr-kor tesseract-ocr-rus tesseract-ocr-ukr tesseract-ocr-spa tesseract-ocr-ara tesseract-ocr-hin tesseract-ocr-deu tesseract-ocr-pol tesseract-ocr-jpn tesseract-ocr-por tesseract-ocr-tha tesseract-ocr-jpn tesseract-ocr-chi-sim tesseract-ocr-chi-tra && \ apt-get clean && rm -rf /var/lib/apt/lists/* -ENV TESSDATA_PREFIX /usr/share/tesseract-ocr/4.00/ -ENV LC_NUMERIC C - # Remove the embedded javax.mail jar from Jetty RUN rm -f /opt/jetty/lib/jndi/javax.mail.glassfish-*.jar diff --git a/docs-core/pom.xml b/docs-core/pom.xml index a168bfc7..280e8d1d 100644 --- a/docs-core/pom.xml +++ b/docs-core/pom.xml @@ -138,35 +138,25 @@ bcprov-jdk15on - - fr.opensagres.xdocreport - org.odftoolkit.odfdom.converter.pdf - - - - fr.opensagres.xdocreport - org.apache.poi.xwpf.converter.pdf - + + fr.opensagres.xdocreport + org.odftoolkit.odfdom.converter.pdf + - net.java.dev.jna - jna + fr.opensagres.xdocreport + org.apache.poi.xwpf.converter.pdf - - com.levigo.jbig2 - levigo-jbig2-imageio - - com.twelvemonkeys.imageio imageio-jpeg - + - com.github.jai-imageio - jai-imageio-core + com.twelvemonkeys.imageio + imageio-tiff @@ -174,10 +164,15 @@ jai-imageio-jpeg2000 + + com.levigo.jbig2 + levigo-jbig2-imageio + + + org.postgresql postgresql - 42.2.2.jre7 diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/ActionUtil.java b/docs-core/src/main/java/com/sismics/docs/core/util/ActionUtil.java index 9df75c19..2c225e74 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/util/ActionUtil.java +++ b/docs-core/src/main/java/com/sismics/docs/core/util/ActionUtil.java @@ -5,6 +5,7 @@ import com.sismics.docs.core.dao.jpa.dto.DocumentDto; import com.sismics.docs.core.util.action.Action; import com.sismics.docs.core.util.action.AddTagAction; import com.sismics.docs.core.util.action.RemoveTagAction; +import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.json.JsonObject; @@ -18,7 +19,7 @@ public class ActionUtil { /** * Logger. */ - private static final org.slf4j.Logger log = LoggerFactory.getLogger(LuceneUtil.class); + private static final Logger log = LoggerFactory.getLogger(ActionUtil.class); /** * Find the action associated to an action type. diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java b/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java index ddf4ec81..fbcbf2e6 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java +++ b/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java @@ -1,6 +1,9 @@ package com.sismics.docs.core.util; +import com.google.common.base.Charsets; import com.google.common.base.Strings; +import com.google.common.collect.Lists; +import com.google.common.io.CharStreams; import com.sismics.docs.core.constant.Constants; import com.sismics.docs.core.dao.jpa.FileDao; import com.sismics.docs.core.dao.jpa.UserDao; @@ -8,25 +11,23 @@ import com.sismics.docs.core.event.DocumentUpdatedAsyncEvent; import com.sismics.docs.core.event.FileCreatedAsyncEvent; import com.sismics.docs.core.model.jpa.File; import com.sismics.docs.core.model.jpa.User; -import com.sismics.tess4j.Tesseract; import com.sismics.util.ImageDeskew; import com.sismics.util.Scalr; import com.sismics.util.context.ThreadLocalContext; +import com.sismics.util.io.InputStreamReaderThread; import com.sismics.util.mime.MimeTypeUtil; import org.apache.commons.lang.StringUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import javax.crypto.Cipher; import javax.crypto.CipherInputStream; +import javax.imageio.ImageIO; import java.awt.image.BufferedImage; import java.io.IOException; import java.io.InputStream; +import java.io.InputStreamReader; import java.nio.file.Files; import java.nio.file.Path; -import java.util.Collections; -import java.util.HashSet; -import java.util.Set; +import java.util.*; /** * File entity utilities. @@ -34,11 +35,6 @@ import java.util.Set; * @author bgamard */ public class FileUtil { - /** - * Logger. - */ - private static final Logger log = LoggerFactory.getLogger(FileUtil.class); - /** * File ID of files currently being processed. */ @@ -50,28 +46,30 @@ public class FileUtil { * @param language Language to OCR * @param image Buffered image * @return Content extracted + * @throws Exception e */ - public static String ocrFile(String language, BufferedImage image) { + public static String ocrFile(String language, BufferedImage image) throws Exception { // Upscale, grayscale and deskew the image - String content = null; BufferedImage resizedImage = Scalr.resize(image, Scalr.Method.AUTOMATIC, Scalr.Mode.AUTOMATIC, 3500, Scalr.OP_ANTIALIAS, Scalr.OP_GRAYSCALE); image.flush(); ImageDeskew imageDeskew = new ImageDeskew(resizedImage); BufferedImage deskewedImage = Scalr.rotate(resizedImage, - imageDeskew.getSkewAngle(), Scalr.OP_ANTIALIAS, Scalr.OP_GRAYSCALE); resizedImage.flush(); - image = deskewedImage; + Path tmpFile = ThreadLocalContext.get().createTemporaryFile(); + ImageIO.write(deskewedImage, "tiff", tmpFile.toFile()); - // OCR the file - try { - Tesseract instance = Tesseract.getInstance(); - log.info("Starting OCR with TESSDATA_PREFIX=" + System.getenv("TESSDATA_PREFIX") + ";LC_NUMERIC=" + System.getenv("LC_NUMERIC")); - instance.setLanguage(language); - content = instance.doOCR(image); - } catch (Throwable e) { - log.error("Error while OCR-izing the image", e); + List result = Lists.newLinkedList(Arrays.asList("tesseract", tmpFile.toAbsolutePath().toString(), "stdout", "-l", language)); + ProcessBuilder pb = new ProcessBuilder(result); + Process process = pb.start(); + + // Consume the process error stream + final String commandName = pb.command().get(0); + new InputStreamReaderThread(process.getErrorStream(), commandName).start(); + + // Consume the data as text + try (InputStream is = process.getInputStream()) { + return CharStreams.toString(new InputStreamReader(is, Charsets.UTF_8)); } - - return content; } /** diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/format/ImageFormatHandler.java b/docs-core/src/main/java/com/sismics/docs/core/util/format/ImageFormatHandler.java index e4016e1f..80592761 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/util/format/ImageFormatHandler.java +++ b/docs-core/src/main/java/com/sismics/docs/core/util/format/ImageFormatHandler.java @@ -17,7 +17,6 @@ import org.slf4j.LoggerFactory; import javax.imageio.ImageIO; import java.awt.image.BufferedImage; -import java.io.IOException; import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; @@ -45,19 +44,16 @@ public class ImageFormatHandler implements FormatHandler { } @Override - public BufferedImage generateThumbnail(Path file) throws IOException { + public BufferedImage generateThumbnail(Path file) throws Exception { try (InputStream inputStream = Files.newInputStream(file)) { return ImageIO.read(inputStream); } } @Override - public String extractContent(String language, Path file) { + public String extractContent(String language, Path file) throws Exception { try (InputStream inputStream = Files.newInputStream(file)) { return FileUtil.ocrFile(language, ImageIO.read(inputStream)); - } catch (IOException e) { - log.error("Error reading the image", e); - return null; } } diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/format/PdfFormatHandler.java b/docs-core/src/main/java/com/sismics/docs/core/util/format/PdfFormatHandler.java index f47f8522..08c698a1 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/util/format/PdfFormatHandler.java +++ b/docs-core/src/main/java/com/sismics/docs/core/util/format/PdfFormatHandler.java @@ -58,6 +58,7 @@ public class PdfFormatHandler implements FormatHandler { PDDocument pdfDocument = PDDocument.load(inputStream)) { PDFRenderer renderer = new PDFRenderer(pdfDocument); for (int pageIndex = 0; pageIndex < pdfDocument.getNumberOfPages(); pageIndex++) { + log.info("OCR page " + (pageIndex + 1) + "/" + pdfDocument.getNumberOfPages() + " of PDF file containing only images"); sb.append(" "); sb.append(FileUtil.ocrFile(language, renderer.renderImage(pageIndex))); } diff --git a/docs-core/src/main/java/com/sismics/tess4j/ImageHelper.java b/docs-core/src/main/java/com/sismics/tess4j/ImageHelper.java deleted file mode 100644 index d3322213..00000000 --- a/docs-core/src/main/java/com/sismics/tess4j/ImageHelper.java +++ /dev/null @@ -1,173 +0,0 @@ -/** - * Copyright @ 2008 Quan Nguyen - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ -package com.sismics.tess4j; - -import java.awt.Graphics2D; -import java.awt.Image; -import java.awt.RenderingHints; -import java.awt.Toolkit; -import java.awt.Transparency; -import java.awt.datatransfer.Clipboard; -import java.awt.datatransfer.DataFlavor; -import java.awt.image.*; - -public class ImageHelper { - - /** - * Convenience method that returns a scaled instance of the provided - * {@code BufferedImage}. - * - * @param image the original image to be scaled - * @param targetWidth the desired width of the scaled instance, in pixels - * @param targetHeight the desired height of the scaled instance, in pixels - * @return a scaled version of the original {@code BufferedImage} - */ - public static BufferedImage getScaledInstance(BufferedImage image, int targetWidth, int targetHeight) { - int type = (image.getTransparency() == Transparency.OPAQUE) - ? BufferedImage.TYPE_INT_RGB : BufferedImage.TYPE_INT_ARGB; - BufferedImage tmp = new BufferedImage(targetWidth, targetHeight, type); - Graphics2D g2 = tmp.createGraphics(); - g2.setRenderingHint(RenderingHints.KEY_INTERPOLATION, RenderingHints.VALUE_INTERPOLATION_BICUBIC); - g2.drawImage(image, 0, 0, targetWidth, targetHeight, null); - g2.dispose(); - return tmp; - } - - /** - * A replacement for the standard - * BufferedImage.getSubimage method. - * - * @param image - * @param x the X coordinate of the upper-left corner of the specified - * rectangular region - * @param y the Y coordinate of the upper-left corner of the specified - * rectangular region - * @param width the width of the specified rectangular region - * @param height the height of the specified rectangular region - * @return a BufferedImage that is the subimage of image. - */ - public static BufferedImage getSubImage(BufferedImage image, int x, int y, int width, int height) { - int type = (image.getTransparency() == Transparency.OPAQUE) - ? BufferedImage.TYPE_INT_RGB : BufferedImage.TYPE_INT_ARGB; - BufferedImage tmp = new BufferedImage(width, height, type); - Graphics2D g2 = tmp.createGraphics(); - g2.drawImage(image.getSubimage(x, y, width, height), 0, 0, null); - g2.dispose(); - return tmp; - } - - /** - * A simple method to convert an image to binary or B/W image. - * - * @param image input image - * @return a monochrome image - */ - public static BufferedImage convertImageToBinary(BufferedImage image) { - BufferedImage tmp = new BufferedImage(image.getWidth(), image.getHeight(), BufferedImage.TYPE_BYTE_BINARY); - Graphics2D g2 = tmp.createGraphics(); - g2.drawImage(image, 0, 0, null); - g2.dispose(); - return tmp; - } - - /** - * A simple method to convert an image to binary or B/W image. - * - * @param image input image - * @return a monochrome image - * @deprecated As of release 1.1, renamed to {@link #convertImageToBinary(BufferedImage image)} - */ - @Deprecated - public static BufferedImage convertImage2Binary(BufferedImage image) { - return convertImageToBinary(image); - } - - /** - * A simple method to convert an image to gray scale. - * - * @param image input image - * @return a monochrome image - */ - public static BufferedImage convertImageToGrayscale(BufferedImage image) { - BufferedImage tmp = new BufferedImage(image.getWidth(), image.getHeight(), BufferedImage.TYPE_BYTE_GRAY); - Graphics2D g2 = tmp.createGraphics(); - g2.drawImage(image, 0, 0, null); - g2.dispose(); - return tmp; - } - - private static final short[] invertTable; - - static { - invertTable = new short[256]; - for (int i = 0; i < 256; i++) { - invertTable[i] = (short) (255 - i); - } - } - - /** - * Inverts image color. - * - * @param image input image - * @return an inverted-color image - */ - public static BufferedImage invertImageColor(BufferedImage image) { - BufferedImage tmp = new BufferedImage(image.getWidth(), image.getHeight(), image.getType()); - BufferedImageOp invertOp = new LookupOp(new ShortLookupTable(0, invertTable), null); - return invertOp.filter(image, tmp); - } - - /** - * Rotates an image. - * - * @param image the original image - * @param angle the degree of rotation - * @return a rotated image - */ - public static BufferedImage rotateImage(BufferedImage image, double angle) { - double theta = Math.toRadians(angle); - double sin = Math.abs(Math.sin(theta)); - double cos = Math.abs(Math.cos(theta)); - int w = image.getWidth(); - int h = image.getHeight(); - int newW = (int) Math.floor(w * cos + h * sin); - int newH = (int) Math.floor(h * cos + w * sin); - - BufferedImage tmp = new BufferedImage(newW, newH, image.getType()); - Graphics2D g2d = tmp.createGraphics(); - g2d.setRenderingHint(RenderingHints.KEY_INTERPOLATION, - RenderingHints.VALUE_INTERPOLATION_BICUBIC); - g2d.translate((newW - w) / 2, (newH - h) / 2); - g2d.rotate(theta, w / 2, h / 2); - g2d.drawImage(image, 0, 0, null); - g2d.dispose(); - return tmp; - } - - /** - * Gets an image from Clipboard. - * - * @return image - */ - public static Image getClipboardImage() { - Clipboard clipboard = Toolkit.getDefaultToolkit().getSystemClipboard(); - try { - return (Image) clipboard.getData(DataFlavor.imageFlavor); - } catch (Exception e) { - return null; - } - } -} diff --git a/docs-core/src/main/java/com/sismics/tess4j/ImageIOHelper.java b/docs-core/src/main/java/com/sismics/tess4j/ImageIOHelper.java deleted file mode 100644 index 7c56a5ec..00000000 --- a/docs-core/src/main/java/com/sismics/tess4j/ImageIOHelper.java +++ /dev/null @@ -1,143 +0,0 @@ -/** - * Copyright @ 2008 Quan Nguyen - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ -package com.sismics.tess4j; - -import java.awt.Toolkit; -import java.awt.image.BufferedImage; -import java.awt.image.DataBufferByte; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.util.HashMap; -import java.util.Locale; -import java.util.Map; - -import javax.imageio.IIOImage; -import javax.imageio.ImageIO; -import javax.imageio.ImageReadParam; -import javax.imageio.ImageReader; -import javax.imageio.ImageWriteParam; -import javax.imageio.ImageWriter; -import javax.imageio.metadata.IIOMetadata; -import javax.imageio.metadata.IIOMetadataNode; -import javax.imageio.stream.ImageOutputStream; - -import org.w3c.dom.NodeList; - -import com.github.jaiimageio.impl.plugins.tiff.TIFFImageReaderSpi; -import com.github.jaiimageio.impl.plugins.tiff.TIFFImageWriterSpi; -import com.github.jaiimageio.plugins.tiff.TIFFImageWriteParam; - -public class ImageIOHelper { - - final static String TIFF_FORMAT = "tiff"; - - - /** - * Gets pixel data of an - * IIOImage object. - * - * @param oimage an - * IIOImage object - * @return a byte buffer of pixel data - * @throws Exception - */ - public static ByteBuffer getImageByteBuffer(BufferedImage oimage) throws IOException { - // Get tif writer and set output to file - ImageWriter writer = new TIFFImageWriterSpi().createWriterInstance(); - - // Set up the writeParam - // We are using the old JAI ImageIO plugin, because for some reason, OCR don't work with TwelveMonkeys' plugin - ImageWriteParam tiffWriteParam = new TIFFImageWriteParam(Locale.US); - tiffWriteParam.setCompressionMode(ImageWriteParam.MODE_DISABLED); - - // Get the stream metadata - IIOMetadata streamMetadata = writer.getDefaultStreamMetadata(tiffWriteParam); - ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); - ImageOutputStream ios = ImageIO.createImageOutputStream(outputStream); - writer.setOutput(ios); - writer.write(streamMetadata, new IIOImage(oimage, null, null), tiffWriteParam); - writer.dispose(); - - // Read the writed image - ios.seek(0); - ImageReader reader = new TIFFImageReaderSpi().createReaderInstance(); - ImageReadParam param = reader.getDefaultReadParam(); - reader.setInput(ios, true, true); - BufferedImage bi; - try { - bi = reader.read(0, param); - } finally { - reader.dispose(); - ios.close(); - } - - return convertImageData(bi); - } - - /** - * Converts BufferedImage to ByteBuffer. - * - * @param bi Input image - * @return pixel data - */ - public static ByteBuffer convertImageData(BufferedImage bi) { - byte[] pixelData = ((DataBufferByte) bi.getRaster().getDataBuffer()).getData(); - // return ByteBuffer.wrap(pixelData); - ByteBuffer buf = ByteBuffer.allocateDirect(pixelData.length); - buf.order(ByteOrder.nativeOrder()); - buf.put(pixelData); - buf.flip(); - return buf; - } - - /** - * Reads image meta data. - * - * @param oimage - * @return a map of meta data - */ - public static Map readImageData(IIOImage oimage) { - Map dict = new HashMap(); - - IIOMetadata imageMetadata = oimage.getMetadata(); - if (imageMetadata != null) { - IIOMetadataNode dimNode = (IIOMetadataNode) imageMetadata.getAsTree("javax_imageio_1.0"); - NodeList nodes = dimNode.getElementsByTagName("HorizontalPixelSize"); - int dpiX; - if (nodes.getLength() > 0) { - float dpcWidth = Float.parseFloat(nodes.item(0).getAttributes().item(0).getNodeValue()); - dpiX = (int) Math.round(25.4f / dpcWidth); - } else { - dpiX = Toolkit.getDefaultToolkit().getScreenResolution(); - } - dict.put("dpiX", String.valueOf(dpiX)); - - nodes = dimNode.getElementsByTagName("VerticalPixelSize"); - int dpiY; - if (nodes.getLength() > 0) { - float dpcHeight = Float.parseFloat(nodes.item(0).getAttributes().item(0).getNodeValue()); - dpiY = (int) Math.round(25.4f / dpcHeight); - } else { - dpiY = Toolkit.getDefaultToolkit().getScreenResolution(); - } - dict.put("dpiY", String.valueOf(dpiY)); - } - - return dict; - } -} diff --git a/docs-core/src/main/java/com/sismics/tess4j/TessAPI.java b/docs-core/src/main/java/com/sismics/tess4j/TessAPI.java deleted file mode 100644 index cb60acde..00000000 --- a/docs-core/src/main/java/com/sismics/tess4j/TessAPI.java +++ /dev/null @@ -1,686 +0,0 @@ -/** - * Copyright @ 2012 Quan Nguyen - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ -package com.sismics.tess4j; - -import com.sun.jna.*; -import com.sun.jna.ptr.*; -import java.nio.*; - -/** - * A Java wrapper for - * Tesseract OCR 3.02 API using - * JNA Interface Mapping. - */ -public interface TessAPI extends Library { - - static final boolean WINDOWS = System.getProperty("os.name").toLowerCase().startsWith("windows"); - /** - * Native library name. - */ - public static final String LIB_NAME = "libtesseract302"; - public static final String LIB_NAME_NON_WIN = "tesseract"; - /** - * An instance of the class library. - */ - public static final TessAPI INSTANCE = (TessAPI) Native.loadLibrary(WINDOWS ? LIB_NAME : LIB_NAME_NON_WIN, TessAPI.class); - - /** - * When Tesseract/Cube is initialized we can choose to instantiate/load/run - * only the Tesseract part, only the Cube part or both along with the - * combiner. The preference of which engine to use is stored in - * tessedit_ocr_engine_mode.

ATTENTION: When - * modifying this enum, please make sure to make the appropriate changes to - * all the enums mirroring it (e.g. OCREngine in - * cityblock/workflow/detection/detection_storage.proto). Such enums will - * mention the connection to OcrEngineMode in the comments. - */ - public static interface TessOcrEngineMode { - - public static final int OEM_TESSERACT_ONLY = (int) 0; - public static final int OEM_CUBE_ONLY = (int) 1; - public static final int OEM_TESSERACT_CUBE_COMBINED = (int) 2; - public static final int OEM_DEFAULT = (int) 3; - }; - - /** - * Possible modes for page layout analysis. These *must* be kept in order of - * decreasing amount of layout analysis to be done, except for - * OSD_ONLY, so that the inequality test macros below work. - */ - public static interface TessPageSegMode { - - public static final int PSM_OSD_ONLY = (int) 0; - public static final int PSM_AUTO_OSD = (int) 1; - public static final int PSM_AUTO_ONLY = (int) 2; - public static final int PSM_AUTO = (int) 3; - public static final int PSM_SINGLE_COLUMN = (int) 4; - public static final int PSM_SINGLE_BLOCK_VERT_TEXT = (int) 5; - public static final int PSM_SINGLE_BLOCK = (int) 6; - public static final int PSM_SINGLE_LINE = (int) 7; - public static final int PSM_SINGLE_WORD = (int) 8; - public static final int PSM_CIRCLE_WORD = (int) 9; - public static final int PSM_SINGLE_CHAR = (int) 10; - public static final int PSM_COUNT = (int) 11; - }; - - /** - * Enum of the elements of the page hierarchy, used in - * ResultIterator to provide functions that operate on each - * level without having to have 5x as many functions. - */ - public static interface TessPageIteratorLevel { - - public static final int RIL_BLOCK = (int) 0; - public static final int RIL_PARA = (int) 1; - public static final int RIL_TEXTLINE = (int) 2; - public static final int RIL_WORD = (int) 3; - public static final int RIL_SYMBOL = (int) 4; - }; - - public static interface TessPolyBlockType { - - public static final int PT_UNKNOWN = (int) 0; - public static final int PT_FLOWING_TEXT = (int) 1; - public static final int PT_HEADING_TEXT = (int) 2; - public static final int PT_PULLOUT_TEXT = (int) 3; - public static final int PT_TABLE = (int) 4; - public static final int PT_VERTICAL_TEXT = (int) 5; - public static final int PT_CAPTION_TEXT = (int) 6; - public static final int PT_FLOWING_IMAGE = (int) 7; - public static final int PT_HEADING_IMAGE = (int) 8; - public static final int PT_PULLOUT_IMAGE = (int) 9; - public static final int PT_HORZ_LINE = (int) 10; - public static final int PT_VERT_LINE = (int) 11; - public static final int PT_NOISE = (int) 12; - public static final int PT_COUNT = (int) 13; - }; - - /** - *
-     *  +------------------+
-     *  | 1 Aaaa Aaaa Aaaa |
-     *  | Aaa aa aaa aa    |
-     *  | aaaaaa A aa aaa. |
-     *  |                2 |
-     *  |   #######  c c C |
-     *  |   #######  c c c |
-     *  | < #######  c c c |
-     *  | < #######  c   c |
-     *  | < #######  .   c |
-     *  | 3 #######      c |
-     *  +------------------+
-     * 
- * Orientation Example:
- * ====================
- * Above is a - * diagram of some (1) English and (2) Chinese text and a (3) photo - * credit.
- *
- * Upright Latin characters are represented as A and a. '<' represents - * a latin character rotated anti-clockwise 90 degrees. Upright - * Chinese characters are represented C and c.
- *
- * NOTA BENE: enum values here should match goodoc.proto
- *
- * If you orient your head so that "up" aligns with Orientation, then - * the characters will appear "right side up" and readable.
- *
- * In the example above, both the - * English and Chinese paragraphs are oriented so their "up" is the top of - * the page (page up). The photo credit is read with one's head turned - * leftward ("up" is to page left).
- *
The values of this enum - * match the convention of Tesseract's osdetect.h - */ - public static interface TessOrientation { - - public static final int ORIENTATION_PAGE_UP = (int) 0; - public static final int ORIENTATION_PAGE_RIGHT = (int) 1; - public static final int ORIENTATION_PAGE_DOWN = (int) 2; - public static final int ORIENTATION_PAGE_LEFT = (int) 3; - }; - - /** - * The grapheme clusters within a line of text are laid out logically in - * this direction, judged when looking at the text line rotated so that its - * Orientation is "page up".

For English text, the writing - * direction is left-to-right. For the Chinese text in the above example, - * the writing direction is top-to-bottom. - */ - public static interface TessWritingDirection { - - public static final int WRITING_DIRECTION_LEFT_TO_RIGHT = (int) 0; - public static final int WRITING_DIRECTION_RIGHT_TO_LEFT = (int) 1; - public static final int WRITING_DIRECTION_TOP_TO_BOTTOM = (int) 2; - }; - - /** - * The text lines are read in the given sequence.

In English, - * the order is top-to-bottom. In Chinese, vertical text lines are read - * right-to-left. Mongolian is written in vertical columns top to bottom - * like Chinese, but the lines order left-to right.

Note that - * only some combinations make sense. For example, - * WRITING_DIRECTION_LEFT_TO_RIGHT implies - * TEXTLINE_ORDER_TOP_TO_BOTTOM. - */ - public static interface TessTextlineOrder { - - public static final int TEXTLINE_ORDER_LEFT_TO_RIGHT = (int) 0; - public static final int TEXTLINE_ORDER_RIGHT_TO_LEFT = (int) 1; - public static final int TEXTLINE_ORDER_TOP_TO_BOTTOM = (int) 2; - }; - public static final int TRUE = (int) 1; - public static final int FALSE = (int) 0; - - /** - * Returns the version identifier. - */ - String TessVersion(); - - void TessDeleteText(Pointer text); - - void TessDeleteTextArray(PointerByReference arr); - - void TessDeleteIntArray(IntBuffer arr); - - /** - * Creates an instance of the base class for all Tesseract APIs. - */ - TessAPI.TessBaseAPI TessBaseAPICreate(); - - /** - * Disposes the TesseractAPI instance. - */ - void TessBaseAPIDelete(TessAPI.TessBaseAPI handle); - - /** - * Set the name of the input file. Needed only for training and reading a - * UNLV zone file. - */ - void TessBaseAPISetInputName(TessAPI.TessBaseAPI handle, String name); - - /** - * Set the name of the bonus output files. Needed only for debugging. - */ - void TessBaseAPISetOutputName(TessAPI.TessBaseAPI handle, String name); - - /** - * Set the value of an internal "parameter." Supply the name of the - * parameter and the value as a string, just as you would in a config file. - * Returns false if the name lookup failed. E.g., - * SetVariable("tessedit_char_blacklist", "xyz"); to ignore x, - * y and z. Or - * SetVariable("classify_bln_numeric_mode", "1"); to set - * numeric-only mode. - * SetVariable may be used before - * Init, but settings will revert to defaults on - * End().

Note: Must be called after - * Init(). Only works for non-init variables (init variables - * should be passed to - * Init()). - */ - int TessBaseAPISetVariable(TessAPI.TessBaseAPI handle, String name, String value); - - /** - * Returns true (1) if the parameter was found among Tesseract parameters. - * Fills in value with the value of the parameter. - */ - int TessBaseAPIGetIntVariable(TessAPI.TessBaseAPI handle, String name, IntBuffer value); - - int TessBaseAPIGetBoolVariable(TessAPI.TessBaseAPI handle, String name, IntBuffer value); - - int TessBaseAPIGetDoubleVariable(TessAPI.TessBaseAPI handle, String name, DoubleBuffer value); - - String TessBaseAPIGetStringVariable(TessAPI.TessBaseAPI handle, String name); - - /** - * Print Tesseract parameters to the given file.

Note: Must not - * be the first method called after instance create. - */ - void TessBaseAPIPrintVariablesToFile(TessAPI.TessBaseAPI handle, String filename); - - /** - * Instances are now mostly thread-safe and totally independent, but some - * global parameters remain. Basically it is safe to use multiple - * TessBaseAPIs in different threads in parallel, UNLESS: you use - * SetVariable on some of the Params in classify and textord. - * If you do, then the effect will be to change it for all your - * instances.

Start tesseract. Returns zero on success and -1 - * on failure. NOTE that the only members that may be called before Init are - * those listed above here in the class definition.

The - * datapath must be the name of the parent directory of - * tessdata and must end in / . Any name after the last / will be stripped. - * The language is (usually) an - * ISO 639-3 string or - * NULL will default to eng. It is entirely safe (and - * eventually will be efficient too) to call Init multiple times on the same - * instance to change language, or just to reset the classifier. The - * language may be a string of the form [~][+[~]]* indicating - * that multiple languages are to be loaded. E.g., hin+eng will load Hindi - * and English. Languages may specify internally that they want to be loaded - * with one or more other languages, so the ~ sign is available to override - * that. E.g., if hin were set to load eng by default, then hin+~eng would - * force loading only hin. The number of loaded languages is limited only by - * memory, with the caveat that loading additional languages will impact - * both speed and accuracy, as there is more work to do to decide on the - * applicable language, and there is more chance of hallucinating incorrect - * words. WARNING: On changing languages, all Tesseract parameters are reset - * back to their default values. (Which may vary between languages.) If you - * have a rare need to set a Variable that controls initialization for a - * second call to - * Init you should explicitly call - * End() and then use - * SetVariable before - * Init. This is only a very rare use case, since there are - * very few uses that require any parameters to be set before - * Init.

If - * set_only_non_debug_params is true, only params that do not - * contain "debug" in the name will be set. - */ - int TessBaseAPIInit1(TessAPI.TessBaseAPI handle, String datapath, String language, int oem, PointerByReference configs, int configs_size); - - int TessBaseAPIInit2(TessAPI.TessBaseAPI handle, String datapath, String language, int oem); - - int TessBaseAPIInit3(TessAPI.TessBaseAPI handle, String datapath, String language); - - /** - * Returns the languages string used in the last valid initialization. If - * the last initialization specified "deu+hin" then that will be returned. - * If hin loaded eng automatically as well, then that will not be included - * in this list. To find the languages actually loaded, use - * GetLoadedLanguagesAsVector. The returned string should NOT - * be deleted. - */ - String TessBaseAPIGetInitLanguagesAsString(TessAPI.TessBaseAPI handle); - - /** - * Returns the loaded languages in the vector of STRINGs. Includes all - * languages loaded by the last - * Init, including those loaded as dependencies of other loaded - * languages. - */ - PointerByReference TessBaseAPIGetLoadedLanguagesAsVector(TessAPI.TessBaseAPI handle); - - /** - * Returns the available languages in the vector of STRINGs. - */ - PointerByReference TessBaseAPIGetAvailableLanguagesAsVector(TessAPI.TessBaseAPI handle); - - /** - * Init only the lang model component of Tesseract. The only functions that - * work after this init are - * SetVariable and - * IsValidWord. WARNING: temporary! This function will be - * removed from here and placed in a separate API at some future time. - */ - int TessBaseAPIInitLangMod(TessAPI.TessBaseAPI handle, String datapath, String language); - - /** - * Init only for page layout analysis. Use only for calls to - * SetImage and - * AnalysePage. Calls that attempt recognition will generate an - * error. - */ - void TessBaseAPIInitForAnalysePage(TessAPI.TessBaseAPI handle); - - /** - * Read a "config" file containing a set of param, value pairs. Searches the - * standard places: - * tessdata/configs, - * tessdata/tessconfigs and also accepts a relative or absolute - * path name. Note: only non-init params will be set (init params are set by - * Init()). - */ - void TessBaseAPIReadConfigFile(TessAPI.TessBaseAPI handle, String filename, int init_only); - - /** - * Set the current page segmentation mode. Defaults to PSM_SINGLE_BLOCK. The - * mode is stored as an IntParam so it can also be modified by - * ReadConfigFile or - * SetVariable("tessedit_pageseg_mode", mode as string). - */ - void TessBaseAPISetPageSegMode(TessAPI.TessBaseAPI handle, int mode); - - /** - * Return the current page segmentation mode. - */ - int TessBaseAPIGetPageSegMode(TessAPI.TessBaseAPI handle); - - /** - * Recognize a rectangle from an image and return the result as a string. - * May be called many times for a single - * Init. Currently has no error checking. Greyscale of 8 and - * color of 24 or 32 bits per pixel may be given. Palette color images will - * not work properly and must be converted to 24 bit. Binary images of 1 bit - * per pixel may also be given but they must be byte packed with the MSB of - * the first byte being the first pixel, and a 1 represents WHITE. For - * binary images set bytes_per_pixel=0. The recognized text is returned as a - * char* which is coded as UTF8 and must be freed with the delete [] - * operator.

Note that - * TesseractRect is the simplified convenience interface. For - * advanced uses, use - * SetImage, (optionally) - * SetRectangle, - * Recognize, and one or more of the - * Get*Text functions below. - */ - Pointer TessBaseAPIRect(TessAPI.TessBaseAPI handle, ByteBuffer imagedata, int bytes_per_pixel, int bytes_per_line, int left, int top, int width, int height); - - /** - * Call between pages or documents etc to free up memory and forget adaptive - * data. - */ - void TessBaseAPIClearAdaptiveClassifier(TessAPI.TessBaseAPI handle); - - /** - * Provide an image for Tesseract to recognize. Format is as TesseractRect - * above. Does not copy the image buffer, or take ownership. The source - * image may be destroyed after Recognize is called, either explicitly or - * implicitly via one of the - * Get*Text functions. - * SetImage clears all recognition results, and sets the - * rectangle to the full image, so it may be followed immediately by a - * GetUTF8Text, and it will automatically perform recognition. - */ - void TessBaseAPISetImage(TessAPI.TessBaseAPI handle, ByteBuffer imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line); - - /** - * Set the resolution of the source image in pixels per inch so font size - * information can be calculated in results. Call this after SetImage(). - */ - void TessBaseAPISetSourceResolution(TessAPI.TessBaseAPI handle, int ppi); - - /** - * Restrict recognition to a sub-rectangle of the image. Call after - * SetImage. Each - * SetRectangle clears the recognition results so multiple - * rectangles can be recognized with the same image. - */ - void TessBaseAPISetRectangle(TessAPI.TessBaseAPI handle, int left, int top, int width, int height); - - /** Scale factor from original image. */ - int TessBaseAPIGetThresholdedImageScaleFactor(TessAPI.TessBaseAPI handle); - - /** Dump the internal binary image to a PGM file. */ - void TessBaseAPIDumpPGM(TessAPI.TessBaseAPI handle, String filename); - - /** - * Runs page layout analysis in the mode set by SetPageSegMode. May - * optionally be called prior to Recognize to get access to just the page - * layout results. Returns an iterator to the results. Returns NULL on - * error. The returned iterator must be deleted after use. WARNING! This - * class points to data held within the TessBaseAPI class, and therefore can - * only be used while the TessBaseAPI class still exists and has not been - * subjected to a call of - * Init, - * SetImage, - * Recognize, - * Clear, - * End, DetectOS, or anything else that changes the internal - * PAGE_RES. - */ - TessAPI.TessPageIterator TessBaseAPIAnalyseLayout(TessAPI.TessBaseAPI handle); - - /** - * Recognize the image from SetAndThresholdImage, generating Tesseract - * internal structures. Returns 0 on success. Optional. The - * Get*Text functions below will call - * Recognize if needed. After Recognize, the output is kept - * internally until the next - * SetImage. - */ - int TessBaseAPIRecognize(TessAPI.TessBaseAPI handle, TessAPI.ETEXT_DESC monitor); - - /** - * Variant on Recognize used for testing chopper. - */ - int TessBaseAPIRecognizeForChopTest(TessAPI.TessBaseAPI handle, TessAPI.ETEXT_DESC monitor); - - /** - * Get a reading-order iterator to the results of LayoutAnalysis and/or - * Recognize. The returned iterator must be deleted after use. WARNING! This - * class points to data held within the TessBaseAPI class, and therefore can - * only be used while the TessBaseAPI class still exists and has not been - * subjected to a call of - * Init, - * SetImage, - * Recognize, - * Clear, - * End, DetectOS, or anything else that changes the internal - * PAGE_RES. - */ - TessAPI.TessResultIterator TessBaseAPIGetIterator(TessAPI.TessBaseAPI handle); - - /** - * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize. - * The returned iterator must be deleted after use. - * WARNING! This class points to data held within the TessBaseAPI class, and - * therefore can only be used while the TessBaseAPI class still exists and - * has not been subjected to a call of Init, SetImage, Recognize, Clear, End - * DetectOS, or anything else that changes the internal PAGE_RES. - */ - TessAPI.TessMutableIterator TessBaseAPIGetMutableIterator(TessAPI.TessBaseAPI handle); - - /** - * Recognizes all the pages in the named file, as a multi-page tiff or list - * of filenames, or single image, and gets the appropriate kind of text - * according to parameters: - * tessedit_create_boxfile, - * tessedit_make_boxes_from_boxes, - * tessedit_write_unlv, - * tessedit_create_hocr. Calls ProcessPage on each page in the - * input file, which may be a multi-page tiff, single-page other file - * format, or a plain text list of images to read. If tessedit_page_number - * is non-negative, processing begins at that page of a multi-page tiff - * file, or filelist. The text is returned in text_out. Returns false on - * error. If non-zero timeout_millisec terminates processing after the - * timeout on a single page. If non-NULL and non-empty, and some page fails - * for some reason, the page is reprocessed with the retry_config config - * file. Useful for interactively debugging a bad page. - */ - Pointer TessBaseAPIProcessPages(TessAPI.TessBaseAPI handle, String filename, String retry_config, int timeout_millisec); - - /** - * The recognized text is returned as a char* which is coded as UTF-8 and - * must be freed with the delete [] operator. - */ - Pointer TessBaseAPIGetUTF8Text(TessAPI.TessBaseAPI handle); - - /** - * Make a HTML-formatted string with hOCR markup from the internal data - * structures. page_number is 0-based but will appear in the output as - * 1-based. - */ - Pointer TessBaseAPIGetHOCRText(TessAPI.TessBaseAPI handle, int page_number); - - /** - * The recognized text is returned as a char* which is coded in the same - * format as a box file used in training. Returned string must be freed with - * the delete [] operator. Constructs coordinates in the original image - - * not just the rectangle. page_number is a 0-based page index that will - * appear in the box file. - */ - Pointer TessBaseAPIGetBoxText(TessAPI.TessBaseAPI handle, int page_number); - - /** - * The recognized text is returned as a char* which is coded as UNLV format - * Latin-1 with specific reject and suspect codes and must be freed with the - * delete [] operator. - */ - Pointer TessBaseAPIGetUNLVText(TessAPI.TessBaseAPI handle); - - /** - * Returns the (average) confidence value between 0 and 100. - */ - int TessBaseAPIMeanTextConf(TessAPI.TessBaseAPI handle); - - /** - * Returns all word confidences (between 0 and 100) in an array, terminated - * by -1. The calling function must delete [] after use. The number of - * confidences should correspond to the number of space-delimited words in - * GetUTF8Text. - */ - IntByReference TessBaseAPIAllWordConfidences(TessAPI.TessBaseAPI handle); - - /** - * Applies the given word to the adaptive classifier if possible. The word - * must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can tell the - * boundaries of the graphemes. Assumes that SetImage/SetRectangle have been - * used to set the image to the given word. The mode arg should be - * PSM_SINGLE_WORD or PSM_CIRCLE_WORD, as that will be used to control - * layout analysis. The currently set PageSegMode is preserved. Returns - * false if adaption was not possible for some reason. - */ - int TessBaseAPIAdaptToWordStr(TessAPI.TessBaseAPI handle, int mode, String wordstr); - - /** - * Free up recognition results and any stored image data, without actually - * freeing any recognition data that would be time-consuming to reload. - * Afterwards, you must call - * SetImage or - * TesseractRect before doing any - * Recognize or - * Get* operation. - */ - void TessBaseAPIClear(TessAPI.TessBaseAPI handle); - - /** - * Close down tesseract and free up all memory. - * End() is equivalent to destructing and reconstructing your - * TessBaseAPI. Once - * End() has been used, none of the other API functions may be - * used other than - * Init and anything declared above it in the class definition. - */ - void TessBaseAPIEnd(TessAPI.TessBaseAPI handle); - - /** - * Check whether a word is valid according to Tesseract's language model. - * - * @return 0 if the word is invalid, non-zero if valid. @warning temporary! - * This function will be removed from here and placed in a separate API at - * some future time. - */ - int TessBaseAPIIsValidWord(TessAPI.TessBaseAPI handle, String word); - - int TessBaseAPIGetTextDirection(TessAPI.TessBaseAPI handle, IntBuffer out_offset, FloatBuffer out_slope); - - /** - * This method returns the string form of the specified unichar. - */ - String TessBaseAPIGetUnichar(TessAPI.TessBaseAPI handle, int unichar_id); - - /* Page iterator */ - void TessPageIteratorDelete(TessAPI.TessPageIterator handle); - - TessAPI.TessPageIterator TessPageIteratorCopy(TessAPI.TessPageIterator handle); - - void TessPageIteratorBegin(TessAPI.TessPageIterator handle); - - int TessPageIteratorNext(TessAPI.TessPageIterator handle, int level); - - int TessPageIteratorIsAtBeginningOf(TessAPI.TessPageIterator handle, int level); - - int TessPageIteratorIsAtFinalElement(TessAPI.TessPageIterator handle, int level, int element); - - int TessPageIteratorBoundingBox(TessAPI.TessPageIterator handle, int level, IntBuffer left, IntBuffer top, IntBuffer right, IntBuffer bottom); - - int TessPageIteratorBlockType(TessAPI.TessPageIterator handle); - - int TessPageIteratorBaseline(TessAPI.TessPageIterator handle, int level, IntBuffer x1, IntBuffer y1, IntBuffer x2, IntBuffer y2); - - void TessPageIteratorOrientation(TessAPI.TessPageIterator handle, IntBuffer orientation, IntBuffer writing_direction, IntBuffer textline_order, FloatBuffer deskew_angle); - - /* Result iterator */ - void TessResultIteratorDelete(TessAPI.TessResultIterator handle); - - TessAPI.TessResultIterator TessResultIteratorCopy(TessAPI.TessResultIterator handle); - - TessAPI.TessPageIterator TessResultIteratorGetPageIterator(TessAPI.TessResultIterator handle); - - TessAPI.TessPageIterator TessResultIteratorGetPageIteratorConst(TessAPI.TessResultIterator handle); - - Pointer TessResultIteratorGetUTF8Text(TessAPI.TessResultIterator handle, int level); - - float TessResultIteratorConfidence(TessAPI.TessResultIterator handle, int level); - - String TessResultIteratorWordFontAttributes(TessAPI.TessResultIterator handle, IntBuffer is_bold, IntBuffer is_italic, IntBuffer is_underlined, IntBuffer is_monospace, IntBuffer is_serif, IntBuffer is_smallcaps, IntBuffer pointsize, IntBuffer font_id); - - int TessResultIteratorWordIsFromDictionary(TessAPI.TessResultIterator handle); - - int TessResultIteratorWordIsNumeric(TessAPI.TessResultIterator handle); - - int TessResultIteratorSymbolIsSuperscript(TessAPI.TessResultIterator handle); - - int TessResultIteratorSymbolIsSubscript(TessAPI.TessResultIterator handle); - - int TessResultIteratorSymbolIsDropcap(TessAPI.TessResultIterator handle); - - public static class TessBaseAPI extends PointerType { - - public TessBaseAPI(Pointer address) { - super(address); - } - - public TessBaseAPI() { - super(); - } - }; - - public static class ETEXT_DESC extends PointerType { - - public ETEXT_DESC(Pointer address) { - super(address); - } - - public ETEXT_DESC() { - super(); - } - }; - - public static class TessPageIterator extends PointerType { - - public TessPageIterator(Pointer address) { - super(address); - } - - public TessPageIterator() { - super(); - } - }; - - public static class TessMutableIterator extends PointerType { - - public TessMutableIterator(Pointer address) { - super(address); - } - - public TessMutableIterator() { - super(); - } - }; - - public static class TessResultIterator extends PointerType { - - public TessResultIterator(Pointer address) { - super(address); - } - - public TessResultIterator() { - super(); - } - }; -} diff --git a/docs-core/src/main/java/com/sismics/tess4j/Tesseract.java b/docs-core/src/main/java/com/sismics/tess4j/Tesseract.java deleted file mode 100644 index 4111ce4c..00000000 --- a/docs-core/src/main/java/com/sismics/tess4j/Tesseract.java +++ /dev/null @@ -1,251 +0,0 @@ -/** - * Copyright @ 2012 Quan Nguyen - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ -package com.sismics.tess4j; - -import java.awt.Rectangle; -import java.awt.image.BufferedImage; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.util.ArrayList; -import java.util.Enumeration; -import java.util.List; -import java.util.Properties; - -import com.sun.jna.Pointer; - -/** - * An object layer on top of - * TessAPI, provides character recognition support for common image - * formats, and multi-page TIFF images beyond the uncompressed, binary TIFF - * format supported by Tesseract OCR engine. The extended capabilities are - * provided by the - * Java Advanced Imaging Image I/O Tools.

Support for - * PDF documents is available through - * Ghost4J, a - * JNA wrapper for - * GPL Ghostscript, which should be installed and included in - * system path.

Any program that uses the library will need to - * ensure that the required libraries (the - * .jar files for - * jna, - * jai-imageio, and - * ghost4j) are in its compile and run-time - * classpath. - */ -public class Tesseract { - - private static Tesseract instance; - private final static Rectangle EMPTY_RECTANGLE = new Rectangle(); - private String language = "eng"; - private String datapath = null; - private int psm = TessAPI.TessPageSegMode.PSM_AUTO; - private boolean hocr; - private int pageNum; - private int ocrEngineMode = TessAPI.TessOcrEngineMode.OEM_DEFAULT; - private Properties prop = new Properties(); - public final static String htmlBeginTag = - "\n" - + "\n\n\n" - + "\n\n" - + "\n\n"; - public final static String htmlEndTag = "\n\n"; - - /** - * Private constructor. - */ - private Tesseract() { - System.setProperty("jna.encoding", "UTF8"); - } - - /** - * Gets an instance of the class library. - * - * @return instance - */ - public static synchronized Tesseract getInstance() { - if (instance == null) { - instance = new Tesseract(); - } - - return instance; - } - - /** - * Sets tessdata path. - * - * @param datapath the tessdata path to set - */ - public void setDatapath(String datapath) { - this.datapath = datapath; - } - - /** - * Sets language for OCR. - * - * @param language the language code, which follows ISO 639-3 standard. - */ - public void setLanguage(String language) { - this.language = language; - } - - /** - * Sets OCR engine mode. - * - * @param ocrEngineMode the OcrEngineMode to set - */ - public void setOcrEngineMode(int ocrEngineMode) { - this.ocrEngineMode = ocrEngineMode; - } - - /** - * Sets page segmentation mode. - * - * @param mode the page segmentation mode to set - */ - public void setPageSegMode(int mode) { - this.psm = mode; - } - - /** - * Enables hocr output. - * - * @param hocr to enable or disable hocr output - */ - public void setHocr(boolean hocr) { - this.hocr = hocr; - prop.setProperty("tessedit_create_hocr", hocr ? "1" : "0"); - } - - /** - * Set the value of Tesseract's internal parameter. - * - * @param key variable name, e.g., - * tessedit_create_hocr, - * tessedit_char_whitelist, etc. - * @param value value for corresponding variable, e.g., "1", "0", - * "0123456789", etc. - */ - public void setTessVariable(String key, String value) { - prop.setProperty(key, value); - } - - /** - * Performs OCR operation. - * - * @param bi a buffered image - * @return the recognized text - * @throws TesseractException - */ - public String doOCR(BufferedImage bi) throws TesseractException { - return doOCR(bi, null); - } - - /** - * Performs OCR operation. - * - * @param bi a buffered image - * @param rect the bounding rectangle defines the region of the image to be - * recognized. A rectangle of zero dimension or - * null indicates the whole image. - * @return the recognized text - * @throws TesseractException - */ - public String doOCR(BufferedImage bi, Rectangle rect) throws TesseractException { - List imageList = new ArrayList(); - imageList.add(bi); - return doOCR(imageList, rect); - } - - /** - * Performs OCR operation. - * - * @param imageList a list of - * BufferedImage objects - * @param rect the bounding rectangle defines the region of the image to be - * recognized. A rectangle of zero dimension or - * null indicates the whole image. - * @return the recognized text - * @throws TesseractException - */ - public String doOCR(List imageList, Rectangle rect) throws TesseractException { - StringBuilder sb = new StringBuilder(); - pageNum = 0; - - for (BufferedImage oimage : imageList) { - pageNum++; - try { - ByteBuffer buf = ImageIOHelper.getImageByteBuffer(oimage); - String pageText = doOCR(oimage.getWidth(), oimage.getHeight(), buf, rect, oimage.getColorModel().getPixelSize()); - sb.append(pageText); - } catch (IOException ioe) { - //skip the problematic image - System.err.println(ioe.getMessage()); - } - } - - if (hocr) { - sb.insert(0, htmlBeginTag).append(htmlEndTag); - } - return sb.toString(); - } - - /** - * Performs OCR operation. Use - * SetImage, (optionally) - * SetRectangle, and one or more of the - * Get*Text functions. - * - * @param xsize width of image - * @param ysize height of image - * @param buf pixel data - * @param rect the bounding rectangle defines the region of the image to be - * recognized. A rectangle of zero dimension or - * null indicates the whole image. - * @param bpp bits per pixel, represents the bit depth of the image, with 1 - * for binary bitmap, 8 for gray, and 24 for color RGB. - * @return the recognized text - * @throws TesseractException - */ - public String doOCR(int xsize, int ysize, ByteBuffer buf, Rectangle rect, int bpp) throws TesseractException { - TessAPI api = TessAPI.INSTANCE; - TessAPI.TessBaseAPI handle = api.TessBaseAPICreate(); - api.TessBaseAPIInit2(handle, datapath, language, ocrEngineMode); - api.TessBaseAPISetPageSegMode(handle, psm); - - Enumeration em = prop.propertyNames(); - while (em.hasMoreElements()) { - String key = (String) em.nextElement(); - api.TessBaseAPISetVariable(handle, key, prop.getProperty(key)); - } - - int bytespp = bpp / 8; - int bytespl = (int) Math.ceil(xsize * bpp / 8.0); - api.TessBaseAPISetImage(handle, buf, xsize, ysize, bytespp, bytespl); - - if (rect != null && !rect.equals(EMPTY_RECTANGLE)) { - api.TessBaseAPISetRectangle(handle, rect.x, rect.y, rect.width, rect.height); - } - - Pointer utf8Text = hocr ? api.TessBaseAPIGetHOCRText(handle, pageNum - 1) : api.TessBaseAPIGetUTF8Text(handle); - String str = utf8Text.getString(0); - api.TessDeleteText(utf8Text); - api.TessBaseAPIDelete(handle); - - return str; - } -} diff --git a/docs-core/src/main/java/com/sismics/tess4j/TesseractException.java b/docs-core/src/main/java/com/sismics/tess4j/TesseractException.java deleted file mode 100644 index f8c4e09e..00000000 --- a/docs-core/src/main/java/com/sismics/tess4j/TesseractException.java +++ /dev/null @@ -1,38 +0,0 @@ -/** - * Copyright @ 2010 Quan Nguyen - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.sismics.tess4j; - -public class TesseractException extends Exception { - - private static final long serialVersionUID = 1L; - - public TesseractException() { - super(); - } - - public TesseractException(String message) { - super(message); - } - - public TesseractException(Throwable cause) { - super(cause); - } - - public TesseractException(String message, Throwable cause) { - super(message, cause); - } -} diff --git a/docs-core/src/main/java/com/sismics/util/Scalr.java b/docs-core/src/main/java/com/sismics/util/Scalr.java index 83396f42..e7776eb5 100644 --- a/docs-core/src/main/java/com/sismics/util/Scalr.java +++ b/docs-core/src/main/java/com/sismics/util/Scalr.java @@ -6,6 +6,11 @@ import java.awt.image.BufferedImage; import java.awt.image.BufferedImageOp; import java.awt.image.ImagingOpException; +/** + * Extends Scalr. + * + * @author bgamard + */ public class Scalr extends org.imgscalr.Scalr { /** * Rotate an image by a specific amount. diff --git a/pom.xml b/pom.xml index d533f1c2..87ede494 100644 --- a/pom.xml +++ b/pom.xml @@ -40,14 +40,14 @@ 3.1.0 1.0.5 4.2.1 - 3.2.1 + 3.3.2 1.6.5 - 1.3.1 + 1.3.0 + 42.2.2 1.2 1.5.7 1.5.6 1.11.2 - 1.3.0 9.3.11.v20160721 9.3.11.v20160721 @@ -435,6 +435,12 @@ ${com.sun.mail.javax.mail.version} + + org.postgresql + postgresql + ${org.postgresql.postgresql.version} + + com.twelvemonkeys.servlet @@ -442,42 +448,35 @@ ${com.twelvemonkeys.imageio.version} - - - net.java.dev.jna - jna - ${net.java.dev.jna.jna.version} - - - + com.twelvemonkeys.imageio imageio-jpeg ${com.twelvemonkeys.imageio.version} + + + com.twelvemonkeys.imageio + imageio-tiff + ${com.twelvemonkeys.imageio.version} + + com.github.jai-imageio jai-imageio-jpeg2000 ${com.github.jai-imageio.jai-imageio-jpeg2000.version} - - + + com.levigo.jbig2 levigo-jbig2-imageio ${com.levigo.jbig2.levigo-jbig2-imageio.version} - - - - com.github.jai-imageio - jai-imageio-core - ${com.github.jai-imageio.jai-imageio-core.version} - - +