/** * Copyright @ 2012 Quan Nguyen * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package com.sismics.tess4j; import java.awt.Rectangle; import java.awt.image.BufferedImage; import java.awt.image.RenderedImage; import java.io.IOException; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Enumeration; import java.util.List; import java.util.Properties; import javax.imageio.IIOImage; import com.sun.jna.Pointer; /** * An object layer on top of * TessAPI, provides character recognition support for common image * formats, and multi-page TIFF images beyond the uncompressed, binary TIFF * format supported by Tesseract OCR engine. The extended capabilities are * provided by the * Java Advanced Imaging Image I/O Tools.

Support for * PDF documents is available through * Ghost4J, a * JNA wrapper for * GPL Ghostscript, which should be installed and included in * system path.

Any program that uses the library will need to * ensure that the required libraries (the * .jar files for * jna, * jai-imageio, and * ghost4j) are in its compile and run-time * classpath. */ public class Tesseract { private static Tesseract instance; private final static Rectangle EMPTY_RECTANGLE = new Rectangle(); private String language = "eng"; private String datapath = null; private int psm = TessAPI.TessPageSegMode.PSM_AUTO; private boolean hocr; private int pageNum; private int ocrEngineMode = TessAPI.TessOcrEngineMode.OEM_DEFAULT; private Properties prop = new Properties(); public final static String htmlBeginTag = "\n" + "\n\n\n" + "\n\n" + "\n\n"; public final static String htmlEndTag = "\n\n"; /** * Private constructor. */ private Tesseract() { System.setProperty("jna.encoding", "UTF8"); } /** * Gets an instance of the class library. * * @return instance */ public static synchronized Tesseract getInstance() { if (instance == null) { instance = new Tesseract(); } return instance; } /** * Sets tessdata path. * * @param datapath the tessdata path to set */ public void setDatapath(String datapath) { this.datapath = datapath; } /** * Sets language for OCR. * * @param language the language code, which follows ISO 639-3 standard. */ public void setLanguage(String language) { this.language = language; } /** * Sets OCR engine mode. * * @param ocrEngineMode the OcrEngineMode to set */ public void setOcrEngineMode(int ocrEngineMode) { this.ocrEngineMode = ocrEngineMode; } /** * Sets page segmentation mode. * * @param mode the page segmentation mode to set */ public void setPageSegMode(int mode) { this.psm = mode; } /** * Enables hocr output. * * @param hocr to enable or disable hocr output */ public void setHocr(boolean hocr) { this.hocr = hocr; prop.setProperty("tessedit_create_hocr", hocr ? "1" : "0"); } /** * Set the value of Tesseract's internal parameter. * * @param key variable name, e.g., * tessedit_create_hocr, * tessedit_char_whitelist, etc. * @param value value for corresponding variable, e.g., "1", "0", * "0123456789", etc. */ public void setTessVariable(String key, String value) { prop.setProperty(key, value); } /** * Performs OCR operation. * * @param bi a buffered image * @return the recognized text * @throws TesseractException */ public String doOCR(BufferedImage bi) throws TesseractException { return doOCR(bi, null); } /** * Performs OCR operation. * * @param bi a buffered image * @param rect the bounding rectangle defines the region of the image to be * recognized. A rectangle of zero dimension or * null indicates the whole image. * @return the recognized text * @throws TesseractException */ public String doOCR(BufferedImage bi, Rectangle rect) throws TesseractException { IIOImage oimage = new IIOImage(bi, null, null); List imageList = new ArrayList(); imageList.add(oimage); return doOCR(imageList, rect); } /** * Performs OCR operation. * * @param imageList a list of * IIOImage objects * @param rect the bounding rectangle defines the region of the image to be * recognized. A rectangle of zero dimension or * null indicates the whole image. * @return the recognized text * @throws TesseractException */ public String doOCR(List imageList, Rectangle rect) throws TesseractException { StringBuilder sb = new StringBuilder(); pageNum = 0; for (IIOImage oimage : imageList) { pageNum++; try { ByteBuffer buf = ImageIOHelper.getImageByteBuffer(oimage); RenderedImage ri = oimage.getRenderedImage(); String pageText = doOCR(ri.getWidth(), ri.getHeight(), buf, rect, ri.getColorModel().getPixelSize()); sb.append(pageText); } catch (IOException ioe) { //skip the problematic image System.err.println(ioe.getMessage()); } } if (hocr) { sb.insert(0, htmlBeginTag).append(htmlEndTag); } return sb.toString(); } /** * Performs OCR operation. Use * SetImage, (optionally) * SetRectangle, and one or more of the * Get*Text functions. * * @param xsize width of image * @param ysize height of image * @param buf pixel data * @param rect the bounding rectangle defines the region of the image to be * recognized. A rectangle of zero dimension or * null indicates the whole image. * @param bpp bits per pixel, represents the bit depth of the image, with 1 * for binary bitmap, 8 for gray, and 24 for color RGB. * @return the recognized text * @throws TesseractException */ public String doOCR(int xsize, int ysize, ByteBuffer buf, Rectangle rect, int bpp) throws TesseractException { TessAPI api = TessAPI.INSTANCE; TessAPI.TessBaseAPI handle = api.TessBaseAPICreate(); api.TessBaseAPIInit2(handle, datapath, language, ocrEngineMode); api.TessBaseAPISetPageSegMode(handle, psm); Enumeration em = prop.propertyNames(); while (em.hasMoreElements()) { String key = (String) em.nextElement(); api.TessBaseAPISetVariable(handle, key, prop.getProperty(key)); } int bytespp = bpp / 8; int bytespl = (int) Math.ceil(xsize * bpp / 8.0); api.TessBaseAPISetImage(handle, buf, xsize, ysize, bytespp, bytespl); if (rect != null && !rect.equals(EMPTY_RECTANGLE)) { api.TessBaseAPISetRectangle(handle, rect.x, rect.y, rect.width, rect.height); } Pointer utf8Text = hocr ? api.TessBaseAPIGetHOCRText(handle, pageNum - 1) : api.TessBaseAPIGetUTF8Text(handle); String str = utf8Text.getString(0); api.TessDeleteText(utf8Text); api.TessBaseAPIDelete(handle); return str; } }