mirror of
https://github.com/sismics/docs.git
synced 2024-11-23 22:37:55 +01:00
Index and generate thumbnails from PDF
This commit is contained in:
parent
4f8076427b
commit
d3f59554f8
@ -117,6 +117,11 @@
|
|||||||
<artifactId>imgscalr-lib</artifactId>
|
<artifactId>imgscalr-lib</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.pdfbox</groupId>
|
||||||
|
<artifactId>pdfbox</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<!-- OCR dependencies -->
|
<!-- OCR dependencies -->
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>jna</groupId>
|
<groupId>jna</groupId>
|
||||||
|
@ -3,11 +3,11 @@ package com.sismics.docs.core.event;
|
|||||||
import com.google.common.base.Objects;
|
import com.google.common.base.Objects;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* OCR all files in database event.
|
* Extract file content event.
|
||||||
*
|
*
|
||||||
* @author bgamard
|
* @author bgamard
|
||||||
*/
|
*/
|
||||||
public class OcrFileAsyncEvent {
|
public class ExtractFileAsyncEvent {
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return Objects.toStringHelper(this)
|
return Objects.toStringHelper(this)
|
@ -9,33 +9,33 @@ import org.slf4j.LoggerFactory;
|
|||||||
import com.google.common.eventbus.Subscribe;
|
import com.google.common.eventbus.Subscribe;
|
||||||
import com.sismics.docs.core.dao.jpa.DocumentDao;
|
import com.sismics.docs.core.dao.jpa.DocumentDao;
|
||||||
import com.sismics.docs.core.dao.jpa.FileDao;
|
import com.sismics.docs.core.dao.jpa.FileDao;
|
||||||
import com.sismics.docs.core.event.OcrFileAsyncEvent;
|
import com.sismics.docs.core.event.ExtractFileAsyncEvent;
|
||||||
import com.sismics.docs.core.model.jpa.Document;
|
import com.sismics.docs.core.model.jpa.Document;
|
||||||
import com.sismics.docs.core.model.jpa.File;
|
import com.sismics.docs.core.model.jpa.File;
|
||||||
import com.sismics.docs.core.util.FileUtil;
|
import com.sismics.docs.core.util.FileUtil;
|
||||||
import com.sismics.docs.core.util.TransactionUtil;
|
import com.sismics.docs.core.util.TransactionUtil;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Listener on OCR all files in database.
|
* Listener on extract content from all files.
|
||||||
*
|
*
|
||||||
* @author bgamard
|
* @author bgamard
|
||||||
*/
|
*/
|
||||||
public class OcrFileAsyncListener {
|
public class ExtractFileAsyncListener {
|
||||||
/**
|
/**
|
||||||
* Logger.
|
* Logger.
|
||||||
*/
|
*/
|
||||||
private static final Logger log = LoggerFactory.getLogger(OcrFileAsyncListener.class);
|
private static final Logger log = LoggerFactory.getLogger(ExtractFileAsyncListener.class);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* OCR all files.
|
* Extract content from all files.
|
||||||
*
|
*
|
||||||
* @param ocrFileAsyncEvent OCR all files in database event
|
* @param extractFileAsyncEvent Extract file content event
|
||||||
* @throws Exception
|
* @throws Exception
|
||||||
*/
|
*/
|
||||||
@Subscribe
|
@Subscribe
|
||||||
public void on(final OcrFileAsyncEvent ocrFileAsyncEvent) throws Exception {
|
public void on(final ExtractFileAsyncEvent extractFileAsyncEvent) throws Exception {
|
||||||
if (log.isInfoEnabled()) {
|
if (log.isInfoEnabled()) {
|
||||||
log.info("OCR all files in database event: " + ocrFileAsyncEvent.toString());
|
log.info("Extract file content event: " + extractFileAsyncEvent.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
TransactionUtil.handle(new Runnable() {
|
TransactionUtil.handle(new Runnable() {
|
||||||
@ -47,10 +47,9 @@ public class OcrFileAsyncListener {
|
|||||||
for (File file : fileList) {
|
for (File file : fileList) {
|
||||||
long startTime = System.currentTimeMillis();
|
long startTime = System.currentTimeMillis();
|
||||||
Document document = documentDao.getById(file.getDocumentId());
|
Document document = documentDao.getById(file.getDocumentId());
|
||||||
String content = FileUtil.ocrFile(document, file);
|
file.setContent(FileUtil.extractContent(document, file));
|
||||||
file.setContent(content);
|
|
||||||
TransactionUtil.commit();
|
TransactionUtil.commit();
|
||||||
log.info(MessageFormat.format("File OCR-ized in {0}ms", System.currentTimeMillis() - startTime));
|
log.info(MessageFormat.format("File content extracted in {0}ms", System.currentTimeMillis() - startTime));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
@ -39,7 +39,7 @@ public class FileCreatedAsyncListener {
|
|||||||
// OCR the file
|
// OCR the file
|
||||||
final File file = fileCreatedAsyncEvent.getFile();
|
final File file = fileCreatedAsyncEvent.getFile();
|
||||||
long startTime = System.currentTimeMillis();
|
long startTime = System.currentTimeMillis();
|
||||||
final String content = FileUtil.ocrFile(fileCreatedAsyncEvent.getDocument(), file);
|
final String content = FileUtil.extractContent(fileCreatedAsyncEvent.getDocument(), file);
|
||||||
log.info(MessageFormat.format("File OCR-ized in {0}ms", System.currentTimeMillis() - startTime));
|
log.info(MessageFormat.format("File OCR-ized in {0}ms", System.currentTimeMillis() - startTime));
|
||||||
|
|
||||||
// Store the OCR-ization result in the database
|
// Store the OCR-ization result in the database
|
||||||
|
@ -16,7 +16,7 @@ import com.sismics.docs.core.listener.async.DocumentDeletedAsyncListener;
|
|||||||
import com.sismics.docs.core.listener.async.DocumentUpdatedAsyncListener;
|
import com.sismics.docs.core.listener.async.DocumentUpdatedAsyncListener;
|
||||||
import com.sismics.docs.core.listener.async.FileCreatedAsyncListener;
|
import com.sismics.docs.core.listener.async.FileCreatedAsyncListener;
|
||||||
import com.sismics.docs.core.listener.async.FileDeletedAsyncListener;
|
import com.sismics.docs.core.listener.async.FileDeletedAsyncListener;
|
||||||
import com.sismics.docs.core.listener.async.OcrFileAsyncListener;
|
import com.sismics.docs.core.listener.async.ExtractFileAsyncListener;
|
||||||
import com.sismics.docs.core.listener.async.RebuildIndexAsyncListener;
|
import com.sismics.docs.core.listener.async.RebuildIndexAsyncListener;
|
||||||
import com.sismics.docs.core.listener.sync.DeadEventListener;
|
import com.sismics.docs.core.listener.sync.DeadEventListener;
|
||||||
import com.sismics.docs.core.model.jpa.Config;
|
import com.sismics.docs.core.model.jpa.Config;
|
||||||
@ -82,7 +82,7 @@ public class AppContext {
|
|||||||
asyncEventBus.register(new DocumentUpdatedAsyncListener());
|
asyncEventBus.register(new DocumentUpdatedAsyncListener());
|
||||||
asyncEventBus.register(new DocumentDeletedAsyncListener());
|
asyncEventBus.register(new DocumentDeletedAsyncListener());
|
||||||
asyncEventBus.register(new RebuildIndexAsyncListener());
|
asyncEventBus.register(new RebuildIndexAsyncListener());
|
||||||
asyncEventBus.register(new OcrFileAsyncListener());
|
asyncEventBus.register(new ExtractFileAsyncListener());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -6,11 +6,15 @@ import java.io.InputStream;
|
|||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.Paths;
|
import java.nio.file.Paths;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
import javax.imageio.ImageIO;
|
import javax.imageio.ImageIO;
|
||||||
|
|
||||||
import net.sourceforge.tess4j.Tesseract;
|
import net.sourceforge.tess4j.Tesseract;
|
||||||
|
|
||||||
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
|
import org.apache.pdfbox.util.PDFTextStripper;
|
||||||
import org.imgscalr.Scalr;
|
import org.imgscalr.Scalr;
|
||||||
import org.imgscalr.Scalr.Method;
|
import org.imgscalr.Scalr.Method;
|
||||||
import org.imgscalr.Scalr.Mode;
|
import org.imgscalr.Scalr.Mode;
|
||||||
@ -20,6 +24,7 @@ import org.slf4j.LoggerFactory;
|
|||||||
import com.sismics.docs.core.model.jpa.Document;
|
import com.sismics.docs.core.model.jpa.Document;
|
||||||
import com.sismics.docs.core.model.jpa.File;
|
import com.sismics.docs.core.model.jpa.File;
|
||||||
import com.sismics.util.ImageUtil;
|
import com.sismics.util.ImageUtil;
|
||||||
|
import com.sismics.util.mime.MimeType;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* File entity utilities.
|
* File entity utilities.
|
||||||
@ -33,18 +38,32 @@ public class FileUtil {
|
|||||||
private static final Logger log = LoggerFactory.getLogger(FileUtil.class);
|
private static final Logger log = LoggerFactory.getLogger(FileUtil.class);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* OCR a file.
|
* Extract content from a file.
|
||||||
|
*
|
||||||
|
* @param document Document linked to the file
|
||||||
|
* @param file File to extract
|
||||||
|
* @return Content extract
|
||||||
|
*/
|
||||||
|
public static String extractContent(Document document, File file) {
|
||||||
|
String content = null;
|
||||||
|
|
||||||
|
if (ImageUtil.isImage(file.getMimeType())) {
|
||||||
|
content = ocrFile(document, file);
|
||||||
|
} else if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) {
|
||||||
|
content = extractPdf(file);
|
||||||
|
}
|
||||||
|
|
||||||
|
return content;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Optical character recognition on a file.
|
||||||
*
|
*
|
||||||
* @param document Document linked to the file
|
* @param document Document linked to the file
|
||||||
* @param file File to OCR
|
* @param file File to OCR
|
||||||
* @return OCR-ized content
|
* @return Content extracted
|
||||||
*/
|
*/
|
||||||
public static String ocrFile(Document document, final File file) {
|
private static String ocrFile(Document document, File file) {
|
||||||
if (!ImageUtil.isImage(file.getMimeType())) {
|
|
||||||
// The file is not OCR-izable
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
Tesseract instance = Tesseract.getInstance();
|
Tesseract instance = Tesseract.getInstance();
|
||||||
java.io.File storedfile = Paths.get(DirectoryUtil.getStorageDirectory().getPath(), file.getId()).toFile();
|
java.io.File storedfile = Paths.get(DirectoryUtil.getStorageDirectory().getPath(), file.getId()).toFile();
|
||||||
String content = null;
|
String content = null;
|
||||||
@ -72,6 +91,35 @@ public class FileUtil {
|
|||||||
return content;
|
return content;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract text from a PDF.
|
||||||
|
*
|
||||||
|
* @param file File to extract
|
||||||
|
* @return Content extracted
|
||||||
|
*/
|
||||||
|
private static String extractPdf(File file) {
|
||||||
|
String content = null;
|
||||||
|
PDDocument pdfDocument = null;
|
||||||
|
java.io.File storedfile = Paths.get(DirectoryUtil.getStorageDirectory().getPath(), file.getId()).toFile();
|
||||||
|
try {
|
||||||
|
PDFTextStripper stripper = new PDFTextStripper();
|
||||||
|
pdfDocument = PDDocument.load(storedfile);
|
||||||
|
content = stripper.getText(pdfDocument);
|
||||||
|
} catch (IOException e) {
|
||||||
|
log.error("Error while extracting text from the PDF " + storedfile, e);
|
||||||
|
} finally {
|
||||||
|
if (pdfDocument != null) {
|
||||||
|
try {
|
||||||
|
pdfDocument.close();
|
||||||
|
} catch (IOException e) {
|
||||||
|
// NOP
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return content;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Save a file on the storage filesystem.
|
* Save a file on the storage filesystem.
|
||||||
*
|
*
|
||||||
@ -84,7 +132,12 @@ public class FileUtil {
|
|||||||
Files.copy(is, path);
|
Files.copy(is, path);
|
||||||
|
|
||||||
// Generate file variations
|
// Generate file variations
|
||||||
saveVariations(file, path.toFile());
|
try {
|
||||||
|
saveVariations(file, path.toFile());
|
||||||
|
} catch (IOException e) {
|
||||||
|
// Don't rethrow Exception from file variations generation
|
||||||
|
log.error("Error creating file variations", e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -95,8 +148,22 @@ public class FileUtil {
|
|||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
public static void saveVariations(File file, java.io.File originalFile) throws IOException {
|
public static void saveVariations(File file, java.io.File originalFile) throws IOException {
|
||||||
|
BufferedImage image = null;
|
||||||
if (ImageUtil.isImage(file.getMimeType())) {
|
if (ImageUtil.isImage(file.getMimeType())) {
|
||||||
BufferedImage image = ImageIO.read(originalFile);
|
image = ImageIO.read(originalFile);
|
||||||
|
} else if(file.getMimeType().equals(MimeType.APPLICATION_PDF)) {
|
||||||
|
// Generate preview from the first page of the PDF
|
||||||
|
PDDocument pdfDocument = PDDocument.load(originalFile);
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
List<PDPage> pageList = pdfDocument.getDocumentCatalog().getAllPages();
|
||||||
|
if (pageList.size() > 0) {
|
||||||
|
PDPage page = pageList.get(0);
|
||||||
|
image = page.convertToImage();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (image != null) {
|
||||||
|
// Generate thumbnails from image
|
||||||
BufferedImage web = Scalr.resize(image, Scalr.Method.AUTOMATIC, Scalr.Mode.AUTOMATIC, 1280, Scalr.OP_ANTIALIAS);
|
BufferedImage web = Scalr.resize(image, Scalr.Method.AUTOMATIC, Scalr.Mode.AUTOMATIC, 1280, Scalr.OP_ANTIALIAS);
|
||||||
BufferedImage thumbnail = Scalr.resize(image, Scalr.Method.AUTOMATIC, Scalr.Mode.AUTOMATIC, 256, Scalr.OP_ANTIALIAS);
|
BufferedImage thumbnail = Scalr.resize(image, Scalr.Method.AUTOMATIC, Scalr.Mode.AUTOMATIC, 256, Scalr.OP_ANTIALIAS);
|
||||||
image.flush();
|
image.flush();
|
||||||
|
@ -1,2 +0,0 @@
|
|||||||
- Extract text from PDF for indexing, see PDFBox (server)
|
|
||||||
- Make thumbnail of the first page of PDF, see PDFBox (server)
|
|
@ -15,7 +15,7 @@
|
|||||||
<maven.compiler.target>1.7</maven.compiler.target>
|
<maven.compiler.target>1.7</maven.compiler.target>
|
||||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
|
|
||||||
<!-- Dependencies version (external) -->
|
<!-- Dependencies version (external) -->
|
||||||
<org.apache.commons.commons-compress.version>1.5</org.apache.commons.commons-compress.version>
|
<org.apache.commons.commons-compress.version>1.5</org.apache.commons.commons-compress.version>
|
||||||
<commons-lang.commons-lang.version>2.6</commons-lang.commons-lang.version>
|
<commons-lang.commons-lang.version>2.6</commons-lang.commons-lang.version>
|
||||||
<commons-io.commons-io.version>2.1</commons-io.commons-io.version>
|
<commons-io.commons-io.version>2.1</commons-io.commons-io.version>
|
||||||
@ -62,6 +62,7 @@
|
|||||||
<org.mortbay.jetty.jetty-maven-plugin.version>8.1.2.v20120308</org.mortbay.jetty.jetty-maven-plugin.version>
|
<org.mortbay.jetty.jetty-maven-plugin.version>8.1.2.v20120308</org.mortbay.jetty.jetty-maven-plugin.version>
|
||||||
<org.vafer.jdeb.version>1.0.1</org.vafer.jdeb.version>
|
<org.vafer.jdeb.version>1.0.1</org.vafer.jdeb.version>
|
||||||
<com.samaxes.maven.minify-maven-plugin.version>1.7</com.samaxes.maven.minify-maven-plugin.version>
|
<com.samaxes.maven.minify-maven-plugin.version>1.7</com.samaxes.maven.minify-maven-plugin.version>
|
||||||
|
<org.apache.pdfbox.pdfbox.version>1.8.2</org.apache.pdfbox.pdfbox.version>
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
<scm>
|
<scm>
|
||||||
@ -436,24 +437,30 @@
|
|||||||
<version>${org.imgscalr.imgscalr-lib.version}</version>
|
<version>${org.imgscalr.imgscalr-lib.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.pdfbox</groupId>
|
||||||
|
<artifactId>pdfbox</artifactId>
|
||||||
|
<version>${org.apache.pdfbox.pdfbox.version}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<!-- OCR dependencies -->
|
<!-- OCR dependencies -->
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>jna</groupId>
|
<groupId>jna</groupId>
|
||||||
<artifactId>jna</artifactId>
|
<artifactId>jna</artifactId>
|
||||||
<version>1.0</version>
|
<version>1.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>jai</groupId>
|
<groupId>jai</groupId>
|
||||||
<artifactId>imageio</artifactId>
|
<artifactId>imageio</artifactId>
|
||||||
<version>1.0</version>
|
<version>1.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>tess4j</groupId>
|
<groupId>tess4j</groupId>
|
||||||
<artifactId>tess4j</artifactId>
|
<artifactId>tess4j</artifactId>
|
||||||
<version>1.0</version>
|
<version>1.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
</dependencyManagement>
|
</dependencyManagement>
|
||||||
|
|
||||||
@ -478,64 +485,64 @@
|
|||||||
<build>
|
<build>
|
||||||
<plugins>
|
<plugins>
|
||||||
<plugin>
|
<plugin>
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
<artifactId>maven-install-plugin</artifactId>
|
<artifactId>maven-install-plugin</artifactId>
|
||||||
<version>2.3.1</version>
|
<version>2.3.1</version>
|
||||||
<executions>
|
<executions>
|
||||||
|
|
||||||
<execution>
|
<execution>
|
||||||
<id>install-jna</id>
|
<id>install-jna</id>
|
||||||
<phase>validate</phase>
|
<phase>validate</phase>
|
||||||
<configuration>
|
<configuration>
|
||||||
<file>${project.basedir}/lib/jna.jar</file>
|
<file>${project.basedir}/lib/jna.jar</file>
|
||||||
<repositoryLayout>default</repositoryLayout>
|
<repositoryLayout>default</repositoryLayout>
|
||||||
<groupId>jna</groupId>
|
<groupId>jna</groupId>
|
||||||
<artifactId>jna</artifactId>
|
<artifactId>jna</artifactId>
|
||||||
<version>1.0</version>
|
<version>1.0</version>
|
||||||
<packaging>jar</packaging>
|
<packaging>jar</packaging>
|
||||||
<generatePom>true</generatePom>
|
<generatePom>true</generatePom>
|
||||||
</configuration>
|
</configuration>
|
||||||
<goals>
|
<goals>
|
||||||
<goal>install-file</goal>
|
<goal>install-file</goal>
|
||||||
</goals>
|
</goals>
|
||||||
</execution>
|
</execution>
|
||||||
|
|
||||||
<execution>
|
<execution>
|
||||||
<id>install-jai-imageio</id>
|
<id>install-jai-imageio</id>
|
||||||
<phase>validate</phase>
|
<phase>validate</phase>
|
||||||
<configuration>
|
<configuration>
|
||||||
<file>${project.basedir}/lib/jai_imageio.jar</file>
|
<file>${project.basedir}/lib/jai_imageio.jar</file>
|
||||||
<repositoryLayout>default</repositoryLayout>
|
<repositoryLayout>default</repositoryLayout>
|
||||||
<groupId>jai</groupId>
|
<groupId>jai</groupId>
|
||||||
<artifactId>imageio</artifactId>
|
<artifactId>imageio</artifactId>
|
||||||
<version>1.0</version>
|
<version>1.0</version>
|
||||||
<packaging>jar</packaging>
|
<packaging>jar</packaging>
|
||||||
<generatePom>true</generatePom>
|
<generatePom>true</generatePom>
|
||||||
</configuration>
|
</configuration>
|
||||||
<goals>
|
<goals>
|
||||||
<goal>install-file</goal>
|
<goal>install-file</goal>
|
||||||
</goals>
|
</goals>
|
||||||
</execution>
|
</execution>
|
||||||
|
|
||||||
<execution>
|
<execution>
|
||||||
<id>install-tess4j</id>
|
<id>install-tess4j</id>
|
||||||
<phase>validate</phase>
|
<phase>validate</phase>
|
||||||
<configuration>
|
<configuration>
|
||||||
<file>${project.basedir}/lib/tess4j.jar</file>
|
<file>${project.basedir}/lib/tess4j.jar</file>
|
||||||
<repositoryLayout>default</repositoryLayout>
|
<repositoryLayout>default</repositoryLayout>
|
||||||
<groupId>tess4j</groupId>
|
<groupId>tess4j</groupId>
|
||||||
<artifactId>tess4j</artifactId>
|
<artifactId>tess4j</artifactId>
|
||||||
<version>1.0</version>
|
<version>1.0</version>
|
||||||
<packaging>jar</packaging>
|
<packaging>jar</packaging>
|
||||||
<generatePom>true</generatePom>
|
<generatePom>true</generatePom>
|
||||||
</configuration>
|
</configuration>
|
||||||
<goals>
|
<goals>
|
||||||
<goal>install-file</goal>
|
<goal>install-file</goal>
|
||||||
</goals>
|
</goals>
|
||||||
</execution>
|
</execution>
|
||||||
|
|
||||||
</executions>
|
</executions>
|
||||||
</plugin>
|
</plugin>
|
||||||
</plugins>
|
</plugins>
|
||||||
</build>
|
</build>
|
||||||
</profile>
|
</profile>
|
||||||
|
@ -26,7 +26,7 @@ import com.sismics.docs.core.dao.jpa.DocumentDao;
|
|||||||
import com.sismics.docs.core.dao.jpa.FileDao;
|
import com.sismics.docs.core.dao.jpa.FileDao;
|
||||||
import com.sismics.docs.core.dao.jpa.criteria.DocumentCriteria;
|
import com.sismics.docs.core.dao.jpa.criteria.DocumentCriteria;
|
||||||
import com.sismics.docs.core.dao.jpa.dto.DocumentDto;
|
import com.sismics.docs.core.dao.jpa.dto.DocumentDto;
|
||||||
import com.sismics.docs.core.event.OcrFileAsyncEvent;
|
import com.sismics.docs.core.event.ExtractFileAsyncEvent;
|
||||||
import com.sismics.docs.core.model.context.AppContext;
|
import com.sismics.docs.core.model.context.AppContext;
|
||||||
import com.sismics.docs.core.model.jpa.File;
|
import com.sismics.docs.core.model.jpa.File;
|
||||||
import com.sismics.docs.core.util.ConfigUtil;
|
import com.sismics.docs.core.util.ConfigUtil;
|
||||||
@ -163,7 +163,7 @@ public class AppResource extends BaseResource {
|
|||||||
checkBaseFunction(BaseFunction.ADMIN);
|
checkBaseFunction(BaseFunction.ADMIN);
|
||||||
|
|
||||||
// Raise a OCR file event
|
// Raise a OCR file event
|
||||||
AppContext.getInstance().getAsyncEventBus().post(new OcrFileAsyncEvent());
|
AppContext.getInstance().getAsyncEventBus().post(new ExtractFileAsyncEvent());
|
||||||
|
|
||||||
JSONObject response = new JSONObject();
|
JSONObject response = new JSONObject();
|
||||||
response.put("status", "ok");
|
response.put("status", "ok");
|
||||||
|
@ -15,6 +15,7 @@ import org.codehaus.jettison.json.JSONObject;
|
|||||||
import org.joda.time.format.DateTimeFormat;
|
import org.joda.time.format.DateTimeFormat;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import com.google.common.io.ByteStreams;
|
||||||
import com.sismics.docs.rest.filter.CookieAuthenticationFilter;
|
import com.sismics.docs.rest.filter.CookieAuthenticationFilter;
|
||||||
import com.sun.jersey.api.client.ClientResponse;
|
import com.sun.jersey.api.client.ClientResponse;
|
||||||
import com.sun.jersey.api.client.ClientResponse.Status;
|
import com.sun.jersey.api.client.ClientResponse.Status;
|
||||||
@ -354,4 +355,67 @@ public class TestDocumentResource extends BaseJerseyTest {
|
|||||||
json = response.getEntity(JSONObject.class);
|
json = response.getEntity(JSONObject.class);
|
||||||
Assert.assertEquals(Status.BAD_REQUEST, Status.fromStatusCode(response.getStatus()));
|
Assert.assertEquals(Status.BAD_REQUEST, Status.fromStatusCode(response.getStatus()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test PDF extraction.
|
||||||
|
*
|
||||||
|
* @throws Exception
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testPdfExtraction() throws Exception {
|
||||||
|
// Login document2
|
||||||
|
clientUtil.createUser("document2");
|
||||||
|
String document2Token = clientUtil.login("document2");
|
||||||
|
|
||||||
|
// Create a document
|
||||||
|
WebResource documentResource = resource().path("/document");
|
||||||
|
documentResource.addFilter(new CookieAuthenticationFilter(document2Token));
|
||||||
|
MultivaluedMapImpl postParams = new MultivaluedMapImpl();
|
||||||
|
postParams.add("title", "My super title document 1");
|
||||||
|
postParams.add("description", "My super description for document 1");
|
||||||
|
postParams.add("language", "eng");
|
||||||
|
long create1Date = new Date().getTime();
|
||||||
|
postParams.add("create_date", create1Date);
|
||||||
|
ClientResponse response = documentResource.put(ClientResponse.class, postParams);
|
||||||
|
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||||
|
JSONObject json = response.getEntity(JSONObject.class);
|
||||||
|
String document1Id = json.optString("id");
|
||||||
|
Assert.assertNotNull(document1Id);
|
||||||
|
|
||||||
|
// Add a PDF file
|
||||||
|
WebResource fileResource = resource().path("/file");
|
||||||
|
fileResource.addFilter(new CookieAuthenticationFilter(document2Token));
|
||||||
|
FormDataMultiPart form = new FormDataMultiPart();
|
||||||
|
InputStream file = this.getClass().getResourceAsStream("/file/wikipedia.pdf");
|
||||||
|
FormDataBodyPart fdp = new FormDataBodyPart("file",
|
||||||
|
new BufferedInputStream(file),
|
||||||
|
MediaType.APPLICATION_OCTET_STREAM_TYPE);
|
||||||
|
form.bodyPart(fdp);
|
||||||
|
form.field("id", document1Id);
|
||||||
|
response = fileResource.type(MediaType.MULTIPART_FORM_DATA).put(ClientResponse.class, form);
|
||||||
|
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||||
|
json = response.getEntity(JSONObject.class);
|
||||||
|
String file1Id = json.getString("id");
|
||||||
|
|
||||||
|
// Search documents by query in full content
|
||||||
|
documentResource = resource().path("/document/list");
|
||||||
|
documentResource.addFilter(new CookieAuthenticationFilter(document2Token));
|
||||||
|
MultivaluedMapImpl getParams = new MultivaluedMapImpl();
|
||||||
|
getParams.putSingle("search", "full:vrandecic");
|
||||||
|
response = documentResource.queryParams(getParams).get(ClientResponse.class);
|
||||||
|
json = response.getEntity(JSONObject.class);
|
||||||
|
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||||
|
Assert.assertTrue(json.getJSONArray("documents").length() == 1);
|
||||||
|
|
||||||
|
// Get the file thumbnail data
|
||||||
|
fileResource = resource().path("/file/" + file1Id + "/data");
|
||||||
|
fileResource.addFilter(new CookieAuthenticationFilter(document2Token));
|
||||||
|
getParams = new MultivaluedMapImpl();
|
||||||
|
getParams.putSingle("size", "thumb");
|
||||||
|
response = fileResource.queryParams(getParams).get(ClientResponse.class);
|
||||||
|
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||||
|
InputStream is = response.getEntityInputStream();
|
||||||
|
byte[] fileBytes = ByteStreams.toByteArray(is);
|
||||||
|
Assert.assertEquals(3457, fileBytes.length);
|
||||||
|
}
|
||||||
}
|
}
|
BIN
docs-web/src/test/resources/file/wikipedia.pdf
Normal file
BIN
docs-web/src/test/resources/file/wikipedia.pdf
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user