Closes #140: video file support

This commit is contained in:
Benjamin Gamard 2018-03-02 19:05:20 +01:00
parent 18c9833104
commit 4a676b01e1
18 changed files with 312 additions and 24 deletions

View File

@ -3,7 +3,7 @@ dist: trusty
language: java
before_install:
- sudo apt-get -qq update
- sudo apt-get -y -q install tesseract-ocr tesseract-ocr-fra tesseract-ocr-ita tesseract-ocr-kor tesseract-ocr-rus tesseract-ocr-ukr tesseract-ocr-spa tesseract-ocr-ara tesseract-ocr-hin tesseract-ocr-deu tesseract-ocr-pol tesseract-ocr-jpn tesseract-ocr-por tesseract-ocr-tha tesseract-ocr-jpn tesseract-ocr-chi-sim tesseract-ocr-chi-tra
- sudo apt-get -y -q install ffmpeg mediainfo tesseract-ocr tesseract-ocr-fra tesseract-ocr-ita tesseract-ocr-kor tesseract-ocr-rus tesseract-ocr-ukr tesseract-ocr-spa tesseract-ocr-ara tesseract-ocr-hin tesseract-ocr-deu tesseract-ocr-pol tesseract-ocr-jpn tesseract-ocr-por tesseract-ocr-tha tesseract-ocr-jpn tesseract-ocr-chi-sim tesseract-ocr-chi-tra
- sudo apt-get -y -q install haveged && sudo service haveged start
after_success:
- mvn -Pprod -DskipTests clean install

View File

@ -1,7 +1,7 @@
FROM sismics/jetty:9.2.20-jdk7
MAINTAINER b.gamard@sismics.com
RUN apt-get update && apt-get -y -q install tesseract-ocr tesseract-ocr-fra tesseract-ocr-ita tesseract-ocr-kor tesseract-ocr-rus tesseract-ocr-ukr tesseract-ocr-spa tesseract-ocr-ara tesseract-ocr-hin tesseract-ocr-deu tesseract-ocr-pol tesseract-ocr-jpn tesseract-ocr-por tesseract-ocr-tha tesseract-ocr-jpn tesseract-ocr-chi-sim tesseract-ocr-chi-tra && \
RUN apt-get update && apt-get -y -q install ffmpeg mediainfo tesseract-ocr tesseract-ocr-fra tesseract-ocr-ita tesseract-ocr-kor tesseract-ocr-rus tesseract-ocr-ukr tesseract-ocr-spa tesseract-ocr-ara tesseract-ocr-hin tesseract-ocr-deu tesseract-ocr-pol tesseract-ocr-jpn tesseract-ocr-por tesseract-ocr-tha tesseract-ocr-jpn tesseract-ocr-chi-sim tesseract-ocr-chi-tra && \
apt-get clean && rm -rf /var/lib/apt/lists/*
ENV TESSDATA_PREFIX /usr/share/tesseract-ocr

View File

@ -30,6 +30,7 @@ Features
- Responsive user interface
- Optical character recognition
- Support image, PDF, ODT and DOCX files
- Video file support ![New!](https://www.sismics.com/public/img/new.png)
- Flexible search engine
- Full text search in all supported files
- All [Dublin Core](http://dublincore.org/) metadata
@ -47,7 +48,7 @@ Features
- Document sharing by URL
- RESTful Web API
- Fully featured Android client
- [Mass files importer](https://github.com/sismics/docs/tree/master/docs-importer) (single or scan mode) ![New!](https://www.sismics.com/public/img/new.png)
- [Bulk files importer](https://github.com/sismics/docs/tree/master/docs-importer) (single or scan mode) ![New!](https://www.sismics.com/public/img/new.png)
- Tested to 100k documents
Download

View File

@ -30,15 +30,13 @@ public class FileCreatedAsyncListener {
* @throws Exception e
*/
@Subscribe
public void on(final FileCreatedAsyncEvent fileCreatedAsyncEvent) throws Exception {
public void on(final FileCreatedAsyncEvent fileCreatedAsyncEvent) {
if (log.isInfoEnabled()) {
log.info("File created event: " + fileCreatedAsyncEvent.toString());
}
// Guess the mime type a second time, for open document format (first detected as simple ZIP file)
final File file = fileCreatedAsyncEvent.getFile();
// Extract text content from the file
final File file = fileCreatedAsyncEvent.getFile();
long startTime = System.currentTimeMillis();
final String content = FileUtil.extractContent(fileCreatedAsyncEvent.getLanguage(), file,
fileCreatedAsyncEvent.getUnencryptedFile(), fileCreatedAsyncEvent.getUnencryptedPdfFile());

View File

@ -12,6 +12,7 @@ import com.sismics.tess4j.Tesseract;
import com.sismics.util.ImageDeskew;
import com.sismics.util.ImageUtil;
import com.sismics.util.Scalr;
import com.sismics.util.VideoUtil;
import com.sismics.util.context.ThreadLocalContext;
import com.sismics.util.mime.MimeTypeUtil;
import org.apache.commons.lang.StringUtils;
@ -54,6 +55,8 @@ public class FileUtil {
if (ImageUtil.isImage(file.getMimeType())) {
content = ocrFile(unencryptedFile, language);
} else if (VideoUtil.isVideo(file.getMimeType())) {
content = VideoUtil.getMetadata(unencryptedFile);
} else if (unencryptedPdfFile != null) {
content = PdfUtil.extractPdf(unencryptedPdfFile);
}
@ -114,8 +117,12 @@ public class FileUtil {
Files.copy(new CipherInputStream(inputStream, cipher), path);
}
// Generate file variations
saveVariations(file, unencryptedFile, unencryptedPdfFile, cipher);
// Generate file variations (errors non-blocking)
try {
saveVariations(file, unencryptedFile, unencryptedPdfFile, cipher);
} catch (Exception e) {
log.error("Unable to generate thumbnails", e);
}
}
/**
@ -132,6 +139,8 @@ public class FileUtil {
try (InputStream inputStream = Files.newInputStream(unencryptedFile)) {
image = ImageIO.read(inputStream);
}
} else if (VideoUtil.isVideo(file.getMimeType())) {
image = VideoUtil.getThumbnail(unencryptedFile);
} else if (unencryptedPdfFile != null) {
// Generate preview from the first page of the PDF
image = PdfUtil.renderFirstPage(unencryptedPdfFile);

View File

@ -22,13 +22,12 @@ import java.util.Iterator;
* @author jtremeaux
*/
public class ImageUtil {
/**
* Write a high quality JPEG.
*
* @param image
* @param image Image
* @param outputStream Output stream
* @throws IOException
* @throws IOException e
*/
public static void writeJpeg(BufferedImage image, OutputStream outputStream) throws IOException {
Iterator<ImageWriter> iter = ImageIO.getImageWritersByFormatName("jpeg");
@ -94,6 +93,14 @@ public class ImageUtil {
.toString();
}
/**
* Return true if a pixel is black.
*
* @param image Image
* @param x X
* @param y Y
* @return True if black
*/
public static boolean isBlack(BufferedImage image, int x, int y) {
if (image.getType() == BufferedImage.TYPE_BYTE_BINARY) {
WritableRaster raster = image.getRaster();
@ -105,7 +112,16 @@ public class ImageUtil {
return isBlack(image, x, y, luminanceValue);
}
public static boolean isBlack(BufferedImage image, int x, int y, int luminanceCutOff) {
/**
* Return true if a pixel is black.
*
* @param image Image
* @param x X
* @param y Y
* @param luminanceCutOff Luminance cutoff
* @return True if black
*/
private static boolean isBlack(BufferedImage image, int x, int y, int luminanceCutOff) {
int pixelRGBValue;
int r;
int g;
@ -124,7 +140,7 @@ public class ImageUtil {
b = (pixelRGBValue) & 0xff;
luminance = (r * 0.299) + (g * 0.587) + (b * 0.114);
} catch (Exception e) {
// ignore.
// NOP
}
return luminance < luminanceCutOff;

View File

@ -0,0 +1,84 @@
package com.sismics.util;
import com.google.common.base.Charsets;
import com.google.common.collect.Lists;
import com.google.common.io.ByteStreams;
import com.sismics.util.io.InputStreamReaderThread;
import com.sismics.util.mime.MimeType;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.List;
/**
* Video processing utilities.
*
* @author bgamard
*/
public class VideoUtil {
/**
* Returns true if this MIME type is a video.
* @param mimeType MIME type
* @return True if video
*/
public static boolean isVideo(String mimeType) {
return mimeType.equals(MimeType.VIDEO_MP4) || mimeType.equals(MimeType.VIDEO_WEBM);
}
/**
* Generate a thumbnail from a video file.
*
* @param file Video file
* @return Thumbnail
*/
public static BufferedImage getThumbnail(Path file) throws Exception {
List<String> result = Lists.newLinkedList(Arrays.asList("ffmpeg", "-i"));
result.add(file.toAbsolutePath().toString());
result.addAll(Arrays.asList("-vf", "\"thumbnail\"", "-frames:v", "1", "-f", "mjpeg", "-"));
ProcessBuilder pb = new ProcessBuilder(result);
Process process = pb.start();
// Consume the process error stream
final String commandName = pb.command().get(0);
new InputStreamReaderThread(process.getErrorStream(), commandName).start();
// Consume the data as an image
try (InputStream is = process.getInputStream()) {
return ImageIO.read(is);
}
}
/**
* Extract metadata from a video file.
*
* @param file Video file
* @return Metadata
*/
public static String getMetadata(Path file) {
List<String> result = Lists.newLinkedList();
result.add("mediainfo");
result.add(file.toAbsolutePath().toString());
ProcessBuilder pb = new ProcessBuilder(result);
Process process;
try {
process = pb.start();
} catch (IOException e) {
return null;
}
// Consume the process error stream
final String commandName = pb.command().get(0);
new InputStreamReaderThread(process.getErrorStream(), commandName).start();
// Consume the data as a string
try (InputStream is = process.getInputStream()) {
return new String(ByteStreams.toByteArray(is), Charsets.UTF_8);
} catch (Exception e) {
return null;
}
}
}

View File

@ -0,0 +1,49 @@
package com.sismics.util.io;
import com.google.common.io.Closer;
import org.apache.log4j.Logger;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
/**
* Thread that consumes data from an input stream and logs it.
*
* @author jtremeaux
*/
public class InputStreamReaderThread extends Thread {
private static final Logger logger = Logger.getLogger(InputStreamReaderThread.class);
private InputStream is;
private String name;
private Closer closer = Closer.create();
public InputStreamReaderThread(InputStream input, String name) {
super(name + " InputStreamReader thread");
this.is = closer.register(input);
this.name = name;
}
@Override
public void run() {
try {
BufferedReader reader = closer.register(new BufferedReader(new InputStreamReader(is)));
for (String line = reader.readLine(); line != null; line = reader.readLine()) {
logger.info(String.format(name + ": %s", line));
}
} catch (IOException x) {
// NOP
} finally {
try {
closer.close();
} catch (Exception e) {
// NOP
}
}
}
}

View File

@ -24,5 +24,9 @@ public class MimeType {
public static final String TEXT_CSV = "text/csv";
public static final String VIDEO_WEBM = "video/webm";
public static final String VIDEO_MP4 = "video/mp4";
public static final String DEFAULT = "application/octet-stream";
}

View File

@ -1,5 +1,9 @@
package com.sismics.util.mime;
import com.google.common.base.Charsets;
import com.sismics.docs.core.model.jpa.File;
import org.apache.commons.compress.utils.IOUtils;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
@ -8,13 +12,6 @@ import java.nio.file.Path;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.apache.commons.compress.archivers.ArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.compress.utils.IOUtils;
import com.google.common.base.Charsets;
import com.sismics.docs.core.model.jpa.File;
/**
* Utility to check MIME types.
*
@ -60,6 +57,11 @@ public class MimeTypeUtil {
return MimeType.IMAGE_PNG;
} else if (headerBytes[0] == ((byte) 0x25) && headerBytes[1] == ((byte) 0x50) && headerBytes[2] == ((byte) 0x44) && headerBytes[3] == ((byte) 0x46)) {
return MimeType.APPLICATION_PDF;
} else if (headerBytes[0] == ((byte) 0x00) && headerBytes[1] == ((byte) 0x00) && headerBytes[2] == ((byte) 0x00) && (headerBytes[3] == ((byte) 0x14) || headerBytes[3] == ((byte) 0x18))
&& headerBytes[4] == ((byte) 0x66) && headerBytes[5] == ((byte) 0x74) && headerBytes[6] == ((byte) 0x79) && headerBytes[7] == ((byte) 0x70)) {
return MimeType.VIDEO_MP4;
} else if (headerBytes[0] == ((byte) 0x1a) && headerBytes[1] == ((byte) 0x45) && headerBytes[2] == ((byte) 0xdf) && headerBytes[3] == ((byte) 0xa3)) {
return MimeType.VIDEO_WEBM;
}
// Detect by file extension
@ -100,6 +102,10 @@ public class MimeTypeUtil {
return "txt";
case MimeType.TEXT_CSV:
return "csv";
case MimeType.VIDEO_MP4:
return "mp4";
case MimeType.VIDEO_WEBM:
return "webm";
default:
return "bin";
}

View File

@ -0,0 +1,10 @@
email.template.password_recovery.subject=R\u00E9initialiser votre mot de passe
email.template.password_recovery.hello=Bonjour {0}.
email.template.password_recovery.instruction1=Nous avons re\u00E7u une demande de r\u00E9initialisation de mot de passe.<br/>Si vous n'avez rien demand\u00E9, vous pouvez ignorer cet mail.
email.template.password_recovery.instruction2=Pour r\u00E9initialiser votre mot de passe, cliquez sur le lien ci-dessous :
email.template.password_recovery.click_here=Cliquez ici pour r\u00E9initialiser votre mot de passe.
email.template.route_step_validate.subject=Un document n\u00E9cessite votre attention
email.template.route_step_validate.hello=Bonjour {0}.
email.template.route_step_validate.instruction1=Une \u00E9tape de workflow vous a \u00E9t\u00E9 attribu\u00E9e et n\u00E9cessite votre attention.
email.template.route_step_validate.instruction2=Pour voir le document et valider le workflow, veuillez visiter le lien ci-dessous :
email.no_html.error=Votre client mail ne supporte pas les messages HTML

View File

@ -0,0 +1,10 @@
email.template.password_recovery.subject=\u8BF7\u91CD\u7F6E\u60A8\u7684\u5BC6\u7801
email.template.password_recovery.hello=\u60A8\u597D {0}.
email.template.password_recovery.instruction1=\u6211\u4EEC\u6536\u5230\u4E86\u4E00\u4E2A\u91CD\u7F6E\u60A8\u7684\u5BC6\u7801\u7684\u8BF7\u6C42\u3002<br/>\u5982\u679C\u60A8\u6CA1\u6709\u53D1\u9001\u8BE5\u8BF7\u6C42\uFF0C\u8BF7\u5FFD\u7565\u6B64\u7535\u5B50\u90AE\u4EF6
email.template.password_recovery.instruction2=\u8981\u91CD\u7F6E\u60A8\u7684\u5BC6\u7801\uFF0C\u8BF7\u8BBF\u95EE\u4EE5\u4E0B\u94FE\u63A5\uFF1A
email.template.password_recovery.click_here=\u8BF7\u70B9\u51FB\u6B64\u5904\u91CD\u7F6E\u60A8\u7684\u5BC6\u7801
email.template.route_step_validate.subject=\u4E00\u4EFD\u6587\u4EF6\u9700\u8981\u4F60\u7684\u5173\u6CE8
email.template.route_step_validate.hello={0}\uFF0C\u60A8\u597D.
email.template.route_step_validate.instruction1=\u5DE5\u4F5C\u6D41\u6B65\u9AA4\u5DF2\u7ECF\u5206\u914D\u7ED9\u60A8\uFF0C\u9700\u8981\u60A8\u7684\u5173\u6CE8\u3002
email.template.route_step_validate.instruction2=\u8981\u67E5\u770B\u6587\u6863\u5E76\u9A8C\u8BC1\u5DE5\u4F5C\u6D41\u7A0B\uFF0C\u8BF7\u8BBF\u95EE\u4EE5\u4E0B\u94FE\u63A5\uFF1A
email.no_html.error=\u60A8\u7684\u7535\u5B50\u90AE\u4EF6\u5BA2\u6237\u7AEF\u4E0D\u652F\u6301HTML\u683C\u5F0F\u90AE\u4EF6

View File

@ -0,0 +1,10 @@
email.template.password_recovery.subject=\u8ACB\u91CD\u65B0\u8A2D\u7F6E\u60A8\u7684\u5BC6\u78BC
email.template.password_recovery.hello=\u60A8\u597D{0}\uFF01
email.template.password_recovery.instruction1=\u6211\u5011\u6536\u5230\u4E86\u91CD\u7F6E\u5BC6\u78BC\u7684\u8ACB\u6C42\u3002<br/>\u5982\u679C\u60A8\u6C92\u6709\u8ACB\u6C42\u5E6B\u52A9\uFF0C\u8ACB\u5FFD\u7565\u6B64\u96FB\u5B50\u90F5\u4EF6\u3002
email.template.password_recovery.instruction2=\u8981\u91CD\u7F6E\u60A8\u7684\u5BC6\u78BC\uFF0C\u8ACB\u8A2A\u554F\u4EE5\u4E0B\u93C8\u63A5\uFF1A
email.template.password_recovery.click_here=\u9EDE\u64CA\u9019\u88E1\u91CD\u7F6E\u60A8\u7684\u5BC6\u78BC
email.template.route_step_validate.subject=\u4E00\u4EFD\u6587\u4EF6\u9700\u8981\u4F60\u7684\u95DC\u6CE8
email.template.route_step_validate.hello={0}\uFF0C\u60A8\u597D.
email.template.route_step_validate.instruction1=\u5DE5\u4F5C\u6D41\u6B65\u9A5F\u5DF2\u7D93\u5206\u914D\u7D66\u60A8\uFF0C\u9700\u8981\u60A8\u7684\u95DC\u6CE8\u3002
email.template.route_step_validate.instruction2=\u8981\u67E5\u770B\u6587\u6A94\u4E26\u9A57\u8B49\u5DE5\u4F5C\u6D41\u7A0B\uFF0C\u8ACB\u8A2A\u554F\u4EE5\u4E0B\u93C8\u63A5\uFF1A
email.no_html.error=\u60A8\u7684\u96FB\u5B50\u90F5\u4EF6\u5BA2\u6236\u7AEF\u4E0D\u652F\u6301HTML\u683C\u5F0F\u90F5\u4EF6

View File

@ -11,7 +11,7 @@ import org.junit.Test;
public class TestImageUtil {
@Test
public void computeGravatarTest() throws Exception {
public void computeGravatarTest() {
Assert.assertEquals("0bc83cb571cd1c50ba6f3e8a78ef1346", ImageUtil.computeGravatar("MyEmailAddress@example.com "));
}
}

View File

@ -20,12 +20,17 @@
</div>
</div>
<div class="text-center" ng-if="$stateParams.fileId">
<div class="text-center" style="position: relative;" ng-if="$stateParams.fileId">
<img ng-src="../api/file/{{ $stateParams.fileId }}/data?size=web"
ng-init="error = false"
img-error="error = true"
ng-show="!error" />
<a href class="video-overlay" ng-if="file.mimetype.substring(0, 6) == 'video/'"
ng-init="videoPlayer = false" ng-click="videoPlayer = true">
<span class="glyphicon glyphicon-play-circle" ng-if="!videoPlayer"></span>
<video ng-if="videoPlayer" autoplay="autoplay" loop="loop"
controls="controls" ng-src="../api/file/{{ $stateParams.fileId }}/data"></video>
</a>
<p class="well-lg" ng-show="error">
<span class="glyphicon glyphicon-warning-sign"></span>
{{ 'file.view.not_found' | translate }}

View File

@ -392,6 +392,33 @@ input[readonly].share-link {
}
}
// Video player
.video-overlay {
display: block;
position: absolute;
top: 0;
bottom: 0;
left: 0;
right: 0;
font-size: 500%;
color: #242424;
.glyphicon {
text-shadow: 0 0 20px #fff;
top: 50%;
transform: translateY(-50%);
}
&:hover {
color: #444;
}
video {
cursor: default;
width: 100%;
}
}
// Vertical alignment
.vertical-center {
min-height: 100vh;

View File

@ -603,6 +603,65 @@ public class TestDocumentResource extends BaseJerseyTest {
Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null));
}
/**
* Test video extraction.
*
* @throws Exception e
*/
@Test
public void testVideoExtraction() throws Exception {
// Login document_video
clientUtil.createUser("document_video");
String documentPlainToken = clientUtil.login("document_video");
// Create a document
long create1Date = new Date().getTime();
JsonObject json = target().path("/document").request()
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken)
.put(Entity.form(new Form()
.param("title", "My super title document 1")
.param("description", "My super description for document 1")
.param("language", "eng")
.param("create_date", Long.toString(create1Date))), JsonObject.class);
String document1Id = json.getString("id");
Assert.assertNotNull(document1Id);
// Add a video file
String file1Id;
try (InputStream is = Resources.getResource("file/video.webm").openStream()) {
StreamDataBodyPart streamDataBodyPart = new StreamDataBodyPart("file", is, "video.webm");
try (FormDataMultiPart multiPart = new FormDataMultiPart()) {
json = target()
.register(MultiPartFeature.class)
.path("/file").request()
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken)
.put(Entity.entity(multiPart.field("id", document1Id).bodyPart(streamDataBodyPart),
MediaType.MULTIPART_FORM_DATA_TYPE), JsonObject.class);
file1Id = json.getString("id");
Assert.assertNotNull(file1Id);
}
}
// Search documents by query in full content
json = target().path("/document/list")
.queryParam("search", "full:vp9")
.request()
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken)
.get(JsonObject.class);
Assert.assertTrue(json.getJsonArray("documents").size() == 1);
// Get the file thumbnail data
Response response = target().path("/file/" + file1Id + "/data")
.queryParam("size", "thumb")
.request()
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken)
.get();
InputStream is = (InputStream) response.getEntity();
byte[] fileBytes = ByteStreams.toByteArray(is);
Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues
Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null));
}
/**
* Test EML import.
*

Binary file not shown.