mirror of https://github.com/sismics/docs.git
278 lines
11 KiB
Java
278 lines
11 KiB
Java
package com.sismics.docs.core.util;
|
|
|
|
import com.google.common.base.Strings;
|
|
import com.sismics.docs.core.constant.Constants;
|
|
import com.sismics.docs.core.dao.jpa.FileDao;
|
|
import com.sismics.docs.core.dao.jpa.UserDao;
|
|
import com.sismics.docs.core.event.DocumentUpdatedAsyncEvent;
|
|
import com.sismics.docs.core.event.FileCreatedAsyncEvent;
|
|
import com.sismics.docs.core.model.jpa.File;
|
|
import com.sismics.docs.core.model.jpa.User;
|
|
import com.sismics.tess4j.Tesseract;
|
|
import com.sismics.util.ImageDeskew;
|
|
import com.sismics.util.ImageUtil;
|
|
import com.sismics.util.Scalr;
|
|
import com.sismics.util.VideoUtil;
|
|
import com.sismics.util.context.ThreadLocalContext;
|
|
import com.sismics.util.mime.MimeTypeUtil;
|
|
import org.apache.commons.lang.StringUtils;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
import javax.crypto.Cipher;
|
|
import javax.crypto.CipherInputStream;
|
|
import javax.crypto.CipherOutputStream;
|
|
import javax.imageio.ImageIO;
|
|
import java.awt.image.BufferedImage;
|
|
import java.io.IOException;
|
|
import java.io.InputStream;
|
|
import java.io.OutputStream;
|
|
import java.nio.file.Files;
|
|
import java.nio.file.Path;
|
|
|
|
/**
|
|
* File entity utilities.
|
|
*
|
|
* @author bgamard
|
|
*/
|
|
public class FileUtil {
|
|
/**
|
|
* Logger.
|
|
*/
|
|
private static final Logger log = LoggerFactory.getLogger(FileUtil.class);
|
|
|
|
/**
|
|
* Extract content from a file.
|
|
*
|
|
* @param language Language to extract
|
|
* @param file File to extract
|
|
* @param unencryptedFile Unencrypted file
|
|
* @param unencryptedPdfFile Unencrypted PDF file
|
|
* @return Content extract
|
|
*/
|
|
public static String extractContent(String language, File file, Path unencryptedFile, Path unencryptedPdfFile) {
|
|
String content = null;
|
|
|
|
if (ImageUtil.isImage(file.getMimeType())) {
|
|
content = ocrFile(unencryptedFile, language);
|
|
} else if (VideoUtil.isVideo(file.getMimeType())) {
|
|
content = VideoUtil.getMetadata(unencryptedFile);
|
|
} else if (unencryptedPdfFile != null) {
|
|
content = PdfUtil.extractPdf(unencryptedPdfFile);
|
|
}
|
|
|
|
return content;
|
|
}
|
|
|
|
/**
|
|
* Optical character recognition on a file.
|
|
*
|
|
* @param unecryptedFile Unencrypted file
|
|
* @param language Language to OCR
|
|
* @return Content extracted
|
|
*/
|
|
private static String ocrFile(Path unecryptedFile, String language) {
|
|
Tesseract instance = Tesseract.getInstance();
|
|
String content = null;
|
|
BufferedImage image;
|
|
try (InputStream inputStream = Files.newInputStream(unecryptedFile)) {
|
|
image = ImageIO.read(inputStream);
|
|
} catch (IOException e) {
|
|
log.error("Error reading the image", e);
|
|
return null;
|
|
}
|
|
|
|
// Upscale, grayscale and deskew the image
|
|
BufferedImage resizedImage = Scalr.resize(image, Scalr.Method.AUTOMATIC, Scalr.Mode.AUTOMATIC, 3500, Scalr.OP_ANTIALIAS, Scalr.OP_GRAYSCALE);
|
|
image.flush();
|
|
ImageDeskew imageDeskew = new ImageDeskew(resizedImage);
|
|
BufferedImage deskewedImage = Scalr.rotate(resizedImage, - imageDeskew.getSkewAngle(), Scalr.OP_ANTIALIAS, Scalr.OP_GRAYSCALE);
|
|
resizedImage.flush();
|
|
image = deskewedImage;
|
|
|
|
// OCR the file
|
|
try {
|
|
log.info("Starting OCR with TESSDATA_PREFIX=" + System.getenv("TESSDATA_PREFIX") + ";LC_NUMERIC=" + System.getenv("LC_NUMERIC"));
|
|
instance.setLanguage(language);
|
|
content = instance.doOCR(image);
|
|
} catch (Throwable e) {
|
|
log.error("Error while OCR-izing the image", e);
|
|
}
|
|
|
|
return content;
|
|
}
|
|
|
|
/**
|
|
* Save a file on the storage filesystem.
|
|
*
|
|
* @param unencryptedFile Unencrypted file
|
|
* @param unencryptedPdfFile Unencrypted PDF file
|
|
* @param file File to save
|
|
* @param privateKey Private key used for encryption
|
|
*/
|
|
public static void save(Path unencryptedFile, Path unencryptedPdfFile, File file, String privateKey) throws Exception {
|
|
Cipher cipher = EncryptionUtil.getEncryptionCipher(privateKey);
|
|
Path path = DirectoryUtil.getStorageDirectory().resolve(file.getId());
|
|
try (InputStream inputStream = Files.newInputStream(unencryptedFile)) {
|
|
Files.copy(new CipherInputStream(inputStream, cipher), path);
|
|
}
|
|
|
|
// Generate file variations (errors non-blocking)
|
|
try {
|
|
saveVariations(file, unencryptedFile, unencryptedPdfFile, cipher);
|
|
} catch (Exception e) {
|
|
log.error("Unable to generate thumbnails", e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Generate file variations.
|
|
*
|
|
* @param file File from database
|
|
* @param unencryptedFile Unencrypted file
|
|
* @param unencryptedPdfFile Unencrypted PDF file
|
|
* @param cipher Cipher to use for encryption
|
|
*/
|
|
private static void saveVariations(File file, Path unencryptedFile, Path unencryptedPdfFile, Cipher cipher) throws Exception {
|
|
BufferedImage image = null;
|
|
if (ImageUtil.isImage(file.getMimeType())) {
|
|
try (InputStream inputStream = Files.newInputStream(unencryptedFile)) {
|
|
image = ImageIO.read(inputStream);
|
|
}
|
|
} else if (VideoUtil.isVideo(file.getMimeType())) {
|
|
image = VideoUtil.getThumbnail(unencryptedFile);
|
|
} else if (unencryptedPdfFile != null) {
|
|
// Generate preview from the first page of the PDF
|
|
image = PdfUtil.renderFirstPage(unencryptedPdfFile);
|
|
}
|
|
|
|
if (image != null) {
|
|
// Generate thumbnails from image
|
|
BufferedImage web = Scalr.resize(image, Scalr.Method.ULTRA_QUALITY, Scalr.Mode.AUTOMATIC, 1280);
|
|
BufferedImage thumbnail = Scalr.resize(image, Scalr.Method.ULTRA_QUALITY, Scalr.Mode.AUTOMATIC, 256);
|
|
image.flush();
|
|
|
|
// Write "web" encrypted image
|
|
Path outputFile = DirectoryUtil.getStorageDirectory().resolve(file.getId() + "_web");
|
|
try (OutputStream outputStream = new CipherOutputStream(Files.newOutputStream(outputFile), cipher)) {
|
|
ImageUtil.writeJpeg(web, outputStream);
|
|
}
|
|
|
|
// Write "thumb" encrypted image
|
|
outputFile = DirectoryUtil.getStorageDirectory().resolve(file.getId() + "_thumb");
|
|
try (OutputStream outputStream = new CipherOutputStream(Files.newOutputStream(outputFile), cipher)) {
|
|
ImageUtil.writeJpeg(thumbnail, outputStream);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Remove a file from the storage filesystem.
|
|
*
|
|
* @param file File to delete
|
|
*/
|
|
public static void delete(File file) throws IOException {
|
|
Path storedFile = DirectoryUtil.getStorageDirectory().resolve(file.getId());
|
|
Path webFile = DirectoryUtil.getStorageDirectory().resolve(file.getId() + "_web");
|
|
Path thumbnailFile = DirectoryUtil.getStorageDirectory().resolve(file.getId() + "_thumb");
|
|
|
|
if (Files.exists(storedFile)) {
|
|
Files.delete(storedFile);
|
|
}
|
|
if (Files.exists(webFile)) {
|
|
Files.delete(webFile);
|
|
}
|
|
if (Files.exists(thumbnailFile)) {
|
|
Files.delete(thumbnailFile);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Create a new file.
|
|
*
|
|
* @param name File name, can be null
|
|
* @param unencryptedFile Path to the unencrypted file
|
|
* @param fileSize File size
|
|
* @param language File language, can be null if associated to no document
|
|
* @param userId User ID creating the file
|
|
* @param documentId Associated document ID or null if no document
|
|
* @return File ID
|
|
* @throws Exception e
|
|
*/
|
|
public static String createFile(String name, Path unencryptedFile, long fileSize, String language, String userId, String documentId) throws Exception {
|
|
// Validate mime type
|
|
String mimeType;
|
|
try {
|
|
mimeType = MimeTypeUtil.guessMimeType(unencryptedFile, name);
|
|
} catch (IOException e) {
|
|
throw new IOException("ErrorGuessMime", e);
|
|
}
|
|
|
|
// Validate user quota
|
|
UserDao userDao = new UserDao();
|
|
User user = userDao.getById(userId);
|
|
if (user.getStorageCurrent() + fileSize > user.getStorageQuota()) {
|
|
throw new IOException("QuotaReached");
|
|
}
|
|
|
|
// Validate global quota
|
|
String globalStorageQuotaStr = System.getenv(Constants.GLOBAL_QUOTA_ENV);
|
|
if (!Strings.isNullOrEmpty(globalStorageQuotaStr)) {
|
|
long globalStorageQuota = Long.valueOf(globalStorageQuotaStr);
|
|
long globalStorageCurrent = userDao.getGlobalStorageCurrent();
|
|
if (globalStorageCurrent + fileSize > globalStorageQuota) {
|
|
throw new IOException("QuotaReached");
|
|
}
|
|
}
|
|
|
|
// Get files of this document
|
|
FileDao fileDao = new FileDao();
|
|
int order = 0;
|
|
if (documentId != null) {
|
|
for (File file : fileDao.getByDocumentId(userId, documentId)) {
|
|
file.setOrder(order++);
|
|
}
|
|
}
|
|
|
|
// Create the file
|
|
File file = new File();
|
|
file.setOrder(order);
|
|
file.setDocumentId(documentId);
|
|
file.setName(StringUtils.abbreviate(name, 200));
|
|
file.setMimeType(mimeType);
|
|
file.setUserId(userId);
|
|
String fileId = fileDao.create(file, userId);
|
|
|
|
// Guess the mime type a second time, for open document format (first detected as simple ZIP file)
|
|
file.setMimeType(MimeTypeUtil.guessOpenDocumentFormat(file, unencryptedFile));
|
|
|
|
// Convert to PDF if necessary (for thumbnail and text extraction)
|
|
java.nio.file.Path unencryptedPdfFile = PdfUtil.convertToPdf(file, unencryptedFile);
|
|
|
|
// Save the file
|
|
FileUtil.save(unencryptedFile, unencryptedPdfFile, file, user.getPrivateKey());
|
|
|
|
// Update the user quota
|
|
user.setStorageCurrent(user.getStorageCurrent() + fileSize);
|
|
userDao.updateQuota(user);
|
|
|
|
// Raise a new file created event and document updated event if we have a document
|
|
if (documentId != null) {
|
|
FileCreatedAsyncEvent fileCreatedAsyncEvent = new FileCreatedAsyncEvent();
|
|
fileCreatedAsyncEvent.setUserId(userId);
|
|
fileCreatedAsyncEvent.setLanguage(language);
|
|
fileCreatedAsyncEvent.setFile(file);
|
|
fileCreatedAsyncEvent.setUnencryptedFile(unencryptedFile);
|
|
fileCreatedAsyncEvent.setUnencryptedPdfFile(unencryptedPdfFile);
|
|
ThreadLocalContext.get().addAsyncEvent(fileCreatedAsyncEvent);
|
|
|
|
DocumentUpdatedAsyncEvent documentUpdatedAsyncEvent = new DocumentUpdatedAsyncEvent();
|
|
documentUpdatedAsyncEvent.setUserId(userId);
|
|
documentUpdatedAsyncEvent.setDocumentId(documentId);
|
|
ThreadLocalContext.get().addAsyncEvent(documentUpdatedAsyncEvent);
|
|
}
|
|
|
|
return fileId;
|
|
}
|
|
}
|