Closes #141: Never close full file content in memory

This commit is contained in:
Benjamin Gamard 2017-11-06 16:45:47 +01:00
parent 4d161aea07
commit 244ddc7ce2
17 changed files with 389 additions and 354 deletions

View File

@ -1,6 +1,7 @@
package com.sismics.docs.core.event;
import java.io.InputStream;
import java.nio.file.Path;
import com.google.common.base.MoreObjects;
import com.sismics.docs.core.model.jpa.File;
@ -22,16 +23,16 @@ public class FileCreatedAsyncEvent extends UserEvent {
private String language;
/**
* Unencrypted input stream containing the file.
* Unencrypted original file.
*/
private InputStream inputStream;
private Path unencryptedFile;
/**
* Unencrypted input stream containing a PDF representation
* of the file. May be null if the PDF conversion is not
* Unencrypted file containing PDF representation
* of the original file. May be null if the PDF conversion is not
* necessary or not possible.
*/
private InputStream pdfInputStream;
private Path unencryptedPdfFile;
public File getFile() {
return file;
@ -48,21 +49,23 @@ public class FileCreatedAsyncEvent extends UserEvent {
public void setLanguage(String language) {
this.language = language;
}
public InputStream getInputStream() {
return inputStream;
public Path getUnencryptedFile() {
return unencryptedFile;
}
public void setInputStream(InputStream inputStream) {
this.inputStream = inputStream;
}
public InputStream getPdfInputStream() {
return pdfInputStream;
public FileCreatedAsyncEvent setUnencryptedFile(Path unencryptedFile) {
this.unencryptedFile = unencryptedFile;
return this;
}
public void setPdfInputStream(InputStream pdfInputStream) {
this.pdfInputStream = pdfInputStream;
public Path getUnencryptedPdfFile() {
return unencryptedPdfFile;
}
public FileCreatedAsyncEvent setUnencryptedPdfFile(Path unencryptedPdfFile) {
this.unencryptedPdfFile = unencryptedPdfFile;
return this;
}
@Override

View File

@ -0,0 +1,35 @@
package com.sismics.docs.core.event;
import com.google.common.base.MoreObjects;
import com.sismics.docs.core.model.jpa.File;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.List;
/**
* Cleanup temporary files event.
*
* @author bgamard
*/
public class TemporaryFileCleanupAsyncEvent {
/**
* Temporary files.
*/
private List<Path> fileList;
public TemporaryFileCleanupAsyncEvent(List<Path> fileList) {
this.fileList = fileList;
}
public List<Path> getFileList() {
return fileList;
}
@Override
public String toString() {
return MoreObjects.toStringHelper(this)
.add("files", fileList)
.toString();
}
}

View File

@ -1,10 +1,5 @@
package com.sismics.docs.core.listener.async;
import java.text.MessageFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.eventbus.Subscribe;
import com.sismics.docs.core.dao.jpa.FileDao;
import com.sismics.docs.core.dao.lucene.LuceneDao;
@ -12,6 +7,10 @@ import com.sismics.docs.core.event.FileCreatedAsyncEvent;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.docs.core.util.FileUtil;
import com.sismics.docs.core.util.TransactionUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.text.MessageFormat;
/**
* Listener on file created.
@ -28,7 +27,7 @@ public class FileCreatedAsyncListener {
* File created.
*
* @param fileCreatedAsyncEvent File created event
* @throws Exception
* @throws Exception e
*/
@Subscribe
public void on(final FileCreatedAsyncEvent fileCreatedAsyncEvent) throws Exception {
@ -42,11 +41,7 @@ public class FileCreatedAsyncListener {
// Extract text content from the file
long startTime = System.currentTimeMillis();
final String content = FileUtil.extractContent(fileCreatedAsyncEvent.getLanguage(), file,
fileCreatedAsyncEvent.getInputStream(), fileCreatedAsyncEvent.getPdfInputStream());
fileCreatedAsyncEvent.getInputStream().close();
if (fileCreatedAsyncEvent.getPdfInputStream() != null) {
fileCreatedAsyncEvent.getPdfInputStream().close();
}
fileCreatedAsyncEvent.getUnencryptedFile(), fileCreatedAsyncEvent.getUnencryptedPdfFile());
log.info(MessageFormat.format("File content extracted in {0}ms", System.currentTimeMillis() - startTime));
// Store the text content in the database

View File

@ -0,0 +1,38 @@
package com.sismics.docs.core.listener.async;
import com.google.common.eventbus.Subscribe;
import com.sismics.docs.core.event.TemporaryFileCleanupAsyncEvent;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.nio.file.Files;
import java.nio.file.Path;
/**
* Listener to cleanup temporary files created during a request.
*
* @author bgamard
*/
public class TemporaryFileCleanupAsyncListener {
/**
* Logger.
*/
private static final Logger log = LoggerFactory.getLogger(FileCreatedAsyncListener.class);
/**
* Cleanup temporary files.
*
* @param event Temporary file cleanup event
* @throws Exception
*/
@Subscribe
public void on(final TemporaryFileCleanupAsyncEvent event) throws Exception {
if (log.isInfoEnabled()) {
log.info("Cleanup temporary files event: " + event.toString());
}
for (Path file : event.getFileList()) {
Files.delete(file);
}
}
}

View File

@ -12,12 +12,8 @@ import com.google.common.eventbus.EventBus;
import com.lowagie.text.FontFactory;
import com.sismics.docs.core.constant.ConfigType;
import com.sismics.docs.core.dao.jpa.ConfigDao;
import com.sismics.docs.core.listener.async.DocumentCreatedAsyncListener;
import com.sismics.docs.core.listener.async.DocumentDeletedAsyncListener;
import com.sismics.docs.core.listener.async.DocumentUpdatedAsyncListener;
import com.sismics.docs.core.listener.async.FileCreatedAsyncListener;
import com.sismics.docs.core.listener.async.FileDeletedAsyncListener;
import com.sismics.docs.core.listener.async.RebuildIndexAsyncListener;
import com.sismics.docs.core.event.TemporaryFileCleanupAsyncEvent;
import com.sismics.docs.core.listener.async.*;
import com.sismics.docs.core.listener.sync.DeadEventListener;
import com.sismics.docs.core.model.jpa.Config;
import com.sismics.docs.core.service.IndexingService;
@ -86,6 +82,7 @@ public class AppContext {
asyncEventBus.register(new DocumentUpdatedAsyncListener());
asyncEventBus.register(new DocumentDeletedAsyncListener());
asyncEventBus.register(new RebuildIndexAsyncListener());
asyncEventBus.register(new TemporaryFileCleanupAsyncListener());
}
/**
@ -132,6 +129,7 @@ public class AppContext {
if (EnvironmentUtil.isUnitTest()) {
return new EventBus();
} else {
// /!\ Don't add more threads because a cleanup event is fired at the end of each request
ThreadPoolExecutor executor = new ThreadPoolExecutor(1, 1,
0L, TimeUnit.MILLISECONDS,
new LinkedBlockingQueue<Runnable>());

View File

@ -1,20 +1,22 @@
package com.sismics.docs.core.util;
import java.io.InputStream;
import java.math.BigInteger;
import java.security.NoSuchAlgorithmException;
import java.security.SecureRandom;
import java.security.Security;
import com.google.common.base.Strings;
import com.sismics.util.context.ThreadLocalContext;
import org.bouncycastle.jce.provider.BouncyCastleProvider;
import javax.crypto.Cipher;
import javax.crypto.CipherInputStream;
import javax.crypto.SecretKey;
import javax.crypto.SecretKeyFactory;
import javax.crypto.spec.PBEKeySpec;
import org.bouncycastle.jce.provider.BouncyCastleProvider;
import com.google.common.base.Strings;
import java.io.InputStream;
import java.math.BigInteger;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.security.NoSuchAlgorithmException;
import java.security.SecureRandom;
import java.security.Security;
/**
* Encryption utilities.
@ -55,7 +57,28 @@ public class EncryptionUtil {
public static InputStream decryptInputStream(InputStream is, String privateKey) throws Exception {
return new CipherInputStream(is, getCipher(privateKey, Cipher.DECRYPT_MODE));
}
/**
* Decrypt a file to a temporary file using the specified private key.
*
* @param file Encrypted file
* @param privateKey Private key
* @return Decrypted temporary file
* @throws Exception
*/
public static Path decryptFile(Path file, String privateKey) throws Exception {
if (privateKey == null) {
// For unit testing
return file;
}
Path tmpFile = ThreadLocalContext.get().createTemporaryFile();
try (InputStream is = Files.newInputStream(file)) {
Files.copy(new CipherInputStream(is, getCipher(privateKey, Cipher.DECRYPT_MODE)), tmpFile, StandardCopyOption.REPLACE_EXISTING);
}
return tmpFile;
}
/**
* Return an encryption cipher.
*

View File

@ -36,34 +36,34 @@ public class FileUtil {
*
* @param language Language to extract
* @param file File to extract
* @param inputStream Unencrypted input stream
* @param pdfInputStream Unencrypted PDF input stream
* @param unencryptedFile Unencrypted file
* @param unencryptedPdfFile Unencrypted PDF file
* @return Content extract
*/
public static String extractContent(String language, File file, InputStream inputStream, InputStream pdfInputStream) {
public static String extractContent(String language, File file, Path unencryptedFile, Path unencryptedPdfFile) {
String content = null;
if (ImageUtil.isImage(file.getMimeType())) {
content = ocrFile(inputStream, language);
} else if (pdfInputStream != null) {
content = PdfUtil.extractPdf(pdfInputStream);
content = ocrFile(unencryptedFile, language);
} else if (unencryptedPdfFile != null) {
content = PdfUtil.extractPdf(unencryptedPdfFile);
}
return content;
}
/**
* Optical character recognition on a stream.
* Optical character recognition on a file.
*
* @param inputStream Unencrypted input stream
* @param unecryptedFile Unencrypted file
* @param language Language to OCR
* @return Content extracted
*/
private static String ocrFile(InputStream inputStream, String language) {
private static String ocrFile(Path unecryptedFile, String language) {
Tesseract instance = Tesseract.getInstance();
String content = null;
BufferedImage image;
try {
try (InputStream inputStream = Files.newInputStream(unecryptedFile)) {
image = ImageIO.read(inputStream);
} catch (IOException e) {
log.error("Error reading the image", e);
@ -90,38 +90,39 @@ public class FileUtil {
/**
* Save a file on the storage filesystem.
*
* @param inputStream Unencrypted input stream
* @param pdfInputStream PDF input stream
* @param unencryptedFile Unencrypted file
* @param unencryptedPdfFile Unencrypted PDF file
* @param file File to save
* @param privateKey Private key used for encryption
*/
public static void save(InputStream inputStream, InputStream pdfInputStream, File file, String privateKey) throws Exception {
public static void save(Path unencryptedFile, Path unencryptedPdfFile, File file, String privateKey) throws Exception {
Cipher cipher = EncryptionUtil.getEncryptionCipher(privateKey);
Path path = DirectoryUtil.getStorageDirectory().resolve(file.getId());
Files.copy(new CipherInputStream(inputStream, cipher), path);
inputStream.reset();
try (InputStream inputStream = Files.newInputStream(unencryptedFile)) {
Files.copy(new CipherInputStream(inputStream, cipher), path);
}
// Generate file variations
saveVariations(file, inputStream, pdfInputStream, cipher);
saveVariations(file, unencryptedFile, unencryptedPdfFile, cipher);
}
/**
* Generate file variations.
*
* @param file File from database
* @param inputStream Unencrypted input stream
* @param pdfInputStream Unencrypted PDF input stream
* @param unencryptedFile Unencrypted file
* @param unencryptedPdfFile Unencrypted PDF file
* @param cipher Cipher to use for encryption
*/
private static void saveVariations(File file, InputStream inputStream, InputStream pdfInputStream, Cipher cipher) throws Exception {
private static void saveVariations(File file, Path unencryptedFile, Path unencryptedPdfFile, Cipher cipher) throws Exception {
BufferedImage image = null;
if (ImageUtil.isImage(file.getMimeType())) {
image = ImageIO.read(inputStream);
inputStream.reset();
} else if(pdfInputStream != null) {
try (InputStream inputStream = Files.newInputStream(unencryptedFile)) {
image = ImageIO.read(inputStream);
}
} else if (unencryptedPdfFile != null) {
// Generate preview from the first page of the PDF
image = PdfUtil.renderFirstPage(pdfInputStream);
pdfInputStream.reset();
image = PdfUtil.renderFirstPage(unencryptedPdfFile);
}
if (image != null) {

View File

@ -3,7 +3,6 @@ package com.sismics.docs.core.util;
import com.google.common.base.Charsets;
import com.google.common.base.Strings;
import com.google.common.io.ByteStreams;
import com.google.common.io.CharStreams;
import com.google.common.io.Closer;
import com.google.common.io.Resources;
import com.lowagie.text.*;
@ -12,6 +11,7 @@ import com.sismics.docs.core.dao.jpa.dto.DocumentDto;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.docs.core.util.pdf.PdfPage;
import com.sismics.util.ImageUtil;
import com.sismics.util.context.ThreadLocalContext;
import com.sismics.util.mime.MimeType;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.multipdf.PDFMergerUtility;
@ -34,7 +34,9 @@ import org.slf4j.LoggerFactory;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.*;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
@ -56,13 +58,13 @@ public class PdfUtil {
/**
* Extract text from a PDF.
*
* @param inputStream Unencrypted input stream
* @param unencryptedPdfFile Unencrypted PDF file
* @return Content extracted
*/
public static String extractPdf(InputStream inputStream) {
public static String extractPdf(Path unencryptedPdfFile) {
String content = null;
PDDocument pdfDocument = null;
try {
try (InputStream inputStream = Files.newInputStream(unencryptedPdfFile)) {
PDFTextStripper stripper = new PDFTextStripper();
pdfDocument = PDDocument.load(inputStream);
content = stripper.getText(pdfDocument);
@ -85,26 +87,25 @@ public class PdfUtil {
* Convert a file to PDF if necessary.
*
* @param file File
* @param inputStream InputStream
* @param reset Reset the stream after usage
* @return PDF input stream
* @param unencryptedFile Unencrypted file
* @return PDF temporary file
*/
public static InputStream convertToPdf(File file, InputStream inputStream, boolean reset) throws Exception {
public static Path convertToPdf(File file, Path unencryptedFile) throws Exception {
if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) {
// It's already PDF, just return the input
return inputStream;
// It's already PDF, just return the file
return unencryptedFile;
}
if (file.getMimeType().equals(MimeType.OFFICE_DOCUMENT)) {
return convertOfficeDocument(inputStream, reset);
return convertOfficeDocument(unencryptedFile);
}
if (file.getMimeType().equals(MimeType.OPEN_DOCUMENT_TEXT)) {
return convertOpenDocumentText(inputStream, reset);
return convertOpenDocumentText(unencryptedFile);
}
if (file.getMimeType().equals(MimeType.TEXT_PLAIN) || file.getMimeType().equals(MimeType.TEXT_CSV)) {
return convertTextPlain(inputStream, reset);
return convertTextPlain(unencryptedFile);
}
// PDF conversion not necessary/possible
@ -114,64 +115,58 @@ public class PdfUtil {
/**
* Convert a text plain document to PDF.
*
* @param inputStream Unecnrypted input stream
* @param reset Reset the stream after usage
* @return PDF input stream
* @param unencryptedFile Unencrypted file
* @return PDF file
*/
private static InputStream convertTextPlain(InputStream inputStream, boolean reset) throws Exception {
private static Path convertTextPlain(Path unencryptedFile) throws Exception {
Document output = new Document(PageSize.A4, 40, 40, 40, 40);
ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream();
Path tempFile = ThreadLocalContext.get().createTemporaryFile();
OutputStream pdfOutputStream = Files.newOutputStream(tempFile);
PdfWriter.getInstance(output, pdfOutputStream);
output.open();
String content = CharStreams.toString(new InputStreamReader(inputStream, Charsets.UTF_8));
String content = new String(Files.readAllBytes(unencryptedFile), Charsets.UTF_8);
Font font = FontFactory.getFont("LiberationMono-Regular");
Paragraph paragraph = new Paragraph(content, font);
paragraph.setAlignment(Element.ALIGN_LEFT);
output.add(paragraph);
output.close();
if (reset) {
inputStream.reset();
}
return new ByteArrayInputStream(pdfOutputStream.toByteArray());
return tempFile;
}
/**
* Convert an open document text file to PDF.
*
* @param inputStream Unencrypted input stream
* @param reset Reset the stream after usage
* @return PDF input stream
* @param unencryptedFile Unencrypted file
* @return PDF file
*/
private static InputStream convertOpenDocumentText(InputStream inputStream, boolean reset) throws Exception {
ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream();
OdfTextDocument document = OdfTextDocument.loadDocument(inputStream);
PdfOptions options = PdfOptions.create();
PdfConverter.getInstance().convert(document, pdfOutputStream, options);
if (reset) {
inputStream.reset();
private static Path convertOpenDocumentText(Path unencryptedFile) throws Exception {
Path tempFile = ThreadLocalContext.get().createTemporaryFile();
try (InputStream inputStream = Files.newInputStream(unencryptedFile);
OutputStream outputStream = Files.newOutputStream(tempFile)) {
OdfTextDocument document = OdfTextDocument.loadDocument(inputStream);
PdfOptions options = PdfOptions.create();
PdfConverter.getInstance().convert(document, outputStream, options);
}
return new ByteArrayInputStream(pdfOutputStream.toByteArray());
return tempFile;
}
/**
* Convert an Office document to PDF.
*
* @param inputStream Unencrypted input stream
* @param reset Reset the stream after usage
* @return PDF input stream
* @param unencryptedFile Unencrypted file
* @return PDF file
*/
private static InputStream convertOfficeDocument(InputStream inputStream, boolean reset) throws Exception {
ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream();
XWPFDocument document = new XWPFDocument(inputStream);
org.apache.poi.xwpf.converter.pdf.PdfOptions options = org.apache.poi.xwpf.converter.pdf.PdfOptions.create();
org.apache.poi.xwpf.converter.pdf.PdfConverter.getInstance().convert(document, pdfOutputStream, options);
if (reset) {
inputStream.reset();
private static Path convertOfficeDocument(Path unencryptedFile) throws Exception {
Path tempFile = ThreadLocalContext.get().createTemporaryFile();
try (InputStream inputStream = Files.newInputStream(unencryptedFile);
OutputStream outputStream = Files.newOutputStream(tempFile)) {
XWPFDocument document = new XWPFDocument(inputStream);
org.apache.poi.xwpf.converter.pdf.PdfOptions options = org.apache.poi.xwpf.converter.pdf.PdfOptions.create();
org.apache.poi.xwpf.converter.pdf.PdfConverter.getInstance().convert(document, outputStream, options);
}
return new ByteArrayInputStream(pdfOutputStream.toByteArray());
return tempFile;
}
/**
@ -182,10 +177,10 @@ public class PdfUtil {
* @param fitImageToPage Fit images to the page
* @param metadata Add a page with metadata
* @param margin Margins in millimeters
* @return PDF input stream
* @param outputStream Output stream to write to, will be closed
*/
public static InputStream convertToPdf(DocumentDto documentDto, List<File> fileList,
boolean fitImageToPage, boolean metadata, int margin) throws Exception {
public static void convertToPdf(DocumentDto documentDto, List<File> fileList,
boolean fitImageToPage, boolean metadata, int margin, OutputStream outputStream) throws Exception {
// Setup PDFBox
Closer closer = Closer.create();
MemoryUsageSetting memUsageSettings = MemoryUsageSetting.setupMixed(1000000); // 1MB max memory usage
@ -240,80 +235,75 @@ public class PdfUtil {
// Add files
for (File file : fileList) {
Path storedFile = DirectoryUtil.getStorageDirectory().resolve(file.getId());
try (InputStream storedFileInputStream = file.getPrivateKey() == null ? // Try to decrypt the file if we have a private key available
Files.newInputStream(storedFile) : EncryptionUtil.decryptInputStream(Files.newInputStream(storedFile), file.getPrivateKey())) {
if (ImageUtil.isImage(file.getMimeType())) {
PDPage page = new PDPage(PDRectangle.A4); // Images into A4 pages
try (PDPageContentStream contentStream = new PDPageContentStream(doc, page)) {
// Read the image using the correct handler. PDFBox can't do it because it relies wrongly on file extension
PDImageXObject pdImage = null;
if (file.getMimeType().equals(MimeType.IMAGE_JPEG)) {
pdImage = JPEGFactory.createFromStream(doc, storedFileInputStream);
} else if (file.getMimeType().equals(MimeType.IMAGE_GIF) || file.getMimeType().equals(MimeType.IMAGE_PNG)) {
BufferedImage bim = ImageIO.read(storedFileInputStream);
pdImage = LosslessFactory.createFromImage(doc, bim);
}
// Do we want to fill the page with the image?
if (fitImageToPage) {
// Fill the page with the image
float widthAvailable = page.getMediaBox().getWidth() - 2 * margin * mmPerInch;
float heightAvailable = page.getMediaBox().getHeight() - 2 * margin * mmPerInch;
// Compare page format and image format
if (widthAvailable / heightAvailable < (float) pdImage.getWidth() / (float) pdImage.getHeight()) {
float imageHeight = widthAvailable / pdImage.getWidth() * pdImage.getHeight();
contentStream.drawImage(pdImage, margin * mmPerInch, heightAvailable + margin * mmPerInch - imageHeight,
widthAvailable, imageHeight);
} else {
float imageWidth = heightAvailable / pdImage.getHeight() * pdImage.getWidth();
contentStream.drawImage(pdImage, margin * mmPerInch, margin * mmPerInch,
imageWidth, heightAvailable);
}
// Decrypt the file to a temporary file
Path unencryptedFile = EncryptionUtil.decryptFile(storedFile, file.getPrivateKey());
if (ImageUtil.isImage(file.getMimeType())) {
PDPage page = new PDPage(PDRectangle.A4); // Images into A4 pages
try (PDPageContentStream contentStream = new PDPageContentStream(doc, page);
InputStream storedFileInputStream = Files.newInputStream(unencryptedFile)) {
// Read the image using the correct handler. PDFBox can't do it because it relies wrongly on file extension
PDImageXObject pdImage = null;
if (file.getMimeType().equals(MimeType.IMAGE_JPEG)) {
pdImage = JPEGFactory.createFromStream(doc, storedFileInputStream);
} else if (file.getMimeType().equals(MimeType.IMAGE_GIF) || file.getMimeType().equals(MimeType.IMAGE_PNG)) {
BufferedImage bim = ImageIO.read(storedFileInputStream);
pdImage = LosslessFactory.createFromImage(doc, bim);
}
// Do we want to fill the page with the image?
if (fitImageToPage) {
// Fill the page with the image
float widthAvailable = page.getMediaBox().getWidth() - 2 * margin * mmPerInch;
float heightAvailable = page.getMediaBox().getHeight() - 2 * margin * mmPerInch;
// Compare page format and image format
if (widthAvailable / heightAvailable < (float) pdImage.getWidth() / (float) pdImage.getHeight()) {
float imageHeight = widthAvailable / pdImage.getWidth() * pdImage.getHeight();
contentStream.drawImage(pdImage, margin * mmPerInch, heightAvailable + margin * mmPerInch - imageHeight,
widthAvailable, imageHeight);
} else {
// Draw the image as is
contentStream.drawImage(pdImage, margin * mmPerInch,
page.getMediaBox().getHeight() - pdImage.getHeight() - margin * mmPerInch);
float imageWidth = heightAvailable / pdImage.getHeight() * pdImage.getWidth();
contentStream.drawImage(pdImage, margin * mmPerInch, margin * mmPerInch,
imageWidth, heightAvailable);
}
} else {
// Draw the image as is
contentStream.drawImage(pdImage, margin * mmPerInch,
page.getMediaBox().getHeight() - pdImage.getHeight() - margin * mmPerInch);
}
doc.addPage(page);
} else {
// Try to convert the file to PDF
InputStream pdfInputStream = convertToPdf(file, storedFileInputStream, false);
if (pdfInputStream != null) {
// This file is convertible to PDF, just add it to the end
try {
PDDocument mergeDoc = PDDocument.load(pdfInputStream, memUsageSettings);
closer.register(mergeDoc);
PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
pdfMergerUtility.appendDocument(doc, mergeDoc);
} finally {
pdfInputStream.close();
}
}
// All other non-PDF-convertible files are ignored
}
doc.addPage(page);
} else {
// Try to convert the file to PDF
Path unencryptedPdfFile = convertToPdf(file, unencryptedFile);
if (unencryptedPdfFile != null) {
// This file is convertible to PDF, just add it to the end
PDDocument mergeDoc = PDDocument.load(unencryptedPdfFile.toFile(), memUsageSettings);
closer.register(mergeDoc);
PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
pdfMergerUtility.appendDocument(doc, mergeDoc);
}
// All other non-PDF-convertible files are ignored
}
}
// Save to a temporary file
try (TemporaryFileStream temporaryFileStream = new TemporaryFileStream()) {
doc.save(temporaryFileStream.openWriteStream());
closer.close(); // Close all remaining opened PDF
return temporaryFileStream.openReadStream();
}
doc.save(outputStream); // Write to the output stream
closer.close(); // Close all remaining opened PDF
}
}
/**
* Render the first page of a PDF.
*
* @param inputStream PDF document
* @param unencryptedFile PDF document
* @return Render of the first page
*/
public static BufferedImage renderFirstPage(InputStream inputStream) throws IOException {
try (PDDocument pdfDocument = PDDocument.load(inputStream)) {
public static BufferedImage renderFirstPage(Path unencryptedFile) throws IOException {
try (InputStream inputStream = Files.newInputStream(unencryptedFile);
PDDocument pdfDocument = PDDocument.load(inputStream)) {
PDFRenderer renderer = new PDFRenderer(pdfDocument);
return renderer.renderImage(0);
}

View File

@ -1,55 +0,0 @@
package com.sismics.docs.core.util;
import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.UUID;
/**
* Utilities for writing and reading to a temporary file.
*
* @author bgamard
*/
public class TemporaryFileStream implements Closeable {
/**
* Temporary file.
*/
private Path tempFile;
/**
* Construct a temporary file.
*
* @throws IOException
*/
public TemporaryFileStream() throws IOException {
tempFile = Files.createTempFile(UUID.randomUUID().toString(), ".tmp");
}
/**
* Open a stream for writing.
*
* @return OutputStream
* @throws IOException
*/
public OutputStream openWriteStream() throws IOException {
return Files.newOutputStream(tempFile);
}
/**
* Open a stream for reading.
*
* @return InputStream
* @throws IOException
*/
public InputStream openReadStream() throws IOException {
return Files.newInputStream(tempFile);
}
@Override
public void close() throws IOException {
Files.delete(tempFile);
}
}

View File

@ -1,9 +1,13 @@
package com.sismics.util.context;
import com.google.common.collect.Lists;
import com.sismics.docs.core.event.TemporaryFileCleanupAsyncEvent;
import com.sismics.docs.core.model.context.AppContext;
import javax.persistence.EntityManager;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
/**
@ -27,6 +31,12 @@ public class ThreadLocalContext {
*/
private List<Object> asyncEventList = Lists.newArrayList();
/**
* List of temporary files created during this request.
* They are deleted at the end of each request.
*/
private List<Path> temporaryFileList = Lists.newArrayList();
/**
* Private constructor.
*/
@ -82,6 +92,17 @@ public class ThreadLocalContext {
asyncEventList.add(asyncEvent);
}
/**
* Create a temporary file linked to the request.
*
* @return New temporary file
*/
public Path createTemporaryFile() throws IOException {
Path path = Files.createTempFile("sismics_docs", null);
temporaryFileList.add(path);
return path;
}
/**
* Fire all pending async events.
*/
@ -89,5 +110,11 @@ public class ThreadLocalContext {
for (Object asyncEvent : asyncEventList) {
AppContext.getInstance().getAsyncEventBus().post(asyncEvent);
}
if (!temporaryFileList.isEmpty()) {
// Some files were created during this request, add a cleanup event to the queue
// It works because we are using a one thread executor
AppContext.getInstance().getAsyncEventBus().post(new TemporaryFileCleanupAsyncEvent(temporaryFileList));
}
}
}

View File

@ -3,6 +3,10 @@ package com.sismics.util.mime;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.apache.commons.compress.archivers.ArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
@ -20,17 +24,17 @@ public class MimeTypeUtil {
/**
* Try to guess the MIME type of a file by its magic number (header).
*
* @param is Stream to inspect
* @param file File to inspect
* @param name File name
* @return MIME type
* @throws IOException e
*/
public static String guessMimeType(InputStream is, String name) throws IOException {
byte[] headerBytes = new byte[64];
is.mark(headerBytes.length);
is.read(headerBytes);
is.reset();
return guessMimeType(headerBytes, name);
public static String guessMimeType(Path file, String name) throws IOException {
try (InputStream is = Files.newInputStream(file)) {
byte[] headerBytes = new byte[64];
is.read(headerBytes);
return guessMimeType(headerBytes, name);
}
}
/**
@ -107,39 +111,38 @@ public class MimeTypeUtil {
* are simple ZIP files on the outside and much bigger on the inside.
*
* @param file File
* @param inputStream Input stream
* @param unencryptedFile File on disk
* @return MIME type
*/
public static String guessOpenDocumentFormat(File file, InputStream inputStream) {
public static String guessOpenDocumentFormat(File file, Path unencryptedFile) {
if (!MimeType.APPLICATION_ZIP.equals(file.getMimeType())) {
// open document formats are ZIP files
return file.getMimeType();
}
String mimeType = file.getMimeType();
try (ZipArchiveInputStream archiveInputStream = new ZipArchiveInputStream(inputStream, Charsets.ISO_8859_1.name())) {
ArchiveEntry archiveEntry = archiveInputStream.getNextEntry();
try (InputStream inputStream = Files.newInputStream(unencryptedFile);
ZipInputStream zipInputStream = new ZipInputStream(inputStream, Charsets.ISO_8859_1)) {
ZipEntry archiveEntry = zipInputStream.getNextEntry();
while (archiveEntry != null) {
if (archiveEntry.getName().equals("mimetype")) {
// Maybe it's an ODT file
String content = new String(IOUtils.toByteArray(archiveInputStream), Charsets.ISO_8859_1);
String content = new String(IOUtils.toByteArray(zipInputStream), Charsets.ISO_8859_1);
if (MimeType.OPEN_DOCUMENT_TEXT.equals(content.trim())) {
mimeType = MimeType.OPEN_DOCUMENT_TEXT;
break;
}
} else if (archiveEntry.getName().equals("[Content_Types].xml")) {
// Maybe it's a DOCX file
String content = new String(IOUtils.toByteArray(archiveInputStream), Charsets.ISO_8859_1);
String content = new String(IOUtils.toByteArray(zipInputStream), Charsets.ISO_8859_1);
if (content.contains(MimeType.OFFICE_DOCUMENT)) {
mimeType = MimeType.OFFICE_DOCUMENT;
break;
}
}
archiveEntry = archiveInputStream.getNextEntry();
archiveEntry = zipInputStream.getNextEntry();
}
inputStream.reset();
} catch (Exception e) {
// In case of any error, just give up and keep the ZIP MIME type
return file.getMimeType();

View File

@ -1,18 +1,20 @@
package com.sismics.docs.core.util;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.StandardCopyOption;
import java.util.Date;
import org.junit.Assert;
import org.junit.Test;
import com.google.common.collect.Lists;
import com.google.common.io.Resources;
import com.sismics.docs.core.dao.jpa.dto.DocumentDto;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.util.mime.MimeType;
import org.junit.Assert;
import org.junit.Test;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.Date;
/**
* Test of the file entity utilities.
@ -22,26 +24,22 @@ import com.sismics.util.mime.MimeType;
public class TestFileUtil {
@Test
public void extractContentOpenDocumentTextTest() throws Exception {
try (InputStream inputStream = Resources.getResource("file/document.odt").openStream()) {
File file = new File();
file.setMimeType(MimeType.OPEN_DOCUMENT_TEXT);
try (InputStream pdfInputStream = PdfUtil.convertToPdf(file, inputStream, false)) {
String content = FileUtil.extractContent(null, file, inputStream, pdfInputStream);
Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen."));
}
}
Path path = Paths.get(ClassLoader.getSystemResource("file/document.odt").toURI());
File file = new File();
file.setMimeType(MimeType.OPEN_DOCUMENT_TEXT);
Path pdfPath = PdfUtil.convertToPdf(file, path);
String content = FileUtil.extractContent(null, file, path, pdfPath);
Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen."));
}
@Test
public void extractContentOfficeDocumentTest() throws Exception {
try (InputStream inputStream = Resources.getResource("file/document.docx").openStream()) {
File file = new File();
file.setMimeType(MimeType.OFFICE_DOCUMENT);
try (InputStream pdfInputStream = PdfUtil.convertToPdf(file, inputStream, false)) {
String content = FileUtil.extractContent(null, file, inputStream, pdfInputStream);
Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen."));
}
}
Path path = Paths.get(ClassLoader.getSystemResource("file/document.docx").toURI());
File file = new File();
file.setMimeType(MimeType.OFFICE_DOCUMENT);
Path pdfPath = PdfUtil.convertToPdf(file, path);
String content = FileUtil.extractContent(null, file, path, pdfPath);
Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen."));
}
@Test
@ -97,8 +95,9 @@ public class TestFileUtil {
file4.setId("document_odt");
file4.setMimeType(MimeType.OPEN_DOCUMENT_TEXT);
InputStream is = PdfUtil.convertToPdf(documentDto, Lists.newArrayList(file0, file1, file2, file3, file4), true, true, 10);
is.close();
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
PdfUtil.convertToPdf(documentDto, Lists.newArrayList(file0, file1, file2, file3, file4), true, true, 10, outputStream);
Assert.assertTrue(outputStream.toByteArray().length > 0);
}
}
}

View File

@ -1,16 +1,13 @@
package com.sismics.util;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import org.apache.commons.compress.utils.IOUtils;
import org.junit.Assert;
import org.junit.Test;
import com.google.common.io.Resources;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.util.mime.MimeType;
import com.sismics.util.mime.MimeTypeUtil;
import org.junit.Assert;
import org.junit.Test;
import java.nio.file.Path;
import java.nio.file.Paths;
/**
* Test of the utilities to check MIME types.
@ -18,23 +15,18 @@ import com.sismics.util.mime.MimeTypeUtil;
* @author bgamard
*/
public class TestMimeTypeUtil {
@Test
public void guessOpenDocumentFormatTest() throws Exception {
// Detect ODT files
try (InputStream inputStream = Resources.getResource("file/document.odt").openStream();
InputStream byteArrayInputStream = new ByteArrayInputStream(IOUtils.toByteArray(inputStream))) {
File file = new File();
file.setMimeType(MimeType.APPLICATION_ZIP);
Assert.assertEquals(MimeType.OPEN_DOCUMENT_TEXT, MimeTypeUtil.guessOpenDocumentFormat(file, byteArrayInputStream));
}
Path path = Paths.get(ClassLoader.getSystemResource("file/document.odt").toURI());
File file = new File();
file.setMimeType(MimeType.APPLICATION_ZIP);
Assert.assertEquals(MimeType.OPEN_DOCUMENT_TEXT, MimeTypeUtil.guessOpenDocumentFormat(file, path));
// Detect DOCX files
try (InputStream inputStream = Resources.getResource("file/document.docx").openStream();
InputStream byteArrayInputStream = new ByteArrayInputStream(IOUtils.toByteArray(inputStream))) {
File file = new File();
file.setMimeType(MimeType.APPLICATION_ZIP);
Assert.assertEquals(MimeType.OFFICE_DOCUMENT, MimeTypeUtil.guessOpenDocumentFormat(file, byteArrayInputStream));
}
path = Paths.get(ClassLoader.getSystemResource("file/document.docx").toURI());
file = new File();
file.setMimeType(MimeType.APPLICATION_ZIP);
Assert.assertEquals(MimeType.OFFICE_DOCUMENT, MimeTypeUtil.guessOpenDocumentFormat(file, path));
}
}

View File

@ -275,16 +275,10 @@ public class DocumentResource extends BaseResource {
StreamingOutput stream = new StreamingOutput() {
@Override
public void write(OutputStream outputStream) throws IOException, WebApplicationException {
try (InputStream inputStream = PdfUtil.convertToPdf(documentDto, fileList, fitImageToPage, metadata, margin)) {
ByteStreams.copy(inputStream, outputStream);
try {
PdfUtil.convertToPdf(documentDto, fileList, fitImageToPage, metadata, margin, outputStream);
} catch (Exception e) {
throw new IOException(e);
} finally {
try {
outputStream.close();
} catch (IOException e) {
// Ignore
}
}
}
};

View File

@ -14,10 +14,7 @@ import com.sismics.docs.core.event.FileCreatedAsyncEvent;
import com.sismics.docs.core.event.FileDeletedAsyncEvent;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.docs.core.model.jpa.User;
import com.sismics.docs.core.util.DirectoryUtil;
import com.sismics.docs.core.util.EncryptionUtil;
import com.sismics.docs.core.util.FileUtil;
import com.sismics.docs.core.util.PdfUtil;
import com.sismics.docs.core.util.*;
import com.sismics.rest.exception.ClientException;
import com.sismics.rest.exception.ForbiddenClientException;
import com.sismics.rest.exception.ServerException;
@ -37,13 +34,11 @@ import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;
import javax.ws.rs.core.Response.Status;
import javax.ws.rs.core.StreamingOutput;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.*;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.text.MessageFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
@ -114,27 +109,29 @@ public class FileResource extends BaseResource {
}
}
// Keep unencrypted data in memory, because we will need it two times
byte[] fileData;
// Keep unencrypted data temporary on disk, because we will need it two times
java.nio.file.Path unencryptedFile;
long fileSize;
try {
fileData = ByteStreams.toByteArray(fileBodyPart.getValueAs(InputStream.class));
unencryptedFile = ThreadLocalContext.get().createTemporaryFile();
Files.copy(fileBodyPart.getValueAs(InputStream.class), unencryptedFile, StandardCopyOption.REPLACE_EXISTING);
fileSize = Files.size(unencryptedFile);
} catch (IOException e) {
throw new ServerException("StreamError", "Error reading the input file", e);
}
InputStream fileInputStream = new ByteArrayInputStream(fileData);
// Validate mime type
String name = fileBodyPart.getContentDisposition() != null ?
fileBodyPart.getContentDisposition().getFileName() : null;
String mimeType;
try {
mimeType = MimeTypeUtil.guessMimeType(fileInputStream, name);
mimeType = MimeTypeUtil.guessMimeType(unencryptedFile, name);
} catch (IOException e) {
throw new ServerException("ErrorGuessMime", "Error guessing mime type", e);
}
// Validate quota
if (user.getStorageCurrent() + fileData.length > user.getStorageQuota()) {
if (user.getStorageCurrent() + fileSize > user.getStorageQuota()) {
throw new ClientException("QuotaReached", "Quota limit reached");
}
@ -158,16 +155,16 @@ public class FileResource extends BaseResource {
String fileId = fileDao.create(file, principal.getId());
// Guess the mime type a second time, for open document format (first detected as simple ZIP file)
file.setMimeType(MimeTypeUtil.guessOpenDocumentFormat(file, fileInputStream));
file.setMimeType(MimeTypeUtil.guessOpenDocumentFormat(file, unencryptedFile));
// Convert to PDF if necessary (for thumbnail and text extraction)
InputStream pdfIntputStream = PdfUtil.convertToPdf(file, fileInputStream, true);
java.nio.file.Path unencryptedPdfFile = PdfUtil.convertToPdf(file, unencryptedFile);
// Save the file
FileUtil.save(fileInputStream, pdfIntputStream, file, user.getPrivateKey());
FileUtil.save(unencryptedFile, unencryptedPdfFile, file, user.getPrivateKey());
// Update the user quota
user.setStorageCurrent(user.getStorageCurrent() + fileData.length);
user.setStorageCurrent(user.getStorageCurrent() + fileSize);
userDao.updateQuota(user);
// Raise a new file created event and document updated event if we have a document
@ -176,8 +173,8 @@ public class FileResource extends BaseResource {
fileCreatedAsyncEvent.setUserId(principal.getId());
fileCreatedAsyncEvent.setLanguage(documentDto.getLanguage());
fileCreatedAsyncEvent.setFile(file);
fileCreatedAsyncEvent.setInputStream(fileInputStream);
fileCreatedAsyncEvent.setPdfInputStream(pdfIntputStream);
fileCreatedAsyncEvent.setUnencryptedFile(unencryptedFile);
fileCreatedAsyncEvent.setUnencryptedPdfFile(unencryptedPdfFile);
ThreadLocalContext.get().addAsyncEvent(fileCreatedAsyncEvent);
DocumentUpdatedAsyncEvent documentUpdatedAsyncEvent = new DocumentUpdatedAsyncEvent();
@ -190,7 +187,7 @@ public class FileResource extends BaseResource {
JsonObjectBuilder response = Json.createObjectBuilder()
.add("status", "ok")
.add("id", fileId)
.add("size", fileData.length);
.add("size", fileSize);
return Response.ok().entity(response.build()).build();
} catch (Exception e) {
throw new ServerException("FileError", "Error adding a file", e);
@ -254,13 +251,13 @@ public class FileResource extends BaseResource {
// Raise a new file created event and document updated event (it wasn't sent during file creation)
try {
java.nio.file.Path storedFile = DirectoryUtil.getStorageDirectory().resolve(id);
InputStream fileInputStream = Files.newInputStream(storedFile);
final InputStream responseInputStream = EncryptionUtil.decryptInputStream(fileInputStream, user.getPrivateKey());
java.nio.file.Path unencryptedFile = EncryptionUtil.decryptFile(storedFile, user.getPrivateKey());
FileCreatedAsyncEvent fileCreatedAsyncEvent = new FileCreatedAsyncEvent();
fileCreatedAsyncEvent.setUserId(principal.getId());
fileCreatedAsyncEvent.setLanguage(documentDto.getLanguage());
fileCreatedAsyncEvent.setFile(file);
fileCreatedAsyncEvent.setInputStream(responseInputStream);
fileCreatedAsyncEvent.setUnencryptedFile(unencryptedFile);
fileCreatedAsyncEvent.setUnencryptedPdfFile(PdfUtil.convertToPdf(file, unencryptedFile));
ThreadLocalContext.get().addAsyncEvent(fileCreatedAsyncEvent);
DocumentUpdatedAsyncEvent documentUpdatedAsyncEvent = new DocumentUpdatedAsyncEvent();

View File

@ -332,7 +332,7 @@
"filter": {
"filesize": {
"mb": "Mo",
"kb": "Ko"
"kb": "ko"
}
},
"acl": {

View File

@ -1,10 +1,16 @@
package com.sismics.docs.rest;
import java.io.BufferedInputStream;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Date;
import com.google.common.io.ByteStreams;
import com.google.common.io.Resources;
import com.sismics.docs.core.util.DirectoryUtil;
import com.sismics.util.filter.TokenBasedSecurityFilter;
import com.sismics.util.mime.MimeType;
import com.sismics.util.mime.MimeTypeUtil;
import org.glassfish.jersey.media.multipart.FormDataMultiPart;
import org.glassfish.jersey.media.multipart.MultiPartFeature;
import org.glassfish.jersey.media.multipart.file.StreamDataBodyPart;
import org.junit.Assert;
import org.junit.Test;
import javax.json.JsonArray;
import javax.json.JsonObject;
@ -13,19 +19,10 @@ import javax.ws.rs.core.Form;
import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;
import javax.ws.rs.core.Response.Status;
import org.glassfish.jersey.media.multipart.FormDataMultiPart;
import org.glassfish.jersey.media.multipart.MultiPartFeature;
import org.glassfish.jersey.media.multipart.file.StreamDataBodyPart;
import org.junit.Assert;
import org.junit.Test;
import com.google.common.io.ByteStreams;
import com.google.common.io.Resources;
import com.sismics.docs.core.util.DirectoryUtil;
import com.sismics.util.filter.TokenBasedSecurityFilter;
import com.sismics.util.mime.MimeType;
import com.sismics.util.mime.MimeTypeUtil;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Date;
/**
* Exhaustive test of the file resource.
@ -123,10 +120,8 @@ public class TestFileResource extends BaseJerseyTest {
// Check that the files are not readable directly from FS
Path storedFile = DirectoryUtil.getStorageDirectory().resolve(file1Id);
try (InputStream storedFileInputStream = new BufferedInputStream(Files.newInputStream(storedFile))) {
Assert.assertEquals(MimeType.DEFAULT, MimeTypeUtil.guessMimeType(storedFileInputStream, null));
}
Assert.assertEquals(MimeType.DEFAULT, MimeTypeUtil.guessMimeType(storedFile, null));
// Get all files from a document
json = target().path("/file/list")
.queryParam("id", document1Id)