#55: Export document in PDF (utilities)

This commit is contained in:
jendib 2015-12-13 22:29:23 +01:00
parent 5f82752416
commit 0d1a4ec7ea
11 changed files with 239 additions and 111 deletions

View File

@ -7,6 +7,7 @@ import javax.persistence.Entity;
import javax.persistence.Id;
import javax.persistence.Lob;
import javax.persistence.Table;
import javax.persistence.Transient;
import com.google.common.base.MoreObjects;
@ -69,150 +70,85 @@ public class File implements Loggable {
private Integer order;
/**
* Getter of id.
*
* @return the id
* Private key to decrypt the file.
* Not saved to database, of course.
*/
@Transient
private String privateKey;
public String getId() {
return id;
}
/**
* Setter of id.
*
* @param id id
*/
public void setId(String id) {
this.id = id;
}
/**
* Getter of documentId.
*
* @return the documentId
*/
public String getDocumentId() {
return documentId;
}
/**
* Setter of documentId.
*
* @param documentId documentId
*/
public void setDocumentId(String documentId) {
this.documentId = documentId;
}
/**
* Getter of mimeType.
*
* @return the mimeType
*/
public String getMimeType() {
return mimeType;
}
/**
* Setter of mimeType.
*
* @param mimeType mimeType
*/
public void setMimeType(String mimeType) {
this.mimeType = mimeType;
}
/**
* Getter of createDate.
*
* @return the createDate
*/
public Date getCreateDate() {
return createDate;
}
/**
* Setter of createDate.
*
* @param createDate createDate
*/
public void setCreateDate(Date createDate) {
this.createDate = createDate;
}
/**
* Getter of deleteDate.
*
* @return the deleteDate
*/
@Override
public Date getDeleteDate() {
return deleteDate;
}
/**
* Setter of deleteDate.
*
* @param deleteDate deleteDate
*/
public void setDeleteDate(Date deleteDate) {
this.deleteDate = deleteDate;
}
/**
* Getter of content.
*
* @return the content
*/
public String getContent() {
return content;
}
/**
* Setter of content.
*
* @param content content
*/
public void setContent(String content) {
this.content = content;
}
/**
* Getter of order.
*
* @return the order
*/
public Integer getOrder() {
return order;
}
/**
* Setter of order.
*
* @param order order
*/
public void setOrder(Integer order) {
this.order = order;
}
/**
* Getter of userId.
*
* @return the userId
*/
public String getUserId() {
return userId;
}
/**
* Setter of userId.
*
* @param userId userId
*/
public void setUserId(String userId) {
this.userId = userId;
}
public String getPrivateKey() {
return privateKey;
}
public void setPrivateKey(String privateKey) {
this.privateKey = privateKey;
}
@Override
public String toString() {
return MoreObjects.toStringHelper(this)

View File

@ -8,13 +8,22 @@ import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import javax.crypto.Cipher;
import javax.crypto.CipherInputStream;
import javax.crypto.CipherOutputStream;
import javax.imageio.ImageIO;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.multipdf.PDFMergerUtility;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.image.JPEGFactory;
import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
@ -27,6 +36,7 @@ import org.odftoolkit.odfdom.doc.OdfTextDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.io.Closer;
import com.sismics.docs.core.model.jpa.Document;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.tess4j.Tesseract;
@ -130,23 +140,24 @@ public class FileUtil {
/**
* Convert a file to PDF if necessary.
*
* @param inputStream InputStream
* @param file File
* @param inputStream InputStream
* @param reset Reset the stream after usage
* @return PDF input stream
* @throws Exception
*/
public static InputStream convertToPdf(InputStream inputStream, File file) throws Exception {
public static InputStream convertToPdf(File file, InputStream inputStream, boolean reset) throws Exception {
if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) {
// It's already PDF, just return the input
return inputStream;
}
if (file.getMimeType().equals(MimeType.OFFICE_DOCUMENT)) {
return convertOfficeDocument(inputStream);
return convertOfficeDocument(inputStream, reset);
}
if (file.getMimeType().equals(MimeType.OPEN_DOCUMENT_TEXT)) {
return convertOpenDocumentText(inputStream);
return convertOpenDocumentText(inputStream, reset);
}
// PDF conversion not necessary/possible
@ -157,15 +168,18 @@ public class FileUtil {
* Convert an open document text file to PDF.
*
* @param inputStream Unencrypted input stream
* @param reset Reset the stream after usage
* @return PDF input stream
* @throws Exception
*/
private static InputStream convertOpenDocumentText(InputStream inputStream) throws Exception {
private static InputStream convertOpenDocumentText(InputStream inputStream, boolean reset) throws Exception {
ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream();
OdfTextDocument document = OdfTextDocument.loadDocument(inputStream);
PdfOptions options = PdfOptions.create();
PdfConverter.getInstance().convert(document, pdfOutputStream, options);
inputStream.reset();
if (reset) {
inputStream.reset();
}
return new ByteArrayInputStream(pdfOutputStream.toByteArray());
}
@ -173,15 +187,18 @@ public class FileUtil {
* Convert an Office document to PDF.
*
* @param inputStream Unencrypted input stream
* @param reset Reset the stream after usage
* @return PDF input stream
* @throws Exception
*/
private static InputStream convertOfficeDocument(InputStream inputStream) throws Exception {
private static InputStream convertOfficeDocument(InputStream inputStream, boolean reset) throws Exception {
ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream();
XWPFDocument document = new XWPFDocument(inputStream);
org.apache.poi.xwpf.converter.pdf.PdfOptions options = org.apache.poi.xwpf.converter.pdf.PdfOptions.create();
org.apache.poi.xwpf.converter.pdf.PdfConverter.getInstance().convert(document, pdfOutputStream, options);
inputStream.reset();
if (reset) {
inputStream.reset();
}
return new ByteArrayInputStream(pdfOutputStream.toByteArray());
}
@ -272,4 +289,94 @@ public class FileUtil {
Files.delete(thumbnailFile);
}
}
/**
* Convert a document and its files to a merged PDF file.
*
* @param fileList List of files
* @param fitImageToPage Fill images to the page
* @param margin Margins in millimeters
* @return PDF input stream
* @throws IOException
*/
public static InputStream convertToPdf(List<File> fileList, boolean fitImageToPage, int margin) throws Exception {
// TODO PDF Export: Option to add a front page with:
// document title, document description, creator, date created, language,
// list of all files (and information if it is in this document or not)
// TODO PDF Export: Option to add the comments
// Create a blank PDF
Closer closer = Closer.create();
MemoryUsageSetting memUsageSettings = MemoryUsageSetting.setupMixed(1000000); // 1MB max memory usage
memUsageSettings.setTempDir(new java.io.File(System.getProperty("java.io.tmpdir"))); // To OS temp
float mmPerInch = 1 / (10 * 2.54f) * 72f;
try (PDDocument doc = new PDDocument(memUsageSettings)) {
// Add files
for (File file : fileList) {
Path storedFile = DirectoryUtil.getStorageDirectory().resolve(file.getId());
try (InputStream storedFileInputStream = file.getPrivateKey() == null ? // Try to decrypt the file if we have a private key available
Files.newInputStream(storedFile) : EncryptionUtil.decryptInputStream(Files.newInputStream(storedFile), file.getPrivateKey())) {
if (ImageUtil.isImage(file.getMimeType())) {
PDPage page = new PDPage(PDRectangle.A4); // Images into A4 pages
try (PDPageContentStream contentStream = new PDPageContentStream(doc, page)) {
// Read the image using the correct handler. PDFBox can't do it because it relies wrongly on file extension
PDImageXObject pdImage = null;
if (file.getMimeType().equals(MimeType.IMAGE_JPEG)) {
pdImage = JPEGFactory.createFromStream(doc, storedFileInputStream);
} else if (file.getMimeType().equals(MimeType.IMAGE_GIF) || file.getMimeType().equals(MimeType.IMAGE_PNG)) {
BufferedImage bim = ImageIO.read(storedFileInputStream);
pdImage = LosslessFactory.createFromImage(doc, bim);
}
if (fitImageToPage) {
// Fill the page with the image
float widthAvailable = page.getMediaBox().getWidth() - 2 * margin * mmPerInch;
float heightAvailable = page.getMediaBox().getHeight() - 2 * margin * mmPerInch;
// Compare page format and image format
if (widthAvailable / heightAvailable < (float) pdImage.getWidth() / (float) pdImage.getHeight()) {
float imageHeight = widthAvailable / pdImage.getWidth() * pdImage.getHeight();
contentStream.drawImage(pdImage, margin * mmPerInch, heightAvailable + margin * mmPerInch - imageHeight,
widthAvailable, imageHeight);
} else {
float imageWidth = heightAvailable / pdImage.getHeight() * pdImage.getWidth();
contentStream.drawImage(pdImage, margin * mmPerInch, margin * mmPerInch,
imageWidth, heightAvailable);
}
} else {
// Draw the image as is
contentStream.drawImage(pdImage, margin * mmPerInch,
page.getMediaBox().getHeight() - pdImage.getHeight() - margin * mmPerInch);
}
}
doc.addPage(page);
} else {
// Try to convert the file to PDF
InputStream pdfInputStream = convertToPdf(file, storedFileInputStream, false);
if (pdfInputStream != null) {
// This file is convertible to PDF, just add it to the end
try {
PDDocument mergeDoc = PDDocument.load(pdfInputStream, memUsageSettings);
closer.register(mergeDoc);
PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
pdfMergerUtility.appendDocument(doc, mergeDoc);
} finally {
pdfInputStream.close();
}
}
// All other non-PDF-convertible files are ignored
}
}
}
// Save to a temporary file
try (TemporaryFileStream temporaryFileStream = new TemporaryFileStream()) {
doc.save(temporaryFileStream.openWriteStream());
closer.close(); // Close all remaining opened PDF
return temporaryFileStream.openReadStream();
}
}
}
}

View File

@ -0,0 +1,55 @@
package com.sismics.docs.core.util;
import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.UUID;
/**
* Utilities for writing and reading to a temporary file.
*
* @author bgamard
*/
public class TemporaryFileStream implements Closeable {
/**
* Temporary file.
*/
private Path tempFile;
/**
* Construct a temporary file.
*
* @throws IOException
*/
public TemporaryFileStream() throws IOException {
tempFile = Files.createTempFile(UUID.randomUUID().toString(), ".tmp");
}
/**
* Open a stream for writing.
*
* @return OutputStream
* @throws IOException
*/
public OutputStream openWriteStream() throws IOException {
return Files.newOutputStream(tempFile);
}
/**
* Open a stream for reading.
*
* @return InputStream
* @throws IOException
*/
public InputStream openReadStream() throws IOException {
return Files.newInputStream(tempFile);
}
@Override
public void close() throws IOException {
Files.delete(tempFile);
}
}

View File

@ -6,9 +6,6 @@ package com.sismics.util.mime;
* @author jtremeaux
*/
public class MimeType {
public static final String IMAGE_X_ICON = "image/x-icon";
public static final String IMAGE_PNG = "image/png";
public static final String IMAGE_JPEG = "image/jpeg";

View File

@ -55,8 +55,6 @@ public class MimeTypeUtil {
} else if (headerBytes[0] == ((byte) 0x89) && headerBytes[1] == ((byte) 0x50) && headerBytes[2] == ((byte) 0x4e) && headerBytes[3] == ((byte) 0x47) &&
headerBytes[4] == ((byte) 0x0d) && headerBytes[5] == ((byte) 0x0a) && headerBytes[6] == ((byte) 0x1a) && headerBytes[7] == ((byte) 0x0a)) {
return MimeType.IMAGE_PNG;
} else if (headerBytes[0] == ((byte) 0x00) && headerBytes[1] == ((byte) 0x00) && headerBytes[2] == ((byte) 0x01) && headerBytes[3] == ((byte) 0x00)) {
return MimeType.IMAGE_X_ICON;
} else if (headerBytes[0] == ((byte) 0x25) && headerBytes[1] == ((byte) 0x50) && headerBytes[2] == ((byte) 0x44) && headerBytes[3] == ((byte) 0x46)) {
return MimeType.APPLICATION_PDF;
}
@ -80,8 +78,6 @@ public class MimeTypeUtil {
return "jpg";
case MimeType.IMAGE_PNG:
return "png";
case MimeType.IMAGE_X_ICON:
return "ico";
case MimeType.APPLICATION_PDF:
return "pdf";
case MimeType.OPEN_DOCUMENT_TEXT:

View File

@ -18,11 +18,6 @@ import com.google.common.io.ByteStreams;
* @author bgamard
*/
public class TestEncryptUtil {
/**
* Test private key.
*/
String pk = "OnceUponATime";
@Test
public void generatePrivateKeyTest() throws Exception {
String key = EncryptionUtil.generatePrivateKey();
@ -38,7 +33,7 @@ public class TestEncryptUtil {
} catch (IllegalArgumentException e) {
// NOP
}
Cipher cipher = EncryptionUtil.getEncryptionCipher(pk);
Cipher cipher = EncryptionUtil.getEncryptionCipher("OnceUponATime");
InputStream inputStream = new CipherInputStream(this.getClass().getResourceAsStream("/file/udhr.pdf"), cipher);
byte[] encryptedData = ByteStreams.toByteArray(inputStream);
byte[] assertData = ByteStreams.toByteArray(this.getClass().getResourceAsStream("/file/udhr_encrypted.pdf"));
@ -48,7 +43,8 @@ public class TestEncryptUtil {
@Test
public void decryptStreamTest() throws Exception {
InputStream inputStream = EncryptionUtil.decryptInputStream(this.getClass().getResourceAsStream("/file/udhr_encrypted.pdf"), pk);
InputStream inputStream = EncryptionUtil.decryptInputStream(
this.getClass().getResourceAsStream("/file/udhr_encrypted.pdf"), "OnceUponATime");
byte[] encryptedData = ByteStreams.toByteArray(inputStream);
byte[] assertData = ByteStreams.toByteArray(this.getClass().getResourceAsStream("/file/udhr.pdf"));

View File

@ -1,13 +1,14 @@
package com.sismics.docs.core.util;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.StandardCopyOption;
import junit.framework.Assert;
import org.apache.pdfbox.io.IOUtils;
import org.junit.Test;
import com.google.common.collect.Lists;
import com.google.common.io.Resources;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.util.mime.MimeType;
@ -20,11 +21,10 @@ import com.sismics.util.mime.MimeType;
public class TestFileUtil {
@Test
public void extractContentOpenDocumentTextTest() throws Exception {
try (InputStream inputStream = Resources.getResource("file/document.odt").openStream();
InputStream bytesInputStream = new ByteArrayInputStream(IOUtils.toByteArray(inputStream))) {
try (InputStream inputStream = Resources.getResource("file/document.odt").openStream()) {
File file = new File();
file.setMimeType(MimeType.OPEN_DOCUMENT_TEXT);
try (InputStream pdfInputStream = FileUtil.convertToPdf(bytesInputStream, file)) {
try (InputStream pdfInputStream = FileUtil.convertToPdf(file, inputStream, false)) {
String content = FileUtil.extractContent(null, file, inputStream, pdfInputStream);
Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen."));
}
@ -33,14 +33,55 @@ public class TestFileUtil {
@Test
public void extractContentOfficeDocumentTest() throws Exception {
try (InputStream inputStream = Resources.getResource("file/document.docx").openStream();
InputStream bytesInputStream = new ByteArrayInputStream(IOUtils.toByteArray(inputStream))) {
try (InputStream inputStream = Resources.getResource("file/document.docx").openStream()) {
File file = new File();
file.setMimeType(MimeType.OFFICE_DOCUMENT);
try (InputStream pdfInputStream = FileUtil.convertToPdf(bytesInputStream, file)) {
try (InputStream pdfInputStream = FileUtil.convertToPdf(file, inputStream, false)) {
String content = FileUtil.extractContent(null, file, inputStream, pdfInputStream);
Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen."));
}
}
}
@Test
public void convertToPdfTest() throws Exception {
try (InputStream inputStream0 = Resources.getResource("file/apollo_landscape.jpg").openStream();
InputStream inputStream1 = Resources.getResource("file/apollo_portrait.jpg").openStream();
InputStream inputStream2 = Resources.getResource("file/udhr_encrypted.pdf").openStream();
InputStream inputStream3 = Resources.getResource("file/document.docx").openStream();
InputStream inputStream4 = Resources.getResource("file/document.odt").openStream()) {
// First file
Files.copy(inputStream0, DirectoryUtil.getStorageDirectory().resolve("apollo_landscape"), StandardCopyOption.REPLACE_EXISTING);
File file0 = new File();
file0.setId("apollo_landscape");
file0.setMimeType(MimeType.IMAGE_JPEG);
// Second file
Files.copy(inputStream1, DirectoryUtil.getStorageDirectory().resolve("apollo_portrait"), StandardCopyOption.REPLACE_EXISTING);
File file1 = new File();
file1.setId("apollo_portrait");
file1.setMimeType(MimeType.IMAGE_JPEG);
// Third file
Files.copy(inputStream2, DirectoryUtil.getStorageDirectory().resolve("udhr"), StandardCopyOption.REPLACE_EXISTING);
File file2 = new File();
file2.setId("udhr");
file2.setPrivateKey("OnceUponATime");
file2.setMimeType(MimeType.APPLICATION_PDF);
// Fourth file
Files.copy(inputStream3, DirectoryUtil.getStorageDirectory().resolve("document_docx"), StandardCopyOption.REPLACE_EXISTING);
File file3 = new File();
file3.setId("document_docx");
file3.setMimeType(MimeType.OFFICE_DOCUMENT);
// Fifth file
Files.copy(inputStream4, DirectoryUtil.getStorageDirectory().resolve("document_odt"), StandardCopyOption.REPLACE_EXISTING);
File file4 = new File();
file4.setId("document_odt");
file4.setMimeType(MimeType.OPEN_DOCUMENT_TEXT);
FileUtil.convertToPdf(Lists.newArrayList(file0, file1, file2, file3, file4), true, 10).close();
}
}
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.7 KiB

View File

@ -30,7 +30,7 @@
<org.mindrot.jbcrypt>0.3m</org.mindrot.jbcrypt>
<org.apache.lucene.version>4.2.0</org.apache.lucene.version>
<org.imgscalr.imgscalr-lib.version>4.2</org.imgscalr.imgscalr-lib.version>
<org.apache.pdfbox.pdfbox.version>2.0.0-RC1</org.apache.pdfbox.pdfbox.version>
<org.apache.pdfbox.pdfbox.version>2.0.0-RC2</org.apache.pdfbox.pdfbox.version>
<org.bouncycastle.bcprov-jdk15on.version>1.53</org.bouncycastle.bcprov-jdk15on.version>
<joda-time.joda-time.version>2.9.1</joda-time.joda-time.version>
<org.hibernate.hibernate.version>4.1.0.Final</org.hibernate.hibernate.version>

View File

@ -150,7 +150,7 @@ public class FileResource extends BaseResource {
file.setMimeType(MimeTypeUtil.guessOpenDocumentFormat(file, fileInputStream));
// Convert to PDF if necessary (for thumbnail and text extraction)
InputStream pdfIntputStream = FileUtil.convertToPdf(fileInputStream, file);
InputStream pdfIntputStream = FileUtil.convertToPdf(file, fileInputStream, true);
// Save the file
FileUtil.save(fileInputStream, pdfIntputStream, file, user.getPrivateKey());