#55: Refactoring

This commit is contained in:
jendib 2015-12-20 02:23:35 +01:00
parent eb61b06784
commit 0591f8a39f
4 changed files with 255 additions and 219 deletions

View File

@ -1,47 +1,27 @@
package com.sismics.docs.core.util;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import javax.crypto.Cipher;
import javax.crypto.CipherInputStream;
import javax.crypto.CipherOutputStream;
import javax.imageio.ImageIO;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.multipdf.PDFMergerUtility;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.image.JPEGFactory;
import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.imgscalr.Scalr;
import org.imgscalr.Scalr.Method;
import org.imgscalr.Scalr.Mode;
import org.odftoolkit.odfdom.converter.pdf.PdfConverter;
import org.odftoolkit.odfdom.converter.pdf.PdfOptions;
import org.odftoolkit.odfdom.doc.OdfTextDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.io.Closer;
import com.sismics.docs.core.model.jpa.Document;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.tess4j.Tesseract;
import com.sismics.util.ImageUtil;
import com.sismics.util.mime.MimeType;
/**
* File entity utilities.
@ -69,7 +49,7 @@ public class FileUtil {
if (ImageUtil.isImage(file.getMimeType())) {
content = ocrFile(inputStream, document);
} else if (pdfInputStream != null) {
content = extractPdf(pdfInputStream);
content = PdfUtil.extractPdf(pdfInputStream);
}
return content;
@ -109,99 +89,6 @@ public class FileUtil {
return content;
}
/**
* Extract text from a PDF.
*
* @param inputStream Unencrypted input stream
* @return Content extracted
*/
private static String extractPdf(InputStream inputStream) {
String content = null;
PDDocument pdfDocument = null;
try {
PDFTextStripper stripper = new PDFTextStripper();
pdfDocument = PDDocument.load(inputStream);
content = stripper.getText(pdfDocument);
} catch (IOException e) {
log.error("Error while extracting text from the PDF", e);
} finally {
if (pdfDocument != null) {
try {
pdfDocument.close();
} catch (IOException e) {
// NOP
}
}
}
return content;
}
/**
* Convert a file to PDF if necessary.
*
* @param file File
* @param inputStream InputStream
* @param reset Reset the stream after usage
* @return PDF input stream
* @throws Exception
*/
public static InputStream convertToPdf(File file, InputStream inputStream, boolean reset) throws Exception {
if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) {
// It's already PDF, just return the input
return inputStream;
}
if (file.getMimeType().equals(MimeType.OFFICE_DOCUMENT)) {
return convertOfficeDocument(inputStream, reset);
}
if (file.getMimeType().equals(MimeType.OPEN_DOCUMENT_TEXT)) {
return convertOpenDocumentText(inputStream, reset);
}
// PDF conversion not necessary/possible
return null;
}
/**
* Convert an open document text file to PDF.
*
* @param inputStream Unencrypted input stream
* @param reset Reset the stream after usage
* @return PDF input stream
* @throws Exception
*/
private static InputStream convertOpenDocumentText(InputStream inputStream, boolean reset) throws Exception {
ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream();
OdfTextDocument document = OdfTextDocument.loadDocument(inputStream);
PdfOptions options = PdfOptions.create();
PdfConverter.getInstance().convert(document, pdfOutputStream, options);
if (reset) {
inputStream.reset();
}
return new ByteArrayInputStream(pdfOutputStream.toByteArray());
}
/**
* Convert an Office document to PDF.
*
* @param inputStream Unencrypted input stream
* @param reset Reset the stream after usage
* @return PDF input stream
* @throws Exception
*/
private static InputStream convertOfficeDocument(InputStream inputStream, boolean reset) throws Exception {
ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream();
XWPFDocument document = new XWPFDocument(inputStream);
org.apache.poi.xwpf.converter.pdf.PdfOptions options = org.apache.poi.xwpf.converter.pdf.PdfOptions.create();
org.apache.poi.xwpf.converter.pdf.PdfConverter.getInstance().convert(document, pdfOutputStream, options);
if (reset) {
inputStream.reset();
}
return new ByteArrayInputStream(pdfOutputStream.toByteArray());
}
/**
* Save a file on the storage filesystem.
*
@ -237,15 +124,8 @@ public class FileUtil {
inputStream.reset();
} else if(pdfInputStream != null) {
// Generate preview from the first page of the PDF
PDDocument pdfDocument = null;
try {
pdfDocument = PDDocument.load(pdfInputStream);
PDFRenderer renderer = new PDFRenderer(pdfDocument);
image = renderer.renderImage(0);
pdfInputStream.reset();
} finally {
pdfDocument.close();
}
image = PdfUtil.renderFirstPage(pdfInputStream);
pdfInputStream.reset();
}
if (image != null) {
@ -289,94 +169,4 @@ public class FileUtil {
Files.delete(thumbnailFile);
}
}
/**
* Convert a document and its files to a merged PDF file.
*
* @param fileList List of files
* @param fitImageToPage Fill images to the page
* @param margin Margins in millimeters
* @return PDF input stream
* @throws IOException
*/
public static InputStream convertToPdf(List<File> fileList, boolean fitImageToPage, int margin) throws Exception {
// TODO PDF Export: Option to add a front page with:
// document title, document description, creator, date created, language,
// list of all files (and information if it is in this document or not)
// TODO PDF Export: Option to add the comments
// Create a blank PDF
Closer closer = Closer.create();
MemoryUsageSetting memUsageSettings = MemoryUsageSetting.setupMixed(1000000); // 1MB max memory usage
memUsageSettings.setTempDir(new java.io.File(System.getProperty("java.io.tmpdir"))); // To OS temp
float mmPerInch = 1 / (10 * 2.54f) * 72f;
try (PDDocument doc = new PDDocument(memUsageSettings)) {
// Add files
for (File file : fileList) {
Path storedFile = DirectoryUtil.getStorageDirectory().resolve(file.getId());
try (InputStream storedFileInputStream = file.getPrivateKey() == null ? // Try to decrypt the file if we have a private key available
Files.newInputStream(storedFile) : EncryptionUtil.decryptInputStream(Files.newInputStream(storedFile), file.getPrivateKey())) {
if (ImageUtil.isImage(file.getMimeType())) {
PDPage page = new PDPage(PDRectangle.A4); // Images into A4 pages
try (PDPageContentStream contentStream = new PDPageContentStream(doc, page)) {
// Read the image using the correct handler. PDFBox can't do it because it relies wrongly on file extension
PDImageXObject pdImage = null;
if (file.getMimeType().equals(MimeType.IMAGE_JPEG)) {
pdImage = JPEGFactory.createFromStream(doc, storedFileInputStream);
} else if (file.getMimeType().equals(MimeType.IMAGE_GIF) || file.getMimeType().equals(MimeType.IMAGE_PNG)) {
BufferedImage bim = ImageIO.read(storedFileInputStream);
pdImage = LosslessFactory.createFromImage(doc, bim);
}
if (fitImageToPage) {
// Fill the page with the image
float widthAvailable = page.getMediaBox().getWidth() - 2 * margin * mmPerInch;
float heightAvailable = page.getMediaBox().getHeight() - 2 * margin * mmPerInch;
// Compare page format and image format
if (widthAvailable / heightAvailable < (float) pdImage.getWidth() / (float) pdImage.getHeight()) {
float imageHeight = widthAvailable / pdImage.getWidth() * pdImage.getHeight();
contentStream.drawImage(pdImage, margin * mmPerInch, heightAvailable + margin * mmPerInch - imageHeight,
widthAvailable, imageHeight);
} else {
float imageWidth = heightAvailable / pdImage.getHeight() * pdImage.getWidth();
contentStream.drawImage(pdImage, margin * mmPerInch, margin * mmPerInch,
imageWidth, heightAvailable);
}
} else {
// Draw the image as is
contentStream.drawImage(pdImage, margin * mmPerInch,
page.getMediaBox().getHeight() - pdImage.getHeight() - margin * mmPerInch);
}
}
doc.addPage(page);
} else {
// Try to convert the file to PDF
InputStream pdfInputStream = convertToPdf(file, storedFileInputStream, false);
if (pdfInputStream != null) {
// This file is convertible to PDF, just add it to the end
try {
PDDocument mergeDoc = PDDocument.load(pdfInputStream, memUsageSettings);
closer.register(mergeDoc);
PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
pdfMergerUtility.appendDocument(doc, mergeDoc);
} finally {
pdfInputStream.close();
}
}
// All other non-PDF-convertible files are ignored
}
}
}
// Save to a temporary file
try (TemporaryFileStream temporaryFileStream = new TemporaryFileStream()) {
doc.save(temporaryFileStream.openWriteStream());
closer.close(); // Close all remaining opened PDF
return temporaryFileStream.openReadStream();
}
}
}
}

View File

@ -0,0 +1,245 @@
package com.sismics.docs.core.util;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import javax.imageio.ImageIO;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.multipdf.PDFMergerUtility;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.image.JPEGFactory;
import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.odftoolkit.odfdom.converter.pdf.PdfConverter;
import org.odftoolkit.odfdom.converter.pdf.PdfOptions;
import org.odftoolkit.odfdom.doc.OdfTextDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.io.Closer;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.util.ImageUtil;
import com.sismics.util.mime.MimeType;
/**
* PDF utilities.
*
* @author bgamard
*/
public class PdfUtil {
/**
* Logger.
*/
private static final Logger log = LoggerFactory.getLogger(PdfUtil.class);
/**
* Extract text from a PDF.
*
* @param inputStream Unencrypted input stream
* @return Content extracted
*/
public static String extractPdf(InputStream inputStream) {
String content = null;
PDDocument pdfDocument = null;
try {
PDFTextStripper stripper = new PDFTextStripper();
pdfDocument = PDDocument.load(inputStream);
content = stripper.getText(pdfDocument);
} catch (IOException e) {
log.error("Error while extracting text from the PDF", e);
} finally {
if (pdfDocument != null) {
try {
pdfDocument.close();
} catch (IOException e) {
// NOP
}
}
}
return content;
}
/**
* Convert a file to PDF if necessary.
*
* @param file File
* @param inputStream InputStream
* @param reset Reset the stream after usage
* @return PDF input stream
* @throws Exception
*/
public static InputStream convertToPdf(File file, InputStream inputStream, boolean reset) throws Exception {
if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) {
// It's already PDF, just return the input
return inputStream;
}
if (file.getMimeType().equals(MimeType.OFFICE_DOCUMENT)) {
return convertOfficeDocument(inputStream, reset);
}
if (file.getMimeType().equals(MimeType.OPEN_DOCUMENT_TEXT)) {
return convertOpenDocumentText(inputStream, reset);
}
// PDF conversion not necessary/possible
return null;
}
/**
* Convert an open document text file to PDF.
*
* @param inputStream Unencrypted input stream
* @param reset Reset the stream after usage
* @return PDF input stream
* @throws Exception
*/
private static InputStream convertOpenDocumentText(InputStream inputStream, boolean reset) throws Exception {
ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream();
OdfTextDocument document = OdfTextDocument.loadDocument(inputStream);
PdfOptions options = PdfOptions.create();
PdfConverter.getInstance().convert(document, pdfOutputStream, options);
if (reset) {
inputStream.reset();
}
return new ByteArrayInputStream(pdfOutputStream.toByteArray());
}
/**
* Convert an Office document to PDF.
*
* @param inputStream Unencrypted input stream
* @param reset Reset the stream after usage
* @return PDF input stream
* @throws Exception
*/
private static InputStream convertOfficeDocument(InputStream inputStream, boolean reset) throws Exception {
ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream();
XWPFDocument document = new XWPFDocument(inputStream);
org.apache.poi.xwpf.converter.pdf.PdfOptions options = org.apache.poi.xwpf.converter.pdf.PdfOptions.create();
org.apache.poi.xwpf.converter.pdf.PdfConverter.getInstance().convert(document, pdfOutputStream, options);
if (reset) {
inputStream.reset();
}
return new ByteArrayInputStream(pdfOutputStream.toByteArray());
}
/**
* Convert a document and its files to a merged PDF file.
*
* @param fileList List of files
* @param fitImageToPage Fit images to the page
* @param margin Margins in millimeters
* @return PDF input stream
* @throws IOException
*/
public static InputStream convertToPdf(List<File> fileList, boolean fitImageToPage, int margin) throws Exception {
// TODO PDF Export: Option to add a front page with:
// document title, document description, creator, date created, language,
// list of all files (and information if it is in this document or not)
// TODO PDF Export: Option to add the comments
// Create a blank PDF
Closer closer = Closer.create();
MemoryUsageSetting memUsageSettings = MemoryUsageSetting.setupMixed(1000000); // 1MB max memory usage
memUsageSettings.setTempDir(new java.io.File(System.getProperty("java.io.tmpdir"))); // To OS temp
float mmPerInch = 1 / (10 * 2.54f) * 72f;
try (PDDocument doc = new PDDocument(memUsageSettings)) {
// Add files
for (File file : fileList) {
Path storedFile = DirectoryUtil.getStorageDirectory().resolve(file.getId());
try (InputStream storedFileInputStream = file.getPrivateKey() == null ? // Try to decrypt the file if we have a private key available
Files.newInputStream(storedFile) : EncryptionUtil.decryptInputStream(Files.newInputStream(storedFile), file.getPrivateKey())) {
if (ImageUtil.isImage(file.getMimeType())) {
PDPage page = new PDPage(PDRectangle.A4); // Images into A4 pages
try (PDPageContentStream contentStream = new PDPageContentStream(doc, page)) {
// Read the image using the correct handler. PDFBox can't do it because it relies wrongly on file extension
PDImageXObject pdImage = null;
if (file.getMimeType().equals(MimeType.IMAGE_JPEG)) {
pdImage = JPEGFactory.createFromStream(doc, storedFileInputStream);
} else if (file.getMimeType().equals(MimeType.IMAGE_GIF) || file.getMimeType().equals(MimeType.IMAGE_PNG)) {
BufferedImage bim = ImageIO.read(storedFileInputStream);
pdImage = LosslessFactory.createFromImage(doc, bim);
}
// Do we want to fill the page with the image?
if (fitImageToPage) {
// Fill the page with the image
float widthAvailable = page.getMediaBox().getWidth() - 2 * margin * mmPerInch;
float heightAvailable = page.getMediaBox().getHeight() - 2 * margin * mmPerInch;
// Compare page format and image format
if (widthAvailable / heightAvailable < (float) pdImage.getWidth() / (float) pdImage.getHeight()) {
float imageHeight = widthAvailable / pdImage.getWidth() * pdImage.getHeight();
contentStream.drawImage(pdImage, margin * mmPerInch, heightAvailable + margin * mmPerInch - imageHeight,
widthAvailable, imageHeight);
} else {
float imageWidth = heightAvailable / pdImage.getHeight() * pdImage.getWidth();
contentStream.drawImage(pdImage, margin * mmPerInch, margin * mmPerInch,
imageWidth, heightAvailable);
}
} else {
// Draw the image as is
contentStream.drawImage(pdImage, margin * mmPerInch,
page.getMediaBox().getHeight() - pdImage.getHeight() - margin * mmPerInch);
}
}
doc.addPage(page);
} else {
// Try to convert the file to PDF
InputStream pdfInputStream = convertToPdf(file, storedFileInputStream, false);
if (pdfInputStream != null) {
// This file is convertible to PDF, just add it to the end
try {
PDDocument mergeDoc = PDDocument.load(pdfInputStream, memUsageSettings);
closer.register(mergeDoc);
PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
pdfMergerUtility.appendDocument(doc, mergeDoc);
} finally {
pdfInputStream.close();
}
}
// All other non-PDF-convertible files are ignored
}
}
}
// Save to a temporary file
try (TemporaryFileStream temporaryFileStream = new TemporaryFileStream()) {
doc.save(temporaryFileStream.openWriteStream());
closer.close(); // Close all remaining opened PDF
return temporaryFileStream.openReadStream();
}
}
}
/**
* Render the first page of a PDF.
*
* @param inputStream PDF document
* @return Render of the first page
* @throws IOException
*/
public static BufferedImage renderFirstPage(InputStream inputStream) throws IOException {
try (PDDocument pdfDocument = PDDocument.load(inputStream)) {
PDFRenderer renderer = new PDFRenderer(pdfDocument);
return renderer.renderImage(0);
}
}
}

View File

@ -4,8 +4,6 @@ import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.StandardCopyOption;
import junit.framework.Assert;
import org.junit.Test;
import com.google.common.collect.Lists;
@ -13,6 +11,8 @@ import com.google.common.io.Resources;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.util.mime.MimeType;
import junit.framework.Assert;
/**
* Test of the file entity utilities.
*
@ -24,7 +24,7 @@ public class TestFileUtil {
try (InputStream inputStream = Resources.getResource("file/document.odt").openStream()) {
File file = new File();
file.setMimeType(MimeType.OPEN_DOCUMENT_TEXT);
try (InputStream pdfInputStream = FileUtil.convertToPdf(file, inputStream, false)) {
try (InputStream pdfInputStream = PdfUtil.convertToPdf(file, inputStream, false)) {
String content = FileUtil.extractContent(null, file, inputStream, pdfInputStream);
Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen."));
}
@ -36,7 +36,7 @@ public class TestFileUtil {
try (InputStream inputStream = Resources.getResource("file/document.docx").openStream()) {
File file = new File();
file.setMimeType(MimeType.OFFICE_DOCUMENT);
try (InputStream pdfInputStream = FileUtil.convertToPdf(file, inputStream, false)) {
try (InputStream pdfInputStream = PdfUtil.convertToPdf(file, inputStream, false)) {
String content = FileUtil.extractContent(null, file, inputStream, pdfInputStream);
Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen."));
}
@ -81,7 +81,7 @@ public class TestFileUtil {
file4.setId("document_odt");
file4.setMimeType(MimeType.OPEN_DOCUMENT_TEXT);
FileUtil.convertToPdf(Lists.newArrayList(file0, file1, file2, file3, file4), true, 10).close();
PdfUtil.convertToPdf(Lists.newArrayList(file0, file1, file2, file3, file4), true, 10).close();
}
}
}

View File

@ -53,6 +53,7 @@ import com.sismics.docs.core.model.jpa.User;
import com.sismics.docs.core.util.DirectoryUtil;
import com.sismics.docs.core.util.EncryptionUtil;
import com.sismics.docs.core.util.FileUtil;
import com.sismics.docs.core.util.PdfUtil;
import com.sismics.rest.exception.ClientException;
import com.sismics.rest.exception.ForbiddenClientException;
import com.sismics.rest.exception.ServerException;
@ -150,7 +151,7 @@ public class FileResource extends BaseResource {
file.setMimeType(MimeTypeUtil.guessOpenDocumentFormat(file, fileInputStream));
// Convert to PDF if necessary (for thumbnail and text extraction)
InputStream pdfIntputStream = FileUtil.convertToPdf(file, fileInputStream, true);
InputStream pdfIntputStream = PdfUtil.convertToPdf(file, fileInputStream, true);
// Save the file
FileUtil.save(fileInputStream, pdfIntputStream, file, user.getPrivateKey());