Closes #215: handle pptx files

This commit is contained in:
Benjamin Gamard 2018-03-20 22:46:56 +01:00
parent 945e619d55
commit 3613f6f8de
11 changed files with 272 additions and 10 deletions

View File

@ -31,7 +31,7 @@ Features
- Responsive user interface
- Optical character recognition
- Support image, PDF, ODT and DOCX files
- Support image, PDF, ODT, DOCX, PPTX files
- Video file support ![New!](https://www.sismics.com/public/img/new.png)
- Flexible search engine
- Full text search in all supported files

View File

@ -15,6 +15,7 @@ public class FormatHandlerUtil {
*/
private static final List<Class<? extends FormatHandler>> FORMAT_HANDLERS = Lists.newArrayList(
DocxFormatHandler.class,
PptxFormatHandler.class,
OdtFormatHandler.class,
VideoFormatHandler.class,
PdfFormatHandler.class,

View File

@ -0,0 +1,97 @@
package com.sismics.docs.core.util.format;
import com.google.common.io.Closer;
import com.sismics.util.mime.MimeType;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFSlide;
import java.awt.*;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
/**
* PPTX format handler.
*
* @author bgamard
*/
public class PptxFormatHandler implements FormatHandler {
/**
* Cached PPTX loaded file.
*/
private XMLSlideShow slideShow;
@Override
public boolean accept(String mimeType) {
return MimeType.OFFICE_PRESENTATION.equals(mimeType);
}
@Override
public BufferedImage generateThumbnail(Path file) throws Exception {
XMLSlideShow pptx = loadPPtxFile(file);
if (pptx.getSlides().length > 0) {
return generateImageFromSlide(pptx, 0);
}
return null;
}
@Override
public String extractContent(String language, Path file) throws Exception {
XMLSlideShow pptx = loadPPtxFile(file);
return new XSLFPowerPointExtractor(pptx).getText();
}
@Override
public void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) throws Exception {
XMLSlideShow pptx = loadPPtxFile(file);
XSLFSlide[] slides = pptx.getSlides();
Dimension pgsize = pptx.getPageSize();
for (int slideIndex = 0; slideIndex < slides.length; slideIndex++) {
// One PDF page per slide
PDPage page = new PDPage(new PDRectangle(pgsize.width, pgsize.height));
try (PDPageContentStream contentStream = new PDPageContentStream(doc, page)) {
BufferedImage bim = generateImageFromSlide(pptx, slideIndex);
PDImageXObject pdImage = LosslessFactory.createFromImage(doc, bim);
contentStream.drawImage(pdImage, 0, page.getMediaBox().getHeight() - pdImage.getHeight());
}
doc.addPage(page);
}
}
private XMLSlideShow loadPPtxFile(Path file) throws Exception {
if (slideShow == null) {
try (InputStream inputStream = Files.newInputStream(file)) {
slideShow = new XMLSlideShow(inputStream);
}
}
return slideShow;
}
/**
* Generate an image from a PPTX slide.
*
* @param pptx PPTX
* @param slideIndex Slide index
* @return Image
*/
private BufferedImage generateImageFromSlide(XMLSlideShow pptx, int slideIndex) {
Dimension pgsize = pptx.getPageSize();
BufferedImage img = new BufferedImage(pgsize.width, pgsize.height,BufferedImage.TYPE_INT_RGB);
Graphics2D graphics = img.createGraphics();
graphics.setPaint(Color.white);
graphics.fill(new Rectangle2D.Float(0, 0, pgsize.width, pgsize.height));
pptx.getSlides()[slideIndex].draw(graphics);
return img;
}
}

View File

@ -20,6 +20,10 @@ public class MimeType {
public static final String OFFICE_DOCUMENT = "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
public static final String OFFICE_PRESENTATION = "application/vnd.openxmlformats-officedocument.presentationml.presentation";
public static final String OFFICE_SHEET = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
public static final String TEXT_PLAIN = "text/plain";
public static final String TEXT_CSV = "text/csv";

View File

@ -145,6 +145,9 @@ public class MimeTypeUtil {
if (content.contains(MimeType.OFFICE_DOCUMENT)) {
mimeType = MimeType.OFFICE_DOCUMENT;
break;
} else if (content.contains(MimeType.OFFICE_PRESENTATION)) {
mimeType = MimeType.OFFICE_PRESENTATION;
break;
}
}

View File

@ -44,6 +44,16 @@ public class TestFileUtil {
Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen."));
}
@Test
public void extractContentPowerpointTest() throws Exception {
Path path = Paths.get(ClassLoader.getSystemResource("file/apache.pptx").toURI());
FormatHandler formatHandler = FormatHandlerUtil.find(MimeTypeUtil.guessMimeType(path, "apache.pptx"));
Assert.assertNotNull(formatHandler);
Assert.assertTrue(formatHandler instanceof PptxFormatHandler);
String content = formatHandler.extractContent("eng", path);
Assert.assertTrue(content.contains("Scaling"));
}
@Test
public void extractContentPdf() throws Exception {
Path path = Paths.get(ClassLoader.getSystemResource("file/udhr.pdf").toURI());
@ -70,7 +80,8 @@ public class TestFileUtil {
InputStream inputStream1 = Resources.getResource("file/apollo_portrait.jpg").openStream();
InputStream inputStream2 = Resources.getResource("file/udhr_encrypted.pdf").openStream();
InputStream inputStream3 = Resources.getResource("file/document.docx").openStream();
InputStream inputStream4 = Resources.getResource("file/document.odt").openStream()) {
InputStream inputStream4 = Resources.getResource("file/document.odt").openStream();
InputStream inputStream5 = Resources.getResource("file/apache.pptx").openStream()) {
// Document
DocumentDto documentDto = new DocumentDto();
documentDto.setTitle("My super document 1");
@ -117,9 +128,16 @@ public class TestFileUtil {
file4.setId("document_odt");
file4.setMimeType(MimeType.OPEN_DOCUMENT_TEXT);
// Sixth file
Files.copy(inputStream5, DirectoryUtil.getStorageDirectory().resolve("document_pptx"), StandardCopyOption.REPLACE_EXISTING);
File file5 = new File();
file5.setId("document_pptx");
file5.setMimeType(MimeType.OFFICE_PRESENTATION);
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
PdfUtil.convertToPdf(documentDto, Lists.newArrayList(file0, file1, file2, file3, file4), true, true, 10, outputStream);
PdfUtil.convertToPdf(documentDto, Lists.newArrayList(file0, file1, file2, file3, file4, file5), true, true, 10, outputStream);
Assert.assertTrue(outputStream.toByteArray().length > 0);
com.google.common.io.Files.write(outputStream.toByteArray(), new java.io.File("C:\\Users\\Jendib\\Downloads\\test.pdf"));
}
}
}

View File

@ -23,5 +23,9 @@ public class TestMimeTypeUtil {
// Detect DOCX files
path = Paths.get(ClassLoader.getSystemResource("file/document.docx").toURI());
Assert.assertEquals(MimeType.OFFICE_DOCUMENT, MimeTypeUtil.guessMimeType(path, "document.odt"));
// Detect PPTX files
path = Paths.get(ClassLoader.getSystemResource("file/apache.pptx").toURI());
Assert.assertEquals(MimeType.OFFICE_PRESENTATION, MimeTypeUtil.guessMimeType(path, "apache.pptx"));
}
}

Binary file not shown.

View File

@ -49,7 +49,7 @@ public class ClientUtil {
.param("username", username)
.param("email", username + "@docs.com")
.param("password", "12345678")
.param("storage_quota", "1000000")), JsonObject.class); // 1MB quota
.param("storage_quota", "10000000")), JsonObject.class); // 10MB quota
// Add to groups
for (String groupName : groupNameList) {

View File

@ -267,7 +267,12 @@ public class TestDocumentResource extends BaseJerseyTest {
Assert.assertEquals(document2Id, json.getString("id"));
// Export a document in PDF format
Response response = target().path("/document/" + document1Id).request()
Response response = target().path("/document/" + document1Id + "/pdf")
.queryParam("margin", "10")
.queryParam("metadata", "true")
.queryParam("comments", "true")
.queryParam("fitimagetopage", "true")
.request()
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, document1Token)
.get();
InputStream is = (InputStream) response.getEntity();
@ -394,6 +399,20 @@ public class TestDocumentResource extends BaseJerseyTest {
byte[] fileBytes = ByteStreams.toByteArray(is);
Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues
Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null));
// Export a document in PDF format
response = target().path("/document/" + document1Id + "/pdf")
.queryParam("margin", "10")
.queryParam("metadata", "true")
.queryParam("comments", "true")
.queryParam("fitimagetopage", "true")
.request()
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentOdtToken)
.get();
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
is = (InputStream) response.getEntity();
byte[] pdfBytes = ByteStreams.toByteArray(is);
Assert.assertTrue(pdfBytes.length > 0);
}
/**
@ -440,6 +459,20 @@ public class TestDocumentResource extends BaseJerseyTest {
byte[] fileBytes = ByteStreams.toByteArray(is);
Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues
Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null));
// Export a document in PDF format
response = target().path("/document/" + document1Id + "/pdf")
.queryParam("margin", "10")
.queryParam("metadata", "true")
.queryParam("comments", "true")
.queryParam("fitimagetopage", "true")
.request()
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentDocxToken)
.get();
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
is = (InputStream) response.getEntity();
byte[] pdfBytes = ByteStreams.toByteArray(is);
Assert.assertTrue(pdfBytes.length > 0);
}
/**
@ -486,6 +519,20 @@ public class TestDocumentResource extends BaseJerseyTest {
byte[] fileBytes = ByteStreams.toByteArray(is);
Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues
Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null));
// Export a document in PDF format
response = target().path("/document/" + document1Id + "/pdf")
.queryParam("margin", "10")
.queryParam("metadata", "true")
.queryParam("comments", "true")
.queryParam("fitimagetopage", "true")
.request()
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPdfToken)
.get();
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
is = (InputStream) response.getEntity();
byte[] pdfBytes = ByteStreams.toByteArray(is);
Assert.assertTrue(pdfBytes.length > 0);
}
/**
@ -532,6 +579,20 @@ public class TestDocumentResource extends BaseJerseyTest {
byte[] fileBytes = ByteStreams.toByteArray(is);
Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues
Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null));
// Export a document in PDF format
response = target().path("/document/" + document1Id + "/pdf")
.queryParam("margin", "10")
.queryParam("metadata", "true")
.queryParam("comments", "true")
.queryParam("fitimagetopage", "true")
.request()
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken)
.get();
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
is = (InputStream) response.getEntity();
byte[] pdfBytes = ByteStreams.toByteArray(is);
Assert.assertTrue(pdfBytes.length > 0);
}
/**
@ -543,12 +604,12 @@ public class TestDocumentResource extends BaseJerseyTest {
public void testVideoExtraction() throws Exception {
// Login document_video
clientUtil.createUser("document_video");
String documentPlainToken = clientUtil.login("document_video");
String documentVideoToken = clientUtil.login("document_video");
// Create a document
long create1Date = new Date().getTime();
JsonObject json = target().path("/document").request()
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken)
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentVideoToken)
.put(Entity.form(new Form()
.param("title", "My super title document 1")
.param("description", "My super description for document 1")
@ -558,13 +619,13 @@ public class TestDocumentResource extends BaseJerseyTest {
Assert.assertNotNull(document1Id);
// Add a video file
String file1Id = clientUtil.addFileToDocument("file/video.webm", "video.webm", documentPlainToken, document1Id);
String file1Id = clientUtil.addFileToDocument("file/video.webm", "video.webm", documentVideoToken, document1Id);
// Search documents by query in full content
json = target().path("/document/list")
.queryParam("search", "full:vp9")
.request()
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken)
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentVideoToken)
.get(JsonObject.class);
Assert.assertTrue(json.getJsonArray("documents").size() == 1);
@ -572,12 +633,86 @@ public class TestDocumentResource extends BaseJerseyTest {
Response response = target().path("/file/" + file1Id + "/data")
.queryParam("size", "thumb")
.request()
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken)
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentVideoToken)
.get();
InputStream is = (InputStream) response.getEntity();
byte[] fileBytes = ByteStreams.toByteArray(is);
Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues
Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null));
// Export a document in PDF format
response = target().path("/document/" + document1Id + "/pdf")
.queryParam("margin", "10")
.queryParam("metadata", "true")
.queryParam("comments", "true")
.queryParam("fitimagetopage", "true")
.request()
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentVideoToken)
.get();
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
is = (InputStream) response.getEntity();
byte[] pdfBytes = ByteStreams.toByteArray(is);
Assert.assertTrue(pdfBytes.length > 0);
}
/**
* Test PPTX extraction.
*
* @throws Exception e
*/
@Test
public void testPptxExtraction() throws Exception {
// Login document_pptx
clientUtil.createUser("document_pptx");
String documentPptxToken = clientUtil.login("document_pptx");
// Create a document
long create1Date = new Date().getTime();
JsonObject json = target().path("/document").request()
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPptxToken)
.put(Entity.form(new Form()
.param("title", "My super title document 1")
.param("description", "My super description for document 1")
.param("language", "eng")
.param("create_date", Long.toString(create1Date))), JsonObject.class);
String document1Id = json.getString("id");
Assert.assertNotNull(document1Id);
// Add a PPTX file
String file1Id = clientUtil.addFileToDocument("file/apache.pptx", "apache.pptx", documentPptxToken, document1Id);
// Search documents by query in full content
json = target().path("/document/list")
.queryParam("search", "full:scaling")
.request()
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPptxToken)
.get(JsonObject.class);
Assert.assertTrue(json.getJsonArray("documents").size() == 1);
// Get the file thumbnail data
Response response = target().path("/file/" + file1Id + "/data")
.queryParam("size", "thumb")
.request()
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPptxToken)
.get();
InputStream is = (InputStream) response.getEntity();
byte[] fileBytes = ByteStreams.toByteArray(is);
Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues
Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null));
// Export a document in PDF format
response = target().path("/document/" + document1Id + "/pdf")
.queryParam("margin", "10")
.queryParam("metadata", "true")
.queryParam("comments", "true")
.queryParam("fitimagetopage", "true")
.request()
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPptxToken)
.get();
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
is = (InputStream) response.getEntity();
byte[] pdfBytes = ByteStreams.toByteArray(is);
Assert.assertTrue(pdfBytes.length > 0);
}
/**

Binary file not shown.