Closes #215: handle pptx files

2025-05-08 18:53:22 +02:00 · 2018-03-20 22:46:56 +01:00 · 2018-03-20 22:46:56 +01:00 · 3613f6f8de
commit 3613f6f8de
parent 945e619d55
11 changed files with 272 additions and 10 deletions
--- a/README.md
+++ b/README.md
@ -31,7 +31,7 @@ Features

 - Responsive user interface
 - Optical character recognition
- Support image, PDF, ODT and DOCX files
+- Support image, PDF, ODT, DOCX, PPTX files
 - Video file support ![New!](https://www.sismics.com/public/img/new.png)
 - Flexible search engine
 - Full text search in all supported files
--- a/docs-core/src/main/java/com/sismics/docs/core/util/format/FormatHandlerUtil.java
+++ b/docs-core/src/main/java/com/sismics/docs/core/util/format/FormatHandlerUtil.java
@ -15,6 +15,7 @@ public class FormatHandlerUtil {
     */
    private static final List<Class<? extends FormatHandler>> FORMAT_HANDLERS = Lists.newArrayList(
            DocxFormatHandler.class,
+            PptxFormatHandler.class,
            OdtFormatHandler.class,
            VideoFormatHandler.class,
            PdfFormatHandler.class,
--- a/docs-core/src/main/java/com/sismics/docs/core/util/format/PptxFormatHandler.java
+++ b/docs-core/src/main/java/com/sismics/docs/core/util/format/PptxFormatHandler.java
@ -0,0 +1,97 @@
+package com.sismics.docs.core.util.format;
+
+import com.google.common.io.Closer;
+import com.sismics.util.mime.MimeType;
+import org.apache.pdfbox.io.MemoryUsageSetting;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.PDPageContentStream;
+import org.apache.pdfbox.pdmodel.common.PDRectangle;
+import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
+import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
+import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xslf.usermodel.XMLSlideShow;
+import org.apache.poi.xslf.usermodel.XSLFSlide;
+
+import java.awt.*;
+import java.awt.geom.Rectangle2D;
+import java.awt.image.BufferedImage;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+/**
+ * PPTX format handler.
+ *
+ * @author bgamard
+ */
+public class PptxFormatHandler implements FormatHandler {
+    /**
+     * Cached PPTX loaded file.
+     */
+    private XMLSlideShow slideShow;
+
+    @Override
+    public boolean accept(String mimeType) {
+        return MimeType.OFFICE_PRESENTATION.equals(mimeType);
+    }
+
+    @Override
+    public BufferedImage generateThumbnail(Path file) throws Exception {
+        XMLSlideShow pptx = loadPPtxFile(file);
+        if (pptx.getSlides().length > 0) {
+            return generateImageFromSlide(pptx, 0);
+        }
+
+        return null;
+    }
+
+    @Override
+    public String extractContent(String language, Path file) throws Exception {
+        XMLSlideShow pptx = loadPPtxFile(file);
+        return new XSLFPowerPointExtractor(pptx).getText();
+    }
+
+    @Override
+    public void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) throws Exception {
+        XMLSlideShow pptx = loadPPtxFile(file);
+        XSLFSlide[] slides = pptx.getSlides();
+        Dimension pgsize = pptx.getPageSize();
+        for (int slideIndex = 0; slideIndex < slides.length; slideIndex++) {
+            // One PDF page per slide
+            PDPage page = new PDPage(new PDRectangle(pgsize.width, pgsize.height));
+            try (PDPageContentStream contentStream = new PDPageContentStream(doc, page)) {
+                BufferedImage bim = generateImageFromSlide(pptx, slideIndex);
+                PDImageXObject pdImage = LosslessFactory.createFromImage(doc, bim);
+                contentStream.drawImage(pdImage, 0, page.getMediaBox().getHeight() - pdImage.getHeight());
+            }
+            doc.addPage(page);
+        }
+    }
+
+    private XMLSlideShow loadPPtxFile(Path file) throws Exception {
+        if (slideShow == null) {
+            try (InputStream inputStream = Files.newInputStream(file)) {
+                slideShow = new XMLSlideShow(inputStream);
+            }
+        }
+        return slideShow;
+    }
+
+    /**
+     * Generate an image from a PPTX slide.
+     *
+     * @param pptx PPTX
+     * @param slideIndex Slide index
+     * @return Image
+     */
+    private BufferedImage generateImageFromSlide(XMLSlideShow pptx, int slideIndex) {
+        Dimension pgsize = pptx.getPageSize();
+        BufferedImage img = new BufferedImage(pgsize.width, pgsize.height,BufferedImage.TYPE_INT_RGB);
+        Graphics2D graphics = img.createGraphics();
+        graphics.setPaint(Color.white);
+        graphics.fill(new Rectangle2D.Float(0, 0, pgsize.width, pgsize.height));
+        pptx.getSlides()[slideIndex].draw(graphics);
+        return img;
+    }
+}
--- a/docs-core/src/main/java/com/sismics/util/mime/MimeType.java
+++ b/docs-core/src/main/java/com/sismics/util/mime/MimeType.java
@ -20,6 +20,10 @@ public class MimeType {
    
    public static final String OFFICE_DOCUMENT = "application/vnd.openxmlformats-officedocument.wordprocessingml.document";

+    public static final String OFFICE_PRESENTATION = "application/vnd.openxmlformats-officedocument.presentationml.presentation";
+
+    public static final String OFFICE_SHEET = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
+
    public static final String TEXT_PLAIN = "text/plain";

    public static final String TEXT_CSV = "text/csv";
--- a/docs-core/src/main/java/com/sismics/util/mime/MimeTypeUtil.java
+++ b/docs-core/src/main/java/com/sismics/util/mime/MimeTypeUtil.java
@ -145,6 +145,9 @@ public class MimeTypeUtil {
                    if (content.contains(MimeType.OFFICE_DOCUMENT)) {
                        mimeType =  MimeType.OFFICE_DOCUMENT;
                        break;
+                    } else if (content.contains(MimeType.OFFICE_PRESENTATION)) {
+                        mimeType = MimeType.OFFICE_PRESENTATION;
+                        break;
                    }
                }
    
--- a/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java
+++ b/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java
@ -44,6 +44,16 @@ public class TestFileUtil {
        Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen."));
    }

+    @Test
+    public void extractContentPowerpointTest() throws Exception {
+        Path path = Paths.get(ClassLoader.getSystemResource("file/apache.pptx").toURI());
+        FormatHandler formatHandler = FormatHandlerUtil.find(MimeTypeUtil.guessMimeType(path, "apache.pptx"));
+        Assert.assertNotNull(formatHandler);
+        Assert.assertTrue(formatHandler instanceof PptxFormatHandler);
+        String content = formatHandler.extractContent("eng", path);
+        Assert.assertTrue(content.contains("Scaling"));
+    }
+
    @Test
    public void extractContentPdf() throws Exception {
        Path path = Paths.get(ClassLoader.getSystemResource("file/udhr.pdf").toURI());
@ -70,7 +80,8 @@ public class TestFileUtil {
                InputStream inputStream1 = Resources.getResource("file/apollo_portrait.jpg").openStream();
                InputStream inputStream2 = Resources.getResource("file/udhr_encrypted.pdf").openStream();
                InputStream inputStream3 = Resources.getResource("file/document.docx").openStream();
-                InputStream inputStream4 = Resources.getResource("file/document.odt").openStream()) {
+                InputStream inputStream4 = Resources.getResource("file/document.odt").openStream();
+                InputStream inputStream5 = Resources.getResource("file/apache.pptx").openStream()) {
            // Document
            DocumentDto documentDto = new DocumentDto();
            documentDto.setTitle("My super document 1");
@ -117,9 +128,16 @@ public class TestFileUtil {
            file4.setId("document_odt");
            file4.setMimeType(MimeType.OPEN_DOCUMENT_TEXT);

+            // Sixth file
+            Files.copy(inputStream5, DirectoryUtil.getStorageDirectory().resolve("document_pptx"), StandardCopyOption.REPLACE_EXISTING);
+            File file5 = new File();
+            file5.setId("document_pptx");
+            file5.setMimeType(MimeType.OFFICE_PRESENTATION);
+
            ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
-            PdfUtil.convertToPdf(documentDto, Lists.newArrayList(file0, file1, file2, file3, file4), true, true, 10, outputStream);
+            PdfUtil.convertToPdf(documentDto, Lists.newArrayList(file0, file1, file2, file3, file4, file5), true, true, 10, outputStream);
            Assert.assertTrue(outputStream.toByteArray().length > 0);
+            com.google.common.io.Files.write(outputStream.toByteArray(), new java.io.File("C:\\Users\\Jendib\\Downloads\\test.pdf"));
        }
    }
 }
--- a/docs-core/src/test/java/com/sismics/util/TestMimeTypeUtil.java
+++ b/docs-core/src/test/java/com/sismics/util/TestMimeTypeUtil.java
@ -23,5 +23,9 @@ public class TestMimeTypeUtil {
        // Detect DOCX files
        path = Paths.get(ClassLoader.getSystemResource("file/document.docx").toURI());
        Assert.assertEquals(MimeType.OFFICE_DOCUMENT, MimeTypeUtil.guessMimeType(path, "document.odt"));
+
+        // Detect PPTX files
+        path = Paths.get(ClassLoader.getSystemResource("file/apache.pptx").toURI());
+        Assert.assertEquals(MimeType.OFFICE_PRESENTATION, MimeTypeUtil.guessMimeType(path, "apache.pptx"));
    }
 }
--- a/docs-core/src/test/resources/file/apache.pptx
+++ b/docs-core/src/test/resources/file/apache.pptx
--- a/docs-web-common/src/test/java/com/sismics/docs/rest/util/ClientUtil.java
+++ b/docs-web-common/src/test/java/com/sismics/docs/rest/util/ClientUtil.java
@ -49,7 +49,7 @@ public class ClientUtil {
                        .param("username", username)
                        .param("email", username + "@docs.com")
                        .param("password", "12345678")
-                        .param("storage_quota", "1000000")), JsonObject.class); // 1MB quota
+                        .param("storage_quota", "10000000")), JsonObject.class); // 10MB quota
        
        // Add to groups
        for (String groupName : groupNameList) {
--- a/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java
+++ b/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java
@ -267,7 +267,12 @@ public class TestDocumentResource extends BaseJerseyTest {
        Assert.assertEquals(document2Id, json.getString("id"));

        // Export a document in PDF format
-        Response response = target().path("/document/" + document1Id).request()
+        Response response = target().path("/document/" + document1Id + "/pdf")
+                .queryParam("margin", "10")
+                .queryParam("metadata", "true")
+                .queryParam("comments", "true")
+                .queryParam("fitimagetopage", "true")
+                .request()
                .cookie(TokenBasedSecurityFilter.COOKIE_NAME, document1Token)
                .get();
        InputStream is = (InputStream) response.getEntity();
@ -394,6 +399,20 @@ public class TestDocumentResource extends BaseJerseyTest {
        byte[] fileBytes = ByteStreams.toByteArray(is);
        Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues
        Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null));
+
+        // Export a document in PDF format
+        response = target().path("/document/" + document1Id + "/pdf")
+                .queryParam("margin", "10")
+                .queryParam("metadata", "true")
+                .queryParam("comments", "true")
+                .queryParam("fitimagetopage", "true")
+                .request()
+                .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentOdtToken)
+                .get();
+        Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
+        is = (InputStream) response.getEntity();
+        byte[] pdfBytes = ByteStreams.toByteArray(is);
+        Assert.assertTrue(pdfBytes.length > 0);
    }
    
    /**
@ -440,6 +459,20 @@ public class TestDocumentResource extends BaseJerseyTest {
        byte[] fileBytes = ByteStreams.toByteArray(is);
        Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues
        Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null));
+
+        // Export a document in PDF format
+        response = target().path("/document/" + document1Id + "/pdf")
+                .queryParam("margin", "10")
+                .queryParam("metadata", "true")
+                .queryParam("comments", "true")
+                .queryParam("fitimagetopage", "true")
+                .request()
+                .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentDocxToken)
+                .get();
+        Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
+        is = (InputStream) response.getEntity();
+        byte[] pdfBytes = ByteStreams.toByteArray(is);
+        Assert.assertTrue(pdfBytes.length > 0);
    }
    
    /**
@ -486,6 +519,20 @@ public class TestDocumentResource extends BaseJerseyTest {
        byte[] fileBytes = ByteStreams.toByteArray(is);
        Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues
        Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null));
+
+        // Export a document in PDF format
+        response = target().path("/document/" + document1Id + "/pdf")
+                .queryParam("margin", "10")
+                .queryParam("metadata", "true")
+                .queryParam("comments", "true")
+                .queryParam("fitimagetopage", "true")
+                .request()
+                .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPdfToken)
+                .get();
+        Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
+        is = (InputStream) response.getEntity();
+        byte[] pdfBytes = ByteStreams.toByteArray(is);
+        Assert.assertTrue(pdfBytes.length > 0);
    }

    /**
@ -532,6 +579,20 @@ public class TestDocumentResource extends BaseJerseyTest {
        byte[] fileBytes = ByteStreams.toByteArray(is);
        Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues
        Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null));
+
+        // Export a document in PDF format
+        response = target().path("/document/" + document1Id + "/pdf")
+                .queryParam("margin", "10")
+                .queryParam("metadata", "true")
+                .queryParam("comments", "true")
+                .queryParam("fitimagetopage", "true")
+                .request()
+                .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken)
+                .get();
+        Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
+        is = (InputStream) response.getEntity();
+        byte[] pdfBytes = ByteStreams.toByteArray(is);
+        Assert.assertTrue(pdfBytes.length > 0);
    }

    /**
@ -543,12 +604,12 @@ public class TestDocumentResource extends BaseJerseyTest {
    public void testVideoExtraction() throws Exception {
        // Login document_video
        clientUtil.createUser("document_video");
-        String documentPlainToken = clientUtil.login("document_video");
+        String documentVideoToken = clientUtil.login("document_video");

        // Create a document
        long create1Date = new Date().getTime();
        JsonObject json = target().path("/document").request()
-                .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken)
+                .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentVideoToken)
                .put(Entity.form(new Form()
                        .param("title", "My super title document 1")
                        .param("description", "My super description for document 1")
@ -558,13 +619,13 @@ public class TestDocumentResource extends BaseJerseyTest {
        Assert.assertNotNull(document1Id);

        // Add a video file
-        String file1Id = clientUtil.addFileToDocument("file/video.webm", "video.webm", documentPlainToken, document1Id);
+        String file1Id = clientUtil.addFileToDocument("file/video.webm", "video.webm", documentVideoToken, document1Id);

        // Search documents by query in full content
        json = target().path("/document/list")
                .queryParam("search", "full:vp9")
                .request()
-                .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken)
+                .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentVideoToken)
                .get(JsonObject.class);
        Assert.assertTrue(json.getJsonArray("documents").size() == 1);

@ -572,12 +633,86 @@ public class TestDocumentResource extends BaseJerseyTest {
        Response response = target().path("/file/" + file1Id + "/data")
                .queryParam("size", "thumb")
                .request()
-                .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken)
+                .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentVideoToken)
                .get();
        InputStream is = (InputStream) response.getEntity();
        byte[] fileBytes = ByteStreams.toByteArray(is);
        Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues
        Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null));
+
+        // Export a document in PDF format
+        response = target().path("/document/" + document1Id + "/pdf")
+                .queryParam("margin", "10")
+                .queryParam("metadata", "true")
+                .queryParam("comments", "true")
+                .queryParam("fitimagetopage", "true")
+                .request()
+                .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentVideoToken)
+                .get();
+        Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
+        is = (InputStream) response.getEntity();
+        byte[] pdfBytes = ByteStreams.toByteArray(is);
+        Assert.assertTrue(pdfBytes.length > 0);
+    }
+
+    /**
+     * Test PPTX extraction.
+     *
+     * @throws Exception e
+     */
+    @Test
+    public void testPptxExtraction() throws Exception {
+        // Login document_pptx
+        clientUtil.createUser("document_pptx");
+        String documentPptxToken = clientUtil.login("document_pptx");
+
+        // Create a document
+        long create1Date = new Date().getTime();
+        JsonObject json = target().path("/document").request()
+                .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPptxToken)
+                .put(Entity.form(new Form()
+                        .param("title", "My super title document 1")
+                        .param("description", "My super description for document 1")
+                        .param("language", "eng")
+                        .param("create_date", Long.toString(create1Date))), JsonObject.class);
+        String document1Id = json.getString("id");
+        Assert.assertNotNull(document1Id);
+
+        // Add a PPTX file
+        String file1Id = clientUtil.addFileToDocument("file/apache.pptx", "apache.pptx", documentPptxToken, document1Id);
+
+        // Search documents by query in full content
+        json = target().path("/document/list")
+                .queryParam("search", "full:scaling")
+                .request()
+                .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPptxToken)
+                .get(JsonObject.class);
+        Assert.assertTrue(json.getJsonArray("documents").size() == 1);
+
+        // Get the file thumbnail data
+        Response response = target().path("/file/" + file1Id + "/data")
+                .queryParam("size", "thumb")
+                .request()
+                .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPptxToken)
+                .get();
+        InputStream is = (InputStream) response.getEntity();
+        byte[] fileBytes = ByteStreams.toByteArray(is);
+        Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues
+        Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null));
+
+        // Export a document in PDF format
+        response = target().path("/document/" + document1Id + "/pdf")
+                .queryParam("margin", "10")
+                .queryParam("metadata", "true")
+                .queryParam("comments", "true")
+                .queryParam("fitimagetopage", "true")
+                .request()
+                .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPptxToken)
+                .get();
+        Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
+        is = (InputStream) response.getEntity();
+        byte[] pdfBytes = ByteStreams.toByteArray(is);
+        Assert.assertTrue(pdfBytes.length > 0);
    }

    /**
--- a/docs-web/src/test/resources/file/apache.pptx
+++ b/docs-web/src/test/resources/file/apache.pptx