diff --git a/README.md b/README.md index d4b9734f..44bb3514 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ Features - Responsive user interface - Optical character recognition -- Support image, PDF, ODT and DOCX files +- Support image, PDF, ODT, DOCX, PPTX files - Video file support ![New!](https://www.sismics.com/public/img/new.png) - Flexible search engine - Full text search in all supported files diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/format/FormatHandlerUtil.java b/docs-core/src/main/java/com/sismics/docs/core/util/format/FormatHandlerUtil.java index 47cb778c..9fa77aef 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/util/format/FormatHandlerUtil.java +++ b/docs-core/src/main/java/com/sismics/docs/core/util/format/FormatHandlerUtil.java @@ -15,6 +15,7 @@ public class FormatHandlerUtil { */ private static final List> FORMAT_HANDLERS = Lists.newArrayList( DocxFormatHandler.class, + PptxFormatHandler.class, OdtFormatHandler.class, VideoFormatHandler.class, PdfFormatHandler.class, diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/format/PptxFormatHandler.java b/docs-core/src/main/java/com/sismics/docs/core/util/format/PptxFormatHandler.java new file mode 100644 index 00000000..24468d82 --- /dev/null +++ b/docs-core/src/main/java/com/sismics/docs/core/util/format/PptxFormatHandler.java @@ -0,0 +1,97 @@ +package com.sismics.docs.core.util.format; + +import com.google.common.io.Closer; +import com.sismics.util.mime.MimeType; +import org.apache.pdfbox.io.MemoryUsageSetting; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageContentStream; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory; +import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; +import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; +import org.apache.poi.xslf.usermodel.XMLSlideShow; +import org.apache.poi.xslf.usermodel.XSLFSlide; + +import java.awt.*; +import java.awt.geom.Rectangle2D; +import java.awt.image.BufferedImage; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; + +/** + * PPTX format handler. + * + * @author bgamard + */ +public class PptxFormatHandler implements FormatHandler { + /** + * Cached PPTX loaded file. + */ + private XMLSlideShow slideShow; + + @Override + public boolean accept(String mimeType) { + return MimeType.OFFICE_PRESENTATION.equals(mimeType); + } + + @Override + public BufferedImage generateThumbnail(Path file) throws Exception { + XMLSlideShow pptx = loadPPtxFile(file); + if (pptx.getSlides().length > 0) { + return generateImageFromSlide(pptx, 0); + } + + return null; + } + + @Override + public String extractContent(String language, Path file) throws Exception { + XMLSlideShow pptx = loadPPtxFile(file); + return new XSLFPowerPointExtractor(pptx).getText(); + } + + @Override + public void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) throws Exception { + XMLSlideShow pptx = loadPPtxFile(file); + XSLFSlide[] slides = pptx.getSlides(); + Dimension pgsize = pptx.getPageSize(); + for (int slideIndex = 0; slideIndex < slides.length; slideIndex++) { + // One PDF page per slide + PDPage page = new PDPage(new PDRectangle(pgsize.width, pgsize.height)); + try (PDPageContentStream contentStream = new PDPageContentStream(doc, page)) { + BufferedImage bim = generateImageFromSlide(pptx, slideIndex); + PDImageXObject pdImage = LosslessFactory.createFromImage(doc, bim); + contentStream.drawImage(pdImage, 0, page.getMediaBox().getHeight() - pdImage.getHeight()); + } + doc.addPage(page); + } + } + + private XMLSlideShow loadPPtxFile(Path file) throws Exception { + if (slideShow == null) { + try (InputStream inputStream = Files.newInputStream(file)) { + slideShow = new XMLSlideShow(inputStream); + } + } + return slideShow; + } + + /** + * Generate an image from a PPTX slide. + * + * @param pptx PPTX + * @param slideIndex Slide index + * @return Image + */ + private BufferedImage generateImageFromSlide(XMLSlideShow pptx, int slideIndex) { + Dimension pgsize = pptx.getPageSize(); + BufferedImage img = new BufferedImage(pgsize.width, pgsize.height,BufferedImage.TYPE_INT_RGB); + Graphics2D graphics = img.createGraphics(); + graphics.setPaint(Color.white); + graphics.fill(new Rectangle2D.Float(0, 0, pgsize.width, pgsize.height)); + pptx.getSlides()[slideIndex].draw(graphics); + return img; + } +} diff --git a/docs-core/src/main/java/com/sismics/util/mime/MimeType.java b/docs-core/src/main/java/com/sismics/util/mime/MimeType.java index a97f7807..1ea316b2 100644 --- a/docs-core/src/main/java/com/sismics/util/mime/MimeType.java +++ b/docs-core/src/main/java/com/sismics/util/mime/MimeType.java @@ -20,6 +20,10 @@ public class MimeType { public static final String OFFICE_DOCUMENT = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"; + public static final String OFFICE_PRESENTATION = "application/vnd.openxmlformats-officedocument.presentationml.presentation"; + + public static final String OFFICE_SHEET = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"; + public static final String TEXT_PLAIN = "text/plain"; public static final String TEXT_CSV = "text/csv"; diff --git a/docs-core/src/main/java/com/sismics/util/mime/MimeTypeUtil.java b/docs-core/src/main/java/com/sismics/util/mime/MimeTypeUtil.java index 3ca6e1af..f36cd489 100644 --- a/docs-core/src/main/java/com/sismics/util/mime/MimeTypeUtil.java +++ b/docs-core/src/main/java/com/sismics/util/mime/MimeTypeUtil.java @@ -145,6 +145,9 @@ public class MimeTypeUtil { if (content.contains(MimeType.OFFICE_DOCUMENT)) { mimeType = MimeType.OFFICE_DOCUMENT; break; + } else if (content.contains(MimeType.OFFICE_PRESENTATION)) { + mimeType = MimeType.OFFICE_PRESENTATION; + break; } } diff --git a/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java b/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java index bf4f344f..5c5bce86 100644 --- a/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java +++ b/docs-core/src/test/java/com/sismics/docs/core/util/TestFileUtil.java @@ -44,6 +44,16 @@ public class TestFileUtil { Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen.")); } + @Test + public void extractContentPowerpointTest() throws Exception { + Path path = Paths.get(ClassLoader.getSystemResource("file/apache.pptx").toURI()); + FormatHandler formatHandler = FormatHandlerUtil.find(MimeTypeUtil.guessMimeType(path, "apache.pptx")); + Assert.assertNotNull(formatHandler); + Assert.assertTrue(formatHandler instanceof PptxFormatHandler); + String content = formatHandler.extractContent("eng", path); + Assert.assertTrue(content.contains("Scaling")); + } + @Test public void extractContentPdf() throws Exception { Path path = Paths.get(ClassLoader.getSystemResource("file/udhr.pdf").toURI()); @@ -70,7 +80,8 @@ public class TestFileUtil { InputStream inputStream1 = Resources.getResource("file/apollo_portrait.jpg").openStream(); InputStream inputStream2 = Resources.getResource("file/udhr_encrypted.pdf").openStream(); InputStream inputStream3 = Resources.getResource("file/document.docx").openStream(); - InputStream inputStream4 = Resources.getResource("file/document.odt").openStream()) { + InputStream inputStream4 = Resources.getResource("file/document.odt").openStream(); + InputStream inputStream5 = Resources.getResource("file/apache.pptx").openStream()) { // Document DocumentDto documentDto = new DocumentDto(); documentDto.setTitle("My super document 1"); @@ -117,9 +128,16 @@ public class TestFileUtil { file4.setId("document_odt"); file4.setMimeType(MimeType.OPEN_DOCUMENT_TEXT); + // Sixth file + Files.copy(inputStream5, DirectoryUtil.getStorageDirectory().resolve("document_pptx"), StandardCopyOption.REPLACE_EXISTING); + File file5 = new File(); + file5.setId("document_pptx"); + file5.setMimeType(MimeType.OFFICE_PRESENTATION); + ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); - PdfUtil.convertToPdf(documentDto, Lists.newArrayList(file0, file1, file2, file3, file4), true, true, 10, outputStream); + PdfUtil.convertToPdf(documentDto, Lists.newArrayList(file0, file1, file2, file3, file4, file5), true, true, 10, outputStream); Assert.assertTrue(outputStream.toByteArray().length > 0); + com.google.common.io.Files.write(outputStream.toByteArray(), new java.io.File("C:\\Users\\Jendib\\Downloads\\test.pdf")); } } } diff --git a/docs-core/src/test/java/com/sismics/util/TestMimeTypeUtil.java b/docs-core/src/test/java/com/sismics/util/TestMimeTypeUtil.java index e7e2c286..b6e64299 100644 --- a/docs-core/src/test/java/com/sismics/util/TestMimeTypeUtil.java +++ b/docs-core/src/test/java/com/sismics/util/TestMimeTypeUtil.java @@ -23,5 +23,9 @@ public class TestMimeTypeUtil { // Detect DOCX files path = Paths.get(ClassLoader.getSystemResource("file/document.docx").toURI()); Assert.assertEquals(MimeType.OFFICE_DOCUMENT, MimeTypeUtil.guessMimeType(path, "document.odt")); + + // Detect PPTX files + path = Paths.get(ClassLoader.getSystemResource("file/apache.pptx").toURI()); + Assert.assertEquals(MimeType.OFFICE_PRESENTATION, MimeTypeUtil.guessMimeType(path, "apache.pptx")); } } diff --git a/docs-core/src/test/resources/file/apache.pptx b/docs-core/src/test/resources/file/apache.pptx new file mode 100644 index 00000000..d3dc6cfe Binary files /dev/null and b/docs-core/src/test/resources/file/apache.pptx differ diff --git a/docs-web-common/src/test/java/com/sismics/docs/rest/util/ClientUtil.java b/docs-web-common/src/test/java/com/sismics/docs/rest/util/ClientUtil.java index de999f55..1f56111b 100644 --- a/docs-web-common/src/test/java/com/sismics/docs/rest/util/ClientUtil.java +++ b/docs-web-common/src/test/java/com/sismics/docs/rest/util/ClientUtil.java @@ -49,7 +49,7 @@ public class ClientUtil { .param("username", username) .param("email", username + "@docs.com") .param("password", "12345678") - .param("storage_quota", "1000000")), JsonObject.class); // 1MB quota + .param("storage_quota", "10000000")), JsonObject.class); // 10MB quota // Add to groups for (String groupName : groupNameList) { diff --git a/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java b/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java index f0dff767..d7880fb4 100644 --- a/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java +++ b/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java @@ -267,7 +267,12 @@ public class TestDocumentResource extends BaseJerseyTest { Assert.assertEquals(document2Id, json.getString("id")); // Export a document in PDF format - Response response = target().path("/document/" + document1Id).request() + Response response = target().path("/document/" + document1Id + "/pdf") + .queryParam("margin", "10") + .queryParam("metadata", "true") + .queryParam("comments", "true") + .queryParam("fitimagetopage", "true") + .request() .cookie(TokenBasedSecurityFilter.COOKIE_NAME, document1Token) .get(); InputStream is = (InputStream) response.getEntity(); @@ -394,6 +399,20 @@ public class TestDocumentResource extends BaseJerseyTest { byte[] fileBytes = ByteStreams.toByteArray(is); Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null)); + + // Export a document in PDF format + response = target().path("/document/" + document1Id + "/pdf") + .queryParam("margin", "10") + .queryParam("metadata", "true") + .queryParam("comments", "true") + .queryParam("fitimagetopage", "true") + .request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentOdtToken) + .get(); + Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus())); + is = (InputStream) response.getEntity(); + byte[] pdfBytes = ByteStreams.toByteArray(is); + Assert.assertTrue(pdfBytes.length > 0); } /** @@ -440,6 +459,20 @@ public class TestDocumentResource extends BaseJerseyTest { byte[] fileBytes = ByteStreams.toByteArray(is); Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null)); + + // Export a document in PDF format + response = target().path("/document/" + document1Id + "/pdf") + .queryParam("margin", "10") + .queryParam("metadata", "true") + .queryParam("comments", "true") + .queryParam("fitimagetopage", "true") + .request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentDocxToken) + .get(); + Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus())); + is = (InputStream) response.getEntity(); + byte[] pdfBytes = ByteStreams.toByteArray(is); + Assert.assertTrue(pdfBytes.length > 0); } /** @@ -486,6 +519,20 @@ public class TestDocumentResource extends BaseJerseyTest { byte[] fileBytes = ByteStreams.toByteArray(is); Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null)); + + // Export a document in PDF format + response = target().path("/document/" + document1Id + "/pdf") + .queryParam("margin", "10") + .queryParam("metadata", "true") + .queryParam("comments", "true") + .queryParam("fitimagetopage", "true") + .request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPdfToken) + .get(); + Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus())); + is = (InputStream) response.getEntity(); + byte[] pdfBytes = ByteStreams.toByteArray(is); + Assert.assertTrue(pdfBytes.length > 0); } /** @@ -532,6 +579,20 @@ public class TestDocumentResource extends BaseJerseyTest { byte[] fileBytes = ByteStreams.toByteArray(is); Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null)); + + // Export a document in PDF format + response = target().path("/document/" + document1Id + "/pdf") + .queryParam("margin", "10") + .queryParam("metadata", "true") + .queryParam("comments", "true") + .queryParam("fitimagetopage", "true") + .request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken) + .get(); + Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus())); + is = (InputStream) response.getEntity(); + byte[] pdfBytes = ByteStreams.toByteArray(is); + Assert.assertTrue(pdfBytes.length > 0); } /** @@ -543,12 +604,12 @@ public class TestDocumentResource extends BaseJerseyTest { public void testVideoExtraction() throws Exception { // Login document_video clientUtil.createUser("document_video"); - String documentPlainToken = clientUtil.login("document_video"); + String documentVideoToken = clientUtil.login("document_video"); // Create a document long create1Date = new Date().getTime(); JsonObject json = target().path("/document").request() - .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken) + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentVideoToken) .put(Entity.form(new Form() .param("title", "My super title document 1") .param("description", "My super description for document 1") @@ -558,13 +619,13 @@ public class TestDocumentResource extends BaseJerseyTest { Assert.assertNotNull(document1Id); // Add a video file - String file1Id = clientUtil.addFileToDocument("file/video.webm", "video.webm", documentPlainToken, document1Id); + String file1Id = clientUtil.addFileToDocument("file/video.webm", "video.webm", documentVideoToken, document1Id); // Search documents by query in full content json = target().path("/document/list") .queryParam("search", "full:vp9") .request() - .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken) + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentVideoToken) .get(JsonObject.class); Assert.assertTrue(json.getJsonArray("documents").size() == 1); @@ -572,12 +633,86 @@ public class TestDocumentResource extends BaseJerseyTest { Response response = target().path("/file/" + file1Id + "/data") .queryParam("size", "thumb") .request() - .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken) + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentVideoToken) .get(); InputStream is = (InputStream) response.getEntity(); byte[] fileBytes = ByteStreams.toByteArray(is); Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null)); + + // Export a document in PDF format + response = target().path("/document/" + document1Id + "/pdf") + .queryParam("margin", "10") + .queryParam("metadata", "true") + .queryParam("comments", "true") + .queryParam("fitimagetopage", "true") + .request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentVideoToken) + .get(); + Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus())); + is = (InputStream) response.getEntity(); + byte[] pdfBytes = ByteStreams.toByteArray(is); + Assert.assertTrue(pdfBytes.length > 0); + } + + /** + * Test PPTX extraction. + * + * @throws Exception e + */ + @Test + public void testPptxExtraction() throws Exception { + // Login document_pptx + clientUtil.createUser("document_pptx"); + String documentPptxToken = clientUtil.login("document_pptx"); + + // Create a document + long create1Date = new Date().getTime(); + JsonObject json = target().path("/document").request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPptxToken) + .put(Entity.form(new Form() + .param("title", "My super title document 1") + .param("description", "My super description for document 1") + .param("language", "eng") + .param("create_date", Long.toString(create1Date))), JsonObject.class); + String document1Id = json.getString("id"); + Assert.assertNotNull(document1Id); + + // Add a PPTX file + String file1Id = clientUtil.addFileToDocument("file/apache.pptx", "apache.pptx", documentPptxToken, document1Id); + + // Search documents by query in full content + json = target().path("/document/list") + .queryParam("search", "full:scaling") + .request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPptxToken) + .get(JsonObject.class); + Assert.assertTrue(json.getJsonArray("documents").size() == 1); + + // Get the file thumbnail data + Response response = target().path("/file/" + file1Id + "/data") + .queryParam("size", "thumb") + .request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPptxToken) + .get(); + InputStream is = (InputStream) response.getEntity(); + byte[] fileBytes = ByteStreams.toByteArray(is); + Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues + Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null)); + + // Export a document in PDF format + response = target().path("/document/" + document1Id + "/pdf") + .queryParam("margin", "10") + .queryParam("metadata", "true") + .queryParam("comments", "true") + .queryParam("fitimagetopage", "true") + .request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPptxToken) + .get(); + Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus())); + is = (InputStream) response.getEntity(); + byte[] pdfBytes = ByteStreams.toByteArray(is); + Assert.assertTrue(pdfBytes.length > 0); } /** diff --git a/docs-web/src/test/resources/file/apache.pptx b/docs-web/src/test/resources/file/apache.pptx new file mode 100644 index 00000000..d3dc6cfe Binary files /dev/null and b/docs-web/src/test/resources/file/apache.pptx differ