mirror of
https://github.com/sismics/docs.git
synced 2024-11-22 05:57:57 +01:00
Closes #215: handle pptx files
This commit is contained in:
parent
945e619d55
commit
3613f6f8de
@ -31,7 +31,7 @@ Features
|
||||
|
||||
- Responsive user interface
|
||||
- Optical character recognition
|
||||
- Support image, PDF, ODT and DOCX files
|
||||
- Support image, PDF, ODT, DOCX, PPTX files
|
||||
- Video file support ![New!](https://www.sismics.com/public/img/new.png)
|
||||
- Flexible search engine
|
||||
- Full text search in all supported files
|
||||
|
@ -15,6 +15,7 @@ public class FormatHandlerUtil {
|
||||
*/
|
||||
private static final List<Class<? extends FormatHandler>> FORMAT_HANDLERS = Lists.newArrayList(
|
||||
DocxFormatHandler.class,
|
||||
PptxFormatHandler.class,
|
||||
OdtFormatHandler.class,
|
||||
VideoFormatHandler.class,
|
||||
PdfFormatHandler.class,
|
||||
|
@ -0,0 +1,97 @@
|
||||
package com.sismics.docs.core.util.format;
|
||||
|
||||
import com.google.common.io.Closer;
|
||||
import com.sismics.util.mime.MimeType;
|
||||
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
||||
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
|
||||
import org.apache.poi.xslf.usermodel.XMLSlideShow;
|
||||
import org.apache.poi.xslf.usermodel.XSLFSlide;
|
||||
|
||||
import java.awt.*;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
/**
|
||||
* PPTX format handler.
|
||||
*
|
||||
* @author bgamard
|
||||
*/
|
||||
public class PptxFormatHandler implements FormatHandler {
|
||||
/**
|
||||
* Cached PPTX loaded file.
|
||||
*/
|
||||
private XMLSlideShow slideShow;
|
||||
|
||||
@Override
|
||||
public boolean accept(String mimeType) {
|
||||
return MimeType.OFFICE_PRESENTATION.equals(mimeType);
|
||||
}
|
||||
|
||||
@Override
|
||||
public BufferedImage generateThumbnail(Path file) throws Exception {
|
||||
XMLSlideShow pptx = loadPPtxFile(file);
|
||||
if (pptx.getSlides().length > 0) {
|
||||
return generateImageFromSlide(pptx, 0);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String extractContent(String language, Path file) throws Exception {
|
||||
XMLSlideShow pptx = loadPPtxFile(file);
|
||||
return new XSLFPowerPointExtractor(pptx).getText();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void appendToPdf(Path file, PDDocument doc, boolean fitImageToPage, int margin, MemoryUsageSetting memUsageSettings, Closer closer) throws Exception {
|
||||
XMLSlideShow pptx = loadPPtxFile(file);
|
||||
XSLFSlide[] slides = pptx.getSlides();
|
||||
Dimension pgsize = pptx.getPageSize();
|
||||
for (int slideIndex = 0; slideIndex < slides.length; slideIndex++) {
|
||||
// One PDF page per slide
|
||||
PDPage page = new PDPage(new PDRectangle(pgsize.width, pgsize.height));
|
||||
try (PDPageContentStream contentStream = new PDPageContentStream(doc, page)) {
|
||||
BufferedImage bim = generateImageFromSlide(pptx, slideIndex);
|
||||
PDImageXObject pdImage = LosslessFactory.createFromImage(doc, bim);
|
||||
contentStream.drawImage(pdImage, 0, page.getMediaBox().getHeight() - pdImage.getHeight());
|
||||
}
|
||||
doc.addPage(page);
|
||||
}
|
||||
}
|
||||
|
||||
private XMLSlideShow loadPPtxFile(Path file) throws Exception {
|
||||
if (slideShow == null) {
|
||||
try (InputStream inputStream = Files.newInputStream(file)) {
|
||||
slideShow = new XMLSlideShow(inputStream);
|
||||
}
|
||||
}
|
||||
return slideShow;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate an image from a PPTX slide.
|
||||
*
|
||||
* @param pptx PPTX
|
||||
* @param slideIndex Slide index
|
||||
* @return Image
|
||||
*/
|
||||
private BufferedImage generateImageFromSlide(XMLSlideShow pptx, int slideIndex) {
|
||||
Dimension pgsize = pptx.getPageSize();
|
||||
BufferedImage img = new BufferedImage(pgsize.width, pgsize.height,BufferedImage.TYPE_INT_RGB);
|
||||
Graphics2D graphics = img.createGraphics();
|
||||
graphics.setPaint(Color.white);
|
||||
graphics.fill(new Rectangle2D.Float(0, 0, pgsize.width, pgsize.height));
|
||||
pptx.getSlides()[slideIndex].draw(graphics);
|
||||
return img;
|
||||
}
|
||||
}
|
@ -20,6 +20,10 @@ public class MimeType {
|
||||
|
||||
public static final String OFFICE_DOCUMENT = "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
|
||||
|
||||
public static final String OFFICE_PRESENTATION = "application/vnd.openxmlformats-officedocument.presentationml.presentation";
|
||||
|
||||
public static final String OFFICE_SHEET = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
|
||||
|
||||
public static final String TEXT_PLAIN = "text/plain";
|
||||
|
||||
public static final String TEXT_CSV = "text/csv";
|
||||
|
@ -145,6 +145,9 @@ public class MimeTypeUtil {
|
||||
if (content.contains(MimeType.OFFICE_DOCUMENT)) {
|
||||
mimeType = MimeType.OFFICE_DOCUMENT;
|
||||
break;
|
||||
} else if (content.contains(MimeType.OFFICE_PRESENTATION)) {
|
||||
mimeType = MimeType.OFFICE_PRESENTATION;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -44,6 +44,16 @@ public class TestFileUtil {
|
||||
Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen."));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void extractContentPowerpointTest() throws Exception {
|
||||
Path path = Paths.get(ClassLoader.getSystemResource("file/apache.pptx").toURI());
|
||||
FormatHandler formatHandler = FormatHandlerUtil.find(MimeTypeUtil.guessMimeType(path, "apache.pptx"));
|
||||
Assert.assertNotNull(formatHandler);
|
||||
Assert.assertTrue(formatHandler instanceof PptxFormatHandler);
|
||||
String content = formatHandler.extractContent("eng", path);
|
||||
Assert.assertTrue(content.contains("Scaling"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void extractContentPdf() throws Exception {
|
||||
Path path = Paths.get(ClassLoader.getSystemResource("file/udhr.pdf").toURI());
|
||||
@ -70,7 +80,8 @@ public class TestFileUtil {
|
||||
InputStream inputStream1 = Resources.getResource("file/apollo_portrait.jpg").openStream();
|
||||
InputStream inputStream2 = Resources.getResource("file/udhr_encrypted.pdf").openStream();
|
||||
InputStream inputStream3 = Resources.getResource("file/document.docx").openStream();
|
||||
InputStream inputStream4 = Resources.getResource("file/document.odt").openStream()) {
|
||||
InputStream inputStream4 = Resources.getResource("file/document.odt").openStream();
|
||||
InputStream inputStream5 = Resources.getResource("file/apache.pptx").openStream()) {
|
||||
// Document
|
||||
DocumentDto documentDto = new DocumentDto();
|
||||
documentDto.setTitle("My super document 1");
|
||||
@ -117,9 +128,16 @@ public class TestFileUtil {
|
||||
file4.setId("document_odt");
|
||||
file4.setMimeType(MimeType.OPEN_DOCUMENT_TEXT);
|
||||
|
||||
// Sixth file
|
||||
Files.copy(inputStream5, DirectoryUtil.getStorageDirectory().resolve("document_pptx"), StandardCopyOption.REPLACE_EXISTING);
|
||||
File file5 = new File();
|
||||
file5.setId("document_pptx");
|
||||
file5.setMimeType(MimeType.OFFICE_PRESENTATION);
|
||||
|
||||
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
|
||||
PdfUtil.convertToPdf(documentDto, Lists.newArrayList(file0, file1, file2, file3, file4), true, true, 10, outputStream);
|
||||
PdfUtil.convertToPdf(documentDto, Lists.newArrayList(file0, file1, file2, file3, file4, file5), true, true, 10, outputStream);
|
||||
Assert.assertTrue(outputStream.toByteArray().length > 0);
|
||||
com.google.common.io.Files.write(outputStream.toByteArray(), new java.io.File("C:\\Users\\Jendib\\Downloads\\test.pdf"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -23,5 +23,9 @@ public class TestMimeTypeUtil {
|
||||
// Detect DOCX files
|
||||
path = Paths.get(ClassLoader.getSystemResource("file/document.docx").toURI());
|
||||
Assert.assertEquals(MimeType.OFFICE_DOCUMENT, MimeTypeUtil.guessMimeType(path, "document.odt"));
|
||||
|
||||
// Detect PPTX files
|
||||
path = Paths.get(ClassLoader.getSystemResource("file/apache.pptx").toURI());
|
||||
Assert.assertEquals(MimeType.OFFICE_PRESENTATION, MimeTypeUtil.guessMimeType(path, "apache.pptx"));
|
||||
}
|
||||
}
|
||||
|
BIN
docs-core/src/test/resources/file/apache.pptx
Normal file
BIN
docs-core/src/test/resources/file/apache.pptx
Normal file
Binary file not shown.
@ -49,7 +49,7 @@ public class ClientUtil {
|
||||
.param("username", username)
|
||||
.param("email", username + "@docs.com")
|
||||
.param("password", "12345678")
|
||||
.param("storage_quota", "1000000")), JsonObject.class); // 1MB quota
|
||||
.param("storage_quota", "10000000")), JsonObject.class); // 10MB quota
|
||||
|
||||
// Add to groups
|
||||
for (String groupName : groupNameList) {
|
||||
|
@ -267,7 +267,12 @@ public class TestDocumentResource extends BaseJerseyTest {
|
||||
Assert.assertEquals(document2Id, json.getString("id"));
|
||||
|
||||
// Export a document in PDF format
|
||||
Response response = target().path("/document/" + document1Id).request()
|
||||
Response response = target().path("/document/" + document1Id + "/pdf")
|
||||
.queryParam("margin", "10")
|
||||
.queryParam("metadata", "true")
|
||||
.queryParam("comments", "true")
|
||||
.queryParam("fitimagetopage", "true")
|
||||
.request()
|
||||
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, document1Token)
|
||||
.get();
|
||||
InputStream is = (InputStream) response.getEntity();
|
||||
@ -394,6 +399,20 @@ public class TestDocumentResource extends BaseJerseyTest {
|
||||
byte[] fileBytes = ByteStreams.toByteArray(is);
|
||||
Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues
|
||||
Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null));
|
||||
|
||||
// Export a document in PDF format
|
||||
response = target().path("/document/" + document1Id + "/pdf")
|
||||
.queryParam("margin", "10")
|
||||
.queryParam("metadata", "true")
|
||||
.queryParam("comments", "true")
|
||||
.queryParam("fitimagetopage", "true")
|
||||
.request()
|
||||
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentOdtToken)
|
||||
.get();
|
||||
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||
is = (InputStream) response.getEntity();
|
||||
byte[] pdfBytes = ByteStreams.toByteArray(is);
|
||||
Assert.assertTrue(pdfBytes.length > 0);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -440,6 +459,20 @@ public class TestDocumentResource extends BaseJerseyTest {
|
||||
byte[] fileBytes = ByteStreams.toByteArray(is);
|
||||
Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues
|
||||
Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null));
|
||||
|
||||
// Export a document in PDF format
|
||||
response = target().path("/document/" + document1Id + "/pdf")
|
||||
.queryParam("margin", "10")
|
||||
.queryParam("metadata", "true")
|
||||
.queryParam("comments", "true")
|
||||
.queryParam("fitimagetopage", "true")
|
||||
.request()
|
||||
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentDocxToken)
|
||||
.get();
|
||||
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||
is = (InputStream) response.getEntity();
|
||||
byte[] pdfBytes = ByteStreams.toByteArray(is);
|
||||
Assert.assertTrue(pdfBytes.length > 0);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -486,6 +519,20 @@ public class TestDocumentResource extends BaseJerseyTest {
|
||||
byte[] fileBytes = ByteStreams.toByteArray(is);
|
||||
Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues
|
||||
Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null));
|
||||
|
||||
// Export a document in PDF format
|
||||
response = target().path("/document/" + document1Id + "/pdf")
|
||||
.queryParam("margin", "10")
|
||||
.queryParam("metadata", "true")
|
||||
.queryParam("comments", "true")
|
||||
.queryParam("fitimagetopage", "true")
|
||||
.request()
|
||||
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPdfToken)
|
||||
.get();
|
||||
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||
is = (InputStream) response.getEntity();
|
||||
byte[] pdfBytes = ByteStreams.toByteArray(is);
|
||||
Assert.assertTrue(pdfBytes.length > 0);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -532,6 +579,20 @@ public class TestDocumentResource extends BaseJerseyTest {
|
||||
byte[] fileBytes = ByteStreams.toByteArray(is);
|
||||
Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues
|
||||
Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null));
|
||||
|
||||
// Export a document in PDF format
|
||||
response = target().path("/document/" + document1Id + "/pdf")
|
||||
.queryParam("margin", "10")
|
||||
.queryParam("metadata", "true")
|
||||
.queryParam("comments", "true")
|
||||
.queryParam("fitimagetopage", "true")
|
||||
.request()
|
||||
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken)
|
||||
.get();
|
||||
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||
is = (InputStream) response.getEntity();
|
||||
byte[] pdfBytes = ByteStreams.toByteArray(is);
|
||||
Assert.assertTrue(pdfBytes.length > 0);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -543,12 +604,12 @@ public class TestDocumentResource extends BaseJerseyTest {
|
||||
public void testVideoExtraction() throws Exception {
|
||||
// Login document_video
|
||||
clientUtil.createUser("document_video");
|
||||
String documentPlainToken = clientUtil.login("document_video");
|
||||
String documentVideoToken = clientUtil.login("document_video");
|
||||
|
||||
// Create a document
|
||||
long create1Date = new Date().getTime();
|
||||
JsonObject json = target().path("/document").request()
|
||||
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken)
|
||||
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentVideoToken)
|
||||
.put(Entity.form(new Form()
|
||||
.param("title", "My super title document 1")
|
||||
.param("description", "My super description for document 1")
|
||||
@ -558,13 +619,13 @@ public class TestDocumentResource extends BaseJerseyTest {
|
||||
Assert.assertNotNull(document1Id);
|
||||
|
||||
// Add a video file
|
||||
String file1Id = clientUtil.addFileToDocument("file/video.webm", "video.webm", documentPlainToken, document1Id);
|
||||
String file1Id = clientUtil.addFileToDocument("file/video.webm", "video.webm", documentVideoToken, document1Id);
|
||||
|
||||
// Search documents by query in full content
|
||||
json = target().path("/document/list")
|
||||
.queryParam("search", "full:vp9")
|
||||
.request()
|
||||
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken)
|
||||
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentVideoToken)
|
||||
.get(JsonObject.class);
|
||||
Assert.assertTrue(json.getJsonArray("documents").size() == 1);
|
||||
|
||||
@ -572,12 +633,86 @@ public class TestDocumentResource extends BaseJerseyTest {
|
||||
Response response = target().path("/file/" + file1Id + "/data")
|
||||
.queryParam("size", "thumb")
|
||||
.request()
|
||||
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken)
|
||||
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentVideoToken)
|
||||
.get();
|
||||
InputStream is = (InputStream) response.getEntity();
|
||||
byte[] fileBytes = ByteStreams.toByteArray(is);
|
||||
Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues
|
||||
Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null));
|
||||
|
||||
// Export a document in PDF format
|
||||
response = target().path("/document/" + document1Id + "/pdf")
|
||||
.queryParam("margin", "10")
|
||||
.queryParam("metadata", "true")
|
||||
.queryParam("comments", "true")
|
||||
.queryParam("fitimagetopage", "true")
|
||||
.request()
|
||||
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentVideoToken)
|
||||
.get();
|
||||
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||
is = (InputStream) response.getEntity();
|
||||
byte[] pdfBytes = ByteStreams.toByteArray(is);
|
||||
Assert.assertTrue(pdfBytes.length > 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test PPTX extraction.
|
||||
*
|
||||
* @throws Exception e
|
||||
*/
|
||||
@Test
|
||||
public void testPptxExtraction() throws Exception {
|
||||
// Login document_pptx
|
||||
clientUtil.createUser("document_pptx");
|
||||
String documentPptxToken = clientUtil.login("document_pptx");
|
||||
|
||||
// Create a document
|
||||
long create1Date = new Date().getTime();
|
||||
JsonObject json = target().path("/document").request()
|
||||
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPptxToken)
|
||||
.put(Entity.form(new Form()
|
||||
.param("title", "My super title document 1")
|
||||
.param("description", "My super description for document 1")
|
||||
.param("language", "eng")
|
||||
.param("create_date", Long.toString(create1Date))), JsonObject.class);
|
||||
String document1Id = json.getString("id");
|
||||
Assert.assertNotNull(document1Id);
|
||||
|
||||
// Add a PPTX file
|
||||
String file1Id = clientUtil.addFileToDocument("file/apache.pptx", "apache.pptx", documentPptxToken, document1Id);
|
||||
|
||||
// Search documents by query in full content
|
||||
json = target().path("/document/list")
|
||||
.queryParam("search", "full:scaling")
|
||||
.request()
|
||||
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPptxToken)
|
||||
.get(JsonObject.class);
|
||||
Assert.assertTrue(json.getJsonArray("documents").size() == 1);
|
||||
|
||||
// Get the file thumbnail data
|
||||
Response response = target().path("/file/" + file1Id + "/data")
|
||||
.queryParam("size", "thumb")
|
||||
.request()
|
||||
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPptxToken)
|
||||
.get();
|
||||
InputStream is = (InputStream) response.getEntity();
|
||||
byte[] fileBytes = ByteStreams.toByteArray(is);
|
||||
Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues
|
||||
Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null));
|
||||
|
||||
// Export a document in PDF format
|
||||
response = target().path("/document/" + document1Id + "/pdf")
|
||||
.queryParam("margin", "10")
|
||||
.queryParam("metadata", "true")
|
||||
.queryParam("comments", "true")
|
||||
.queryParam("fitimagetopage", "true")
|
||||
.request()
|
||||
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPptxToken)
|
||||
.get();
|
||||
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||
is = (InputStream) response.getEntity();
|
||||
byte[] pdfBytes = ByteStreams.toByteArray(is);
|
||||
Assert.assertTrue(pdfBytes.length > 0);
|
||||
}
|
||||
|
||||
/**
|
||||
|
BIN
docs-web/src/test/resources/file/apache.pptx
Normal file
BIN
docs-web/src/test/resources/file/apache.pptx
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user