From 330de495dbffa24aa999da13f8fda642a04f5bdc Mon Sep 17 00:00:00 2001 From: Benjamin Gamard Date: Sun, 11 Jun 2017 11:33:30 +0200 Subject: [PATCH] #118: extract text content from text plain files (WIP) --- .../com/sismics/docs/core/util/FileUtil.java | 10 ++-- .../com/sismics/docs/core/util/PdfUtil.java | 28 ++++++--- .../com/sismics/util/mime/MimeTypeUtil.java | 36 ++++++----- .../docs/rest/TestDocumentResource.java | 59 +++++++++++++++++++ docs-web/src/test/resources/file/document.txt | 2 + 5 files changed, 106 insertions(+), 29 deletions(-) create mode 100644 docs-web/src/test/resources/file/document.txt diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java b/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java index 2a005fce..a12637fa 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java +++ b/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java @@ -64,11 +64,12 @@ public class FileUtil { private static String ocrFile(InputStream inputStream, String language) { Tesseract instance = Tesseract.getInstance(); String content = null; - BufferedImage image = null; + BufferedImage image; try { image = ImageIO.read(inputStream); } catch (IOException e) { log.error("Error reading the image", e); + return null; } // Upscale and grayscale the image @@ -92,10 +93,9 @@ public class FileUtil { * Save a file on the storage filesystem. * * @param inputStream Unencrypted input stream - * @param pdf + * @param pdfInputStream PDF input stream * @param file File to save * @param privateKey Private key used for encryption - * @throws Exception */ public static void save(InputStream inputStream, InputStream pdfInputStream, File file, String privateKey) throws Exception { Cipher cipher = EncryptionUtil.getEncryptionCipher(privateKey); @@ -114,9 +114,8 @@ public class FileUtil { * @param inputStream Unencrypted input stream * @param pdfInputStream Unencrypted PDF input stream * @param cipher Cipher to use for encryption - * @throws Exception */ - public static void saveVariations(File file, InputStream inputStream, InputStream pdfInputStream, Cipher cipher) throws Exception { + private static void saveVariations(File file, InputStream inputStream, InputStream pdfInputStream, Cipher cipher) throws Exception { BufferedImage image = null; if (ImageUtil.isImage(file.getMimeType())) { image = ImageIO.read(inputStream); @@ -151,7 +150,6 @@ public class FileUtil { * Remove a file from the storage filesystem. * * @param file File to delete - * @throws IOException */ public static void delete(File file) throws IOException { Path storedFile = DirectoryUtil.getStorageDirectory().resolve(file.getId()); diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/PdfUtil.java b/docs-core/src/main/java/com/sismics/docs/core/util/PdfUtil.java index f1268545..7ef06102 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/util/PdfUtil.java +++ b/docs-core/src/main/java/com/sismics/docs/core/util/PdfUtil.java @@ -86,7 +86,6 @@ public class PdfUtil { * @param inputStream InputStream * @param reset Reset the stream after usage * @return PDF input stream - * @throws Exception */ public static InputStream convertToPdf(File file, InputStream inputStream, boolean reset) throws Exception { if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) { @@ -101,18 +100,36 @@ public class PdfUtil { if (file.getMimeType().equals(MimeType.OPEN_DOCUMENT_TEXT)) { return convertOpenDocumentText(inputStream, reset); } - + + if (file.getMimeType().equals(MimeType.TEXT_PLAIN) || file.getMimeType().equals(MimeType.TEXT_CSV)) { + return convertTextPlain(inputStream, reset); + } + // PDF conversion not necessary/possible return null; } - + + /** + * Convert a text plain document to PDF. + * + * @param inputStream Unecnrypted input stream + * @param reset Reset the stream after usage + * @return PDF input stream + */ + private static InputStream convertTextPlain(InputStream inputStream, boolean reset) throws Exception { + if (reset) { + inputStream.reset(); + } + // TODO Create a PDF from the text plain + return null; + } + /** * Convert an open document text file to PDF. * * @param inputStream Unencrypted input stream * @param reset Reset the stream after usage * @return PDF input stream - * @throws Exception */ private static InputStream convertOpenDocumentText(InputStream inputStream, boolean reset) throws Exception { ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream(); @@ -131,7 +148,6 @@ public class PdfUtil { * @param inputStream Unencrypted input stream * @param reset Reset the stream after usage * @return PDF input stream - * @throws Exception */ private static InputStream convertOfficeDocument(InputStream inputStream, boolean reset) throws Exception { ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream(); @@ -153,7 +169,6 @@ public class PdfUtil { * @param metadata Add a page with metadata * @param margin Margins in millimeters * @return PDF input stream - * @throws IOException */ public static InputStream convertToPdf(DocumentDto documentDto, List fileList, boolean fitImageToPage, boolean metadata, int margin) throws Exception { @@ -282,7 +297,6 @@ public class PdfUtil { * * @param inputStream PDF document * @return Render of the first page - * @throws IOException */ public static BufferedImage renderFirstPage(InputStream inputStream) throws IOException { try (PDDocument pdfDocument = PDDocument.load(inputStream)) { diff --git a/docs-core/src/main/java/com/sismics/util/mime/MimeTypeUtil.java b/docs-core/src/main/java/com/sismics/util/mime/MimeTypeUtil.java index 124d03ab..740a6c7c 100644 --- a/docs-core/src/main/java/com/sismics/util/mime/MimeTypeUtil.java +++ b/docs-core/src/main/java/com/sismics/util/mime/MimeTypeUtil.java @@ -78,22 +78,26 @@ public class MimeTypeUtil { */ public static String getFileExtension(String mimeType) { switch (mimeType) { - case MimeType.APPLICATION_ZIP: - return "zip"; - case MimeType.IMAGE_GIF: - return "gif"; - case MimeType.IMAGE_JPEG: - return "jpg"; - case MimeType.IMAGE_PNG: - return "png"; - case MimeType.APPLICATION_PDF: - return "pdf"; - case MimeType.OPEN_DOCUMENT_TEXT: - return "odt"; - case MimeType.OFFICE_DOCUMENT: - return "docx"; - default: - return "bin"; + case MimeType.APPLICATION_ZIP: + return "zip"; + case MimeType.IMAGE_GIF: + return "gif"; + case MimeType.IMAGE_JPEG: + return "jpg"; + case MimeType.IMAGE_PNG: + return "png"; + case MimeType.APPLICATION_PDF: + return "pdf"; + case MimeType.OPEN_DOCUMENT_TEXT: + return "odt"; + case MimeType.OFFICE_DOCUMENT: + return "docx"; + case MimeType.TEXT_PLAIN: + return "txt"; + case MimeType.TEXT_CSV: + return "csv"; + default: + return "bin"; } } diff --git a/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java b/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java index 6af38820..d463613e 100644 --- a/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java +++ b/docs-web/src/test/java/com/sismics/docs/rest/TestDocumentResource.java @@ -545,4 +545,63 @@ public class TestDocumentResource extends BaseJerseyTest { Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null)); } + + /** + * Test plain text extraction. + * + * @throws Exception e + */ + @Test + public void testPlainTextExtraction() throws Exception { + // Login document_docx + clientUtil.createUser("document_plain"); + String documentPlainToken = clientUtil.login("document_plain"); + + // Create a document + long create1Date = new Date().getTime(); + JsonObject json = target().path("/document").request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken) + .put(Entity.form(new Form() + .param("title", "My super title document 1") + .param("description", "My super description for document 1") + .param("language", "eng") + .param("create_date", Long.toString(create1Date))), JsonObject.class); + String document1Id = json.getString("id"); + Assert.assertNotNull(document1Id); + + // Add a PDF file + String file1Id; + try (InputStream is = Resources.getResource("file/document.txt").openStream()) { + StreamDataBodyPart streamDataBodyPart = new StreamDataBodyPart("file", is, "document.txt"); + try (FormDataMultiPart multiPart = new FormDataMultiPart()) { + json = target() + .register(MultiPartFeature.class) + .path("/file").request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken) + .put(Entity.entity(multiPart.field("id", document1Id).bodyPart(streamDataBodyPart), + MediaType.MULTIPART_FORM_DATA_TYPE), JsonObject.class); + file1Id = json.getString("id"); + Assert.assertNotNull(file1Id); + } + } + + // Search documents by query in full content + json = target().path("/document/list") + .queryParam("search", "full:love") + .request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken) + .get(JsonObject.class); + Assert.assertTrue(json.getJsonArray("documents").size() == 1); + + // Get the file thumbnail data + Response response = target().path("/file/" + file1Id + "/data") + .queryParam("size", "thumb") + .request() + .cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken) + .get(); + InputStream is = (InputStream) response.getEntity(); + byte[] fileBytes = ByteStreams.toByteArray(is); + Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues + Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null)); + } } \ No newline at end of file diff --git a/docs-web/src/test/resources/file/document.txt b/docs-web/src/test/resources/file/document.txt new file mode 100644 index 00000000..58b1387f --- /dev/null +++ b/docs-web/src/test/resources/file/document.txt @@ -0,0 +1,2 @@ +This is a test document +Please love me \ No newline at end of file