#118: extract text content from text plain files (WIP)

This commit is contained in:
Benjamin Gamard 2017-06-11 11:33:30 +02:00
parent dcc7fe55f4
commit 330de495db
5 changed files with 106 additions and 29 deletions

View File

@ -64,11 +64,12 @@ public class FileUtil {
private static String ocrFile(InputStream inputStream, String language) { private static String ocrFile(InputStream inputStream, String language) {
Tesseract instance = Tesseract.getInstance(); Tesseract instance = Tesseract.getInstance();
String content = null; String content = null;
BufferedImage image = null; BufferedImage image;
try { try {
image = ImageIO.read(inputStream); image = ImageIO.read(inputStream);
} catch (IOException e) { } catch (IOException e) {
log.error("Error reading the image", e); log.error("Error reading the image", e);
return null;
} }
// Upscale and grayscale the image // Upscale and grayscale the image
@ -92,10 +93,9 @@ public class FileUtil {
* Save a file on the storage filesystem. * Save a file on the storage filesystem.
* *
* @param inputStream Unencrypted input stream * @param inputStream Unencrypted input stream
* @param pdf * @param pdfInputStream PDF input stream
* @param file File to save * @param file File to save
* @param privateKey Private key used for encryption * @param privateKey Private key used for encryption
* @throws Exception
*/ */
public static void save(InputStream inputStream, InputStream pdfInputStream, File file, String privateKey) throws Exception { public static void save(InputStream inputStream, InputStream pdfInputStream, File file, String privateKey) throws Exception {
Cipher cipher = EncryptionUtil.getEncryptionCipher(privateKey); Cipher cipher = EncryptionUtil.getEncryptionCipher(privateKey);
@ -114,9 +114,8 @@ public class FileUtil {
* @param inputStream Unencrypted input stream * @param inputStream Unencrypted input stream
* @param pdfInputStream Unencrypted PDF input stream * @param pdfInputStream Unencrypted PDF input stream
* @param cipher Cipher to use for encryption * @param cipher Cipher to use for encryption
* @throws Exception
*/ */
public static void saveVariations(File file, InputStream inputStream, InputStream pdfInputStream, Cipher cipher) throws Exception { private static void saveVariations(File file, InputStream inputStream, InputStream pdfInputStream, Cipher cipher) throws Exception {
BufferedImage image = null; BufferedImage image = null;
if (ImageUtil.isImage(file.getMimeType())) { if (ImageUtil.isImage(file.getMimeType())) {
image = ImageIO.read(inputStream); image = ImageIO.read(inputStream);
@ -151,7 +150,6 @@ public class FileUtil {
* Remove a file from the storage filesystem. * Remove a file from the storage filesystem.
* *
* @param file File to delete * @param file File to delete
* @throws IOException
*/ */
public static void delete(File file) throws IOException { public static void delete(File file) throws IOException {
Path storedFile = DirectoryUtil.getStorageDirectory().resolve(file.getId()); Path storedFile = DirectoryUtil.getStorageDirectory().resolve(file.getId());

View File

@ -86,7 +86,6 @@ public class PdfUtil {
* @param inputStream InputStream * @param inputStream InputStream
* @param reset Reset the stream after usage * @param reset Reset the stream after usage
* @return PDF input stream * @return PDF input stream
* @throws Exception
*/ */
public static InputStream convertToPdf(File file, InputStream inputStream, boolean reset) throws Exception { public static InputStream convertToPdf(File file, InputStream inputStream, boolean reset) throws Exception {
if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) { if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) {
@ -101,18 +100,36 @@ public class PdfUtil {
if (file.getMimeType().equals(MimeType.OPEN_DOCUMENT_TEXT)) { if (file.getMimeType().equals(MimeType.OPEN_DOCUMENT_TEXT)) {
return convertOpenDocumentText(inputStream, reset); return convertOpenDocumentText(inputStream, reset);
} }
if (file.getMimeType().equals(MimeType.TEXT_PLAIN) || file.getMimeType().equals(MimeType.TEXT_CSV)) {
return convertTextPlain(inputStream, reset);
}
// PDF conversion not necessary/possible // PDF conversion not necessary/possible
return null; return null;
} }
/**
* Convert a text plain document to PDF.
*
* @param inputStream Unecnrypted input stream
* @param reset Reset the stream after usage
* @return PDF input stream
*/
private static InputStream convertTextPlain(InputStream inputStream, boolean reset) throws Exception {
if (reset) {
inputStream.reset();
}
// TODO Create a PDF from the text plain
return null;
}
/** /**
* Convert an open document text file to PDF. * Convert an open document text file to PDF.
* *
* @param inputStream Unencrypted input stream * @param inputStream Unencrypted input stream
* @param reset Reset the stream after usage * @param reset Reset the stream after usage
* @return PDF input stream * @return PDF input stream
* @throws Exception
*/ */
private static InputStream convertOpenDocumentText(InputStream inputStream, boolean reset) throws Exception { private static InputStream convertOpenDocumentText(InputStream inputStream, boolean reset) throws Exception {
ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream(); ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream();
@ -131,7 +148,6 @@ public class PdfUtil {
* @param inputStream Unencrypted input stream * @param inputStream Unencrypted input stream
* @param reset Reset the stream after usage * @param reset Reset the stream after usage
* @return PDF input stream * @return PDF input stream
* @throws Exception
*/ */
private static InputStream convertOfficeDocument(InputStream inputStream, boolean reset) throws Exception { private static InputStream convertOfficeDocument(InputStream inputStream, boolean reset) throws Exception {
ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream(); ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream();
@ -153,7 +169,6 @@ public class PdfUtil {
* @param metadata Add a page with metadata * @param metadata Add a page with metadata
* @param margin Margins in millimeters * @param margin Margins in millimeters
* @return PDF input stream * @return PDF input stream
* @throws IOException
*/ */
public static InputStream convertToPdf(DocumentDto documentDto, List<File> fileList, public static InputStream convertToPdf(DocumentDto documentDto, List<File> fileList,
boolean fitImageToPage, boolean metadata, int margin) throws Exception { boolean fitImageToPage, boolean metadata, int margin) throws Exception {
@ -282,7 +297,6 @@ public class PdfUtil {
* *
* @param inputStream PDF document * @param inputStream PDF document
* @return Render of the first page * @return Render of the first page
* @throws IOException
*/ */
public static BufferedImage renderFirstPage(InputStream inputStream) throws IOException { public static BufferedImage renderFirstPage(InputStream inputStream) throws IOException {
try (PDDocument pdfDocument = PDDocument.load(inputStream)) { try (PDDocument pdfDocument = PDDocument.load(inputStream)) {

View File

@ -78,22 +78,26 @@ public class MimeTypeUtil {
*/ */
public static String getFileExtension(String mimeType) { public static String getFileExtension(String mimeType) {
switch (mimeType) { switch (mimeType) {
case MimeType.APPLICATION_ZIP: case MimeType.APPLICATION_ZIP:
return "zip"; return "zip";
case MimeType.IMAGE_GIF: case MimeType.IMAGE_GIF:
return "gif"; return "gif";
case MimeType.IMAGE_JPEG: case MimeType.IMAGE_JPEG:
return "jpg"; return "jpg";
case MimeType.IMAGE_PNG: case MimeType.IMAGE_PNG:
return "png"; return "png";
case MimeType.APPLICATION_PDF: case MimeType.APPLICATION_PDF:
return "pdf"; return "pdf";
case MimeType.OPEN_DOCUMENT_TEXT: case MimeType.OPEN_DOCUMENT_TEXT:
return "odt"; return "odt";
case MimeType.OFFICE_DOCUMENT: case MimeType.OFFICE_DOCUMENT:
return "docx"; return "docx";
default: case MimeType.TEXT_PLAIN:
return "bin"; return "txt";
case MimeType.TEXT_CSV:
return "csv";
default:
return "bin";
} }
} }

View File

@ -545,4 +545,63 @@ public class TestDocumentResource extends BaseJerseyTest {
Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues
Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null)); Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null));
} }
/**
* Test plain text extraction.
*
* @throws Exception e
*/
@Test
public void testPlainTextExtraction() throws Exception {
// Login document_docx
clientUtil.createUser("document_plain");
String documentPlainToken = clientUtil.login("document_plain");
// Create a document
long create1Date = new Date().getTime();
JsonObject json = target().path("/document").request()
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken)
.put(Entity.form(new Form()
.param("title", "My super title document 1")
.param("description", "My super description for document 1")
.param("language", "eng")
.param("create_date", Long.toString(create1Date))), JsonObject.class);
String document1Id = json.getString("id");
Assert.assertNotNull(document1Id);
// Add a PDF file
String file1Id;
try (InputStream is = Resources.getResource("file/document.txt").openStream()) {
StreamDataBodyPart streamDataBodyPart = new StreamDataBodyPart("file", is, "document.txt");
try (FormDataMultiPart multiPart = new FormDataMultiPart()) {
json = target()
.register(MultiPartFeature.class)
.path("/file").request()
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken)
.put(Entity.entity(multiPart.field("id", document1Id).bodyPart(streamDataBodyPart),
MediaType.MULTIPART_FORM_DATA_TYPE), JsonObject.class);
file1Id = json.getString("id");
Assert.assertNotNull(file1Id);
}
}
// Search documents by query in full content
json = target().path("/document/list")
.queryParam("search", "full:love")
.request()
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken)
.get(JsonObject.class);
Assert.assertTrue(json.getJsonArray("documents").size() == 1);
// Get the file thumbnail data
Response response = target().path("/file/" + file1Id + "/data")
.queryParam("size", "thumb")
.request()
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken)
.get();
InputStream is = (InputStream) response.getEntity();
byte[] fileBytes = ByteStreams.toByteArray(is);
Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues
Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null));
}
} }

View File

@ -0,0 +1,2 @@
This is a test document
Please love me