mirror of
https://github.com/sismics/docs.git
synced 2024-11-25 15:17:57 +01:00
#118: extract text content from text plain files (WIP)
This commit is contained in:
parent
dcc7fe55f4
commit
330de495db
@ -64,11 +64,12 @@ public class FileUtil {
|
|||||||
private static String ocrFile(InputStream inputStream, String language) {
|
private static String ocrFile(InputStream inputStream, String language) {
|
||||||
Tesseract instance = Tesseract.getInstance();
|
Tesseract instance = Tesseract.getInstance();
|
||||||
String content = null;
|
String content = null;
|
||||||
BufferedImage image = null;
|
BufferedImage image;
|
||||||
try {
|
try {
|
||||||
image = ImageIO.read(inputStream);
|
image = ImageIO.read(inputStream);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
log.error("Error reading the image", e);
|
log.error("Error reading the image", e);
|
||||||
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Upscale and grayscale the image
|
// Upscale and grayscale the image
|
||||||
@ -92,10 +93,9 @@ public class FileUtil {
|
|||||||
* Save a file on the storage filesystem.
|
* Save a file on the storage filesystem.
|
||||||
*
|
*
|
||||||
* @param inputStream Unencrypted input stream
|
* @param inputStream Unencrypted input stream
|
||||||
* @param pdf
|
* @param pdfInputStream PDF input stream
|
||||||
* @param file File to save
|
* @param file File to save
|
||||||
* @param privateKey Private key used for encryption
|
* @param privateKey Private key used for encryption
|
||||||
* @throws Exception
|
|
||||||
*/
|
*/
|
||||||
public static void save(InputStream inputStream, InputStream pdfInputStream, File file, String privateKey) throws Exception {
|
public static void save(InputStream inputStream, InputStream pdfInputStream, File file, String privateKey) throws Exception {
|
||||||
Cipher cipher = EncryptionUtil.getEncryptionCipher(privateKey);
|
Cipher cipher = EncryptionUtil.getEncryptionCipher(privateKey);
|
||||||
@ -114,9 +114,8 @@ public class FileUtil {
|
|||||||
* @param inputStream Unencrypted input stream
|
* @param inputStream Unencrypted input stream
|
||||||
* @param pdfInputStream Unencrypted PDF input stream
|
* @param pdfInputStream Unencrypted PDF input stream
|
||||||
* @param cipher Cipher to use for encryption
|
* @param cipher Cipher to use for encryption
|
||||||
* @throws Exception
|
|
||||||
*/
|
*/
|
||||||
public static void saveVariations(File file, InputStream inputStream, InputStream pdfInputStream, Cipher cipher) throws Exception {
|
private static void saveVariations(File file, InputStream inputStream, InputStream pdfInputStream, Cipher cipher) throws Exception {
|
||||||
BufferedImage image = null;
|
BufferedImage image = null;
|
||||||
if (ImageUtil.isImage(file.getMimeType())) {
|
if (ImageUtil.isImage(file.getMimeType())) {
|
||||||
image = ImageIO.read(inputStream);
|
image = ImageIO.read(inputStream);
|
||||||
@ -151,7 +150,6 @@ public class FileUtil {
|
|||||||
* Remove a file from the storage filesystem.
|
* Remove a file from the storage filesystem.
|
||||||
*
|
*
|
||||||
* @param file File to delete
|
* @param file File to delete
|
||||||
* @throws IOException
|
|
||||||
*/
|
*/
|
||||||
public static void delete(File file) throws IOException {
|
public static void delete(File file) throws IOException {
|
||||||
Path storedFile = DirectoryUtil.getStorageDirectory().resolve(file.getId());
|
Path storedFile = DirectoryUtil.getStorageDirectory().resolve(file.getId());
|
||||||
|
@ -86,7 +86,6 @@ public class PdfUtil {
|
|||||||
* @param inputStream InputStream
|
* @param inputStream InputStream
|
||||||
* @param reset Reset the stream after usage
|
* @param reset Reset the stream after usage
|
||||||
* @return PDF input stream
|
* @return PDF input stream
|
||||||
* @throws Exception
|
|
||||||
*/
|
*/
|
||||||
public static InputStream convertToPdf(File file, InputStream inputStream, boolean reset) throws Exception {
|
public static InputStream convertToPdf(File file, InputStream inputStream, boolean reset) throws Exception {
|
||||||
if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) {
|
if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) {
|
||||||
@ -101,18 +100,36 @@ public class PdfUtil {
|
|||||||
if (file.getMimeType().equals(MimeType.OPEN_DOCUMENT_TEXT)) {
|
if (file.getMimeType().equals(MimeType.OPEN_DOCUMENT_TEXT)) {
|
||||||
return convertOpenDocumentText(inputStream, reset);
|
return convertOpenDocumentText(inputStream, reset);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (file.getMimeType().equals(MimeType.TEXT_PLAIN) || file.getMimeType().equals(MimeType.TEXT_CSV)) {
|
||||||
|
return convertTextPlain(inputStream, reset);
|
||||||
|
}
|
||||||
|
|
||||||
// PDF conversion not necessary/possible
|
// PDF conversion not necessary/possible
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert a text plain document to PDF.
|
||||||
|
*
|
||||||
|
* @param inputStream Unecnrypted input stream
|
||||||
|
* @param reset Reset the stream after usage
|
||||||
|
* @return PDF input stream
|
||||||
|
*/
|
||||||
|
private static InputStream convertTextPlain(InputStream inputStream, boolean reset) throws Exception {
|
||||||
|
if (reset) {
|
||||||
|
inputStream.reset();
|
||||||
|
}
|
||||||
|
// TODO Create a PDF from the text plain
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Convert an open document text file to PDF.
|
* Convert an open document text file to PDF.
|
||||||
*
|
*
|
||||||
* @param inputStream Unencrypted input stream
|
* @param inputStream Unencrypted input stream
|
||||||
* @param reset Reset the stream after usage
|
* @param reset Reset the stream after usage
|
||||||
* @return PDF input stream
|
* @return PDF input stream
|
||||||
* @throws Exception
|
|
||||||
*/
|
*/
|
||||||
private static InputStream convertOpenDocumentText(InputStream inputStream, boolean reset) throws Exception {
|
private static InputStream convertOpenDocumentText(InputStream inputStream, boolean reset) throws Exception {
|
||||||
ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream();
|
ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream();
|
||||||
@ -131,7 +148,6 @@ public class PdfUtil {
|
|||||||
* @param inputStream Unencrypted input stream
|
* @param inputStream Unencrypted input stream
|
||||||
* @param reset Reset the stream after usage
|
* @param reset Reset the stream after usage
|
||||||
* @return PDF input stream
|
* @return PDF input stream
|
||||||
* @throws Exception
|
|
||||||
*/
|
*/
|
||||||
private static InputStream convertOfficeDocument(InputStream inputStream, boolean reset) throws Exception {
|
private static InputStream convertOfficeDocument(InputStream inputStream, boolean reset) throws Exception {
|
||||||
ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream();
|
ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream();
|
||||||
@ -153,7 +169,6 @@ public class PdfUtil {
|
|||||||
* @param metadata Add a page with metadata
|
* @param metadata Add a page with metadata
|
||||||
* @param margin Margins in millimeters
|
* @param margin Margins in millimeters
|
||||||
* @return PDF input stream
|
* @return PDF input stream
|
||||||
* @throws IOException
|
|
||||||
*/
|
*/
|
||||||
public static InputStream convertToPdf(DocumentDto documentDto, List<File> fileList,
|
public static InputStream convertToPdf(DocumentDto documentDto, List<File> fileList,
|
||||||
boolean fitImageToPage, boolean metadata, int margin) throws Exception {
|
boolean fitImageToPage, boolean metadata, int margin) throws Exception {
|
||||||
@ -282,7 +297,6 @@ public class PdfUtil {
|
|||||||
*
|
*
|
||||||
* @param inputStream PDF document
|
* @param inputStream PDF document
|
||||||
* @return Render of the first page
|
* @return Render of the first page
|
||||||
* @throws IOException
|
|
||||||
*/
|
*/
|
||||||
public static BufferedImage renderFirstPage(InputStream inputStream) throws IOException {
|
public static BufferedImage renderFirstPage(InputStream inputStream) throws IOException {
|
||||||
try (PDDocument pdfDocument = PDDocument.load(inputStream)) {
|
try (PDDocument pdfDocument = PDDocument.load(inputStream)) {
|
||||||
|
@ -78,22 +78,26 @@ public class MimeTypeUtil {
|
|||||||
*/
|
*/
|
||||||
public static String getFileExtension(String mimeType) {
|
public static String getFileExtension(String mimeType) {
|
||||||
switch (mimeType) {
|
switch (mimeType) {
|
||||||
case MimeType.APPLICATION_ZIP:
|
case MimeType.APPLICATION_ZIP:
|
||||||
return "zip";
|
return "zip";
|
||||||
case MimeType.IMAGE_GIF:
|
case MimeType.IMAGE_GIF:
|
||||||
return "gif";
|
return "gif";
|
||||||
case MimeType.IMAGE_JPEG:
|
case MimeType.IMAGE_JPEG:
|
||||||
return "jpg";
|
return "jpg";
|
||||||
case MimeType.IMAGE_PNG:
|
case MimeType.IMAGE_PNG:
|
||||||
return "png";
|
return "png";
|
||||||
case MimeType.APPLICATION_PDF:
|
case MimeType.APPLICATION_PDF:
|
||||||
return "pdf";
|
return "pdf";
|
||||||
case MimeType.OPEN_DOCUMENT_TEXT:
|
case MimeType.OPEN_DOCUMENT_TEXT:
|
||||||
return "odt";
|
return "odt";
|
||||||
case MimeType.OFFICE_DOCUMENT:
|
case MimeType.OFFICE_DOCUMENT:
|
||||||
return "docx";
|
return "docx";
|
||||||
default:
|
case MimeType.TEXT_PLAIN:
|
||||||
return "bin";
|
return "txt";
|
||||||
|
case MimeType.TEXT_CSV:
|
||||||
|
return "csv";
|
||||||
|
default:
|
||||||
|
return "bin";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -545,4 +545,63 @@ public class TestDocumentResource extends BaseJerseyTest {
|
|||||||
Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues
|
Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues
|
||||||
Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null));
|
Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test plain text extraction.
|
||||||
|
*
|
||||||
|
* @throws Exception e
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testPlainTextExtraction() throws Exception {
|
||||||
|
// Login document_docx
|
||||||
|
clientUtil.createUser("document_plain");
|
||||||
|
String documentPlainToken = clientUtil.login("document_plain");
|
||||||
|
|
||||||
|
// Create a document
|
||||||
|
long create1Date = new Date().getTime();
|
||||||
|
JsonObject json = target().path("/document").request()
|
||||||
|
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken)
|
||||||
|
.put(Entity.form(new Form()
|
||||||
|
.param("title", "My super title document 1")
|
||||||
|
.param("description", "My super description for document 1")
|
||||||
|
.param("language", "eng")
|
||||||
|
.param("create_date", Long.toString(create1Date))), JsonObject.class);
|
||||||
|
String document1Id = json.getString("id");
|
||||||
|
Assert.assertNotNull(document1Id);
|
||||||
|
|
||||||
|
// Add a PDF file
|
||||||
|
String file1Id;
|
||||||
|
try (InputStream is = Resources.getResource("file/document.txt").openStream()) {
|
||||||
|
StreamDataBodyPart streamDataBodyPart = new StreamDataBodyPart("file", is, "document.txt");
|
||||||
|
try (FormDataMultiPart multiPart = new FormDataMultiPart()) {
|
||||||
|
json = target()
|
||||||
|
.register(MultiPartFeature.class)
|
||||||
|
.path("/file").request()
|
||||||
|
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken)
|
||||||
|
.put(Entity.entity(multiPart.field("id", document1Id).bodyPart(streamDataBodyPart),
|
||||||
|
MediaType.MULTIPART_FORM_DATA_TYPE), JsonObject.class);
|
||||||
|
file1Id = json.getString("id");
|
||||||
|
Assert.assertNotNull(file1Id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Search documents by query in full content
|
||||||
|
json = target().path("/document/list")
|
||||||
|
.queryParam("search", "full:love")
|
||||||
|
.request()
|
||||||
|
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken)
|
||||||
|
.get(JsonObject.class);
|
||||||
|
Assert.assertTrue(json.getJsonArray("documents").size() == 1);
|
||||||
|
|
||||||
|
// Get the file thumbnail data
|
||||||
|
Response response = target().path("/file/" + file1Id + "/data")
|
||||||
|
.queryParam("size", "thumb")
|
||||||
|
.request()
|
||||||
|
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken)
|
||||||
|
.get();
|
||||||
|
InputStream is = (InputStream) response.getEntity();
|
||||||
|
byte[] fileBytes = ByteStreams.toByteArray(is);
|
||||||
|
Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues
|
||||||
|
Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null));
|
||||||
|
}
|
||||||
}
|
}
|
2
docs-web/src/test/resources/file/document.txt
Normal file
2
docs-web/src/test/resources/file/document.txt
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
This is a test document
|
||||||
|
Please love me
|
Loading…
Reference in New Issue
Block a user