Closes #639: rework mime type resolution using java api

This commit is contained in:
bgamard 2022-05-16 18:44:26 +02:00
parent 1ccce3f942
commit 1f7c0afc1e
11 changed files with 52 additions and 109 deletions

View File

@ -13,7 +13,7 @@ public class MimeType {
public static final String IMAGE_GIF = "image/gif";
public static final String APPLICATION_ZIP = "application/zip";
public static final String APPLICATION_PDF = "application/pdf";
public static final String OPEN_DOCUMENT_TEXT = "application/vnd.oasis.opendocument.text";

View File

@ -1,16 +1,9 @@
package com.sismics.util.mime;
import com.google.common.base.Charsets;
import org.apache.commons.compress.utils.IOUtils;
import java.io.IOException;
import java.io.InputStream;
import java.net.URLConnection;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
/**
* Utility to check MIME types.
@ -19,7 +12,7 @@ import java.util.zip.ZipInputStream;
*/
public class MimeTypeUtil {
/**
* Try to guess the MIME type of a file by its magic number (header).
* Try to guess the MIME type of a file.
*
* @param file File to inspect
* @param name File name
@ -27,59 +20,17 @@ public class MimeTypeUtil {
* @throws IOException e
*/
public static String guessMimeType(Path file, String name) throws IOException {
String mimeType = name == null ?
null : URLConnection.getFileNameMap().getContentTypeFor(name);
String mimeType = Files.probeContentType(file);
if (mimeType == null && name != null) {
mimeType = URLConnection.getFileNameMap().getContentTypeFor(name);
}
if (mimeType == null) {
try (InputStream is = Files.newInputStream(file)) {
final byte[] headerBytes = new byte[64];
is.read(headerBytes);
mimeType = guessMimeType(headerBytes, name);
}
return MimeType.DEFAULT;
}
return guessOpenDocumentFormat(mimeType, file);
}
/**
* Try to guess the MIME type of a file by its magic number (header).
*
* @param headerBytes File header (first bytes)
* @param name File name
* @return MIME type
*/
public static String guessMimeType(byte[] headerBytes, String name) {
String header = new String(headerBytes, StandardCharsets.US_ASCII);
// Detect by header bytes
if (header.startsWith("PK")) {
return MimeType.APPLICATION_ZIP;
} else if (header.startsWith("GIF87a") || header.startsWith("GIF89a")) {
return MimeType.IMAGE_GIF;
} else if (headerBytes[0] == ((byte) 0xff) && headerBytes[1] == ((byte) 0xd8)) {
return MimeType.IMAGE_JPEG;
} else if (headerBytes[0] == ((byte) 0x89) && headerBytes[1] == ((byte) 0x50) && headerBytes[2] == ((byte) 0x4e) && headerBytes[3] == ((byte) 0x47) &&
headerBytes[4] == ((byte) 0x0d) && headerBytes[5] == ((byte) 0x0a) && headerBytes[6] == ((byte) 0x1a) && headerBytes[7] == ((byte) 0x0a)) {
return MimeType.IMAGE_PNG;
} else if (headerBytes[0] == ((byte) 0x25) && headerBytes[1] == ((byte) 0x50) && headerBytes[2] == ((byte) 0x44) && headerBytes[3] == ((byte) 0x46)) {
return MimeType.APPLICATION_PDF;
} else if (headerBytes[0] == ((byte) 0x00) && headerBytes[1] == ((byte) 0x00) && headerBytes[2] == ((byte) 0x00)
&& (headerBytes[3] == ((byte) 0x14) || headerBytes[3] == ((byte) 0x18) || headerBytes[3] == ((byte) 0x20))
&& headerBytes[4] == ((byte) 0x66) && headerBytes[5] == ((byte) 0x74) && headerBytes[6] == ((byte) 0x79) && headerBytes[7] == ((byte) 0x70)) {
return MimeType.VIDEO_MP4;
} else if (headerBytes[0] == ((byte) 0x1a) && headerBytes[1] == ((byte) 0x45) && headerBytes[2] == ((byte) 0xdf) && headerBytes[3] == ((byte) 0xa3)) {
return MimeType.VIDEO_WEBM;
}
// Detect by file extension
if (name != null) {
if (name.endsWith(".txt")) {
return MimeType.TEXT_PLAIN;
} else if (name.endsWith(".csv")) {
return MimeType.TEXT_CSV;
}
}
return MimeType.DEFAULT;
return mimeType;
}
/**
@ -116,52 +67,4 @@ public class MimeTypeUtil {
return "bin";
}
}
/**
* Guess the MIME type of open document formats (docx and odt).
* It's more costly than the simple header check, but needed because open document formats
* are simple ZIP files on the outside and much bigger on the inside.
*
* @param mimeType Currently detected MIME type
* @param file File on disk
* @return MIME type
*/
private static String guessOpenDocumentFormat(String mimeType, Path file) {
if (!MimeType.APPLICATION_ZIP.equals(mimeType)) {
// open document formats are ZIP files
return mimeType;
}
try (InputStream inputStream = Files.newInputStream(file);
ZipInputStream zipInputStream = new ZipInputStream(inputStream, Charsets.ISO_8859_1)) {
ZipEntry archiveEntry = zipInputStream.getNextEntry();
while (archiveEntry != null) {
if (archiveEntry.getName().equals("mimetype")) {
// Maybe it's an ODT file
String content = new String(IOUtils.toByteArray(zipInputStream), Charsets.ISO_8859_1);
if (MimeType.OPEN_DOCUMENT_TEXT.equals(content.trim())) {
mimeType = MimeType.OPEN_DOCUMENT_TEXT;
break;
}
} else if (archiveEntry.getName().equals("[Content_Types].xml")) {
// Maybe it's a DOCX file
String content = new String(IOUtils.toByteArray(zipInputStream), Charsets.ISO_8859_1);
if (content.contains(MimeType.OFFICE_DOCUMENT)) {
mimeType = MimeType.OFFICE_DOCUMENT;
break;
} else if (content.contains(MimeType.OFFICE_PRESENTATION)) {
mimeType = MimeType.OFFICE_PRESENTATION;
break;
}
}
archiveEntry = zipInputStream.getNextEntry();
}
} catch (Exception e) {
// In case of any error, just give up and keep the ZIP MIME type
return mimeType;
}
return mimeType;
}
}

View File

@ -15,7 +15,7 @@ import java.nio.file.Paths;
*/
public class TestMimeTypeUtil {
@Test
public void guessOpenDocumentFormatTest() throws Exception {
public void test() throws Exception {
// Detect ODT files
Path path = Paths.get(ClassLoader.getSystemResource("file/document.odt").toURI());
Assert.assertEquals(MimeType.OPEN_DOCUMENT_TEXT, MimeTypeUtil.guessMimeType(path, "document.odt"));
@ -28,7 +28,44 @@ public class TestMimeTypeUtil {
path = Paths.get(ClassLoader.getSystemResource("file/apache.pptx").toURI());
Assert.assertEquals(MimeType.OFFICE_PRESENTATION, MimeTypeUtil.guessMimeType(path, "apache.pptx"));
// Detect XLSX files
path = Paths.get(ClassLoader.getSystemResource("file/document.xlsx").toURI());
Assert.assertEquals(MimeType.OFFICE_SHEET, MimeTypeUtil.guessMimeType(path, "document.xlsx"));
// Detect TXT files
Assert.assertEquals(MimeType.TEXT_PLAIN, MimeTypeUtil.guessMimeType(path, "file.txt"));
path = Paths.get(ClassLoader.getSystemResource("file/document.txt").toURI());
Assert.assertEquals(MimeType.TEXT_PLAIN, MimeTypeUtil.guessMimeType(path, "document.txt"));
// Detect CSV files
path = Paths.get(ClassLoader.getSystemResource("file/document.csv").toURI());
Assert.assertEquals(MimeType.TEXT_CSV, MimeTypeUtil.guessMimeType(path, "document.csv"));
// Detect PDF files
path = Paths.get(ClassLoader.getSystemResource("file/udhr.pdf").toURI());
Assert.assertEquals(MimeType.APPLICATION_PDF, MimeTypeUtil.guessMimeType(path, "udhr.pdf"));
// Detect JPEG files
path = Paths.get(ClassLoader.getSystemResource("file/apollo_portrait.jpg").toURI());
Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(path, "apollo_portrait.jpg"));
// Detect GIF files
path = Paths.get(ClassLoader.getSystemResource("file/image.gif").toURI());
Assert.assertEquals(MimeType.IMAGE_GIF, MimeTypeUtil.guessMimeType(path, "image.gif"));
// Detect PNG files
path = Paths.get(ClassLoader.getSystemResource("file/image.png").toURI());
Assert.assertEquals(MimeType.IMAGE_PNG, MimeTypeUtil.guessMimeType(path, "image.png"));
// Detect ZIP files
path = Paths.get(ClassLoader.getSystemResource("file/document.zip").toURI());
Assert.assertEquals(MimeType.APPLICATION_ZIP, MimeTypeUtil.guessMimeType(path, "document.zip"));
// Detect WEBM files
path = Paths.get(ClassLoader.getSystemResource("file/video.webm").toURI());
Assert.assertEquals(MimeType.VIDEO_WEBM, MimeTypeUtil.guessMimeType(path, "video.webm"));
// Detect MP4 files
path = Paths.get(ClassLoader.getSystemResource("file/video.mp4").toURI());
Assert.assertEquals(MimeType.VIDEO_MP4, MimeTypeUtil.guessMimeType(path, "video.mp4"));
}
}

View File

@ -0,0 +1,2 @@
col1,col2
test,me
1 col1 col2
2 test me

View File

@ -0,0 +1 @@
test me.

Binary file not shown.

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.6 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.4 KiB

Binary file not shown.

Binary file not shown.