mirror of
https://github.com/sismics/docs.git
synced 2024-11-21 21:47:57 +01:00
Delete files from storage when necessary,
batch to cleanup storage for orphan files, better Lucene directory reader management
This commit is contained in:
parent
00ed2e3c25
commit
6b5c1b2b51
@ -10,8 +10,6 @@ import java.util.Set;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.queries.TermsFilter;
|
||||
@ -23,8 +21,6 @@ import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.sismics.docs.core.model.context.AppContext;
|
||||
import com.sismics.docs.core.model.jpa.Document;
|
||||
@ -38,11 +34,6 @@ import com.sismics.docs.core.util.LuceneUtil.LuceneRunnable;
|
||||
* @author bgamard
|
||||
*/
|
||||
public class LuceneDao {
|
||||
/**
|
||||
* Logger.
|
||||
*/
|
||||
private static final Logger log = LoggerFactory.getLogger(LuceneDao.class);
|
||||
|
||||
/**
|
||||
* Destroy and rebuild index.
|
||||
*
|
||||
@ -178,17 +169,12 @@ public class LuceneDao {
|
||||
TermsFilter userFilter = new TermsFilter(terms);
|
||||
|
||||
// Search
|
||||
Set<String> documentIdList = new HashSet<String>();
|
||||
if (!DirectoryReader.indexExists(AppContext.getInstance().getLuceneDirectory())) {
|
||||
log.warn("Lucene directory not yet initialized");
|
||||
return documentIdList;
|
||||
}
|
||||
IndexReader reader = DirectoryReader.open(AppContext.getInstance().getLuceneDirectory());
|
||||
IndexSearcher searcher = new IndexSearcher(reader);
|
||||
IndexSearcher searcher = new IndexSearcher(AppContext.getInstance().getIndexingService().getDirectoryReader());
|
||||
TopDocs topDocs = searcher.search(query, userFilter, Integer.MAX_VALUE);
|
||||
ScoreDoc[] docs = topDocs.scoreDocs;
|
||||
|
||||
// Extract document IDs
|
||||
Set<String> documentIdList = new HashSet<String>();
|
||||
for (int i = 0; i < docs.length; i++) {
|
||||
org.apache.lucene.document.Document document = searcher.doc(docs[i].doc);
|
||||
String type = document.get("type");
|
||||
@ -201,8 +187,6 @@ public class LuceneDao {
|
||||
documentIdList.add(documentId);
|
||||
}
|
||||
|
||||
reader.close();
|
||||
|
||||
return documentIdList;
|
||||
}
|
||||
|
||||
|
@ -1,11 +1,15 @@
|
||||
package com.sismics.docs.core.listener.async;
|
||||
|
||||
import java.nio.file.Paths;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.common.eventbus.Subscribe;
|
||||
import com.sismics.docs.core.dao.lucene.LuceneDao;
|
||||
import com.sismics.docs.core.event.FileDeletedAsyncEvent;
|
||||
import com.sismics.docs.core.model.jpa.File;
|
||||
import com.sismics.docs.core.util.DirectoryUtil;
|
||||
|
||||
/**
|
||||
* Listener on file deleted.
|
||||
@ -30,10 +34,20 @@ public class FileDeletedAsyncListener {
|
||||
log.info("File deleted event: " + fileDeletedAsyncEvent.toString());
|
||||
}
|
||||
|
||||
// TODO Delete the file from storage
|
||||
// Delete the file from storage
|
||||
File file = fileDeletedAsyncEvent.getFile();
|
||||
java.io.File thumbnailFile = Paths.get(DirectoryUtil.getStorageDirectory().getPath(), file.getId() + "_thumb").toFile();
|
||||
java.io.File storedFile = Paths.get(DirectoryUtil.getStorageDirectory().getPath(), file.getId()).toFile();
|
||||
|
||||
if (thumbnailFile.exists()) {
|
||||
thumbnailFile.delete();
|
||||
}
|
||||
if (storedFile.exists()) {
|
||||
storedFile.delete();
|
||||
}
|
||||
|
||||
// Update Lucene index
|
||||
LuceneDao luceneDao = new LuceneDao();
|
||||
luceneDao.deleteDocument(fileDeletedAsyncEvent.getFile().getId());
|
||||
luceneDao.deleteDocument(file.getId());
|
||||
}
|
||||
}
|
||||
|
@ -7,8 +7,6 @@ import java.util.concurrent.LinkedBlockingQueue;
|
||||
import java.util.concurrent.ThreadPoolExecutor;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import org.apache.lucene.store.Directory;
|
||||
|
||||
import com.google.common.eventbus.AsyncEventBus;
|
||||
import com.google.common.eventbus.EventBus;
|
||||
import com.sismics.docs.core.constant.ConfigType;
|
||||
@ -51,11 +49,6 @@ public class AppContext {
|
||||
*/
|
||||
private IndexingService indexingService;
|
||||
|
||||
/**
|
||||
* Lucene directory.
|
||||
*/
|
||||
private Directory luceneDirectory;
|
||||
|
||||
/**
|
||||
* Asynchronous executors.
|
||||
*/
|
||||
@ -71,8 +64,6 @@ public class AppContext {
|
||||
Config luceneStorageConfig = configDao.getById(ConfigType.LUCENE_DIRECTORY_STORAGE);
|
||||
indexingService = new IndexingService(luceneStorageConfig != null ? luceneStorageConfig.getValue() : null);
|
||||
indexingService.startAndWait();
|
||||
|
||||
luceneDirectory = indexingService.getDirectory();
|
||||
}
|
||||
|
||||
/**
|
||||
@ -165,20 +156,11 @@ public class AppContext {
|
||||
}
|
||||
|
||||
/**
|
||||
* Getter of feedService.
|
||||
* Getter of indexingService.
|
||||
*
|
||||
* @return feedService
|
||||
* @return indexingService
|
||||
*/
|
||||
public IndexingService getIndexingService() {
|
||||
return indexingService;
|
||||
}
|
||||
|
||||
/**
|
||||
* Getter of- luceneDirectory.
|
||||
*
|
||||
* @return the luceneDirectory
|
||||
*/
|
||||
public Directory getLuceneDirectory() {
|
||||
return luceneDirectory;
|
||||
}
|
||||
}
|
||||
|
@ -4,6 +4,7 @@ import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.store.SimpleFSDirectory;
|
||||
@ -34,6 +35,11 @@ public class IndexingService extends AbstractScheduledService {
|
||||
*/
|
||||
private Directory directory;
|
||||
|
||||
/**
|
||||
* Directory reader.
|
||||
*/
|
||||
private DirectoryReader directoryReader;
|
||||
|
||||
/**
|
||||
* Lucene storage config.
|
||||
*/
|
||||
@ -62,10 +68,16 @@ public class IndexingService extends AbstractScheduledService {
|
||||
|
||||
@Override
|
||||
protected void shutDown() {
|
||||
Directory luceneIndex = AppContext.getInstance().getLuceneDirectory();
|
||||
if (luceneIndex != null) {
|
||||
if (directoryReader != null) {
|
||||
try {
|
||||
luceneIndex.close();
|
||||
directoryReader.close();
|
||||
} catch (IOException e) {
|
||||
log.error("Error closing the index reader", e);
|
||||
}
|
||||
}
|
||||
if (directory != null) {
|
||||
try {
|
||||
directory.close();
|
||||
} catch (IOException e) {
|
||||
log.error("Error closing Lucene index", e);
|
||||
}
|
||||
@ -105,4 +117,36 @@ public class IndexingService extends AbstractScheduledService {
|
||||
public Directory getDirectory() {
|
||||
return directory;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a valid directory reader.
|
||||
* Take care of reopening the reader if the index has changed
|
||||
* and closing the previous one.
|
||||
*
|
||||
* @return the directoryReader
|
||||
*/
|
||||
public DirectoryReader getDirectoryReader() {
|
||||
if (directoryReader == null) {
|
||||
if (!DirectoryReader.indexExists(directory)) {
|
||||
log.info("Lucene directory not yet created");
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
directoryReader = DirectoryReader.open(directory);
|
||||
} catch (IOException e) {
|
||||
log.error("Error creating the directory reader", e);
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
DirectoryReader newReader = DirectoryReader.openIfChanged(directoryReader);
|
||||
if (newReader != null) {
|
||||
directoryReader.close();
|
||||
directoryReader = newReader;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
log.error("Error while reopening the directory reader", e);
|
||||
}
|
||||
}
|
||||
return directoryReader;
|
||||
}
|
||||
}
|
||||
|
@ -37,7 +37,7 @@ public class LuceneUtil {
|
||||
config.setMergeScheduler(new SerialMergeScheduler());
|
||||
|
||||
// Creating index writer
|
||||
Directory directory = AppContext.getInstance().getLuceneDirectory();
|
||||
Directory directory = AppContext.getInstance().getIndexingService().getDirectory();
|
||||
IndexWriter indexWriter = null;
|
||||
try {
|
||||
indexWriter = new IndexWriter(directory, config);
|
||||
|
@ -1,7 +1,6 @@
|
||||
- New image rescale between thumbnail and original (client/server)
|
||||
- Batch to regenerate all thumbnails (server)
|
||||
- Delete files on FS when a file in database is deleted (server)
|
||||
- Batch to delete unused files on FS (server)
|
||||
- Special criteria to search inside OCR-ed content (eg. full:uranium) (server)
|
||||
- Special criteria to search on a specific time span (eg. at:2013-06) (server)
|
||||
- Show help on special criterias (client)
|
||||
- Disable Add/Edit button while uploading (client)
|
@ -1,7 +1,9 @@
|
||||
package com.sismics.docs.rest.resource;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.ResourceBundle;
|
||||
|
||||
import javax.ws.rs.GET;
|
||||
@ -19,11 +21,14 @@ import org.codehaus.jettison.json.JSONException;
|
||||
import org.codehaus.jettison.json.JSONObject;
|
||||
|
||||
import com.sismics.docs.core.dao.jpa.DocumentDao;
|
||||
import com.sismics.docs.core.dao.jpa.FileDao;
|
||||
import com.sismics.docs.core.dao.jpa.criteria.DocumentCriteria;
|
||||
import com.sismics.docs.core.dao.jpa.dto.DocumentDto;
|
||||
import com.sismics.docs.core.event.OcrFileAsyncEvent;
|
||||
import com.sismics.docs.core.model.context.AppContext;
|
||||
import com.sismics.docs.core.model.jpa.File;
|
||||
import com.sismics.docs.core.util.ConfigUtil;
|
||||
import com.sismics.docs.core.util.DirectoryUtil;
|
||||
import com.sismics.docs.core.util.jpa.PaginatedList;
|
||||
import com.sismics.docs.core.util.jpa.PaginatedLists;
|
||||
import com.sismics.docs.core.util.jpa.SortCriteria;
|
||||
@ -177,12 +182,53 @@ public class AppResource extends BaseResource {
|
||||
}
|
||||
checkBaseFunction(BaseFunction.ADMIN);
|
||||
|
||||
JSONObject response = new JSONObject();
|
||||
try {
|
||||
AppContext.getInstance().getIndexingService().rebuildIndex();
|
||||
} catch (Exception e) {
|
||||
throw new ServerException("IndexingError", "Error rebuilding index", e);
|
||||
}
|
||||
|
||||
JSONObject response = new JSONObject();
|
||||
response.put("status", "ok");
|
||||
return Response.ok().entity(response).build();
|
||||
}
|
||||
|
||||
/**
|
||||
* Destroy and rebuild Lucene index.
|
||||
*
|
||||
* @return Response
|
||||
* @throws JSONException
|
||||
*/
|
||||
@POST
|
||||
@Path("batch/clean_storage")
|
||||
@Produces(MediaType.APPLICATION_JSON)
|
||||
public Response batchCleanStorage() throws JSONException {
|
||||
if (!authenticate()) {
|
||||
throw new ForbiddenClientException();
|
||||
}
|
||||
checkBaseFunction(BaseFunction.ADMIN);
|
||||
|
||||
// Get all files
|
||||
FileDao fileDao = new FileDao();
|
||||
List<File> fileList = fileDao.findAll();
|
||||
Map<String, File> fileMap = new HashMap<>();
|
||||
for (File file : fileList) {
|
||||
fileMap.put(file.getId(), file);
|
||||
}
|
||||
|
||||
// Check if each stored file is valid
|
||||
java.io.File[] storedFileList = DirectoryUtil.getStorageDirectory().listFiles();
|
||||
for (java.io.File storedFile : storedFileList) {
|
||||
String fileName = storedFile.getName();
|
||||
if (fileName.endsWith("_thumb")) {
|
||||
fileName = fileName.replace("_thumb", "");
|
||||
}
|
||||
if (!fileMap.containsKey(fileName)) {
|
||||
storedFile.delete();
|
||||
}
|
||||
}
|
||||
|
||||
JSONObject response = new JSONObject();
|
||||
response.put("status", "ok");
|
||||
return Response.ok().entity(response).build();
|
||||
}
|
||||
|
@ -55,6 +55,12 @@ public class TestAppResource extends BaseJerseyTest {
|
||||
appResource.addFilter(new CookieAuthenticationFilter(adminAuthenticationToken));
|
||||
response = appResource.post(ClientResponse.class);
|
||||
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||
|
||||
// Clean storage
|
||||
appResource = resource().path("/app/batch/clean_storage");
|
||||
appResource.addFilter(new CookieAuthenticationFilter(adminAuthenticationToken));
|
||||
response = appResource.post(ClientResponse.class);
|
||||
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -284,6 +284,15 @@ public class TestDocumentResource extends BaseJerseyTest {
|
||||
json = response.getEntity(JSONObject.class);
|
||||
Assert.assertEquals(document1Id, json.getString("id"));
|
||||
|
||||
// Search documents by query
|
||||
documentResource = resource().path("/document/list");
|
||||
documentResource.addFilter(new CookieAuthenticationFilter(document1Token));
|
||||
getParams = new MultivaluedMapImpl();
|
||||
getParams.putSingle("search", "super");
|
||||
response = documentResource.queryParams(getParams).get(ClientResponse.class);
|
||||
json = response.getEntity(JSONObject.class);
|
||||
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
|
||||
|
||||
// Get a document
|
||||
documentResource = resource().path("/document/" + document1Id);
|
||||
documentResource.addFilter(new CookieAuthenticationFilter(document1Token));
|
||||
|
@ -2,6 +2,7 @@ package com.sismics.docs.rest;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Paths;
|
||||
|
||||
import javax.ws.rs.core.MediaType;
|
||||
|
||||
@ -12,6 +13,7 @@ import org.codehaus.jettison.json.JSONObject;
|
||||
import org.junit.Test;
|
||||
|
||||
import com.google.common.io.ByteStreams;
|
||||
import com.sismics.docs.core.util.DirectoryUtil;
|
||||
import com.sismics.docs.rest.filter.CookieAuthenticationFilter;
|
||||
import com.sun.jersey.api.client.ClientResponse;
|
||||
import com.sun.jersey.api.client.ClientResponse.Status;
|
||||
@ -145,6 +147,12 @@ public class TestFileResource extends BaseJerseyTest {
|
||||
json = response.getEntity(JSONObject.class);
|
||||
Assert.assertEquals("ok", json.getString("status"));
|
||||
|
||||
// Check that files are deleted from FS
|
||||
java.io.File thumbnailFile = Paths.get(DirectoryUtil.getStorageDirectory().getPath(), file1Id + "_thumb").toFile();
|
||||
java.io.File storedFile = Paths.get(DirectoryUtil.getStorageDirectory().getPath(), file1Id).toFile();
|
||||
Assert.assertFalse(thumbnailFile.exists());
|
||||
Assert.assertFalse(storedFile.exists());
|
||||
|
||||
// Get all files from a document
|
||||
fileResource = resource().path("/file/list");
|
||||
fileResource.addFilter(new CookieAuthenticationFilter(file1AuthenticationToken));
|
||||
|
Loading…
Reference in New Issue
Block a user