Delete files from storage when necessary,

batch to cleanup storage for orphan files,
better Lucene directory reader management
This commit is contained in:
jendib 2013-08-18 00:53:01 +02:00
parent 00ed2e3c25
commit 6b5c1b2b51
10 changed files with 140 additions and 48 deletions

View File

@ -10,8 +10,6 @@ import java.util.Set;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField; import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField; import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.queries.TermsFilter; import org.apache.lucene.queries.TermsFilter;
@ -23,8 +21,6 @@ import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopDocs;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.sismics.docs.core.model.context.AppContext; import com.sismics.docs.core.model.context.AppContext;
import com.sismics.docs.core.model.jpa.Document; import com.sismics.docs.core.model.jpa.Document;
@ -38,11 +34,6 @@ import com.sismics.docs.core.util.LuceneUtil.LuceneRunnable;
* @author bgamard * @author bgamard
*/ */
public class LuceneDao { public class LuceneDao {
/**
* Logger.
*/
private static final Logger log = LoggerFactory.getLogger(LuceneDao.class);
/** /**
* Destroy and rebuild index. * Destroy and rebuild index.
* *
@ -178,17 +169,12 @@ public class LuceneDao {
TermsFilter userFilter = new TermsFilter(terms); TermsFilter userFilter = new TermsFilter(terms);
// Search // Search
Set<String> documentIdList = new HashSet<String>(); IndexSearcher searcher = new IndexSearcher(AppContext.getInstance().getIndexingService().getDirectoryReader());
if (!DirectoryReader.indexExists(AppContext.getInstance().getLuceneDirectory())) {
log.warn("Lucene directory not yet initialized");
return documentIdList;
}
IndexReader reader = DirectoryReader.open(AppContext.getInstance().getLuceneDirectory());
IndexSearcher searcher = new IndexSearcher(reader);
TopDocs topDocs = searcher.search(query, userFilter, Integer.MAX_VALUE); TopDocs topDocs = searcher.search(query, userFilter, Integer.MAX_VALUE);
ScoreDoc[] docs = topDocs.scoreDocs; ScoreDoc[] docs = topDocs.scoreDocs;
// Extract document IDs // Extract document IDs
Set<String> documentIdList = new HashSet<String>();
for (int i = 0; i < docs.length; i++) { for (int i = 0; i < docs.length; i++) {
org.apache.lucene.document.Document document = searcher.doc(docs[i].doc); org.apache.lucene.document.Document document = searcher.doc(docs[i].doc);
String type = document.get("type"); String type = document.get("type");
@ -201,8 +187,6 @@ public class LuceneDao {
documentIdList.add(documentId); documentIdList.add(documentId);
} }
reader.close();
return documentIdList; return documentIdList;
} }

View File

@ -1,11 +1,15 @@
package com.sismics.docs.core.listener.async; package com.sismics.docs.core.listener.async;
import java.nio.file.Paths;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.google.common.eventbus.Subscribe; import com.google.common.eventbus.Subscribe;
import com.sismics.docs.core.dao.lucene.LuceneDao; import com.sismics.docs.core.dao.lucene.LuceneDao;
import com.sismics.docs.core.event.FileDeletedAsyncEvent; import com.sismics.docs.core.event.FileDeletedAsyncEvent;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.docs.core.util.DirectoryUtil;
/** /**
* Listener on file deleted. * Listener on file deleted.
@ -30,10 +34,20 @@ public class FileDeletedAsyncListener {
log.info("File deleted event: " + fileDeletedAsyncEvent.toString()); log.info("File deleted event: " + fileDeletedAsyncEvent.toString());
} }
// TODO Delete the file from storage // Delete the file from storage
File file = fileDeletedAsyncEvent.getFile();
java.io.File thumbnailFile = Paths.get(DirectoryUtil.getStorageDirectory().getPath(), file.getId() + "_thumb").toFile();
java.io.File storedFile = Paths.get(DirectoryUtil.getStorageDirectory().getPath(), file.getId()).toFile();
if (thumbnailFile.exists()) {
thumbnailFile.delete();
}
if (storedFile.exists()) {
storedFile.delete();
}
// Update Lucene index // Update Lucene index
LuceneDao luceneDao = new LuceneDao(); LuceneDao luceneDao = new LuceneDao();
luceneDao.deleteDocument(fileDeletedAsyncEvent.getFile().getId()); luceneDao.deleteDocument(file.getId());
} }
} }

View File

@ -7,8 +7,6 @@ import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import org.apache.lucene.store.Directory;
import com.google.common.eventbus.AsyncEventBus; import com.google.common.eventbus.AsyncEventBus;
import com.google.common.eventbus.EventBus; import com.google.common.eventbus.EventBus;
import com.sismics.docs.core.constant.ConfigType; import com.sismics.docs.core.constant.ConfigType;
@ -51,11 +49,6 @@ public class AppContext {
*/ */
private IndexingService indexingService; private IndexingService indexingService;
/**
* Lucene directory.
*/
private Directory luceneDirectory;
/** /**
* Asynchronous executors. * Asynchronous executors.
*/ */
@ -71,8 +64,6 @@ public class AppContext {
Config luceneStorageConfig = configDao.getById(ConfigType.LUCENE_DIRECTORY_STORAGE); Config luceneStorageConfig = configDao.getById(ConfigType.LUCENE_DIRECTORY_STORAGE);
indexingService = new IndexingService(luceneStorageConfig != null ? luceneStorageConfig.getValue() : null); indexingService = new IndexingService(luceneStorageConfig != null ? luceneStorageConfig.getValue() : null);
indexingService.startAndWait(); indexingService.startAndWait();
luceneDirectory = indexingService.getDirectory();
} }
/** /**
@ -165,20 +156,11 @@ public class AppContext {
} }
/** /**
* Getter of feedService. * Getter of indexingService.
* *
* @return feedService * @return indexingService
*/ */
public IndexingService getIndexingService() { public IndexingService getIndexingService() {
return indexingService; return indexingService;
} }
/**
* Getter of- luceneDirectory.
*
* @return the luceneDirectory
*/
public Directory getLuceneDirectory() {
return luceneDirectory;
}
} }

View File

@ -4,6 +4,7 @@ import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.store.SimpleFSDirectory; import org.apache.lucene.store.SimpleFSDirectory;
@ -34,6 +35,11 @@ public class IndexingService extends AbstractScheduledService {
*/ */
private Directory directory; private Directory directory;
/**
* Directory reader.
*/
private DirectoryReader directoryReader;
/** /**
* Lucene storage config. * Lucene storage config.
*/ */
@ -62,10 +68,16 @@ public class IndexingService extends AbstractScheduledService {
@Override @Override
protected void shutDown() { protected void shutDown() {
Directory luceneIndex = AppContext.getInstance().getLuceneDirectory(); if (directoryReader != null) {
if (luceneIndex != null) {
try { try {
luceneIndex.close(); directoryReader.close();
} catch (IOException e) {
log.error("Error closing the index reader", e);
}
}
if (directory != null) {
try {
directory.close();
} catch (IOException e) { } catch (IOException e) {
log.error("Error closing Lucene index", e); log.error("Error closing Lucene index", e);
} }
@ -105,4 +117,36 @@ public class IndexingService extends AbstractScheduledService {
public Directory getDirectory() { public Directory getDirectory() {
return directory; return directory;
} }
/**
* Returns a valid directory reader.
* Take care of reopening the reader if the index has changed
* and closing the previous one.
*
* @return the directoryReader
*/
public DirectoryReader getDirectoryReader() {
if (directoryReader == null) {
if (!DirectoryReader.indexExists(directory)) {
log.info("Lucene directory not yet created");
return null;
}
try {
directoryReader = DirectoryReader.open(directory);
} catch (IOException e) {
log.error("Error creating the directory reader", e);
}
} else {
try {
DirectoryReader newReader = DirectoryReader.openIfChanged(directoryReader);
if (newReader != null) {
directoryReader.close();
directoryReader = newReader;
}
} catch (IOException e) {
log.error("Error while reopening the directory reader", e);
}
}
return directoryReader;
}
} }

View File

@ -37,7 +37,7 @@ public class LuceneUtil {
config.setMergeScheduler(new SerialMergeScheduler()); config.setMergeScheduler(new SerialMergeScheduler());
// Creating index writer // Creating index writer
Directory directory = AppContext.getInstance().getLuceneDirectory(); Directory directory = AppContext.getInstance().getIndexingService().getDirectory();
IndexWriter indexWriter = null; IndexWriter indexWriter = null;
try { try {
indexWriter = new IndexWriter(directory, config); indexWriter = new IndexWriter(directory, config);

View File

@ -1,7 +1,6 @@
- New image rescale between thumbnail and original (client/server) - New image rescale between thumbnail and original (client/server)
- Batch to regenerate all thumbnails (server) - Batch to regenerate all thumbnails (server)
- Delete files on FS when a file in database is deleted (server)
- Batch to delete unused files on FS (server)
- Special criteria to search inside OCR-ed content (eg. full:uranium) (server) - Special criteria to search inside OCR-ed content (eg. full:uranium) (server)
- Special criteria to search on a specific time span (eg. at:2013-06) (server) - Special criteria to search on a specific time span (eg. at:2013-06) (server)
- Show help on special criterias (client) - Show help on special criterias (client)
- Disable Add/Edit button while uploading (client)

View File

@ -1,7 +1,9 @@
package com.sismics.docs.rest.resource; package com.sismics.docs.rest.resource;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.ResourceBundle; import java.util.ResourceBundle;
import javax.ws.rs.GET; import javax.ws.rs.GET;
@ -19,11 +21,14 @@ import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject; import org.codehaus.jettison.json.JSONObject;
import com.sismics.docs.core.dao.jpa.DocumentDao; import com.sismics.docs.core.dao.jpa.DocumentDao;
import com.sismics.docs.core.dao.jpa.FileDao;
import com.sismics.docs.core.dao.jpa.criteria.DocumentCriteria; import com.sismics.docs.core.dao.jpa.criteria.DocumentCriteria;
import com.sismics.docs.core.dao.jpa.dto.DocumentDto; import com.sismics.docs.core.dao.jpa.dto.DocumentDto;
import com.sismics.docs.core.event.OcrFileAsyncEvent; import com.sismics.docs.core.event.OcrFileAsyncEvent;
import com.sismics.docs.core.model.context.AppContext; import com.sismics.docs.core.model.context.AppContext;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.docs.core.util.ConfigUtil; import com.sismics.docs.core.util.ConfigUtil;
import com.sismics.docs.core.util.DirectoryUtil;
import com.sismics.docs.core.util.jpa.PaginatedList; import com.sismics.docs.core.util.jpa.PaginatedList;
import com.sismics.docs.core.util.jpa.PaginatedLists; import com.sismics.docs.core.util.jpa.PaginatedLists;
import com.sismics.docs.core.util.jpa.SortCriteria; import com.sismics.docs.core.util.jpa.SortCriteria;
@ -177,12 +182,53 @@ public class AppResource extends BaseResource {
} }
checkBaseFunction(BaseFunction.ADMIN); checkBaseFunction(BaseFunction.ADMIN);
JSONObject response = new JSONObject();
try { try {
AppContext.getInstance().getIndexingService().rebuildIndex(); AppContext.getInstance().getIndexingService().rebuildIndex();
} catch (Exception e) { } catch (Exception e) {
throw new ServerException("IndexingError", "Error rebuilding index", e); throw new ServerException("IndexingError", "Error rebuilding index", e);
} }
JSONObject response = new JSONObject();
response.put("status", "ok");
return Response.ok().entity(response).build();
}
/**
* Destroy and rebuild Lucene index.
*
* @return Response
* @throws JSONException
*/
@POST
@Path("batch/clean_storage")
@Produces(MediaType.APPLICATION_JSON)
public Response batchCleanStorage() throws JSONException {
if (!authenticate()) {
throw new ForbiddenClientException();
}
checkBaseFunction(BaseFunction.ADMIN);
// Get all files
FileDao fileDao = new FileDao();
List<File> fileList = fileDao.findAll();
Map<String, File> fileMap = new HashMap<>();
for (File file : fileList) {
fileMap.put(file.getId(), file);
}
// Check if each stored file is valid
java.io.File[] storedFileList = DirectoryUtil.getStorageDirectory().listFiles();
for (java.io.File storedFile : storedFileList) {
String fileName = storedFile.getName();
if (fileName.endsWith("_thumb")) {
fileName = fileName.replace("_thumb", "");
}
if (!fileMap.containsKey(fileName)) {
storedFile.delete();
}
}
JSONObject response = new JSONObject();
response.put("status", "ok"); response.put("status", "ok");
return Response.ok().entity(response).build(); return Response.ok().entity(response).build();
} }

View File

@ -55,6 +55,12 @@ public class TestAppResource extends BaseJerseyTest {
appResource.addFilter(new CookieAuthenticationFilter(adminAuthenticationToken)); appResource.addFilter(new CookieAuthenticationFilter(adminAuthenticationToken));
response = appResource.post(ClientResponse.class); response = appResource.post(ClientResponse.class);
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus())); Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
// Clean storage
appResource = resource().path("/app/batch/clean_storage");
appResource.addFilter(new CookieAuthenticationFilter(adminAuthenticationToken));
response = appResource.post(ClientResponse.class);
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
} }
/** /**

View File

@ -284,6 +284,15 @@ public class TestDocumentResource extends BaseJerseyTest {
json = response.getEntity(JSONObject.class); json = response.getEntity(JSONObject.class);
Assert.assertEquals(document1Id, json.getString("id")); Assert.assertEquals(document1Id, json.getString("id"));
// Search documents by query
documentResource = resource().path("/document/list");
documentResource.addFilter(new CookieAuthenticationFilter(document1Token));
getParams = new MultivaluedMapImpl();
getParams.putSingle("search", "super");
response = documentResource.queryParams(getParams).get(ClientResponse.class);
json = response.getEntity(JSONObject.class);
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
// Get a document // Get a document
documentResource = resource().path("/document/" + document1Id); documentResource = resource().path("/document/" + document1Id);
documentResource.addFilter(new CookieAuthenticationFilter(document1Token)); documentResource.addFilter(new CookieAuthenticationFilter(document1Token));

View File

@ -2,6 +2,7 @@ package com.sismics.docs.rest;
import java.io.BufferedInputStream; import java.io.BufferedInputStream;
import java.io.InputStream; import java.io.InputStream;
import java.nio.file.Paths;
import javax.ws.rs.core.MediaType; import javax.ws.rs.core.MediaType;
@ -12,6 +13,7 @@ import org.codehaus.jettison.json.JSONObject;
import org.junit.Test; import org.junit.Test;
import com.google.common.io.ByteStreams; import com.google.common.io.ByteStreams;
import com.sismics.docs.core.util.DirectoryUtil;
import com.sismics.docs.rest.filter.CookieAuthenticationFilter; import com.sismics.docs.rest.filter.CookieAuthenticationFilter;
import com.sun.jersey.api.client.ClientResponse; import com.sun.jersey.api.client.ClientResponse;
import com.sun.jersey.api.client.ClientResponse.Status; import com.sun.jersey.api.client.ClientResponse.Status;
@ -145,6 +147,12 @@ public class TestFileResource extends BaseJerseyTest {
json = response.getEntity(JSONObject.class); json = response.getEntity(JSONObject.class);
Assert.assertEquals("ok", json.getString("status")); Assert.assertEquals("ok", json.getString("status"));
// Check that files are deleted from FS
java.io.File thumbnailFile = Paths.get(DirectoryUtil.getStorageDirectory().getPath(), file1Id + "_thumb").toFile();
java.io.File storedFile = Paths.get(DirectoryUtil.getStorageDirectory().getPath(), file1Id).toFile();
Assert.assertFalse(thumbnailFile.exists());
Assert.assertFalse(storedFile.exists());
// Get all files from a document // Get all files from a document
fileResource = resource().path("/file/list"); fileResource = resource().path("/file/list");
fileResource.addFilter(new CookieAuthenticationFilter(file1AuthenticationToken)); fileResource.addFilter(new CookieAuthenticationFilter(file1AuthenticationToken));