Make search for documents faster for large dataset (#698)

This commit is contained in:
Julien Kirch 2023-10-08 22:07:01 +02:00 committed by GitHub
parent ce30b1a6ff
commit a89543b555
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 109 additions and 38 deletions

View File

@ -4,13 +4,16 @@ import com.sismics.docs.core.constant.AuditLogType;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.docs.core.util.AuditLogUtil;
import com.sismics.util.context.ThreadLocalContext;
import jakarta.persistence.EntityManager;
import jakarta.persistence.NoResultException;
import jakarta.persistence.Query;
import jakarta.persistence.TypedQuery;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.UUID;
/**
@ -213,6 +216,24 @@ public class FileDao {
return q.getResultList();
}
/**
* Get files count by documents IDs.
*
* @param documentIds Documents IDs
* @return the number of files per document id
*/
public Map<String, Long> countByDocumentsIds(Iterable<String> documentIds) {
EntityManager em = ThreadLocalContext.get().getEntityManager();
Query q = em.createQuery("select f.documentId, count(*) from File f where f.documentId in :documentIds and f.latestVersion = true and f.deleteDate is null group by (f.documentId)");
q.setParameter("documentIds", documentIds);
Map<String, Long> result = new HashMap<>();
q.getResultList().forEach(o -> {
Object[] resultLine = (Object[]) o;
result.put((String) resultLine[0], (Long) resultLine[1]);
});
return result;
}
/**
* Get all files from a version.
*

View File

@ -26,9 +26,18 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.*;
import org.apache.lucene.index.CheckIndex;
import org.apache.lucene.index.ConcurrentMergeScheduler;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.simple.SimpleQueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLEncoder;
@ -47,7 +56,12 @@ import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.Timestamp;
import java.util.*;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.UUID;
/**
* Lucene indexing handler.
@ -242,32 +256,27 @@ public class LuceneIndexingHandler implements IndexingHandler {
StringBuilder sb = new StringBuilder("select distinct d.DOC_ID_C c0, d.DOC_TITLE_C c1, d.DOC_DESCRIPTION_C c2, d.DOC_CREATEDATE_D c3, d.DOC_LANGUAGE_C c4, d.DOC_IDFILE_C, ");
sb.append(" s.count c5, ");
sb.append(" f.count c6, ");
sb.append(" rs2.RTP_ID_C c7, rs2.RTP_NAME_C, d.DOC_UPDATEDATE_D c8 ");
sb.append(" from T_DOCUMENT d ");
sb.append(" left join (SELECT count(s.SHA_ID_C) count, ac.ACL_SOURCEID_C " +
" FROM T_SHARE s, T_ACL ac " +
" WHERE ac.ACL_TARGETID_C = s.SHA_ID_C AND ac.ACL_DELETEDATE_D IS NULL AND " +
" s.SHA_DELETEDATE_D IS NULL group by ac.ACL_SOURCEID_C) s on s.ACL_SOURCEID_C = d.DOC_ID_C " +
" left join (SELECT count(f.FIL_ID_C) count, f.FIL_IDDOC_C " +
" FROM T_FILE f " +
" WHERE f.FIL_DELETEDATE_D is null group by f.FIL_IDDOC_C) f on f.FIL_IDDOC_C = d.DOC_ID_C ");
" s.SHA_DELETEDATE_D IS NULL group by ac.ACL_SOURCEID_C) s on s.ACL_SOURCEID_C = d.DOC_ID_C ");
sb.append(" left join (select rs.*, rs3.idDocument " +
"from T_ROUTE_STEP rs " +
"join (select r.RTE_IDDOCUMENT_C idDocument, rs.RTP_IDROUTE_C idRoute, min(rs.RTP_ORDER_N) minOrder from T_ROUTE_STEP rs join T_ROUTE r on r.RTE_ID_C = rs.RTP_IDROUTE_C and r.RTE_DELETEDATE_D is null where rs.RTP_DELETEDATE_D is null and rs.RTP_ENDDATE_D is null group by rs.RTP_IDROUTE_C, r.RTE_IDDOCUMENT_C) rs3 on rs.RTP_IDROUTE_C = rs3.idRoute and rs.RTP_ORDER_N = rs3.minOrder " +
"where rs.RTP_IDTARGET_C in (:targetIdList)) rs2 on rs2.idDocument = d.DOC_ID_C ");
// Add search criterias
if (criteria.getTargetIdList() != null) {
if (!SecurityUtil.skipAclCheck(criteria.getTargetIdList())) {
// Read permission is enough for searching
sb.append(" left join T_ACL a on a.ACL_TARGETID_C in (:targetIdList) and a.ACL_SOURCEID_C = d.DOC_ID_C and a.ACL_PERM_C = 'READ' and a.ACL_DELETEDATE_D is null ");
sb.append(" left join T_DOCUMENT_TAG dta on dta.DOT_IDDOCUMENT_C = d.DOC_ID_C and dta.DOT_DELETEDATE_D is null ");
sb.append(" left join T_ACL a2 on a2.ACL_TARGETID_C in (:targetIdList) and a2.ACL_SOURCEID_C = dta.DOT_IDTAG_C and a2.ACL_PERM_C = 'READ' and a2.ACL_DELETEDATE_D is null ");
criteriaList.add("(a.ACL_ID_C is not null or a2.ACL_ID_C is not null)");
}
parameterMap.put("targetIdList", criteria.getTargetIdList());
if (!SecurityUtil.skipAclCheck(criteria.getTargetIdList())) {
// Read permission is enough for searching
sb.append(" left join T_ACL a on a.ACL_TARGETID_C in (:targetIdList) and a.ACL_SOURCEID_C = d.DOC_ID_C and a.ACL_PERM_C = 'READ' and a.ACL_DELETEDATE_D is null ");
sb.append(" left join T_DOCUMENT_TAG dta on dta.DOT_IDDOCUMENT_C = d.DOC_ID_C and dta.DOT_DELETEDATE_D is null ");
sb.append(" left join T_ACL a2 on a2.ACL_TARGETID_C in (:targetIdList) and a2.ACL_SOURCEID_C = dta.DOT_IDTAG_C and a2.ACL_PERM_C = 'READ' and a2.ACL_DELETEDATE_D is null ");
criteriaList.add("(a.ACL_ID_C is not null or a2.ACL_ID_C is not null)");
}
parameterMap.put("targetIdList", criteria.getTargetIdList());
if (!Strings.isNullOrEmpty(criteria.getSearch()) || !Strings.isNullOrEmpty(criteria.getFullSearch())) {
documentSearchMap = search(criteria.getSearch(), criteria.getFullSearch());
if (documentSearchMap.isEmpty()) {
@ -312,7 +321,7 @@ public class LuceneIndexingHandler implements IndexingHandler {
criteriaList.add("(" + Joiner.on(" OR ").join(tagCriteriaList) + ")");
}
}
if (criteria.getExcludedTagIdList() != null && !criteria.getExcludedTagIdList().isEmpty()) {
if (!criteria.getExcludedTagIdList().isEmpty()) {
int index = 0;
for (List<String> tagIdList : criteria.getExcludedTagIdList()) {
List<String> tagCriteriaList = Lists.newArrayList();
@ -367,8 +376,6 @@ public class LuceneIndexingHandler implements IndexingHandler {
documentDto.setFileId((String) o[i++]);
Number shareCount = (Number) o[i++];
documentDto.setShared(shareCount != null && shareCount.intValue() > 0);
Number fileCount = (Number) o[i++];
documentDto.setFileCount(fileCount == null ? 0 : fileCount.intValue());
documentDto.setActiveRoute(o[i++] != null);
documentDto.setCurrentStepName((String) o[i++]);
documentDto.setUpdateTimestamp(((Timestamp) o[i]).getTime());

View File

@ -1 +1 @@
db.version=29
db.version=30

View File

@ -0,0 +1,2 @@
create index IDX_FIL_IDDOC_C ON T_FILE (FIL_IDDOC_C ASC);
update T_CONFIG set CFG_VALUE_C = '30' where CFG_ID_C = 'DB_VERSION';

View File

@ -1,3 +1,3 @@
api.current_version=${project.version}
api.min_version=1.0
db.version=29
db.version=30

View File

@ -7,10 +7,22 @@ import com.sismics.docs.core.constant.AclType;
import com.sismics.docs.core.constant.ConfigType;
import com.sismics.docs.core.constant.Constants;
import com.sismics.docs.core.constant.PermType;
import com.sismics.docs.core.dao.*;
import com.sismics.docs.core.dao.AclDao;
import com.sismics.docs.core.dao.ContributorDao;
import com.sismics.docs.core.dao.DocumentDao;
import com.sismics.docs.core.dao.FileDao;
import com.sismics.docs.core.dao.RelationDao;
import com.sismics.docs.core.dao.RouteStepDao;
import com.sismics.docs.core.dao.TagDao;
import com.sismics.docs.core.dao.UserDao;
import com.sismics.docs.core.dao.criteria.DocumentCriteria;
import com.sismics.docs.core.dao.criteria.TagCriteria;
import com.sismics.docs.core.dao.dto.*;
import com.sismics.docs.core.dao.dto.AclDto;
import com.sismics.docs.core.dao.dto.ContributorDto;
import com.sismics.docs.core.dao.dto.DocumentDto;
import com.sismics.docs.core.dao.dto.RelationDto;
import com.sismics.docs.core.dao.dto.RouteStepDto;
import com.sismics.docs.core.dao.dto.TagDto;
import com.sismics.docs.core.event.DocumentCreatedAsyncEvent;
import com.sismics.docs.core.event.DocumentDeletedAsyncEvent;
import com.sismics.docs.core.event.DocumentUpdatedAsyncEvent;
@ -38,6 +50,21 @@ import com.sismics.util.EmailUtil;
import com.sismics.util.JsonUtil;
import com.sismics.util.context.ThreadLocalContext;
import com.sismics.util.mime.MimeType;
import jakarta.json.Json;
import jakarta.json.JsonArrayBuilder;
import jakarta.json.JsonObjectBuilder;
import jakarta.ws.rs.Consumes;
import jakarta.ws.rs.DELETE;
import jakarta.ws.rs.FormParam;
import jakarta.ws.rs.GET;
import jakarta.ws.rs.NotFoundException;
import jakarta.ws.rs.POST;
import jakarta.ws.rs.PUT;
import jakarta.ws.rs.Path;
import jakarta.ws.rs.PathParam;
import jakarta.ws.rs.QueryParam;
import jakarta.ws.rs.core.Response;
import jakarta.ws.rs.core.StreamingOutput;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.glassfish.jersey.media.multipart.FormDataBodyPart;
@ -48,22 +75,25 @@ import org.joda.time.format.DateTimeFormatter;
import org.joda.time.format.DateTimeFormatterBuilder;
import org.joda.time.format.DateTimeParser;
import jakarta.json.Json;
import jakarta.json.JsonArrayBuilder;
import jakarta.json.JsonObjectBuilder;
import javax.mail.Message;
import javax.mail.MessagingException;
import javax.mail.Session;
import javax.mail.internet.MimeMessage;
import jakarta.ws.rs.*;
import jakarta.ws.rs.core.Response;
import jakarta.ws.rs.core.StreamingOutput;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.StandardCopyOption;
import java.text.MessageFormat;
import java.util.*;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.UUID;
/**
* Document REST resources.
@ -443,11 +473,14 @@ public class DocumentResource extends BaseResource {
}
// Find the files of the documents
Iterable<String> documentsIds = CollectionUtils.collect(paginatedList.getResultList(), DocumentDto::getId);
FileDao fileDao = new FileDao();
List<File> filesList = null;
Map<String, Long> filesCountByDocument = null;
if (Boolean.TRUE == files) {
Iterable<String> documentsIds = CollectionUtils.collect(paginatedList.getResultList(), DocumentDto::getId);
FileDao fileDao = new FileDao();
filesList = fileDao.getByDocumentsIds(documentsIds);
} else {
filesCountByDocument = fileDao.countByDocumentsIds(documentsIds);
}
for (DocumentDto documentDto : paginatedList.getResultList()) {
@ -463,6 +496,16 @@ public class DocumentResource extends BaseResource {
.add("color", tagDto.getColor()));
}
Long filesCount;
Collection<File> filesOfDocument = null;
if (Boolean.TRUE == files) {
// Find files matching the document
filesOfDocument = CollectionUtils.select(filesList, file -> file.getDocumentId().equals(documentDto.getId()));
filesCount = (long) filesOfDocument.size();
} else {
filesCount = filesCountByDocument.getOrDefault(documentDto.getId(), 0L);
}
JsonObjectBuilder documentObjectBuilder = Json.createObjectBuilder()
.add("id", documentDto.getId())
.add("highlight", JsonUtil.nullable(documentDto.getHighlight()))
@ -475,12 +518,10 @@ public class DocumentResource extends BaseResource {
.add("shared", documentDto.getShared())
.add("active_route", documentDto.isActiveRoute())
.add("current_step_name", JsonUtil.nullable(documentDto.getCurrentStepName()))
.add("file_count", documentDto.getFileCount())
.add("file_count", filesCount)
.add("tags", tags);
if (Boolean.TRUE == files) {
JsonArrayBuilder filesArrayBuilder = Json.createArrayBuilder();
// Find files matching the document
Collection<File> filesOfDocument = CollectionUtils.select(filesList, file -> file.getDocumentId().equals(documentDto.getId()));
for (File fileDb : filesOfDocument) {
filesArrayBuilder.add(RestUtil.fileToJsonObjectBuilder(fileDb));
}

View File

@ -1,3 +1,3 @@
api.current_version=${project.version}
api.min_version=1.0
db.version=29
db.version=30