From 7f19f8c11260424d788700f06c2e975d08857ad9 Mon Sep 17 00:00:00 2001 From: jendib Date: Tue, 1 Mar 2016 01:01:10 +0100 Subject: [PATCH 1/3] #62: Migration to Lucene 5 (without rebuilding old index) --- docs-core/pom.xml | 5 - .../core/dao/lucene/DocsStandardAnalyzer.java | 135 ------------------ .../docs/core/dao/lucene/LuceneDao.java | 35 ++--- .../docs/core/service/IndexingService.java | 10 +- .../sismics/docs/core/util/LuceneUtil.java | 19 +-- docs-parent/pom.xml | 8 +- 6 files changed, 29 insertions(+), 183 deletions(-) delete mode 100644 docs-core/src/main/java/com/sismics/docs/core/dao/lucene/DocsStandardAnalyzer.java diff --git a/docs-core/pom.xml b/docs-core/pom.xml index 6db59fe5..9e12d3fb 100644 --- a/docs-core/pom.xml +++ b/docs-core/pom.xml @@ -93,11 +93,6 @@ lucene-queryparser - - org.apache.lucene - lucene-highlighter - - org.imgscalr imgscalr-lib diff --git a/docs-core/src/main/java/com/sismics/docs/core/dao/lucene/DocsStandardAnalyzer.java b/docs-core/src/main/java/com/sismics/docs/core/dao/lucene/DocsStandardAnalyzer.java deleted file mode 100644 index 753b80fb..00000000 --- a/docs-core/src/main/java/com/sismics/docs/core/dao/lucene/DocsStandardAnalyzer.java +++ /dev/null @@ -1,135 +0,0 @@ -package com.sismics.docs.core.dao.lucene; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter; -import org.apache.lucene.analysis.core.LowerCaseFilter; -import org.apache.lucene.analysis.core.StopAnalyzer; -import org.apache.lucene.analysis.core.StopFilter; -import org.apache.lucene.analysis.standard.ClassicAnalyzer; -import org.apache.lucene.analysis.standard.ClassicTokenizer; -import org.apache.lucene.analysis.standard.StandardFilter; -import org.apache.lucene.analysis.standard.StandardTokenizer; -import org.apache.lucene.analysis.util.CharArraySet; -import org.apache.lucene.analysis.util.StopwordAnalyzerBase; -import org.apache.lucene.util.Version; - -import java.io.IOException; -import java.io.Reader; - -/** - * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link - * LowerCaseFilter} and {@link StopFilter}, using a list of - * English stop words. - * - * - *

You must specify the required {@link Version} - * compatibility when creating StandardAnalyzer: - *

- */ -public final class DocsStandardAnalyzer extends StopwordAnalyzerBase { - - /** Default maximum allowed token length */ - public static final int DEFAULT_MAX_TOKEN_LENGTH = 255; - - private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; - - /** An unmodifiable set containing some common English words that are usually not - useful for searching. */ - public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET; - - /** Builds an analyzer with the given stop words. - * @param matchVersion Lucene version to match See {@link - * above} - * @param stopWords stop words */ - public DocsStandardAnalyzer(Version matchVersion, CharArraySet stopWords) { - super(matchVersion, stopWords); - } - - /** Builds an analyzer with the default stop words ({@link - * #STOP_WORDS_SET}). - * @param matchVersion Lucene version to match See {@link - * above} - */ - public DocsStandardAnalyzer(Version matchVersion) { - this(matchVersion, STOP_WORDS_SET); - } - - /** Builds an analyzer with the stop words from the given reader. - * @see WordlistLoader#getWordSet(Reader, Version) - * @param matchVersion Lucene version to match See {@link - * above} - * @param stopwords Reader to read stop words from */ - public DocsStandardAnalyzer(Version matchVersion, Reader stopwords) throws IOException { - this(matchVersion, loadStopwordSet(stopwords, matchVersion)); - } - - /** - * Set maximum allowed token length. If a token is seen - * that exceeds this length then it is discarded. This - * setting only takes effect the next time tokenStream or - * tokenStream is called. - */ - public void setMaxTokenLength(int length) { - maxTokenLength = length; - } - - /** - * @see #setMaxTokenLength - */ - public int getMaxTokenLength() { - return maxTokenLength; - } - - @Override - protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) { - final StandardTokenizer src = new StandardTokenizer(matchVersion, reader); - src.setMaxTokenLength(maxTokenLength); - TokenStream tok = new StandardFilter(matchVersion, src); - tok = new LowerCaseFilter(matchVersion, tok); - tok = new StopFilter(matchVersion, tok, stopwords); - return new TokenStreamComponents(src, tok) { - @Override - protected void setReader(final Reader reader) throws IOException { - src.setMaxTokenLength(DocsStandardAnalyzer.this.maxTokenLength); - super.setReader(reader); - } - }; - } - - @Override - protected Reader initReader(String fieldName, Reader reader) { - if (fieldName.equals("title") || fieldName.equals("description")) { - return new HTMLStripCharFilter(super.initReader(fieldName, reader)); - } - return super.initReader(fieldName, reader); - } -} diff --git a/docs-core/src/main/java/com/sismics/docs/core/dao/lucene/LuceneDao.java b/docs-core/src/main/java/com/sismics/docs/core/dao/lucene/LuceneDao.java index a96f807a..e144a559 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/dao/lucene/LuceneDao.java +++ b/docs-core/src/main/java/com/sismics/docs/core/dao/lucene/LuceneDao.java @@ -6,6 +6,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; @@ -19,7 +20,6 @@ import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; -import org.apache.lucene.util.Version; import com.sismics.docs.core.model.context.AppContext; import com.sismics.docs.core.model.jpa.Document; @@ -152,22 +152,23 @@ public class LuceneDao { fullSearchQuery = "\"" + QueryParserUtil.escape(fullSearchQuery) + "\""; // Build search query - StandardQueryParser qpHelper = new StandardQueryParser(new DocsStandardAnalyzer(Version.LUCENE_42)); + StandardQueryParser qpHelper = new StandardQueryParser(new StandardAnalyzer()); qpHelper.setPhraseSlop(100000); // PhraseQuery add terms // Search on documents and files - BooleanQuery query = new BooleanQuery(); - query.add(qpHelper.parse(searchQuery, "title"), Occur.SHOULD); - query.add(qpHelper.parse(searchQuery, "description"), Occur.SHOULD); - query.add(qpHelper.parse(searchQuery, "subject"), Occur.SHOULD); - query.add(qpHelper.parse(searchQuery, "identifier"), Occur.SHOULD); - query.add(qpHelper.parse(searchQuery, "publisher"), Occur.SHOULD); - query.add(qpHelper.parse(searchQuery, "format"), Occur.SHOULD); - query.add(qpHelper.parse(searchQuery, "source"), Occur.SHOULD); - query.add(qpHelper.parse(searchQuery, "type"), Occur.SHOULD); - query.add(qpHelper.parse(searchQuery, "coverage"), Occur.SHOULD); - query.add(qpHelper.parse(searchQuery, "rights"), Occur.SHOULD); - query.add(qpHelper.parse(fullSearchQuery, "content"), Occur.SHOULD); + BooleanQuery query = new BooleanQuery.Builder() + .add(qpHelper.parse(searchQuery, "title"), Occur.SHOULD) + .add(qpHelper.parse(searchQuery, "description"), Occur.SHOULD) + .add(qpHelper.parse(searchQuery, "subject"), Occur.SHOULD) + .add(qpHelper.parse(searchQuery, "identifier"), Occur.SHOULD) + .add(qpHelper.parse(searchQuery, "publisher"), Occur.SHOULD) + .add(qpHelper.parse(searchQuery, "format"), Occur.SHOULD) + .add(qpHelper.parse(searchQuery, "source"), Occur.SHOULD) + .add(qpHelper.parse(searchQuery, "type"), Occur.SHOULD) + .add(qpHelper.parse(searchQuery, "coverage"), Occur.SHOULD) + .add(qpHelper.parse(searchQuery, "rights"), Occur.SHOULD) + .add(qpHelper.parse(fullSearchQuery, "content"), Occur.SHOULD) + .build(); // Search DirectoryReader directoryReader = AppContext.getInstance().getIndexingService().getDirectoryReader(); @@ -183,7 +184,7 @@ public class LuceneDao { // Extract document IDs for (int i = 0; i < docs.length; i++) { org.apache.lucene.document.Document document = searcher.doc(docs[i].doc); - String type = document.get("type"); + String type = document.get("doctype"); String documentId = null; if (type.equals("document")) { documentId = document.get("id"); @@ -205,7 +206,7 @@ public class LuceneDao { private org.apache.lucene.document.Document getDocumentFromDocument(Document document) { org.apache.lucene.document.Document luceneDocument = new org.apache.lucene.document.Document(); luceneDocument.add(new StringField("id", document.getId(), Field.Store.YES)); - luceneDocument.add(new StringField("type", "document", Field.Store.YES)); + luceneDocument.add(new StringField("doctype", "document", Field.Store.YES)); luceneDocument.add(new TextField("title", document.getTitle(), Field.Store.NO)); if (document.getDescription() != null) { luceneDocument.add(new TextField("description", document.getDescription(), Field.Store.NO)); @@ -248,7 +249,7 @@ public class LuceneDao { private org.apache.lucene.document.Document getDocumentFromFile(File file, Document document) { org.apache.lucene.document.Document luceneDocument = new org.apache.lucene.document.Document(); luceneDocument.add(new StringField("id", file.getId(), Field.Store.YES)); - luceneDocument.add(new StringField("type", "file", Field.Store.YES)); + luceneDocument.add(new StringField("doctype", "file", Field.Store.YES)); luceneDocument.add(new StringField("document_id", file.getDocumentId(), Field.Store.YES)); if (file.getContent() != null) { luceneDocument.add(new TextField("content", file.getContent(), Field.Store.NO)); diff --git a/docs-core/src/main/java/com/sismics/docs/core/service/IndexingService.java b/docs-core/src/main/java/com/sismics/docs/core/service/IndexingService.java index 3efd055b..0ecf95b2 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/service/IndexingService.java +++ b/docs-core/src/main/java/com/sismics/docs/core/service/IndexingService.java @@ -8,7 +8,7 @@ import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.store.SimpleFSDirectory; -import org.apache.lucene.store.SimpleFSLockFactory; +import org.apache.lucene.store.SingleInstanceLockFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -59,7 +59,7 @@ public class IndexingService extends AbstractScheduledService { Path luceneDirectory = DirectoryUtil.getLuceneDirectory(); log.info("Using file Lucene storage: {}", luceneDirectory); try { - directory = new SimpleFSDirectory(luceneDirectory.toFile(), new SimpleFSLockFactory()); + directory = new SimpleFSDirectory(luceneDirectory, new SingleInstanceLockFactory()); } catch (IOException e) { log.error("Error initializing Lucene index", e); } @@ -127,10 +127,10 @@ public class IndexingService extends AbstractScheduledService { */ public DirectoryReader getDirectoryReader() { if (directoryReader == null) { - if (!DirectoryReader.indexExists(directory)) { - return null; - } try { + if (!DirectoryReader.indexExists(directory)) { + return null; + } directoryReader = DirectoryReader.open(directory); } catch (IOException e) { log.error("Error creating the directory reader", e); diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/LuceneUtil.java b/docs-core/src/main/java/com/sismics/docs/core/util/LuceneUtil.java index 0aba6e34..9fcb977f 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/util/LuceneUtil.java +++ b/docs-core/src/main/java/com/sismics/docs/core/util/LuceneUtil.java @@ -1,16 +1,16 @@ package com.sismics.docs.core.util; -import com.sismics.docs.core.dao.lucene.DocsStandardAnalyzer; -import com.sismics.docs.core.model.context.AppContext; +import java.io.IOException; + +import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.SerialMergeScheduler; import org.apache.lucene.store.Directory; -import org.apache.lucene.util.Version; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; +import com.sismics.docs.core.model.context.AppContext; /** * Lucene utils. @@ -31,7 +31,7 @@ public class LuceneUtil { */ public static void handle(LuceneRunnable runnable) { // Standard analyzer - IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_42, new DocsStandardAnalyzer(Version.LUCENE_42)); + IndexWriterConfig config = new IndexWriterConfig(new StandardAnalyzer()); // Merge sequentially, because Lucene writing is already done asynchronously config.setMergeScheduler(new SerialMergeScheduler()); @@ -45,15 +45,6 @@ public class LuceneUtil { log.error("Cannot create IndexWriter", e); } - // Unlock index if needed - try { - if (IndexWriter.isLocked(directory)) { - IndexWriter.unlock(directory); - } - } catch (IOException e) { - log.error("Cannot unlock Lucene directory", e); - } - try { runnable.run(indexWriter); } catch (Exception e) { diff --git a/docs-parent/pom.xml b/docs-parent/pom.xml index 30badbc7..54ee0aa2 100644 --- a/docs-parent/pom.xml +++ b/docs-parent/pom.xml @@ -28,7 +28,7 @@ 1.4.191 2.22.1 0.3m - 4.2.0 + 5.5.0 4.2 2.0.0-RC3 1.54 @@ -339,12 +339,6 @@ ${org.apache.lucene.version}
- - org.apache.lucene - lucene-highlighter - ${org.apache.lucene.version} - - org.imgscalr imgscalr-lib From a7a6adfa3423f5d3645152908f7ea2eae1626d53 Mon Sep 17 00:00:00 2001 From: jendib Date: Tue, 1 Mar 2016 01:24:26 +0100 Subject: [PATCH 2/3] #62: Rebuild index if too old or corrupted --- .../docs/core/service/IndexingService.java | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/docs-core/src/main/java/com/sismics/docs/core/service/IndexingService.java b/docs-core/src/main/java/com/sismics/docs/core/service/IndexingService.java index 0ecf95b2..535fc26e 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/service/IndexingService.java +++ b/docs-core/src/main/java/com/sismics/docs/core/service/IndexingService.java @@ -4,11 +4,15 @@ import java.io.IOException; import java.nio.file.Path; import java.util.concurrent.TimeUnit; +import org.apache.lucene.index.CheckIndex; +import org.apache.lucene.index.CheckIndex.Status; +import org.apache.lucene.index.CheckIndex.Status.SegmentInfoStatus; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.store.SimpleFSDirectory; import org.apache.lucene.store.SingleInstanceLockFactory; +import org.apache.lucene.util.Version; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -64,6 +68,26 @@ public class IndexingService extends AbstractScheduledService { log.error("Error initializing Lucene index", e); } } + + // Check index version and rebuild it if necessary + log.info("Checking index health and version"); + try (CheckIndex checkIndex = new CheckIndex(directory)) { + Status status = checkIndex.checkIndex(); + if (status.clean) { + for (SegmentInfoStatus segmentInfo : status.segmentInfos) { + if (!segmentInfo.version.onOrAfter(Version.LATEST)) { + RebuildIndexAsyncEvent rebuildIndexAsyncEvent = new RebuildIndexAsyncEvent(); + AppContext.getInstance().getAsyncEventBus().post(rebuildIndexAsyncEvent); + break; + } + } + } else { + RebuildIndexAsyncEvent rebuildIndexAsyncEvent = new RebuildIndexAsyncEvent(); + AppContext.getInstance().getAsyncEventBus().post(rebuildIndexAsyncEvent); + } + } catch (IOException e) { + log.error("Error checking index", e); + } } @Override From 59682b5ba663cbc094279a3a18c76ecac2732d7b Mon Sep 17 00:00:00 2001 From: jendib Date: Tue, 1 Mar 2016 23:52:15 +0100 Subject: [PATCH 3/3] Closes #62: logs for index checking, explicit commit on close --- docs-core/pom.xml | 6 ++++ .../docs/core/service/IndexingService.java | 32 +++++++++++-------- .../sismics/docs/core/util/LuceneUtil.java | 5 ++- docs-parent/pom.xml | 6 ++++ 4 files changed, 35 insertions(+), 14 deletions(-) diff --git a/docs-core/pom.xml b/docs-core/pom.xml index 9e12d3fb..ed0673dc 100644 --- a/docs-core/pom.xml +++ b/docs-core/pom.xml @@ -93,6 +93,12 @@ lucene-queryparser + + + org.apache.lucene + lucene-backward-codecs + + org.imgscalr imgscalr-lib diff --git a/docs-core/src/main/java/com/sismics/docs/core/service/IndexingService.java b/docs-core/src/main/java/com/sismics/docs/core/service/IndexingService.java index 535fc26e..7f7fbeed 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/service/IndexingService.java +++ b/docs-core/src/main/java/com/sismics/docs/core/service/IndexingService.java @@ -9,9 +9,9 @@ import org.apache.lucene.index.CheckIndex.Status; import org.apache.lucene.index.CheckIndex.Status.SegmentInfoStatus; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.store.Directory; +import org.apache.lucene.store.NoLockFactory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.store.SimpleFSDirectory; -import org.apache.lucene.store.SingleInstanceLockFactory; import org.apache.lucene.util.Version; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -63,29 +63,35 @@ public class IndexingService extends AbstractScheduledService { Path luceneDirectory = DirectoryUtil.getLuceneDirectory(); log.info("Using file Lucene storage: {}", luceneDirectory); try { - directory = new SimpleFSDirectory(luceneDirectory, new SingleInstanceLockFactory()); + directory = new SimpleFSDirectory(luceneDirectory, NoLockFactory.INSTANCE); } catch (IOException e) { log.error("Error initializing Lucene index", e); } } // Check index version and rebuild it if necessary - log.info("Checking index health and version"); - try (CheckIndex checkIndex = new CheckIndex(directory)) { - Status status = checkIndex.checkIndex(); - if (status.clean) { - for (SegmentInfoStatus segmentInfo : status.segmentInfos) { - if (!segmentInfo.version.onOrAfter(Version.LATEST)) { + try { + if (DirectoryReader.indexExists(directory)) { + log.info("Checking index health and version"); + try (CheckIndex checkIndex = new CheckIndex(directory)) { + Status status = checkIndex.checkIndex(); + if (status.clean) { + for (SegmentInfoStatus segmentInfo : status.segmentInfos) { + if (!segmentInfo.version.onOrAfter(Version.LATEST)) { + log.info("Index is old (" + segmentInfo.version + "), rebuilding"); + RebuildIndexAsyncEvent rebuildIndexAsyncEvent = new RebuildIndexAsyncEvent(); + AppContext.getInstance().getAsyncEventBus().post(rebuildIndexAsyncEvent); + break; + } + } + } else { + log.info("Index is dirty, rebuilding"); RebuildIndexAsyncEvent rebuildIndexAsyncEvent = new RebuildIndexAsyncEvent(); AppContext.getInstance().getAsyncEventBus().post(rebuildIndexAsyncEvent); - break; } } - } else { - RebuildIndexAsyncEvent rebuildIndexAsyncEvent = new RebuildIndexAsyncEvent(); - AppContext.getInstance().getAsyncEventBus().post(rebuildIndexAsyncEvent); } - } catch (IOException e) { + } catch (Exception e) { log.error("Error checking index", e); } } diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/LuceneUtil.java b/docs-core/src/main/java/com/sismics/docs/core/util/LuceneUtil.java index 9fcb977f..2754c533 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/util/LuceneUtil.java +++ b/docs-core/src/main/java/com/sismics/docs/core/util/LuceneUtil.java @@ -33,6 +33,9 @@ public class LuceneUtil { // Standard analyzer IndexWriterConfig config = new IndexWriterConfig(new StandardAnalyzer()); + // Automatically commit when closing this writer + config.setCommitOnClose(true); + // Merge sequentially, because Lucene writing is already done asynchronously config.setMergeScheduler(new SerialMergeScheduler()); @@ -59,7 +62,7 @@ public class LuceneUtil { try { indexWriter.close(); } catch (IOException e) { - log.error("Cannot close IndexWriter", e); + log.error("Cannot commit and close IndexWriter", e); } } diff --git a/docs-parent/pom.xml b/docs-parent/pom.xml index 54ee0aa2..c90ce97c 100644 --- a/docs-parent/pom.xml +++ b/docs-parent/pom.xml @@ -339,6 +339,12 @@ ${org.apache.lucene.version} + + org.apache.lucene + lucene-backward-codecs + ${org.apache.lucene.version} + + org.imgscalr imgscalr-lib