Merge pull request #74 from sismics/lucene5

Migration to Lucene 5
This commit is contained in:
Benjamin Gamard 2016-03-01 23:54:17 +01:00
commit 25a2144b31
6 changed files with 66 additions and 175 deletions

View File

@ -93,9 +93,10 @@
<artifactId>lucene-queryparser</artifactId>
</dependency>
<!-- Only there to read old index and rebuild them -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<artifactId>lucene-backward-codecs</artifactId>
</dependency>
<dependency>

View File

@ -1,135 +0,0 @@
package com.sismics.docs.core.dao.lucene;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.standard.ClassicAnalyzer;
import org.apache.lucene.analysis.standard.ClassicTokenizer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version;
import java.io.IOException;
import java.io.Reader;
/**
* Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
* LowerCaseFilter} and {@link StopFilter}, using a list of
* English stop words.
*
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating StandardAnalyzer:
* <ul>
* <li> As of 3.4, Hiragana and Han characters are no longer wrongly split
* from their combining characters. If you use a previous version number,
* you get the exact broken behavior for backwards compatibility.
* <li> As of 3.1, StandardTokenizer implements Unicode text segmentation,
* and StopFilter correctly handles Unicode 4.0 supplementary characters
* in stopwords. {@link ClassicTokenizer} and {@link ClassicAnalyzer}
* are the pre-3.1 implementations of StandardTokenizer and
* StandardAnalyzer.
* <li> As of 2.9, StopFilter preserves position increments
* <li> As of 2.4, Tokens incorrectly identified as acronyms
* are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
* </ul>
*/
public final class DocsStandardAnalyzer extends StopwordAnalyzerBase {
/** Default maximum allowed token length */
public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
/** An unmodifiable set containing some common English words that are usually not
useful for searching. */
public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
/** Builds an analyzer with the given stop words.
* @param matchVersion Lucene version to match See {@link
* <a href="#version">above</a>}
* @param stopWords stop words */
public DocsStandardAnalyzer(Version matchVersion, CharArraySet stopWords) {
super(matchVersion, stopWords);
}
/** Builds an analyzer with the default stop words ({@link
* #STOP_WORDS_SET}).
* @param matchVersion Lucene version to match See {@link
* <a href="#version">above</a>}
*/
public DocsStandardAnalyzer(Version matchVersion) {
this(matchVersion, STOP_WORDS_SET);
}
/** Builds an analyzer with the stop words from the given reader.
* @see WordlistLoader#getWordSet(Reader, Version)
* @param matchVersion Lucene version to match See {@link
* <a href="#version">above</a>}
* @param stopwords Reader to read stop words from */
public DocsStandardAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
this(matchVersion, loadStopwordSet(stopwords, matchVersion));
}
/**
* Set maximum allowed token length. If a token is seen
* that exceeds this length then it is discarded. This
* setting only takes effect the next time tokenStream or
* tokenStream is called.
*/
public void setMaxTokenLength(int length) {
maxTokenLength = length;
}
/**
* @see #setMaxTokenLength
*/
public int getMaxTokenLength() {
return maxTokenLength;
}
@Override
protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
final StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
src.setMaxTokenLength(maxTokenLength);
TokenStream tok = new StandardFilter(matchVersion, src);
tok = new LowerCaseFilter(matchVersion, tok);
tok = new StopFilter(matchVersion, tok, stopwords);
return new TokenStreamComponents(src, tok) {
@Override
protected void setReader(final Reader reader) throws IOException {
src.setMaxTokenLength(DocsStandardAnalyzer.this.maxTokenLength);
super.setReader(reader);
}
};
}
@Override
protected Reader initReader(String fieldName, Reader reader) {
if (fieldName.equals("title") || fieldName.equals("description")) {
return new HTMLStripCharFilter(super.initReader(fieldName, reader));
}
return super.initReader(fieldName, reader);
}
}

View File

@ -6,6 +6,7 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
@ -19,7 +20,6 @@ import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.util.Version;
import com.sismics.docs.core.model.context.AppContext;
import com.sismics.docs.core.model.jpa.Document;
@ -152,22 +152,23 @@ public class LuceneDao {
fullSearchQuery = "\"" + QueryParserUtil.escape(fullSearchQuery) + "\"";
// Build search query
StandardQueryParser qpHelper = new StandardQueryParser(new DocsStandardAnalyzer(Version.LUCENE_42));
StandardQueryParser qpHelper = new StandardQueryParser(new StandardAnalyzer());
qpHelper.setPhraseSlop(100000); // PhraseQuery add terms
// Search on documents and files
BooleanQuery query = new BooleanQuery();
query.add(qpHelper.parse(searchQuery, "title"), Occur.SHOULD);
query.add(qpHelper.parse(searchQuery, "description"), Occur.SHOULD);
query.add(qpHelper.parse(searchQuery, "subject"), Occur.SHOULD);
query.add(qpHelper.parse(searchQuery, "identifier"), Occur.SHOULD);
query.add(qpHelper.parse(searchQuery, "publisher"), Occur.SHOULD);
query.add(qpHelper.parse(searchQuery, "format"), Occur.SHOULD);
query.add(qpHelper.parse(searchQuery, "source"), Occur.SHOULD);
query.add(qpHelper.parse(searchQuery, "type"), Occur.SHOULD);
query.add(qpHelper.parse(searchQuery, "coverage"), Occur.SHOULD);
query.add(qpHelper.parse(searchQuery, "rights"), Occur.SHOULD);
query.add(qpHelper.parse(fullSearchQuery, "content"), Occur.SHOULD);
BooleanQuery query = new BooleanQuery.Builder()
.add(qpHelper.parse(searchQuery, "title"), Occur.SHOULD)
.add(qpHelper.parse(searchQuery, "description"), Occur.SHOULD)
.add(qpHelper.parse(searchQuery, "subject"), Occur.SHOULD)
.add(qpHelper.parse(searchQuery, "identifier"), Occur.SHOULD)
.add(qpHelper.parse(searchQuery, "publisher"), Occur.SHOULD)
.add(qpHelper.parse(searchQuery, "format"), Occur.SHOULD)
.add(qpHelper.parse(searchQuery, "source"), Occur.SHOULD)
.add(qpHelper.parse(searchQuery, "type"), Occur.SHOULD)
.add(qpHelper.parse(searchQuery, "coverage"), Occur.SHOULD)
.add(qpHelper.parse(searchQuery, "rights"), Occur.SHOULD)
.add(qpHelper.parse(fullSearchQuery, "content"), Occur.SHOULD)
.build();
// Search
DirectoryReader directoryReader = AppContext.getInstance().getIndexingService().getDirectoryReader();
@ -183,7 +184,7 @@ public class LuceneDao {
// Extract document IDs
for (int i = 0; i < docs.length; i++) {
org.apache.lucene.document.Document document = searcher.doc(docs[i].doc);
String type = document.get("type");
String type = document.get("doctype");
String documentId = null;
if (type.equals("document")) {
documentId = document.get("id");
@ -205,7 +206,7 @@ public class LuceneDao {
private org.apache.lucene.document.Document getDocumentFromDocument(Document document) {
org.apache.lucene.document.Document luceneDocument = new org.apache.lucene.document.Document();
luceneDocument.add(new StringField("id", document.getId(), Field.Store.YES));
luceneDocument.add(new StringField("type", "document", Field.Store.YES));
luceneDocument.add(new StringField("doctype", "document", Field.Store.YES));
luceneDocument.add(new TextField("title", document.getTitle(), Field.Store.NO));
if (document.getDescription() != null) {
luceneDocument.add(new TextField("description", document.getDescription(), Field.Store.NO));
@ -248,7 +249,7 @@ public class LuceneDao {
private org.apache.lucene.document.Document getDocumentFromFile(File file, Document document) {
org.apache.lucene.document.Document luceneDocument = new org.apache.lucene.document.Document();
luceneDocument.add(new StringField("id", file.getId(), Field.Store.YES));
luceneDocument.add(new StringField("type", "file", Field.Store.YES));
luceneDocument.add(new StringField("doctype", "file", Field.Store.YES));
luceneDocument.add(new StringField("document_id", file.getDocumentId(), Field.Store.YES));
if (file.getContent() != null) {
luceneDocument.add(new TextField("content", file.getContent(), Field.Store.NO));

View File

@ -4,11 +4,15 @@ import java.io.IOException;
import java.nio.file.Path;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.index.CheckIndex;
import org.apache.lucene.index.CheckIndex.Status;
import org.apache.lucene.index.CheckIndex.Status.SegmentInfoStatus;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.NoLockFactory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.store.SimpleFSLockFactory;
import org.apache.lucene.util.Version;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -59,11 +63,37 @@ public class IndexingService extends AbstractScheduledService {
Path luceneDirectory = DirectoryUtil.getLuceneDirectory();
log.info("Using file Lucene storage: {}", luceneDirectory);
try {
directory = new SimpleFSDirectory(luceneDirectory.toFile(), new SimpleFSLockFactory());
directory = new SimpleFSDirectory(luceneDirectory, NoLockFactory.INSTANCE);
} catch (IOException e) {
log.error("Error initializing Lucene index", e);
}
}
// Check index version and rebuild it if necessary
try {
if (DirectoryReader.indexExists(directory)) {
log.info("Checking index health and version");
try (CheckIndex checkIndex = new CheckIndex(directory)) {
Status status = checkIndex.checkIndex();
if (status.clean) {
for (SegmentInfoStatus segmentInfo : status.segmentInfos) {
if (!segmentInfo.version.onOrAfter(Version.LATEST)) {
log.info("Index is old (" + segmentInfo.version + "), rebuilding");
RebuildIndexAsyncEvent rebuildIndexAsyncEvent = new RebuildIndexAsyncEvent();
AppContext.getInstance().getAsyncEventBus().post(rebuildIndexAsyncEvent);
break;
}
}
} else {
log.info("Index is dirty, rebuilding");
RebuildIndexAsyncEvent rebuildIndexAsyncEvent = new RebuildIndexAsyncEvent();
AppContext.getInstance().getAsyncEventBus().post(rebuildIndexAsyncEvent);
}
}
}
} catch (Exception e) {
log.error("Error checking index", e);
}
}
@Override
@ -127,10 +157,10 @@ public class IndexingService extends AbstractScheduledService {
*/
public DirectoryReader getDirectoryReader() {
if (directoryReader == null) {
if (!DirectoryReader.indexExists(directory)) {
return null;
}
try {
if (!DirectoryReader.indexExists(directory)) {
return null;
}
directoryReader = DirectoryReader.open(directory);
} catch (IOException e) {
log.error("Error creating the directory reader", e);

View File

@ -1,16 +1,16 @@
package com.sismics.docs.core.util;
import com.sismics.docs.core.dao.lucene.DocsStandardAnalyzer;
import com.sismics.docs.core.model.context.AppContext;
import java.io.IOException;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.SerialMergeScheduler;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Version;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import com.sismics.docs.core.model.context.AppContext;
/**
* Lucene utils.
@ -31,7 +31,10 @@ public class LuceneUtil {
*/
public static void handle(LuceneRunnable runnable) {
// Standard analyzer
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_42, new DocsStandardAnalyzer(Version.LUCENE_42));
IndexWriterConfig config = new IndexWriterConfig(new StandardAnalyzer());
// Automatically commit when closing this writer
config.setCommitOnClose(true);
// Merge sequentially, because Lucene writing is already done asynchronously
config.setMergeScheduler(new SerialMergeScheduler());
@ -45,15 +48,6 @@ public class LuceneUtil {
log.error("Cannot create IndexWriter", e);
}
// Unlock index if needed
try {
if (IndexWriter.isLocked(directory)) {
IndexWriter.unlock(directory);
}
} catch (IOException e) {
log.error("Cannot unlock Lucene directory", e);
}
try {
runnable.run(indexWriter);
} catch (Exception e) {
@ -68,7 +62,7 @@ public class LuceneUtil {
try {
indexWriter.close();
} catch (IOException e) {
log.error("Cannot close IndexWriter", e);
log.error("Cannot commit and close IndexWriter", e);
}
}

View File

@ -28,7 +28,7 @@
<com.h2database.h2.version>1.4.191</com.h2database.h2.version>
<org.glassfish.jersey.version>2.22.1</org.glassfish.jersey.version>
<org.mindrot.jbcrypt>0.3m</org.mindrot.jbcrypt>
<org.apache.lucene.version>4.2.0</org.apache.lucene.version>
<org.apache.lucene.version>5.5.0</org.apache.lucene.version>
<org.imgscalr.imgscalr-lib.version>4.2</org.imgscalr.imgscalr-lib.version>
<org.apache.pdfbox.pdfbox.version>2.0.0-RC3</org.apache.pdfbox.pdfbox.version>
<org.bouncycastle.bcprov-jdk15on.version>1.54</org.bouncycastle.bcprov-jdk15on.version>
@ -341,7 +341,7 @@
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<artifactId>lucene-backward-codecs</artifactId>
<version>${org.apache.lucene.version}</version>
</dependency>