Tweak Lucene analyzer's definition of a whole word

When words were linked with '_' or '.' Lucene would treat it as one
word. Change it so that Lucene treats them as separate words. The new
analyzer wraps StandardAnalyzer and changes the behavior such that '_'
and '.' are treated as whitespace. A reindex is necessary.

Bug: issue 2822
Change-Id: Ibed25695bf8e60335a2486e5e988a7c67b3da37d
This commit is contained in:
Simon Lei
2014-08-19 13:58:05 -04:00
committed by Dave Borowitz
parent 85a8f993ed
commit 158a024f14
3 changed files with 101 additions and 6 deletions

View File

@@ -58,7 +58,6 @@ import com.google.inject.Provider;
import com.google.inject.assistedinject.Assisted;
import com.google.inject.assistedinject.AssistedInject;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.document.Document;
@@ -122,6 +121,8 @@ public class LuceneChangeIndex implements ChangeIndex {
private static final String ID_FIELD = ChangeField.LEGACY_ID.getName();
private static final ImmutableSet<String> FIELDS = ImmutableSet.of(
ADDED_FIELD, APPROVAL_FIELD, CHANGE_FIELD, DELETED_FIELD, ID_FIELD);
private static final Map<String, String> CUSTOM_CHAR_MAPPING = ImmutableMap.of(
"_", " ", ".", " ");
private static final Map<Schema<ChangeData>, Version> LUCENE_VERSIONS;
static {
@@ -173,8 +174,10 @@ public class LuceneChangeIndex implements ChangeIndex {
private long commitWithinMs;
private GerritIndexWriterConfig(Version version, Config cfg, String name) {
luceneConfig = new IndexWriterConfig(version,
new StandardAnalyzer(version, CharArraySet.EMPTY_SET));
CustomMappingAnalyzer analyzer =
new CustomMappingAnalyzer(new StandardAnalyzer(version,
CharArraySet.EMPTY_SET), CUSTOM_CHAR_MAPPING);
luceneConfig = new IndexWriterConfig(version, analyzer);
luceneConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
double m = 1 << 20;
luceneConfig.setRAMBufferSizeMB(cfg.getLong(
@@ -237,9 +240,9 @@ public class LuceneChangeIndex implements ChangeIndex {
Version luceneVersion = checkNotNull(
LUCENE_VERSIONS.get(schema),
"unknown Lucene version for index schema: %s", schema);
Analyzer analyzer =
new StandardAnalyzer(luceneVersion, CharArraySet.EMPTY_SET);
CustomMappingAnalyzer analyzer =
new CustomMappingAnalyzer(new StandardAnalyzer(luceneVersion,
CharArraySet.EMPTY_SET), CUSTOM_CHAR_MAPPING);
queryBuilder = new QueryBuilder(schema, analyzer);
GerritIndexWriterConfig openConfig =