Tweak Lucene analyzer's definition of a whole word

When words were linked with '_' or '.' Lucene would treat it as one
word. Change it so that Lucene treats them as separate words. The new
analyzer wraps StandardAnalyzer and changes the behavior such that '_'
and '.' are treated as whitespace. A reindex is necessary.

Bug: issue 2822
Change-Id: Ibed25695bf8e60335a2486e5e988a7c67b3da37d
This commit is contained in:
Simon Lei
2014-08-19 13:58:05 -04:00
committed by Dave Borowitz
parent 85a8f993ed
commit 158a024f14
3 changed files with 101 additions and 6 deletions

View File

@@ -0,0 +1,65 @@
// Copyright (C) 2014 The Android Open Source Project
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.gerrit.lucene;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.AnalyzerWrapper;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import java.io.Reader;
import java.util.Map;
/**
* This analyzer can be used to provide custom char mappings.
*
* <p>Example usage:
*
* <pre class="prettyprint">
* {@code
* Map<String,String> customMapping = new HashMap<>();
* customMapping.put("_", " ");
* customMapping.put(".", " ");
*
* CustomMappingAnalyzer analyzer =
* new CustomMappingAnalyzer(new StandardAnalyzer(version), customMapping);
* }
* </pre>
*/
public class CustomMappingAnalyzer extends AnalyzerWrapper {
private Analyzer delegate;
private Map<String, String> customMappings;
public CustomMappingAnalyzer(Analyzer delegate,
Map<String, String> customMappings) {
super(delegate.getReuseStrategy());
this.delegate = delegate;
this.customMappings = customMappings;
}
@Override
protected Analyzer getWrappedAnalyzer(String fieldName) {
return delegate;
}
@Override
protected Reader wrapReader(String fieldName, Reader reader) {
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
for (Map.Entry<String, String> e : customMappings.entrySet()) {
builder.add(e.getKey(), e.getValue());
}
return new MappingCharFilter(builder.build(), reader);
}
}

View File

@@ -58,7 +58,6 @@ import com.google.inject.Provider;
import com.google.inject.assistedinject.Assisted;
import com.google.inject.assistedinject.AssistedInject;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.document.Document;
@@ -122,6 +121,8 @@ public class LuceneChangeIndex implements ChangeIndex {
private static final String ID_FIELD = ChangeField.LEGACY_ID.getName();
private static final ImmutableSet<String> FIELDS = ImmutableSet.of(
ADDED_FIELD, APPROVAL_FIELD, CHANGE_FIELD, DELETED_FIELD, ID_FIELD);
private static final Map<String, String> CUSTOM_CHAR_MAPPING = ImmutableMap.of(
"_", " ", ".", " ");
private static final Map<Schema<ChangeData>, Version> LUCENE_VERSIONS;
static {
@@ -173,8 +174,10 @@ public class LuceneChangeIndex implements ChangeIndex {
private long commitWithinMs;
private GerritIndexWriterConfig(Version version, Config cfg, String name) {
luceneConfig = new IndexWriterConfig(version,
new StandardAnalyzer(version, CharArraySet.EMPTY_SET));
CustomMappingAnalyzer analyzer =
new CustomMappingAnalyzer(new StandardAnalyzer(version,
CharArraySet.EMPTY_SET), CUSTOM_CHAR_MAPPING);
luceneConfig = new IndexWriterConfig(version, analyzer);
luceneConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
double m = 1 << 20;
luceneConfig.setRAMBufferSizeMB(cfg.getLong(
@@ -237,9 +240,9 @@ public class LuceneChangeIndex implements ChangeIndex {
Version luceneVersion = checkNotNull(
LUCENE_VERSIONS.get(schema),
"unknown Lucene version for index schema: %s", schema);
Analyzer analyzer =
new StandardAnalyzer(luceneVersion, CharArraySet.EMPTY_SET);
CustomMappingAnalyzer analyzer =
new CustomMappingAnalyzer(new StandardAnalyzer(luceneVersion,
CharArraySet.EMPTY_SET), CUSTOM_CHAR_MAPPING);
queryBuilder = new QueryBuilder(schema, analyzer);
GerritIndexWriterConfig openConfig =

View File

@@ -14,12 +14,39 @@
package com.google.gerrit.server.query.change;
import static org.junit.Assert.assertTrue;
import com.google.gerrit.reviewdb.client.Change;
import com.google.gerrit.testutil.InMemoryModule;
import com.google.inject.Guice;
import com.google.inject.Injector;
import org.eclipse.jgit.internal.storage.dfs.InMemoryRepository;
import org.eclipse.jgit.junit.TestRepository;
import org.eclipse.jgit.revwalk.RevCommit;
import org.junit.Test;
public class LuceneQueryChangesTest extends AbstractQueryChangesTest {
protected Injector createInjector() {
return Guice.createInjector(new InMemoryModule());
}
@Test
public void fullTextWithSpecialChars() throws Exception {
TestRepository<InMemoryRepository> repo = createProject("repo");
RevCommit commit1 =
repo.parseBody(repo.commit().message("foo_bar_foo").create());
Change change1 = newChange(repo, commit1, null, null, null).insert();
RevCommit commit2 =
repo.parseBody(repo.commit().message("one.two.three").create());
Change change2 = newChange(repo, commit2, null, null, null).insert();
assertTrue(query("message:foo_ba").isEmpty());
assertResultEquals(change1, queryOne("message:bar"));
assertResultEquals(change1, queryOne("message:foo_bar"));
assertResultEquals(change1, queryOne("message:foo bar"));
assertResultEquals(change2, queryOne("message:two"));
assertResultEquals(change2, queryOne("message:one.two"));
assertResultEquals(change2, queryOne("message:one two"));
}
}