Use Lucene's QueryBuilder for building full-text queries

FuzzyQuery was a mistake; it uses edit distance to find terms in the
index close to the provided search term. This produces bizarre results
for queries like "message:1234".

Instead, use Lucene's QueryBuilder with an analyzer to convert a
full-text search word/phrase into a phrase query.

Add some tests for full-text matching behavior on numbers, which
should hopefully not be too dependent on specific Lucene behavior.
Coincidentally, a copy-paste error in the byMessageExact test
prevented this poor behavior from showing up in tests sooner.

Change-Id: I384f74f1455d0433433a27f880204ac8ecbf93da
This commit is contained in:
Dave Borowitz
2013-12-27 14:48:26 -08:00
committed by Shawn Pearce
parent d8b4b55ea5
commit 569f2516db
5 changed files with 70 additions and 32 deletions

View File

@@ -54,6 +54,7 @@ import com.google.inject.Provider;
import com.google.inject.assistedinject.Assisted;
import com.google.inject.assistedinject.AssistedInject;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.document.Document;
@@ -157,6 +158,7 @@ public class LuceneChangeIndex implements ChangeIndex {
private final ChangeData.Factory changeDataFactory;
private final File dir;
private final Schema<ChangeData> schema;
private final QueryBuilder queryBuilder;
private final SubIndex openIndex;
private final SubIndex closedIndex;
@@ -186,6 +188,10 @@ public class LuceneChangeIndex implements ChangeIndex {
LUCENE_VERSIONS.get(schema),
"unknown Lucene version for index schema: %s", schema);
Analyzer analyzer =
new StandardAnalyzer(luceneVersion, CharArraySet.EMPTY_SET);
queryBuilder = new QueryBuilder(schema, analyzer);
IndexWriterConfig openConfig =
getIndexWriterConfig(luceneVersion, cfg, "changes_open");
IndexWriterConfig closedConfig =
@@ -298,7 +304,7 @@ public class LuceneChangeIndex implements ChangeIndex {
if (!Sets.intersection(statuses, CLOSED_STATUSES).isEmpty()) {
indexes.add(closedIndex);
}
return new QuerySource(indexes, QueryBuilder.toQuery(schema, p), limit,
return new QuerySource(indexes, queryBuilder.toQuery(p), limit,
ChangeQueryBuilder.hasNonTrivialSortKeyAfter(schema, p));
}

View File

@@ -33,9 +33,9 @@ import com.google.gerrit.server.query.QueryParseException;
import com.google.gerrit.server.query.change.ChangeData;
import com.google.gerrit.server.query.change.SortKeyPredicate;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.NumericRangeQuery;
import org.apache.lucene.search.PrefixQuery;
@@ -55,27 +55,34 @@ public class QueryBuilder {
return intTerm(ID_FIELD, cd.getId().get());
}
public static Query toQuery(Schema<ChangeData> schema, Predicate<ChangeData> p)
throws QueryParseException {
private final Schema<ChangeData> schema;
private final org.apache.lucene.util.QueryBuilder queryBuilder;
public QueryBuilder(Schema<ChangeData> schema, Analyzer analyzer) {
this.schema = schema;
queryBuilder = new org.apache.lucene.util.QueryBuilder(analyzer);
}
public Query toQuery(Predicate<ChangeData> p) throws QueryParseException {
if (p instanceof AndPredicate) {
return and(schema, p);
return and(p);
} else if (p instanceof OrPredicate) {
return or(schema, p);
return or(p);
} else if (p instanceof NotPredicate) {
return not(schema, p);
return not(p);
} else if (p instanceof IndexPredicate) {
return fieldQuery(schema, (IndexPredicate<ChangeData>) p);
return fieldQuery((IndexPredicate<ChangeData>) p);
} else {
throw new QueryParseException("cannot create query for index: " + p);
}
}
private static Query or(Schema<ChangeData> schema, Predicate<ChangeData> p)
private Query or(Predicate<ChangeData> p)
throws QueryParseException {
try {
BooleanQuery q = new BooleanQuery();
for (int i = 0; i < p.getChildCount(); i++) {
q.add(toQuery(schema, p.getChild(i)), SHOULD);
q.add(toQuery(p.getChild(i)), SHOULD);
}
return q;
} catch (BooleanQuery.TooManyClauses e) {
@@ -83,7 +90,7 @@ public class QueryBuilder {
}
}
private static Query and(Schema<ChangeData> schema, Predicate<ChangeData> p)
private Query and(Predicate<ChangeData> p)
throws QueryParseException {
try {
BooleanQuery b = new BooleanQuery();
@@ -95,10 +102,10 @@ public class QueryBuilder {
if (n instanceof TimestampRangePredicate) {
b.add(notTimestamp((TimestampRangePredicate<ChangeData>) n), MUST);
} else {
not.add(toQuery(schema, n));
not.add(toQuery(n));
}
} else {
b.add(toQuery(schema, c), MUST);
b.add(toQuery(c), MUST);
}
}
for (Query q : not) {
@@ -110,7 +117,7 @@ public class QueryBuilder {
}
}
private static Query not(Schema<ChangeData> schema, Predicate<ChangeData> p)
private Query not(Predicate<ChangeData> p)
throws QueryParseException {
Predicate<ChangeData> n = p.getChild(0);
if (n instanceof TimestampRangePredicate) {
@@ -120,12 +127,12 @@ public class QueryBuilder {
// Lucene does not support negation, start with all and subtract.
BooleanQuery q = new BooleanQuery();
q.add(new MatchAllDocsQuery(), MUST);
q.add(toQuery(schema, n), MUST_NOT);
q.add(toQuery(n), MUST_NOT);
return q;
}
private static Query fieldQuery(Schema<ChangeData> schema,
IndexPredicate<ChangeData> p) throws QueryParseException {
private Query fieldQuery(IndexPredicate<ChangeData> p)
throws QueryParseException {
if (p.getType() == FieldType.INTEGER) {
return intQuery(p);
} else if (p.getType() == FieldType.TIMESTAMP) {
@@ -137,7 +144,7 @@ public class QueryBuilder {
} else if (p.getType() == FieldType.FULL_TEXT) {
return fullTextQuery(p);
} else if (p instanceof SortKeyPredicate) {
return sortKeyQuery(schema, (SortKeyPredicate) p);
return sortKeyQuery((SortKeyPredicate) p);
} else {
throw badFieldType(p.getType());
}
@@ -149,7 +156,7 @@ public class QueryBuilder {
return new Term(name, bytes);
}
private static Query intQuery(IndexPredicate<ChangeData> p)
private Query intQuery(IndexPredicate<ChangeData> p)
throws QueryParseException {
int value;
try {
@@ -162,7 +169,7 @@ public class QueryBuilder {
return new TermQuery(intTerm(p.getField().getName(), value));
}
private static Query sortKeyQuery(Schema<ChangeData> schema, SortKeyPredicate p) {
private Query sortKeyQuery(SortKeyPredicate p) {
long min = p.getMinValue(schema);
long max = p.getMaxValue(schema);
return NumericRangeQuery.newLongRange(
@@ -172,7 +179,7 @@ public class QueryBuilder {
false, false);
}
private static Query timestampQuery(IndexPredicate<ChangeData> p)
private Query timestampQuery(IndexPredicate<ChangeData> p)
throws QueryParseException {
if (p instanceof TimestampRangePredicate) {
TimestampRangePredicate<ChangeData> r =
@@ -186,7 +193,7 @@ public class QueryBuilder {
throw new QueryParseException("not a timestamp: " + p);
}
private static Query notTimestamp(TimestampRangePredicate<ChangeData> r)
private Query notTimestamp(TimestampRangePredicate<ChangeData> r)
throws QueryParseException {
if (r.getMinTimestamp().getTime() == 0) {
return NumericRangeQuery.newIntRange(
@@ -198,7 +205,7 @@ public class QueryBuilder {
throw new QueryParseException("cannot negate: " + r);
}
private static Query exactQuery(IndexPredicate<ChangeData> p) {
private Query exactQuery(IndexPredicate<ChangeData> p) {
if (p instanceof RegexPredicate<?>) {
return regexQuery(p);
} else {
@@ -206,7 +213,7 @@ public class QueryBuilder {
}
}
private static Query regexQuery(IndexPredicate<ChangeData> p) {
private Query regexQuery(IndexPredicate<ChangeData> p) {
String re = p.getValue();
if (re.startsWith("^")) {
re = re.substring(1);
@@ -217,12 +224,12 @@ public class QueryBuilder {
return new RegexpQuery(new Term(p.getField().getName(), re));
}
private static Query prefixQuery(IndexPredicate<ChangeData> p) {
private Query prefixQuery(IndexPredicate<ChangeData> p) {
return new PrefixQuery(new Term(p.getField().getName(), p.getValue()));
}
private static Query fullTextQuery(IndexPredicate<ChangeData> p) {
return new FuzzyQuery(new Term(p.getField().getName(), p.getValue()));
private Query fullTextQuery(IndexPredicate<ChangeData> p) {
return queryBuilder.createPhraseQuery(p.getField().getName(), p.getValue());
}
public static int toIndexTime(Timestamp ts) {
@@ -232,7 +239,4 @@ public class QueryBuilder {
public static IllegalArgumentException badFieldType(FieldType<?> t) {
return new IllegalArgumentException("unknown index field type " + t);
}
private QueryBuilder() {
}
}

View File

@@ -349,11 +349,26 @@ public abstract class AbstractQueryChangesTest {
RevCommit commit2 = repo.parseBody(repo.commit().message("two").create());
Change change2 = newChange(repo, commit2, null, null, null).insert();
assertTrue(query("topic:foo").isEmpty());
assertTrue(query("message:foo").isEmpty());
assertResultEquals(change1, queryOne("message:one"));
assertResultEquals(change2, queryOne("message:two"));
}
@Test
public void fullTextWithNumbers() throws Exception {
TestRepository<InMemoryRepository> repo = createProject("repo");
RevCommit commit1 =
repo.parseBody(repo.commit().message("12345 67890").create());
Change change1 = newChange(repo, commit1, null, null, null).insert();
RevCommit commit2 =
repo.parseBody(repo.commit().message("12346 67891").create());
Change change2 = newChange(repo, commit2, null, null, null).insert();
assertTrue(query("message:1234").isEmpty());
assertResultEquals(change1, queryOne("message:12345"));
assertResultEquals(change2, queryOne("message:12346"));
}
@Test
public void byLabel() throws Exception {
accountManager.authenticate(AuthRequest.forUser("anotheruser"));

View File

@@ -12,6 +12,7 @@ java_library(
'//lib/guice:guice',
'//lib/jgit:jgit',
'//lib/log:api',
'//lib/lucene:analyzers-common',
'//lib/lucene:core',
'//lib/solr:solrj',
],

View File

@@ -45,7 +45,10 @@ import com.google.gwtorm.server.OrmException;
import com.google.gwtorm.server.ResultSet;
import com.google.inject.Provider;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.Version;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
@@ -79,6 +82,7 @@ class SolrChangeIndex implements ChangeIndex, LifecycleListener {
private final CloudSolrServer openIndex;
private final CloudSolrServer closedIndex;
private final Schema<ChangeData> schema;
private final QueryBuilder queryBuilder;
SolrChangeIndex(
@GerritServerConfig Config cfg,
@@ -101,6 +105,14 @@ class SolrChangeIndex implements ChangeIndex, LifecycleListener {
throw new IllegalStateException("index.solr.url must be supplied");
}
// Version is only used to determine the list of stop words used by the
// analyzer, so use the latest version rather than trying to match the Solr
// server version.
@SuppressWarnings("deprecation")
Version v = Version.LUCENE_CURRENT;
queryBuilder = new QueryBuilder(
schema, new StandardAnalyzer(v, CharArraySet.EMPTY_SET));
base = Strings.nullToEmpty(base);
openIndex = new CloudSolrServer(url);
openIndex.setDefaultCollection(base + CHANGES_OPEN);
@@ -208,7 +220,7 @@ class SolrChangeIndex implements ChangeIndex, LifecycleListener {
if (!Sets.intersection(statuses, CLOSED_STATUSES).isEmpty()) {
indexes.add(closedIndex);
}
return new QuerySource(indexes, QueryBuilder.toQuery(schema, p), limit,
return new QuerySource(indexes, queryBuilder.toQuery(p), limit,
ChangeQueryBuilder.hasNonTrivialSortKeyAfter(schema, p));
}