Use Lucene's QueryBuilder for building full-text queries

FuzzyQuery was a mistake; it uses edit distance to find terms in the
index close to the provided search term. This produces bizarre results
for queries like "message:1234".

Instead, use Lucene's QueryBuilder with an analyzer to convert a
full-text search word/phrase into a phrase query.

Add some tests for full-text matching behavior on numbers, which
should hopefully not be too dependent on specific Lucene behavior.
Coincidentally, a copy-paste error in the byMessageExact test
prevented this poor behavior from showing up in tests sooner.

Change-Id: I384f74f1455d0433433a27f880204ac8ecbf93da
This commit is contained in:
Dave Borowitz
2013-12-27 14:48:26 -08:00
committed by Shawn Pearce
parent d8b4b55ea5
commit 569f2516db
5 changed files with 70 additions and 32 deletions

View File

@@ -54,6 +54,7 @@ import com.google.inject.Provider;
import com.google.inject.assistedinject.Assisted;
import com.google.inject.assistedinject.AssistedInject;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.document.Document;
@@ -157,6 +158,7 @@ public class LuceneChangeIndex implements ChangeIndex {
private final ChangeData.Factory changeDataFactory;
private final File dir;
private final Schema<ChangeData> schema;
private final QueryBuilder queryBuilder;
private final SubIndex openIndex;
private final SubIndex closedIndex;
@@ -186,6 +188,10 @@ public class LuceneChangeIndex implements ChangeIndex {
LUCENE_VERSIONS.get(schema),
"unknown Lucene version for index schema: %s", schema);
Analyzer analyzer =
new StandardAnalyzer(luceneVersion, CharArraySet.EMPTY_SET);
queryBuilder = new QueryBuilder(schema, analyzer);
IndexWriterConfig openConfig =
getIndexWriterConfig(luceneVersion, cfg, "changes_open");
IndexWriterConfig closedConfig =
@@ -298,7 +304,7 @@ public class LuceneChangeIndex implements ChangeIndex {
if (!Sets.intersection(statuses, CLOSED_STATUSES).isEmpty()) {
indexes.add(closedIndex);
}
return new QuerySource(indexes, QueryBuilder.toQuery(schema, p), limit,
return new QuerySource(indexes, queryBuilder.toQuery(p), limit,
ChangeQueryBuilder.hasNonTrivialSortKeyAfter(schema, p));
}

View File

@@ -33,9 +33,9 @@ import com.google.gerrit.server.query.QueryParseException;
import com.google.gerrit.server.query.change.ChangeData;
import com.google.gerrit.server.query.change.SortKeyPredicate;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.NumericRangeQuery;
import org.apache.lucene.search.PrefixQuery;
@@ -55,27 +55,34 @@ public class QueryBuilder {
return intTerm(ID_FIELD, cd.getId().get());
}
public static Query toQuery(Schema<ChangeData> schema, Predicate<ChangeData> p)
throws QueryParseException {
private final Schema<ChangeData> schema;
private final org.apache.lucene.util.QueryBuilder queryBuilder;
public QueryBuilder(Schema<ChangeData> schema, Analyzer analyzer) {
this.schema = schema;
queryBuilder = new org.apache.lucene.util.QueryBuilder(analyzer);
}
public Query toQuery(Predicate<ChangeData> p) throws QueryParseException {
if (p instanceof AndPredicate) {
return and(schema, p);
return and(p);
} else if (p instanceof OrPredicate) {
return or(schema, p);
return or(p);
} else if (p instanceof NotPredicate) {
return not(schema, p);
return not(p);
} else if (p instanceof IndexPredicate) {
return fieldQuery(schema, (IndexPredicate<ChangeData>) p);
return fieldQuery((IndexPredicate<ChangeData>) p);
} else {
throw new QueryParseException("cannot create query for index: " + p);
}
}
private static Query or(Schema<ChangeData> schema, Predicate<ChangeData> p)
private Query or(Predicate<ChangeData> p)
throws QueryParseException {
try {
BooleanQuery q = new BooleanQuery();
for (int i = 0; i < p.getChildCount(); i++) {
q.add(toQuery(schema, p.getChild(i)), SHOULD);
q.add(toQuery(p.getChild(i)), SHOULD);
}
return q;
} catch (BooleanQuery.TooManyClauses e) {
@@ -83,7 +90,7 @@ public class QueryBuilder {
}
}
private static Query and(Schema<ChangeData> schema, Predicate<ChangeData> p)
private Query and(Predicate<ChangeData> p)
throws QueryParseException {
try {
BooleanQuery b = new BooleanQuery();
@@ -95,10 +102,10 @@ public class QueryBuilder {
if (n instanceof TimestampRangePredicate) {
b.add(notTimestamp((TimestampRangePredicate<ChangeData>) n), MUST);
} else {
not.add(toQuery(schema, n));
not.add(toQuery(n));
}
} else {
b.add(toQuery(schema, c), MUST);
b.add(toQuery(c), MUST);
}
}
for (Query q : not) {
@@ -110,7 +117,7 @@ public class QueryBuilder {
}
}
private static Query not(Schema<ChangeData> schema, Predicate<ChangeData> p)
private Query not(Predicate<ChangeData> p)
throws QueryParseException {
Predicate<ChangeData> n = p.getChild(0);
if (n instanceof TimestampRangePredicate) {
@@ -120,12 +127,12 @@ public class QueryBuilder {
// Lucene does not support negation, start with all and subtract.
BooleanQuery q = new BooleanQuery();
q.add(new MatchAllDocsQuery(), MUST);
q.add(toQuery(schema, n), MUST_NOT);
q.add(toQuery(n), MUST_NOT);
return q;
}
private static Query fieldQuery(Schema<ChangeData> schema,
IndexPredicate<ChangeData> p) throws QueryParseException {
private Query fieldQuery(IndexPredicate<ChangeData> p)
throws QueryParseException {
if (p.getType() == FieldType.INTEGER) {
return intQuery(p);
} else if (p.getType() == FieldType.TIMESTAMP) {
@@ -137,7 +144,7 @@ public class QueryBuilder {
} else if (p.getType() == FieldType.FULL_TEXT) {
return fullTextQuery(p);
} else if (p instanceof SortKeyPredicate) {
return sortKeyQuery(schema, (SortKeyPredicate) p);
return sortKeyQuery((SortKeyPredicate) p);
} else {
throw badFieldType(p.getType());
}
@@ -149,7 +156,7 @@ public class QueryBuilder {
return new Term(name, bytes);
}
private static Query intQuery(IndexPredicate<ChangeData> p)
private Query intQuery(IndexPredicate<ChangeData> p)
throws QueryParseException {
int value;
try {
@@ -162,7 +169,7 @@ public class QueryBuilder {
return new TermQuery(intTerm(p.getField().getName(), value));
}
private static Query sortKeyQuery(Schema<ChangeData> schema, SortKeyPredicate p) {
private Query sortKeyQuery(SortKeyPredicate p) {
long min = p.getMinValue(schema);
long max = p.getMaxValue(schema);
return NumericRangeQuery.newLongRange(
@@ -172,7 +179,7 @@ public class QueryBuilder {
false, false);
}
private static Query timestampQuery(IndexPredicate<ChangeData> p)
private Query timestampQuery(IndexPredicate<ChangeData> p)
throws QueryParseException {
if (p instanceof TimestampRangePredicate) {
TimestampRangePredicate<ChangeData> r =
@@ -186,7 +193,7 @@ public class QueryBuilder {
throw new QueryParseException("not a timestamp: " + p);
}
private static Query notTimestamp(TimestampRangePredicate<ChangeData> r)
private Query notTimestamp(TimestampRangePredicate<ChangeData> r)
throws QueryParseException {
if (r.getMinTimestamp().getTime() == 0) {
return NumericRangeQuery.newIntRange(
@@ -198,7 +205,7 @@ public class QueryBuilder {
throw new QueryParseException("cannot negate: " + r);
}
private static Query exactQuery(IndexPredicate<ChangeData> p) {
private Query exactQuery(IndexPredicate<ChangeData> p) {
if (p instanceof RegexPredicate<?>) {
return regexQuery(p);
} else {
@@ -206,7 +213,7 @@ public class QueryBuilder {
}
}
private static Query regexQuery(IndexPredicate<ChangeData> p) {
private Query regexQuery(IndexPredicate<ChangeData> p) {
String re = p.getValue();
if (re.startsWith("^")) {
re = re.substring(1);
@@ -217,12 +224,12 @@ public class QueryBuilder {
return new RegexpQuery(new Term(p.getField().getName(), re));
}
private static Query prefixQuery(IndexPredicate<ChangeData> p) {
private Query prefixQuery(IndexPredicate<ChangeData> p) {
return new PrefixQuery(new Term(p.getField().getName(), p.getValue()));
}
private static Query fullTextQuery(IndexPredicate<ChangeData> p) {
return new FuzzyQuery(new Term(p.getField().getName(), p.getValue()));
private Query fullTextQuery(IndexPredicate<ChangeData> p) {
return queryBuilder.createPhraseQuery(p.getField().getName(), p.getValue());
}
public static int toIndexTime(Timestamp ts) {
@@ -232,7 +239,4 @@ public class QueryBuilder {
public static IllegalArgumentException badFieldType(FieldType<?> t) {
return new IllegalArgumentException("unknown index field type " + t);
}
private QueryBuilder() {
}
}