Use Lucene's QueryBuilder for building full-text queries
FuzzyQuery was a mistake; it uses edit distance to find terms in the index close to the provided search term. This produces bizarre results for queries like "message:1234". Instead, use Lucene's QueryBuilder with an analyzer to convert a full-text search word/phrase into a phrase query. Add some tests for full-text matching behavior on numbers, which should hopefully not be too dependent on specific Lucene behavior. Coincidentally, a copy-paste error in the byMessageExact test prevented this poor behavior from showing up in tests sooner. Change-Id: I384f74f1455d0433433a27f880204ac8ecbf93da
This commit is contained in:

committed by
Shawn Pearce

parent
d8b4b55ea5
commit
569f2516db
@@ -54,6 +54,7 @@ import com.google.inject.Provider;
|
||||
import com.google.inject.assistedinject.Assisted;
|
||||
import com.google.inject.assistedinject.AssistedInject;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.document.Document;
|
||||
@@ -157,6 +158,7 @@ public class LuceneChangeIndex implements ChangeIndex {
|
||||
private final ChangeData.Factory changeDataFactory;
|
||||
private final File dir;
|
||||
private final Schema<ChangeData> schema;
|
||||
private final QueryBuilder queryBuilder;
|
||||
private final SubIndex openIndex;
|
||||
private final SubIndex closedIndex;
|
||||
|
||||
@@ -186,6 +188,10 @@ public class LuceneChangeIndex implements ChangeIndex {
|
||||
LUCENE_VERSIONS.get(schema),
|
||||
"unknown Lucene version for index schema: %s", schema);
|
||||
|
||||
Analyzer analyzer =
|
||||
new StandardAnalyzer(luceneVersion, CharArraySet.EMPTY_SET);
|
||||
queryBuilder = new QueryBuilder(schema, analyzer);
|
||||
|
||||
IndexWriterConfig openConfig =
|
||||
getIndexWriterConfig(luceneVersion, cfg, "changes_open");
|
||||
IndexWriterConfig closedConfig =
|
||||
@@ -298,7 +304,7 @@ public class LuceneChangeIndex implements ChangeIndex {
|
||||
if (!Sets.intersection(statuses, CLOSED_STATUSES).isEmpty()) {
|
||||
indexes.add(closedIndex);
|
||||
}
|
||||
return new QuerySource(indexes, QueryBuilder.toQuery(schema, p), limit,
|
||||
return new QuerySource(indexes, queryBuilder.toQuery(p), limit,
|
||||
ChangeQueryBuilder.hasNonTrivialSortKeyAfter(schema, p));
|
||||
}
|
||||
|
||||
|
@@ -33,9 +33,9 @@ import com.google.gerrit.server.query.QueryParseException;
|
||||
import com.google.gerrit.server.query.change.ChangeData;
|
||||
import com.google.gerrit.server.query.change.SortKeyPredicate;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.FuzzyQuery;
|
||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||
import org.apache.lucene.search.NumericRangeQuery;
|
||||
import org.apache.lucene.search.PrefixQuery;
|
||||
@@ -55,27 +55,34 @@ public class QueryBuilder {
|
||||
return intTerm(ID_FIELD, cd.getId().get());
|
||||
}
|
||||
|
||||
public static Query toQuery(Schema<ChangeData> schema, Predicate<ChangeData> p)
|
||||
throws QueryParseException {
|
||||
private final Schema<ChangeData> schema;
|
||||
private final org.apache.lucene.util.QueryBuilder queryBuilder;
|
||||
|
||||
public QueryBuilder(Schema<ChangeData> schema, Analyzer analyzer) {
|
||||
this.schema = schema;
|
||||
queryBuilder = new org.apache.lucene.util.QueryBuilder(analyzer);
|
||||
}
|
||||
|
||||
public Query toQuery(Predicate<ChangeData> p) throws QueryParseException {
|
||||
if (p instanceof AndPredicate) {
|
||||
return and(schema, p);
|
||||
return and(p);
|
||||
} else if (p instanceof OrPredicate) {
|
||||
return or(schema, p);
|
||||
return or(p);
|
||||
} else if (p instanceof NotPredicate) {
|
||||
return not(schema, p);
|
||||
return not(p);
|
||||
} else if (p instanceof IndexPredicate) {
|
||||
return fieldQuery(schema, (IndexPredicate<ChangeData>) p);
|
||||
return fieldQuery((IndexPredicate<ChangeData>) p);
|
||||
} else {
|
||||
throw new QueryParseException("cannot create query for index: " + p);
|
||||
}
|
||||
}
|
||||
|
||||
private static Query or(Schema<ChangeData> schema, Predicate<ChangeData> p)
|
||||
private Query or(Predicate<ChangeData> p)
|
||||
throws QueryParseException {
|
||||
try {
|
||||
BooleanQuery q = new BooleanQuery();
|
||||
for (int i = 0; i < p.getChildCount(); i++) {
|
||||
q.add(toQuery(schema, p.getChild(i)), SHOULD);
|
||||
q.add(toQuery(p.getChild(i)), SHOULD);
|
||||
}
|
||||
return q;
|
||||
} catch (BooleanQuery.TooManyClauses e) {
|
||||
@@ -83,7 +90,7 @@ public class QueryBuilder {
|
||||
}
|
||||
}
|
||||
|
||||
private static Query and(Schema<ChangeData> schema, Predicate<ChangeData> p)
|
||||
private Query and(Predicate<ChangeData> p)
|
||||
throws QueryParseException {
|
||||
try {
|
||||
BooleanQuery b = new BooleanQuery();
|
||||
@@ -95,10 +102,10 @@ public class QueryBuilder {
|
||||
if (n instanceof TimestampRangePredicate) {
|
||||
b.add(notTimestamp((TimestampRangePredicate<ChangeData>) n), MUST);
|
||||
} else {
|
||||
not.add(toQuery(schema, n));
|
||||
not.add(toQuery(n));
|
||||
}
|
||||
} else {
|
||||
b.add(toQuery(schema, c), MUST);
|
||||
b.add(toQuery(c), MUST);
|
||||
}
|
||||
}
|
||||
for (Query q : not) {
|
||||
@@ -110,7 +117,7 @@ public class QueryBuilder {
|
||||
}
|
||||
}
|
||||
|
||||
private static Query not(Schema<ChangeData> schema, Predicate<ChangeData> p)
|
||||
private Query not(Predicate<ChangeData> p)
|
||||
throws QueryParseException {
|
||||
Predicate<ChangeData> n = p.getChild(0);
|
||||
if (n instanceof TimestampRangePredicate) {
|
||||
@@ -120,12 +127,12 @@ public class QueryBuilder {
|
||||
// Lucene does not support negation, start with all and subtract.
|
||||
BooleanQuery q = new BooleanQuery();
|
||||
q.add(new MatchAllDocsQuery(), MUST);
|
||||
q.add(toQuery(schema, n), MUST_NOT);
|
||||
q.add(toQuery(n), MUST_NOT);
|
||||
return q;
|
||||
}
|
||||
|
||||
private static Query fieldQuery(Schema<ChangeData> schema,
|
||||
IndexPredicate<ChangeData> p) throws QueryParseException {
|
||||
private Query fieldQuery(IndexPredicate<ChangeData> p)
|
||||
throws QueryParseException {
|
||||
if (p.getType() == FieldType.INTEGER) {
|
||||
return intQuery(p);
|
||||
} else if (p.getType() == FieldType.TIMESTAMP) {
|
||||
@@ -137,7 +144,7 @@ public class QueryBuilder {
|
||||
} else if (p.getType() == FieldType.FULL_TEXT) {
|
||||
return fullTextQuery(p);
|
||||
} else if (p instanceof SortKeyPredicate) {
|
||||
return sortKeyQuery(schema, (SortKeyPredicate) p);
|
||||
return sortKeyQuery((SortKeyPredicate) p);
|
||||
} else {
|
||||
throw badFieldType(p.getType());
|
||||
}
|
||||
@@ -149,7 +156,7 @@ public class QueryBuilder {
|
||||
return new Term(name, bytes);
|
||||
}
|
||||
|
||||
private static Query intQuery(IndexPredicate<ChangeData> p)
|
||||
private Query intQuery(IndexPredicate<ChangeData> p)
|
||||
throws QueryParseException {
|
||||
int value;
|
||||
try {
|
||||
@@ -162,7 +169,7 @@ public class QueryBuilder {
|
||||
return new TermQuery(intTerm(p.getField().getName(), value));
|
||||
}
|
||||
|
||||
private static Query sortKeyQuery(Schema<ChangeData> schema, SortKeyPredicate p) {
|
||||
private Query sortKeyQuery(SortKeyPredicate p) {
|
||||
long min = p.getMinValue(schema);
|
||||
long max = p.getMaxValue(schema);
|
||||
return NumericRangeQuery.newLongRange(
|
||||
@@ -172,7 +179,7 @@ public class QueryBuilder {
|
||||
false, false);
|
||||
}
|
||||
|
||||
private static Query timestampQuery(IndexPredicate<ChangeData> p)
|
||||
private Query timestampQuery(IndexPredicate<ChangeData> p)
|
||||
throws QueryParseException {
|
||||
if (p instanceof TimestampRangePredicate) {
|
||||
TimestampRangePredicate<ChangeData> r =
|
||||
@@ -186,7 +193,7 @@ public class QueryBuilder {
|
||||
throw new QueryParseException("not a timestamp: " + p);
|
||||
}
|
||||
|
||||
private static Query notTimestamp(TimestampRangePredicate<ChangeData> r)
|
||||
private Query notTimestamp(TimestampRangePredicate<ChangeData> r)
|
||||
throws QueryParseException {
|
||||
if (r.getMinTimestamp().getTime() == 0) {
|
||||
return NumericRangeQuery.newIntRange(
|
||||
@@ -198,7 +205,7 @@ public class QueryBuilder {
|
||||
throw new QueryParseException("cannot negate: " + r);
|
||||
}
|
||||
|
||||
private static Query exactQuery(IndexPredicate<ChangeData> p) {
|
||||
private Query exactQuery(IndexPredicate<ChangeData> p) {
|
||||
if (p instanceof RegexPredicate<?>) {
|
||||
return regexQuery(p);
|
||||
} else {
|
||||
@@ -206,7 +213,7 @@ public class QueryBuilder {
|
||||
}
|
||||
}
|
||||
|
||||
private static Query regexQuery(IndexPredicate<ChangeData> p) {
|
||||
private Query regexQuery(IndexPredicate<ChangeData> p) {
|
||||
String re = p.getValue();
|
||||
if (re.startsWith("^")) {
|
||||
re = re.substring(1);
|
||||
@@ -217,12 +224,12 @@ public class QueryBuilder {
|
||||
return new RegexpQuery(new Term(p.getField().getName(), re));
|
||||
}
|
||||
|
||||
private static Query prefixQuery(IndexPredicate<ChangeData> p) {
|
||||
private Query prefixQuery(IndexPredicate<ChangeData> p) {
|
||||
return new PrefixQuery(new Term(p.getField().getName(), p.getValue()));
|
||||
}
|
||||
|
||||
private static Query fullTextQuery(IndexPredicate<ChangeData> p) {
|
||||
return new FuzzyQuery(new Term(p.getField().getName(), p.getValue()));
|
||||
private Query fullTextQuery(IndexPredicate<ChangeData> p) {
|
||||
return queryBuilder.createPhraseQuery(p.getField().getName(), p.getValue());
|
||||
}
|
||||
|
||||
public static int toIndexTime(Timestamp ts) {
|
||||
@@ -232,7 +239,4 @@ public class QueryBuilder {
|
||||
public static IllegalArgumentException badFieldType(FieldType<?> t) {
|
||||
return new IllegalArgumentException("unknown index field type " + t);
|
||||
}
|
||||
|
||||
private QueryBuilder() {
|
||||
}
|
||||
}
|
||||
|
@@ -349,11 +349,26 @@ public abstract class AbstractQueryChangesTest {
|
||||
RevCommit commit2 = repo.parseBody(repo.commit().message("two").create());
|
||||
Change change2 = newChange(repo, commit2, null, null, null).insert();
|
||||
|
||||
assertTrue(query("topic:foo").isEmpty());
|
||||
assertTrue(query("message:foo").isEmpty());
|
||||
assertResultEquals(change1, queryOne("message:one"));
|
||||
assertResultEquals(change2, queryOne("message:two"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void fullTextWithNumbers() throws Exception {
|
||||
TestRepository<InMemoryRepository> repo = createProject("repo");
|
||||
RevCommit commit1 =
|
||||
repo.parseBody(repo.commit().message("12345 67890").create());
|
||||
Change change1 = newChange(repo, commit1, null, null, null).insert();
|
||||
RevCommit commit2 =
|
||||
repo.parseBody(repo.commit().message("12346 67891").create());
|
||||
Change change2 = newChange(repo, commit2, null, null, null).insert();
|
||||
|
||||
assertTrue(query("message:1234").isEmpty());
|
||||
assertResultEquals(change1, queryOne("message:12345"));
|
||||
assertResultEquals(change2, queryOne("message:12346"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void byLabel() throws Exception {
|
||||
accountManager.authenticate(AuthRequest.forUser("anotheruser"));
|
||||
|
@@ -12,6 +12,7 @@ java_library(
|
||||
'//lib/guice:guice',
|
||||
'//lib/jgit:jgit',
|
||||
'//lib/log:api',
|
||||
'//lib/lucene:analyzers-common',
|
||||
'//lib/lucene:core',
|
||||
'//lib/solr:solrj',
|
||||
],
|
||||
|
@@ -45,7 +45,10 @@ import com.google.gwtorm.server.OrmException;
|
||||
import com.google.gwtorm.server.ResultSet;
|
||||
import com.google.inject.Provider;
|
||||
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.solr.client.solrj.SolrQuery;
|
||||
import org.apache.solr.client.solrj.SolrServer;
|
||||
import org.apache.solr.client.solrj.SolrServerException;
|
||||
@@ -79,6 +82,7 @@ class SolrChangeIndex implements ChangeIndex, LifecycleListener {
|
||||
private final CloudSolrServer openIndex;
|
||||
private final CloudSolrServer closedIndex;
|
||||
private final Schema<ChangeData> schema;
|
||||
private final QueryBuilder queryBuilder;
|
||||
|
||||
SolrChangeIndex(
|
||||
@GerritServerConfig Config cfg,
|
||||
@@ -101,6 +105,14 @@ class SolrChangeIndex implements ChangeIndex, LifecycleListener {
|
||||
throw new IllegalStateException("index.solr.url must be supplied");
|
||||
}
|
||||
|
||||
// Version is only used to determine the list of stop words used by the
|
||||
// analyzer, so use the latest version rather than trying to match the Solr
|
||||
// server version.
|
||||
@SuppressWarnings("deprecation")
|
||||
Version v = Version.LUCENE_CURRENT;
|
||||
queryBuilder = new QueryBuilder(
|
||||
schema, new StandardAnalyzer(v, CharArraySet.EMPTY_SET));
|
||||
|
||||
base = Strings.nullToEmpty(base);
|
||||
openIndex = new CloudSolrServer(url);
|
||||
openIndex.setDefaultCollection(base + CHANGES_OPEN);
|
||||
@@ -208,7 +220,7 @@ class SolrChangeIndex implements ChangeIndex, LifecycleListener {
|
||||
if (!Sets.intersection(statuses, CLOSED_STATUSES).isEmpty()) {
|
||||
indexes.add(closedIndex);
|
||||
}
|
||||
return new QuerySource(indexes, QueryBuilder.toQuery(schema, p), limit,
|
||||
return new QuerySource(indexes, queryBuilder.toQuery(p), limit,
|
||||
ChangeQueryBuilder.hasNonTrivialSortKeyAfter(schema, p));
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user