Use Lucene's QueryBuilder for building full-text queries
FuzzyQuery was a mistake; it uses edit distance to find terms in the index close to the provided search term. This produces bizarre results for queries like "message:1234". Instead, use Lucene's QueryBuilder with an analyzer to convert a full-text search word/phrase into a phrase query. Add some tests for full-text matching behavior on numbers, which should hopefully not be too dependent on specific Lucene behavior. Coincidentally, a copy-paste error in the byMessageExact test prevented this poor behavior from showing up in tests sooner. Change-Id: I384f74f1455d0433433a27f880204ac8ecbf93da
This commit is contained in:

committed by
Shawn Pearce

parent
d8b4b55ea5
commit
569f2516db
@@ -54,6 +54,7 @@ import com.google.inject.Provider;
|
|||||||
import com.google.inject.assistedinject.Assisted;
|
import com.google.inject.assistedinject.Assisted;
|
||||||
import com.google.inject.assistedinject.AssistedInject;
|
import com.google.inject.assistedinject.AssistedInject;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
import org.apache.lucene.analysis.util.CharArraySet;
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
@@ -157,6 +158,7 @@ public class LuceneChangeIndex implements ChangeIndex {
|
|||||||
private final ChangeData.Factory changeDataFactory;
|
private final ChangeData.Factory changeDataFactory;
|
||||||
private final File dir;
|
private final File dir;
|
||||||
private final Schema<ChangeData> schema;
|
private final Schema<ChangeData> schema;
|
||||||
|
private final QueryBuilder queryBuilder;
|
||||||
private final SubIndex openIndex;
|
private final SubIndex openIndex;
|
||||||
private final SubIndex closedIndex;
|
private final SubIndex closedIndex;
|
||||||
|
|
||||||
@@ -186,6 +188,10 @@ public class LuceneChangeIndex implements ChangeIndex {
|
|||||||
LUCENE_VERSIONS.get(schema),
|
LUCENE_VERSIONS.get(schema),
|
||||||
"unknown Lucene version for index schema: %s", schema);
|
"unknown Lucene version for index schema: %s", schema);
|
||||||
|
|
||||||
|
Analyzer analyzer =
|
||||||
|
new StandardAnalyzer(luceneVersion, CharArraySet.EMPTY_SET);
|
||||||
|
queryBuilder = new QueryBuilder(schema, analyzer);
|
||||||
|
|
||||||
IndexWriterConfig openConfig =
|
IndexWriterConfig openConfig =
|
||||||
getIndexWriterConfig(luceneVersion, cfg, "changes_open");
|
getIndexWriterConfig(luceneVersion, cfg, "changes_open");
|
||||||
IndexWriterConfig closedConfig =
|
IndexWriterConfig closedConfig =
|
||||||
@@ -298,7 +304,7 @@ public class LuceneChangeIndex implements ChangeIndex {
|
|||||||
if (!Sets.intersection(statuses, CLOSED_STATUSES).isEmpty()) {
|
if (!Sets.intersection(statuses, CLOSED_STATUSES).isEmpty()) {
|
||||||
indexes.add(closedIndex);
|
indexes.add(closedIndex);
|
||||||
}
|
}
|
||||||
return new QuerySource(indexes, QueryBuilder.toQuery(schema, p), limit,
|
return new QuerySource(indexes, queryBuilder.toQuery(p), limit,
|
||||||
ChangeQueryBuilder.hasNonTrivialSortKeyAfter(schema, p));
|
ChangeQueryBuilder.hasNonTrivialSortKeyAfter(schema, p));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -33,9 +33,9 @@ import com.google.gerrit.server.query.QueryParseException;
|
|||||||
import com.google.gerrit.server.query.change.ChangeData;
|
import com.google.gerrit.server.query.change.ChangeData;
|
||||||
import com.google.gerrit.server.query.change.SortKeyPredicate;
|
import com.google.gerrit.server.query.change.SortKeyPredicate;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.search.BooleanQuery;
|
import org.apache.lucene.search.BooleanQuery;
|
||||||
import org.apache.lucene.search.FuzzyQuery;
|
|
||||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||||
import org.apache.lucene.search.NumericRangeQuery;
|
import org.apache.lucene.search.NumericRangeQuery;
|
||||||
import org.apache.lucene.search.PrefixQuery;
|
import org.apache.lucene.search.PrefixQuery;
|
||||||
@@ -55,27 +55,34 @@ public class QueryBuilder {
|
|||||||
return intTerm(ID_FIELD, cd.getId().get());
|
return intTerm(ID_FIELD, cd.getId().get());
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Query toQuery(Schema<ChangeData> schema, Predicate<ChangeData> p)
|
private final Schema<ChangeData> schema;
|
||||||
throws QueryParseException {
|
private final org.apache.lucene.util.QueryBuilder queryBuilder;
|
||||||
|
|
||||||
|
public QueryBuilder(Schema<ChangeData> schema, Analyzer analyzer) {
|
||||||
|
this.schema = schema;
|
||||||
|
queryBuilder = new org.apache.lucene.util.QueryBuilder(analyzer);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Query toQuery(Predicate<ChangeData> p) throws QueryParseException {
|
||||||
if (p instanceof AndPredicate) {
|
if (p instanceof AndPredicate) {
|
||||||
return and(schema, p);
|
return and(p);
|
||||||
} else if (p instanceof OrPredicate) {
|
} else if (p instanceof OrPredicate) {
|
||||||
return or(schema, p);
|
return or(p);
|
||||||
} else if (p instanceof NotPredicate) {
|
} else if (p instanceof NotPredicate) {
|
||||||
return not(schema, p);
|
return not(p);
|
||||||
} else if (p instanceof IndexPredicate) {
|
} else if (p instanceof IndexPredicate) {
|
||||||
return fieldQuery(schema, (IndexPredicate<ChangeData>) p);
|
return fieldQuery((IndexPredicate<ChangeData>) p);
|
||||||
} else {
|
} else {
|
||||||
throw new QueryParseException("cannot create query for index: " + p);
|
throw new QueryParseException("cannot create query for index: " + p);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Query or(Schema<ChangeData> schema, Predicate<ChangeData> p)
|
private Query or(Predicate<ChangeData> p)
|
||||||
throws QueryParseException {
|
throws QueryParseException {
|
||||||
try {
|
try {
|
||||||
BooleanQuery q = new BooleanQuery();
|
BooleanQuery q = new BooleanQuery();
|
||||||
for (int i = 0; i < p.getChildCount(); i++) {
|
for (int i = 0; i < p.getChildCount(); i++) {
|
||||||
q.add(toQuery(schema, p.getChild(i)), SHOULD);
|
q.add(toQuery(p.getChild(i)), SHOULD);
|
||||||
}
|
}
|
||||||
return q;
|
return q;
|
||||||
} catch (BooleanQuery.TooManyClauses e) {
|
} catch (BooleanQuery.TooManyClauses e) {
|
||||||
@@ -83,7 +90,7 @@ public class QueryBuilder {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Query and(Schema<ChangeData> schema, Predicate<ChangeData> p)
|
private Query and(Predicate<ChangeData> p)
|
||||||
throws QueryParseException {
|
throws QueryParseException {
|
||||||
try {
|
try {
|
||||||
BooleanQuery b = new BooleanQuery();
|
BooleanQuery b = new BooleanQuery();
|
||||||
@@ -95,10 +102,10 @@ public class QueryBuilder {
|
|||||||
if (n instanceof TimestampRangePredicate) {
|
if (n instanceof TimestampRangePredicate) {
|
||||||
b.add(notTimestamp((TimestampRangePredicate<ChangeData>) n), MUST);
|
b.add(notTimestamp((TimestampRangePredicate<ChangeData>) n), MUST);
|
||||||
} else {
|
} else {
|
||||||
not.add(toQuery(schema, n));
|
not.add(toQuery(n));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
b.add(toQuery(schema, c), MUST);
|
b.add(toQuery(c), MUST);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (Query q : not) {
|
for (Query q : not) {
|
||||||
@@ -110,7 +117,7 @@ public class QueryBuilder {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Query not(Schema<ChangeData> schema, Predicate<ChangeData> p)
|
private Query not(Predicate<ChangeData> p)
|
||||||
throws QueryParseException {
|
throws QueryParseException {
|
||||||
Predicate<ChangeData> n = p.getChild(0);
|
Predicate<ChangeData> n = p.getChild(0);
|
||||||
if (n instanceof TimestampRangePredicate) {
|
if (n instanceof TimestampRangePredicate) {
|
||||||
@@ -120,12 +127,12 @@ public class QueryBuilder {
|
|||||||
// Lucene does not support negation, start with all and subtract.
|
// Lucene does not support negation, start with all and subtract.
|
||||||
BooleanQuery q = new BooleanQuery();
|
BooleanQuery q = new BooleanQuery();
|
||||||
q.add(new MatchAllDocsQuery(), MUST);
|
q.add(new MatchAllDocsQuery(), MUST);
|
||||||
q.add(toQuery(schema, n), MUST_NOT);
|
q.add(toQuery(n), MUST_NOT);
|
||||||
return q;
|
return q;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Query fieldQuery(Schema<ChangeData> schema,
|
private Query fieldQuery(IndexPredicate<ChangeData> p)
|
||||||
IndexPredicate<ChangeData> p) throws QueryParseException {
|
throws QueryParseException {
|
||||||
if (p.getType() == FieldType.INTEGER) {
|
if (p.getType() == FieldType.INTEGER) {
|
||||||
return intQuery(p);
|
return intQuery(p);
|
||||||
} else if (p.getType() == FieldType.TIMESTAMP) {
|
} else if (p.getType() == FieldType.TIMESTAMP) {
|
||||||
@@ -137,7 +144,7 @@ public class QueryBuilder {
|
|||||||
} else if (p.getType() == FieldType.FULL_TEXT) {
|
} else if (p.getType() == FieldType.FULL_TEXT) {
|
||||||
return fullTextQuery(p);
|
return fullTextQuery(p);
|
||||||
} else if (p instanceof SortKeyPredicate) {
|
} else if (p instanceof SortKeyPredicate) {
|
||||||
return sortKeyQuery(schema, (SortKeyPredicate) p);
|
return sortKeyQuery((SortKeyPredicate) p);
|
||||||
} else {
|
} else {
|
||||||
throw badFieldType(p.getType());
|
throw badFieldType(p.getType());
|
||||||
}
|
}
|
||||||
@@ -149,7 +156,7 @@ public class QueryBuilder {
|
|||||||
return new Term(name, bytes);
|
return new Term(name, bytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Query intQuery(IndexPredicate<ChangeData> p)
|
private Query intQuery(IndexPredicate<ChangeData> p)
|
||||||
throws QueryParseException {
|
throws QueryParseException {
|
||||||
int value;
|
int value;
|
||||||
try {
|
try {
|
||||||
@@ -162,7 +169,7 @@ public class QueryBuilder {
|
|||||||
return new TermQuery(intTerm(p.getField().getName(), value));
|
return new TermQuery(intTerm(p.getField().getName(), value));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Query sortKeyQuery(Schema<ChangeData> schema, SortKeyPredicate p) {
|
private Query sortKeyQuery(SortKeyPredicate p) {
|
||||||
long min = p.getMinValue(schema);
|
long min = p.getMinValue(schema);
|
||||||
long max = p.getMaxValue(schema);
|
long max = p.getMaxValue(schema);
|
||||||
return NumericRangeQuery.newLongRange(
|
return NumericRangeQuery.newLongRange(
|
||||||
@@ -172,7 +179,7 @@ public class QueryBuilder {
|
|||||||
false, false);
|
false, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Query timestampQuery(IndexPredicate<ChangeData> p)
|
private Query timestampQuery(IndexPredicate<ChangeData> p)
|
||||||
throws QueryParseException {
|
throws QueryParseException {
|
||||||
if (p instanceof TimestampRangePredicate) {
|
if (p instanceof TimestampRangePredicate) {
|
||||||
TimestampRangePredicate<ChangeData> r =
|
TimestampRangePredicate<ChangeData> r =
|
||||||
@@ -186,7 +193,7 @@ public class QueryBuilder {
|
|||||||
throw new QueryParseException("not a timestamp: " + p);
|
throw new QueryParseException("not a timestamp: " + p);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Query notTimestamp(TimestampRangePredicate<ChangeData> r)
|
private Query notTimestamp(TimestampRangePredicate<ChangeData> r)
|
||||||
throws QueryParseException {
|
throws QueryParseException {
|
||||||
if (r.getMinTimestamp().getTime() == 0) {
|
if (r.getMinTimestamp().getTime() == 0) {
|
||||||
return NumericRangeQuery.newIntRange(
|
return NumericRangeQuery.newIntRange(
|
||||||
@@ -198,7 +205,7 @@ public class QueryBuilder {
|
|||||||
throw new QueryParseException("cannot negate: " + r);
|
throw new QueryParseException("cannot negate: " + r);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Query exactQuery(IndexPredicate<ChangeData> p) {
|
private Query exactQuery(IndexPredicate<ChangeData> p) {
|
||||||
if (p instanceof RegexPredicate<?>) {
|
if (p instanceof RegexPredicate<?>) {
|
||||||
return regexQuery(p);
|
return regexQuery(p);
|
||||||
} else {
|
} else {
|
||||||
@@ -206,7 +213,7 @@ public class QueryBuilder {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Query regexQuery(IndexPredicate<ChangeData> p) {
|
private Query regexQuery(IndexPredicate<ChangeData> p) {
|
||||||
String re = p.getValue();
|
String re = p.getValue();
|
||||||
if (re.startsWith("^")) {
|
if (re.startsWith("^")) {
|
||||||
re = re.substring(1);
|
re = re.substring(1);
|
||||||
@@ -217,12 +224,12 @@ public class QueryBuilder {
|
|||||||
return new RegexpQuery(new Term(p.getField().getName(), re));
|
return new RegexpQuery(new Term(p.getField().getName(), re));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Query prefixQuery(IndexPredicate<ChangeData> p) {
|
private Query prefixQuery(IndexPredicate<ChangeData> p) {
|
||||||
return new PrefixQuery(new Term(p.getField().getName(), p.getValue()));
|
return new PrefixQuery(new Term(p.getField().getName(), p.getValue()));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Query fullTextQuery(IndexPredicate<ChangeData> p) {
|
private Query fullTextQuery(IndexPredicate<ChangeData> p) {
|
||||||
return new FuzzyQuery(new Term(p.getField().getName(), p.getValue()));
|
return queryBuilder.createPhraseQuery(p.getField().getName(), p.getValue());
|
||||||
}
|
}
|
||||||
|
|
||||||
public static int toIndexTime(Timestamp ts) {
|
public static int toIndexTime(Timestamp ts) {
|
||||||
@@ -232,7 +239,4 @@ public class QueryBuilder {
|
|||||||
public static IllegalArgumentException badFieldType(FieldType<?> t) {
|
public static IllegalArgumentException badFieldType(FieldType<?> t) {
|
||||||
return new IllegalArgumentException("unknown index field type " + t);
|
return new IllegalArgumentException("unknown index field type " + t);
|
||||||
}
|
}
|
||||||
|
|
||||||
private QueryBuilder() {
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@@ -349,11 +349,26 @@ public abstract class AbstractQueryChangesTest {
|
|||||||
RevCommit commit2 = repo.parseBody(repo.commit().message("two").create());
|
RevCommit commit2 = repo.parseBody(repo.commit().message("two").create());
|
||||||
Change change2 = newChange(repo, commit2, null, null, null).insert();
|
Change change2 = newChange(repo, commit2, null, null, null).insert();
|
||||||
|
|
||||||
assertTrue(query("topic:foo").isEmpty());
|
assertTrue(query("message:foo").isEmpty());
|
||||||
assertResultEquals(change1, queryOne("message:one"));
|
assertResultEquals(change1, queryOne("message:one"));
|
||||||
assertResultEquals(change2, queryOne("message:two"));
|
assertResultEquals(change2, queryOne("message:two"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void fullTextWithNumbers() throws Exception {
|
||||||
|
TestRepository<InMemoryRepository> repo = createProject("repo");
|
||||||
|
RevCommit commit1 =
|
||||||
|
repo.parseBody(repo.commit().message("12345 67890").create());
|
||||||
|
Change change1 = newChange(repo, commit1, null, null, null).insert();
|
||||||
|
RevCommit commit2 =
|
||||||
|
repo.parseBody(repo.commit().message("12346 67891").create());
|
||||||
|
Change change2 = newChange(repo, commit2, null, null, null).insert();
|
||||||
|
|
||||||
|
assertTrue(query("message:1234").isEmpty());
|
||||||
|
assertResultEquals(change1, queryOne("message:12345"));
|
||||||
|
assertResultEquals(change2, queryOne("message:12346"));
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void byLabel() throws Exception {
|
public void byLabel() throws Exception {
|
||||||
accountManager.authenticate(AuthRequest.forUser("anotheruser"));
|
accountManager.authenticate(AuthRequest.forUser("anotheruser"));
|
||||||
|
@@ -12,6 +12,7 @@ java_library(
|
|||||||
'//lib/guice:guice',
|
'//lib/guice:guice',
|
||||||
'//lib/jgit:jgit',
|
'//lib/jgit:jgit',
|
||||||
'//lib/log:api',
|
'//lib/log:api',
|
||||||
|
'//lib/lucene:analyzers-common',
|
||||||
'//lib/lucene:core',
|
'//lib/lucene:core',
|
||||||
'//lib/solr:solrj',
|
'//lib/solr:solrj',
|
||||||
],
|
],
|
||||||
|
@@ -45,7 +45,10 @@ import com.google.gwtorm.server.OrmException;
|
|||||||
import com.google.gwtorm.server.ResultSet;
|
import com.google.gwtorm.server.ResultSet;
|
||||||
import com.google.inject.Provider;
|
import com.google.inject.Provider;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
import org.apache.solr.client.solrj.SolrQuery;
|
import org.apache.solr.client.solrj.SolrQuery;
|
||||||
import org.apache.solr.client.solrj.SolrServer;
|
import org.apache.solr.client.solrj.SolrServer;
|
||||||
import org.apache.solr.client.solrj.SolrServerException;
|
import org.apache.solr.client.solrj.SolrServerException;
|
||||||
@@ -79,6 +82,7 @@ class SolrChangeIndex implements ChangeIndex, LifecycleListener {
|
|||||||
private final CloudSolrServer openIndex;
|
private final CloudSolrServer openIndex;
|
||||||
private final CloudSolrServer closedIndex;
|
private final CloudSolrServer closedIndex;
|
||||||
private final Schema<ChangeData> schema;
|
private final Schema<ChangeData> schema;
|
||||||
|
private final QueryBuilder queryBuilder;
|
||||||
|
|
||||||
SolrChangeIndex(
|
SolrChangeIndex(
|
||||||
@GerritServerConfig Config cfg,
|
@GerritServerConfig Config cfg,
|
||||||
@@ -101,6 +105,14 @@ class SolrChangeIndex implements ChangeIndex, LifecycleListener {
|
|||||||
throw new IllegalStateException("index.solr.url must be supplied");
|
throw new IllegalStateException("index.solr.url must be supplied");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Version is only used to determine the list of stop words used by the
|
||||||
|
// analyzer, so use the latest version rather than trying to match the Solr
|
||||||
|
// server version.
|
||||||
|
@SuppressWarnings("deprecation")
|
||||||
|
Version v = Version.LUCENE_CURRENT;
|
||||||
|
queryBuilder = new QueryBuilder(
|
||||||
|
schema, new StandardAnalyzer(v, CharArraySet.EMPTY_SET));
|
||||||
|
|
||||||
base = Strings.nullToEmpty(base);
|
base = Strings.nullToEmpty(base);
|
||||||
openIndex = new CloudSolrServer(url);
|
openIndex = new CloudSolrServer(url);
|
||||||
openIndex.setDefaultCollection(base + CHANGES_OPEN);
|
openIndex.setDefaultCollection(base + CHANGES_OPEN);
|
||||||
@@ -208,7 +220,7 @@ class SolrChangeIndex implements ChangeIndex, LifecycleListener {
|
|||||||
if (!Sets.intersection(statuses, CLOSED_STATUSES).isEmpty()) {
|
if (!Sets.intersection(statuses, CLOSED_STATUSES).isEmpty()) {
|
||||||
indexes.add(closedIndex);
|
indexes.add(closedIndex);
|
||||||
}
|
}
|
||||||
return new QuerySource(indexes, QueryBuilder.toQuery(schema, p), limit,
|
return new QuerySource(indexes, queryBuilder.toQuery(p), limit,
|
||||||
ChangeQueryBuilder.hasNonTrivialSortKeyAfter(schema, p));
|
ChangeQueryBuilder.hasNonTrivialSortKeyAfter(schema, p));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user