Index modified filenames with Lucene

Add a ChangeIndex interface and an implementation based on Apache
Lucene[1] to provide a secondary index, indexing things we won't or
can't index in the database.

As an example, index the list of modified files in the most recent
patch set of each change. Provide an EqualsFilePredicate for searching
on exact filenames[2], and teach the query builder to rewrite such
predicates as ChangeDataSources returning results from the index.

As this feature is still experimental but we want to avoid prolonged
feature branch development, protect it with an undocumented
index.enabled boolean in gerrit.config.

[1] http://lucene.apache.org/core/

[2] Uses the "file:" operator the same as RegexFilePredicate, but does
    not support regular expressions in the search context.

Change-Id: Ie14ebe062d991eb9626f7b5d78b2d193c1bcb33f
This commit is contained in:
Dave Borowitz
2013-05-17 14:05:25 -07:00
parent 363497ee67
commit 9161eda6d5
28 changed files with 1235 additions and 18 deletions

View File

@@ -0,0 +1,276 @@
// Copyright (C) 2013 The Android Open Source Project
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.package com.google.gerrit.server.git;
package com.google.gerrit.lucene;
import static com.google.gerrit.server.query.change.ChangeQueryBuilder.FIELD_CHANGE;
import com.google.common.collect.Lists;
import com.google.gerrit.extensions.events.LifecycleListener;
import com.google.gerrit.reviewdb.client.Change;
import com.google.gerrit.server.config.SitePaths;
import com.google.gerrit.server.index.ChangeField;
import com.google.gerrit.server.index.ChangeIndex;
import com.google.gerrit.server.index.FieldDef;
import com.google.gerrit.server.index.FieldDef.FillArgs;
import com.google.gerrit.server.index.FieldType;
import com.google.gerrit.server.index.IndexPredicate;
import com.google.gerrit.server.query.QueryParseException;
import com.google.gerrit.server.query.change.ChangeData;
import com.google.gerrit.server.query.change.ChangeDataSource;
import com.google.gwtorm.server.OrmException;
import com.google.gwtorm.server.ResultSet;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.SearcherManager;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.Version;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
/**
* Secondary index implementation using Apache Lucene.
* <p>
* Writes are managed using a single {@link IndexWriter} per process, committed
* aggressively. Reads use {@link SearcherManager} and periodically refresh,
* though there may be some lag between a committed write and it showing up to
* other threads' searchers.
*/
@Singleton
public class LuceneChangeIndex implements ChangeIndex, LifecycleListener {
private static final Logger log =
LoggerFactory.getLogger(LuceneChangeIndex.class);
private static final Version VERSION = Version.LUCENE_43;
private final FillArgs fillArgs;
private final Directory dir;
private final IndexWriter writer;
private final SearcherManager searcherManager;
@Inject
LuceneChangeIndex(SitePaths sitePaths,
FillArgs fillArgs) throws IOException {
this.fillArgs = fillArgs;
dir = FSDirectory.open(new File(sitePaths.index_dir, "changes"));
IndexWriterConfig writerConfig =
new IndexWriterConfig(VERSION, new StandardAnalyzer(VERSION));
writerConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
writer = new IndexWriter(dir, writerConfig);
searcherManager = new SearcherManager(writer, true, null);
}
@Override
public void start() {
// Do nothing.
}
@Override
public void stop() {
try {
searcherManager.close();
} catch (IOException e) {
log.warn("error closing Lucene searcher", e);
}
try {
writer.close(true);
} catch (IOException e) {
log.warn("error closing Lucene writer", e);
}
try {
dir.close();
} catch (IOException e) {
log.warn("error closing Lucene directory", e);
}
}
@Override
public void insert(ChangeData cd) throws IOException {
writer.addDocument(toDocument(cd));
commit();
}
@Override
public void replace(ChangeData cd) throws IOException {
writer.updateDocument(intTerm(FIELD_CHANGE, cd.getId().get()),
toDocument(cd));
commit();
}
@Override
public ChangeDataSource getSource(IndexPredicate<ChangeData> p)
throws QueryParseException {
if (p.getType() == FieldType.INTEGER) {
return intQuery(p);
} else if (p.getType() == FieldType.EXACT) {
return exactQuery(p);
} else {
throw badFieldType(p.getType());
}
}
public IndexWriter getWriter() {
return writer;
}
private void commit() throws IOException {
writer.commit();
searcherManager.maybeRefresh();
}
private Term intTerm(String name, int value) {
BytesRef bytes = new BytesRef(NumericUtils.BUF_SIZE_INT);
NumericUtils.intToPrefixCodedBytes(value, 0, bytes);
return new Term(name, bytes);
}
private QuerySource intQuery(IndexPredicate<ChangeData> p)
throws QueryParseException {
int value;
try {
// Can't use IntPredicate because it and IndexPredicate are different
// subclasses of OperatorPredicate.
value = Integer.valueOf(p.getValue());
} catch (IllegalArgumentException e) {
throw new QueryParseException("not an integer: " + p.getValue());
}
return new QuerySource(new TermQuery(intTerm(p.getOperator(), value)));
}
private QuerySource exactQuery(IndexPredicate<ChangeData> p) {
return new QuerySource(new TermQuery(
new Term(p.getOperator(), p.getValue())));
}
private class QuerySource implements ChangeDataSource {
// TODO(dborowitz): Push limit down from predicate tree.
private static final int LIMIT = 1000;
private final Query query;
public QuerySource(Query query) {
this.query = query;
}
@Override
public int getCardinality() {
return 10; // TODO(dborowitz): estimate from Lucene?
}
@Override
public boolean hasChange() {
return false;
}
@Override
public ResultSet<ChangeData> read() throws OrmException {
try {
IndexSearcher searcher = searcherManager.acquire();
try {
ScoreDoc[] docs = searcher.search(query, LIMIT).scoreDocs;
List<ChangeData> result = Lists.newArrayListWithCapacity(docs.length);
for (ScoreDoc sd : docs) {
Document doc = searcher.doc(sd.doc);
Number v = doc.getField(FIELD_CHANGE).numericValue();
result.add(new ChangeData(new Change.Id(v.intValue())));
}
final List<ChangeData> r = Collections.unmodifiableList(result);
return new ResultSet<ChangeData>() {
@Override
public Iterator<ChangeData> iterator() {
return r.iterator();
}
@Override
public List<ChangeData> toList() {
return r;
}
@Override
public void close() {
// Do nothing.
}
};
} finally {
searcherManager.release(searcher);
}
} catch (IOException e) {
throw new OrmException(e);
}
}
}
private Document toDocument(ChangeData cd) throws IOException {
try {
Document result = new Document();
for (FieldDef<ChangeData, ?> f : ChangeField.ALL.values()) {
if (f.isRepeatable()) {
add(result, f, (Iterable<?>) f.get(cd, fillArgs));
} else {
add(result, f, Collections.singleton(f.get(cd, fillArgs)));
}
}
return result;
} catch (OrmException e) {
throw new IOException(e);
}
}
private void add(Document doc, FieldDef<ChangeData, ?> f,
Iterable<?> values) throws OrmException {
if (f.getType() == FieldType.INTEGER) {
for (Object value : values) {
doc.add(new IntField(f.getName(), (Integer) value, store(f)));
}
} else if (f.getType() == FieldType.EXACT) {
for (Object value : values) {
doc.add(new StringField(f.getName(), (String) value, store(f)));
}
} else {
throw badFieldType(f.getType());
}
}
private static Field.Store store(FieldDef<?, ?> f) {
return f.isStored() ? Field.Store.YES : Field.Store.NO;
}
private static IllegalArgumentException badFieldType(FieldType<?> t) {
return new IllegalArgumentException("unknown index field type " + t);
}
}

View File

@@ -0,0 +1,39 @@
// Copyright (C) 2013 The Android Open Source Project
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.package com.google.gerrit.server.git;
package com.google.gerrit.lucene;
import com.google.gerrit.lifecycle.LifecycleModule;
import com.google.gerrit.server.config.GerritServerConfig;
import com.google.gerrit.server.index.ChangeIndex;
import com.google.gerrit.server.index.ChangeIndexer;
import com.google.gerrit.server.index.ChangeIndexerImpl;
import com.google.inject.Injector;
import com.google.inject.Key;
import org.eclipse.jgit.lib.Config;
public class LuceneIndexModule extends LifecycleModule {
public static boolean isEnabled(Injector injector) {
return injector.getInstance(Key.get(Config.class, GerritServerConfig.class))
.getBoolean("index", null, "enabled", false);
}
@Override
protected void configure() {
bind(ChangeIndex.class).to(LuceneChangeIndex.class);
bind(ChangeIndexer.class).to(ChangeIndexerImpl.class);
listener().to(LuceneChangeIndex.class);
}
}