Factor out a class for efficient regex search over lists

RegexPathPredicate had some clever logic for avoiding matching regular
expressions over an entire search list when we can identify a common
prefix that all matching results must have. Factor this out into a new
class, RegexListSearcher, add tests, and use it from ListProjects.

Change-Id: Ie08ad8dcf09708a8aa3efcfbed6c4ee6879f80c7
This commit is contained in:
Dave Borowitz
2014-08-07 13:19:23 -07:00
parent 1e0e85045e
commit d4ab000a20
4 changed files with 208 additions and 82 deletions

View File

@@ -16,6 +16,7 @@ package com.google.gerrit.server.project;
import com.google.common.base.Predicate;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
@@ -38,14 +39,12 @@ import com.google.gerrit.server.WebLinks;
import com.google.gerrit.server.account.GroupCache;
import com.google.gerrit.server.account.GroupControl;
import com.google.gerrit.server.git.GitRepositoryManager;
import com.google.gerrit.server.util.RegexListSearcher;
import com.google.gerrit.server.util.TreeFormatter;
import com.google.gson.reflect.TypeToken;
import com.google.inject.Inject;
import com.google.inject.Provider;
import dk.brics.automaton.RegExp;
import dk.brics.automaton.RunAutomaton;
import org.eclipse.jgit.errors.RepositoryNotFoundException;
import org.eclipse.jgit.lib.Constants;
import org.eclipse.jgit.lib.Ref;
@@ -462,27 +461,18 @@ public class ListProjects implements RestReadView<TopLevelResource> {
});
} else if (matchRegex != null) {
checkMatchOptions(matchPrefix == null && matchSubstring == null);
if (matchRegex.startsWith("^")) {
matchRegex = matchRegex.substring(1);
}
if (matchRegex.endsWith("$") && !matchRegex.endsWith("\\$")) {
matchRegex = matchRegex.substring(0, matchRegex.length() - 1);
}
if (matchRegex.equals(".*")) {
return projectCache.all();
}
RegexListSearcher<Project.NameKey> searcher;
try {
final RunAutomaton a =
new RunAutomaton(new RegExp(matchRegex).toAutomaton());
return Iterables.filter(projectCache.all(),
new Predicate<Project.NameKey>() {
public boolean apply(Project.NameKey in) {
return a.run(in.get());
}
});
searcher = new RegexListSearcher<Project.NameKey>(matchRegex) {
@Override
public String apply(Project.NameKey in) {
return in.get();
}
};
} catch (IllegalArgumentException e) {
throw new BadRequestException(e.getMessage());
}
return searcher.search(ImmutableList.copyOf(projectCache.all()));
} else {
return projectCache.all();
}

View File

@@ -16,76 +16,21 @@ package com.google.gerrit.server.query.change;
import com.google.gerrit.server.index.ChangeField;
import com.google.gerrit.server.index.RegexPredicate;
import com.google.gerrit.server.util.RegexListSearcher;
import com.google.gwtorm.server.OrmException;
import dk.brics.automaton.Automaton;
import dk.brics.automaton.RegExp;
import dk.brics.automaton.RunAutomaton;
import java.util.Collections;
import java.util.List;
class RegexPathPredicate extends RegexPredicate<ChangeData> {
private final RunAutomaton pattern;
private final String prefixBegin;
private final String prefixEnd;
private final int prefixLen;
private final boolean prefixOnly;
RegexPathPredicate(String fieldName, String re) {
super(ChangeField.PATH, re);
if (re.startsWith("^")) {
re = re.substring(1);
}
if (re.endsWith("$") && !re.endsWith("\\$")) {
re = re.substring(0, re.length() - 1);
}
Automaton automaton = new RegExp(re).toAutomaton();
prefixBegin = automaton.getCommonPrefix();
prefixLen = prefixBegin.length();
if (0 < prefixLen) {
char max = (char) (prefixBegin.charAt(prefixLen - 1) + 1);
prefixEnd = prefixBegin.substring(0, prefixLen - 1) + max;
prefixOnly = re.equals(prefixBegin + ".*");
} else {
prefixEnd = "";
prefixOnly = false;
}
pattern = prefixOnly ? null : new RunAutomaton(automaton);
}
@Override
public boolean match(ChangeData object) throws OrmException {
List<String> files = object.currentFilePaths();
if (files != null) {
int begin, end;
if (0 < prefixLen) {
begin = find(files, prefixBegin);
end = find(files, prefixEnd);
} else {
begin = 0;
end = files.size();
}
if (prefixOnly) {
return begin < end;
}
while (begin < end) {
if (pattern.run(files.get(begin++))) {
return true;
}
}
return false;
return RegexListSearcher.ofStrings(getValue()).hasMatch(files);
} else {
// The ChangeData can't do expensive lookups right now. Bypass
// them and include the result anyway. We might be able to do
@@ -95,11 +40,6 @@ class RegexPathPredicate extends RegexPredicate<ChangeData> {
}
}
private static int find(List<String> files, String p) {
int r = Collections.binarySearch(files, p);
return r < 0 ? -(r + 1) : r;
}
@Override
public int getCost() {
return 1;

View File

@@ -0,0 +1,112 @@
// Copyright (C) 2014 The Android Open Source Project
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.gerrit.server.util;
import static com.google.common.base.Preconditions.checkNotNull;
import com.google.common.base.Function;
import com.google.common.base.Predicate;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.primitives.Chars;
import dk.brics.automaton.Automaton;
import dk.brics.automaton.RegExp;
import dk.brics.automaton.RunAutomaton;
import java.util.Collections;
import java.util.List;
/** Helper to search sorted lists for elements matching a regex. */
public abstract class RegexListSearcher<T> implements Function<T, String> {
public static RegexListSearcher<String> ofStrings(String re) {
return new RegexListSearcher<String>(re) {
@Override
public String apply(String in) {
return in;
}
};
}
private final RunAutomaton pattern;
private final String prefixBegin;
private final String prefixEnd;
private final int prefixLen;
private final boolean prefixOnly;
public RegexListSearcher(String re) {
if (re.startsWith("^")) {
re = re.substring(1);
}
if (re.endsWith("$") && !re.endsWith("\\$")) {
re = re.substring(0, re.length() - 1);
}
Automaton automaton = new RegExp(re).toAutomaton();
prefixBegin = automaton.getCommonPrefix();
prefixLen = prefixBegin.length();
if (0 < prefixLen) {
char max = Chars.checkedCast(prefixBegin.charAt(prefixLen - 1) + 1);
prefixEnd = prefixBegin.substring(0, prefixLen - 1) + max;
prefixOnly = re.equals(prefixBegin + ".*");
} else {
prefixEnd = "";
prefixOnly = false;
}
pattern = prefixOnly ? null : new RunAutomaton(automaton);
}
public Iterable<T> search(List<T> list) {
checkNotNull(list);
int begin, end;
if (0 < prefixLen) {
// Assumes many consecutive elements may have the same prefix, so the cost
// of two binary searches is less than iterating to find the endpoints.
begin = find(list, prefixBegin);
end = find(list, prefixEnd);
} else {
begin = 0;
end = list.size();
}
if (prefixOnly) {
return begin < end ? list.subList(begin, end) : ImmutableList.<T> of();
}
return Iterables.filter(
list.subList(begin, end),
new Predicate<T>() {
@Override
public boolean apply(T in) {
return pattern.run(RegexListSearcher.this.apply(in));
}
});
}
public boolean hasMatch(List<T> list) {
return !Iterables.isEmpty(search(list));
}
private int find(List<T> list, String p) {
int r = Collections.binarySearch(Lists.transform(list, this), p);
return r < 0 ? -(r + 1) : r;
}
}

View File

@@ -0,0 +1,84 @@
// Copyright (C) 2014 The Android Open Source Project
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.gerrit.server.util;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Ordering;
import org.junit.Test;
import java.util.List;
public class RegexListSearcherTest {
private static final List<String> EMPTY = ImmutableList.of();
@Test
public void emptyList() {
assertSearchReturns(EMPTY, "pat", EMPTY);
}
@Test
public void hasMatch() {
List<String> list = ImmutableList.of("bar", "foo", "quux");
assertTrue(RegexListSearcher.ofStrings("foo").hasMatch(list));
assertFalse(RegexListSearcher.ofStrings("xyz").hasMatch(list));
}
@Test
public void anchors() {
List<String> list = ImmutableList.of("foo");
assertSearchReturns(list, "^f.*", list);
assertSearchReturns(list, "^f.*o$", list);
assertSearchReturns(list, "f.*o$", list);
assertSearchReturns(list, "f.*o$", list);
assertSearchReturns(EMPTY, "^.*\\$", list);
}
@Test
public void noCommonPrefix() {
List<String> list = ImmutableList.of("bar", "foo", "quux");
assertSearchReturns(ImmutableList.of("foo"), "f.*", list);
assertSearchReturns(ImmutableList.of("foo"), ".*o.*", list);
assertSearchReturns(ImmutableList.of("bar", "foo", "quux"), ".*[aou].*",
list);
}
@Test
public void commonPrefix() {
List<String> list = ImmutableList.of(
"bar",
"baz",
"foo1",
"foo2",
"foo3",
"quux");
assertSearchReturns(ImmutableList.of("bar", "baz"), "b.*", list);
assertSearchReturns(ImmutableList.of("foo1", "foo2"), "foo[12]", list);
assertSearchReturns(ImmutableList.of("foo1", "foo2", "foo3"), "foo.*",
list);
assertSearchReturns(ImmutableList.of("quux"), "q.*", list);
}
private void assertSearchReturns(List<?> expected, String re,
List<String> inputs) {
assertTrue(Ordering.natural().isOrdered(inputs));
assertEquals(expected,
ImmutableList.copyOf(RegexListSearcher.ofStrings(re).search(inputs)));
}
}