Files
gerrit/lib/asciidoctor/java/DocIndexer.java
Dave Borowitz 70aaf7eaa7 Update Lucene to 5.0.0
Use this version starting at the existing schema version 15 (which has
been in master for only a week or so).

Notable changes:

 - Can use the new IndexWriter#setCommitOnClose(boolean) method to
   simplify closing an index.

 - This means we no longer need to pass Version into the
   IndexWriterConstructor. According to [1], this was _only_ used to
   determine whether or not the index should be committed on close, as
   this behavior differed between versions. No more mapping schema
   versions to Lucene versions!

 - IndexWriters are now forced to use their configured Analyzer,
   removing the methods taking an Analyzer (which we weren't using
   anyway). This saves some code in AutoCommitWriter.

 - Lucene 5 cannot read indexes created by older versions without an
   additional jar in the classpath, so we need to add that.

The most annoying change is that sorting cannot be done on normal
numeric fields by default anymore[2]. This was inefficient anyway, as
Lucene had to seek and read all index field values before doing the
sorting. Switch to the newer DocValues API for strongly-typed sortable
fields. This introduces some medium-term ugliness as the sort spec
changes depending on the schema version.

Unfortunately we can only use DocValues on new index versions; older
versions need to use the new UninvertingReader API, which provides
FieldCache based sorting without a reindex. An overzealous check in a
static method in Lucene[3] means we need to temporarily fork
SearcherManager.java from Lucene in order to get this to work with the
NRT machinery.

Since we have to jump through significant hoops to get older index
versions readable by this version of Lucene, add a test specifically
for schema v14.

[1] https://issues.apache.org/jira/browse/LUCENE-5871
[2] https://issues.apache.org/jira/browse/LUCENE-5666
[3] https://issues.apache.org/jira/browse/LUCENE-6370

Change-Id: I843be2fb697779fc741e25459a2716280b2bd0b6
2015-03-25 12:21:59 -07:00

171 lines
5.7 KiB
Java

// Copyright (C) 2013 The Android Open Source Project
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
import com.google.gerrit.server.documentation.Constants;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.RAMDirectory;
import org.kohsuke.args4j.Argument;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.jar.JarEntry;
import java.util.jar.JarOutputStream;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
public class DocIndexer {
private static final Pattern SECTION_HEADER = Pattern.compile("^=+ (.*)");
@Option(name = "-o", usage = "output JAR file")
private String outFile;
@Option(name = "--prefix", usage = "prefix for the html filepath")
private String prefix = "";
@Option(name = "--in-ext", usage = "extension for input files")
private String inExt = ".txt";
@Option(name = "--out-ext", usage = "extension for output files")
private String outExt = ".html";
@Argument(usage = "input files")
private List<String> inputFiles = new ArrayList<>();
private void invoke(String... parameters) throws IOException {
CmdLineParser parser = new CmdLineParser(this);
try {
parser.parseArgument(parameters);
if (inputFiles.isEmpty()) {
throw new CmdLineException(parser, "FAILED: input file missing");
}
} catch (CmdLineException e) {
System.err.println(e.getMessage());
parser.printUsage(System.err);
System.exit(1);
return;
}
byte[] compressedIndex = zip(index());
JarOutputStream jar = new JarOutputStream(new FileOutputStream(outFile));
JarEntry entry = new JarEntry(
String.format("%s/%s", Constants.PACKAGE, Constants.INDEX_ZIP));
entry.setSize(compressedIndex.length);
jar.putNextEntry(entry);
jar.write(compressedIndex);
jar.closeEntry();
jar.close();
}
private RAMDirectory index() throws IOException,
UnsupportedEncodingException, FileNotFoundException {
RAMDirectory directory = new RAMDirectory();
IndexWriterConfig config = new IndexWriterConfig(
new StandardAnalyzer(CharArraySet.EMPTY_SET));
config.setOpenMode(OpenMode.CREATE);
config.setCommitOnClose(true);
IndexWriter iwriter = new IndexWriter(directory, config);
for (String inputFile : inputFiles) {
File file = new File(inputFile);
if (file.length() == 0) {
continue;
}
BufferedReader titleReader = new BufferedReader(
new InputStreamReader(new FileInputStream(file), "UTF-8"));
String title = titleReader.readLine();
if (title != null && title.startsWith("[[")) {
// Generally the first line of the txt is the title. In a few cases the
// first line is a "[[tag]]" and the second line is the title.
title = titleReader.readLine();
}
titleReader.close();
Matcher matcher = SECTION_HEADER.matcher(title);
if (matcher.matches()) {
title = matcher.group(1);
}
String outputFile = AsciiDoctor.mapInFileToOutFile(
inputFile, inExt, outExt);
FileReader reader = new FileReader(file);
Document doc = new Document();
doc.add(new TextField(Constants.DOC_FIELD, reader));
doc.add(new StringField(
Constants.URL_FIELD, prefix + outputFile, Field.Store.YES));
doc.add(new TextField(Constants.TITLE_FIELD, title, Field.Store.YES));
iwriter.addDocument(doc);
reader.close();
}
iwriter.close();
return directory;
}
private byte[] zip(RAMDirectory dir) throws IOException {
ByteArrayOutputStream buf = new ByteArrayOutputStream();
ZipOutputStream zip = new ZipOutputStream(buf);
for (String name : dir.listAll()) {
IndexInput in = dir.openInput(name, null);
try {
int len = (int) in.length();
byte[] tmp = new byte[len];
ZipEntry entry = new ZipEntry(name);
entry.setSize(len);
in.readBytes(tmp, 0, len);
zip.putNextEntry(entry);
zip.write(tmp, 0, len);
zip.closeEntry();
} finally {
in.close();
}
}
zip.close();
return buf.toByteArray();
}
public static void main(String[] args) {
try {
new DocIndexer().invoke(args);
} catch (IOException e) {
System.err.println(e.getMessage());
System.exit(1);
}
}
}