blob: 4114364e4d09a88f5c7d201cd00cd254216a1ac5 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.byTask.feeds;
import java.io.IOException;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
/**
* Parser for trec doc content, invoked on doc text excluding <DOC> and <DOCNO> which
* are handled in TrecContentSource. Required to be stateless and hence thread safe.
*/
public abstract class TrecDocParser {
/** Types of trec parse paths, */
public enum ParsePathType {
GOV2,
FBIS,
FT,
FR94,
LATIMES
}
/** trec parser type used for unknown extensions */
public static final ParsePathType DEFAULT_PATH_TYPE = ParsePathType.GOV2;
static final Map<ParsePathType, TrecDocParser> pathType2parser = new HashMap<>();
static {
pathType2parser.put(ParsePathType.GOV2, new TrecGov2Parser());
pathType2parser.put(ParsePathType.FBIS, new TrecFBISParser());
pathType2parser.put(ParsePathType.FR94, new TrecFR94Parser());
pathType2parser.put(ParsePathType.FT, new TrecFTParser());
pathType2parser.put(ParsePathType.LATIMES, new TrecLATimesParser());
}
static final Map<String, ParsePathType> pathName2Type = new HashMap<>();
static {
for (ParsePathType ppt : ParsePathType.values()) {
pathName2Type.put(ppt.name().toUpperCase(Locale.ROOT), ppt);
}
}
/** max length of walk up from file to its ancestors when looking for a known path type */
private static final int MAX_PATH_LENGTH = 10;
/** Compute the path type of a file by inspecting name of file and its parents */
public static ParsePathType pathType(Path f) {
int pathLength = 0;
while (f != null && f.getFileName() != null && ++pathLength < MAX_PATH_LENGTH) {
ParsePathType ppt = pathName2Type.get(f.getFileName().toString().toUpperCase(Locale.ROOT));
if (ppt != null) {
return ppt;
}
f = f.getParent();
}
return DEFAULT_PATH_TYPE;
}
/**
* parse the text prepared in docBuf into a result DocData, no synchronization is required.
*
* @param docData reusable result
* @param name name that should be set to the result
* @param trecSrc calling trec content source
* @param docBuf text to parse
* @param pathType type of parsed file, or null if unknown - may be used by parsers to alter their
* behavior according to the file path type.
*/
public abstract DocData parse(
DocData docData,
String name,
TrecContentSource trecSrc,
StringBuilder docBuf,
ParsePathType pathType)
throws IOException;
/**
* strip tags from <code>buf</code>: each tag is replaced by a single blank.
*
* @return text obtained when stripping all tags from <code>buf</code> (Input StringBuilder is
* unmodified).
*/
public static String stripTags(StringBuilder buf, int start) {
return stripTags(buf.substring(start), 0);
}
/**
* strip tags from input.
*
* @see #stripTags(StringBuilder, int)
*/
public static String stripTags(String buf, int start) {
if (start > 0) {
buf = buf.substring(start);
}
return buf.replaceAll("<[^>]*>", " ");
}
/**
* Extract from <code>buf</code> the text of interest within specified tags
*
* @param buf entire input text
* @param startTag tag marking start of text of interest
* @param endTag tag marking end of text of interest
* @param maxPos if &ge; 0 sets a limit on start of text of interest
* @return text of interest or null if not found
*/
public static String extract(
StringBuilder buf, String startTag, String endTag, int maxPos, String noisePrefixes[]) {
int k1 = buf.indexOf(startTag);
if (k1 >= 0 && (maxPos < 0 || k1 < maxPos)) {
k1 += startTag.length();
int k2 = buf.indexOf(endTag, k1);
if (k2 >= 0 && (maxPos < 0 || k2 < maxPos)) { // found end tag with allowed range
if (noisePrefixes != null) {
for (String noise : noisePrefixes) {
int k1a = buf.indexOf(noise, k1);
if (k1a >= 0 && k1a < k2) {
k1 = k1a + noise.length();
}
}
}
return buf.substring(k1, k2).trim();
}
}
return null;
}
// public static void main(String[] args) {
// System.out.println(stripTags("is it true that<space>2<<second space>><almost last space>1<one
// more space>?",0));
// }
}