| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.benchmark.byTask.feeds; |
| |
| import java.io.IOException; |
| import java.nio.file.Path; |
| import java.util.HashMap; |
| import java.util.Locale; |
| import java.util.Map; |
| |
| /** |
| * Parser for trec doc content, invoked on doc text excluding <DOC> and <DOCNO> which |
| * are handled in TrecContentSource. Required to be stateless and hence thread safe. |
| */ |
| public abstract class TrecDocParser { |
| |
| /** Types of trec parse paths, */ |
| public enum ParsePathType { |
| GOV2, |
| FBIS, |
| FT, |
| FR94, |
| LATIMES |
| } |
| |
| /** trec parser type used for unknown extensions */ |
| public static final ParsePathType DEFAULT_PATH_TYPE = ParsePathType.GOV2; |
| |
| static final Map<ParsePathType, TrecDocParser> pathType2parser = new HashMap<>(); |
| |
| static { |
| pathType2parser.put(ParsePathType.GOV2, new TrecGov2Parser()); |
| pathType2parser.put(ParsePathType.FBIS, new TrecFBISParser()); |
| pathType2parser.put(ParsePathType.FR94, new TrecFR94Parser()); |
| pathType2parser.put(ParsePathType.FT, new TrecFTParser()); |
| pathType2parser.put(ParsePathType.LATIMES, new TrecLATimesParser()); |
| } |
| |
| static final Map<String, ParsePathType> pathName2Type = new HashMap<>(); |
| |
| static { |
| for (ParsePathType ppt : ParsePathType.values()) { |
| pathName2Type.put(ppt.name().toUpperCase(Locale.ROOT), ppt); |
| } |
| } |
| |
| /** max length of walk up from file to its ancestors when looking for a known path type */ |
| private static final int MAX_PATH_LENGTH = 10; |
| |
| /** Compute the path type of a file by inspecting name of file and its parents */ |
| public static ParsePathType pathType(Path f) { |
| int pathLength = 0; |
| while (f != null && f.getFileName() != null && ++pathLength < MAX_PATH_LENGTH) { |
| ParsePathType ppt = pathName2Type.get(f.getFileName().toString().toUpperCase(Locale.ROOT)); |
| if (ppt != null) { |
| return ppt; |
| } |
| f = f.getParent(); |
| } |
| return DEFAULT_PATH_TYPE; |
| } |
| |
| /** |
| * parse the text prepared in docBuf into a result DocData, no synchronization is required. |
| * |
| * @param docData reusable result |
| * @param name name that should be set to the result |
| * @param trecSrc calling trec content source |
| * @param docBuf text to parse |
| * @param pathType type of parsed file, or null if unknown - may be used by parsers to alter their |
| * behavior according to the file path type. |
| */ |
| public abstract DocData parse( |
| DocData docData, |
| String name, |
| TrecContentSource trecSrc, |
| StringBuilder docBuf, |
| ParsePathType pathType) |
| throws IOException; |
| |
| /** |
| * strip tags from <code>buf</code>: each tag is replaced by a single blank. |
| * |
| * @return text obtained when stripping all tags from <code>buf</code> (Input StringBuilder is |
| * unmodified). |
| */ |
| public static String stripTags(StringBuilder buf, int start) { |
| return stripTags(buf.substring(start), 0); |
| } |
| |
| /** |
| * strip tags from input. |
| * |
| * @see #stripTags(StringBuilder, int) |
| */ |
| public static String stripTags(String buf, int start) { |
| if (start > 0) { |
| buf = buf.substring(start); |
| } |
| return buf.replaceAll("<[^>]*>", " "); |
| } |
| |
| /** |
| * Extract from <code>buf</code> the text of interest within specified tags |
| * |
| * @param buf entire input text |
| * @param startTag tag marking start of text of interest |
| * @param endTag tag marking end of text of interest |
| * @param maxPos if ≥ 0 sets a limit on start of text of interest |
| * @return text of interest or null if not found |
| */ |
| public static String extract( |
| StringBuilder buf, String startTag, String endTag, int maxPos, String noisePrefixes[]) { |
| int k1 = buf.indexOf(startTag); |
| if (k1 >= 0 && (maxPos < 0 || k1 < maxPos)) { |
| k1 += startTag.length(); |
| int k2 = buf.indexOf(endTag, k1); |
| if (k2 >= 0 && (maxPos < 0 || k2 < maxPos)) { // found end tag with allowed range |
| if (noisePrefixes != null) { |
| for (String noise : noisePrefixes) { |
| int k1a = buf.indexOf(noise, k1); |
| if (k1a >= 0 && k1a < k2) { |
| k1 = k1a + noise.length(); |
| } |
| } |
| } |
| return buf.substring(k1, k2).trim(); |
| } |
| } |
| return null; |
| } |
| |
| // public static void main(String[] args) { |
| // System.out.println(stripTags("is it true that<space>2<<second space>><almost last space>1<one |
| // more space>?",0)); |
| // } |
| |
| } |