| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.benchmark.utils; |
| |
| import java.io.BufferedReader; |
| import java.io.BufferedWriter; |
| import java.io.IOException; |
| import java.nio.charset.StandardCharsets; |
| import java.nio.file.DirectoryStream; |
| import java.nio.file.Files; |
| import java.nio.file.Path; |
| import java.nio.file.Paths; |
| import java.nio.file.StandardCopyOption; |
| import java.util.regex.Matcher; |
| import java.util.regex.Pattern; |
| import org.apache.lucene.util.IOUtils; |
| |
| /** |
| * Split the Reuters SGML documents into Simple Text files containing: Title, Date, Dateline, Body |
| */ |
| public class ExtractReuters { |
| private Path reutersDir; |
| private Path outputDir; |
| |
| public ExtractReuters(Path reutersDir, Path outputDir) throws IOException { |
| this.reutersDir = reutersDir; |
| this.outputDir = outputDir; |
| System.out.println("Deleting all files in " + outputDir); |
| IOUtils.rm(outputDir); |
| } |
| |
| public void extract() throws IOException { |
| long count = 0; |
| Files.createDirectories(outputDir); |
| try (DirectoryStream<Path> stream = Files.newDirectoryStream(reutersDir, "*.sgm")) { |
| for (Path sgmFile : stream) { |
| extractFile(sgmFile); |
| count++; |
| } |
| } |
| if (count == 0) { |
| System.err.println("No .sgm files in " + reutersDir); |
| } |
| } |
| |
| Pattern EXTRACTION_PATTERN = |
| Pattern.compile("<TITLE>(.*?)</TITLE>|<DATE>(.*?)</DATE>|<BODY>(.*?)</BODY>"); |
| |
| private static String[] META_CHARS = {"&", "<", ">", "\"", "'"}; |
| |
| private static String[] META_CHARS_SERIALIZATIONS = {"&", "<", ">", """, "'"}; |
| |
| /** Override if you wish to change what is extracted */ |
| protected void extractFile(Path sgmFile) { |
| try (BufferedReader reader = Files.newBufferedReader(sgmFile, StandardCharsets.ISO_8859_1)) { |
| StringBuilder buffer = new StringBuilder(1024); |
| StringBuilder outBuffer = new StringBuilder(1024); |
| |
| String line = null; |
| int docNumber = 0; |
| while ((line = reader.readLine()) != null) { |
| // when we see a closing reuters tag, flush the file |
| |
| if (line.indexOf("</REUTERS") == -1) { |
| // Replace the SGM escape sequences |
| |
| buffer.append(line).append(' '); // accumulate the strings for now, |
| // then apply regular expression to |
| // get the pieces, |
| } else { |
| // Extract the relevant pieces and write to a file in the output dir |
| Matcher matcher = EXTRACTION_PATTERN.matcher(buffer); |
| while (matcher.find()) { |
| for (int i = 1; i <= matcher.groupCount(); i++) { |
| if (matcher.group(i) != null) { |
| outBuffer.append(matcher.group(i)); |
| } |
| } |
| outBuffer.append(System.lineSeparator()).append(System.lineSeparator()); |
| } |
| String out = outBuffer.toString(); |
| for (int i = 0; i < META_CHARS_SERIALIZATIONS.length; i++) { |
| out = out.replaceAll(META_CHARS_SERIALIZATIONS[i], META_CHARS[i]); |
| } |
| Path outFile = outputDir.resolve(sgmFile.getFileName() + "-" + (docNumber++) + ".txt"); |
| // System.out.println("Writing " + outFile); |
| try (BufferedWriter writer = Files.newBufferedWriter(outFile, StandardCharsets.UTF_8)) { |
| writer.write(out); |
| } |
| outBuffer.setLength(0); |
| buffer.setLength(0); |
| } |
| } |
| } catch (IOException e) { |
| throw new RuntimeException(e); |
| } |
| } |
| |
| public static void main(String[] args) throws Exception { |
| if (args.length != 2) { |
| usage("Wrong number of arguments (" + args.length + ")"); |
| return; |
| } |
| Path reutersDir = Paths.get(args[0]); |
| if (!Files.exists(reutersDir)) { |
| usage("Cannot find Path to Reuters SGM files (" + reutersDir + ")"); |
| return; |
| } |
| |
| // First, extract to a tmp directory and only if everything succeeds, rename |
| // to output directory. |
| Path outputDir = Paths.get(args[1] + "-tmp"); |
| Files.createDirectories(outputDir); |
| ExtractReuters extractor = new ExtractReuters(reutersDir, outputDir); |
| extractor.extract(); |
| // Now rename to requested output dir |
| Files.move(outputDir, Paths.get(args[1]), StandardCopyOption.ATOMIC_MOVE); |
| } |
| |
| private static void usage(String msg) { |
| System.err.println( |
| "Usage: " |
| + msg |
| + " :: java -cp <...> org.apache.lucene.benchmark.utils.ExtractReuters <Path to Reuters SGM files> <Output Path>"); |
| } |
| } |