lucene/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.benchmark.utils;


 import java.io.IOException;
 import java.io.Writer;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.Properties;

 import org.apache.lucene.benchmark.byTask.feeds.ContentSource;
 import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
 import org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource;
 import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
 import org.apache.lucene.benchmark.byTask.utils.Config;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.util.IOUtils;

 /**
  * Extract the downloaded Wikipedia dump into separate files for indexing.
  */
 public class ExtractWikipedia {

   private Path outputDir;

   static public int count = 0;

   static final int BASE = 10;
   protected DocMaker docMaker;

   public ExtractWikipedia(DocMaker docMaker, Path outputDir) throws IOException {
     this.outputDir = outputDir;
     this.docMaker = docMaker;
     System.out.println("Deleting all files in " + outputDir);
     IOUtils.rm(outputDir);
   }

   public Path directory(int count, Path directory) {
     if (directory == null) {
       directory = outputDir;
     }
     int base = BASE;
     while (base <= count) {
       base *= BASE;
     }
     if (count < BASE) {
       return directory;
     }
     directory = directory.resolve(Integer.toString(base / BASE));
     directory = directory.resolve(Integer.toString(count / (base / BASE)));
     return directory(count % (base / BASE), directory);
   }

   public void create(String id, String title, String time, String body) throws IOException {

     Path d = directory(count++, null);
     Files.createDirectories(d);
     Path f = d.resolve(id + ".txt");

     StringBuilder contents = new StringBuilder();

     contents.append(time);
     contents.append("\n\n");
     contents.append(title);
     contents.append("\n\n");
     contents.append(body);
     contents.append("\n");

     try (Writer writer = Files.newBufferedWriter(f, StandardCharsets.UTF_8)) {
       writer.write(contents.toString());
     }
   }

   public void extract() throws Exception {
     Document doc = null;
     System.out.println("Starting Extraction");
     long start = System.currentTimeMillis();
     try {
       while ((doc = docMaker.makeDocument()) != null) {
         create(doc.get(DocMaker.ID_FIELD), doc.get(DocMaker.TITLE_FIELD), doc
             .get(DocMaker.DATE_FIELD), doc.get(DocMaker.BODY_FIELD));
       }
     } catch (NoMoreDataException e) {
       //continue
     }
     long finish = System.currentTimeMillis();
     System.out.println("Extraction took " + (finish - start) + " ms");
   }

   public static void main(String[] args) throws Exception {

     Path wikipedia = null;
     Path outputDir = Paths.get("enwiki");
     boolean keepImageOnlyDocs = true;
     for (int i = 0; i < args.length; i++) {
       String arg = args[i];
       if (arg.equals("--input") || arg.equals("-i")) {
         wikipedia = Paths.get(args[i + 1]);
         i++;
       } else if (arg.equals("--output") || arg.equals("-o")) {
         outputDir = Paths.get(args[i + 1]);
         i++;
       } else if (arg.equals("--discardImageOnlyDocs") || arg.equals("-d")) {
         keepImageOnlyDocs = false;
       }
     }

     Properties properties = new Properties();
     properties.setProperty("docs.file", wikipedia.toAbsolutePath().toString());
     properties.setProperty("content.source.forever", "false");
     properties.setProperty("keep.image.only.docs", String.valueOf(keepImageOnlyDocs));
     Config config = new Config(properties);

     ContentSource source = new EnwikiContentSource();
     source.setConfig(config);

     DocMaker docMaker = new DocMaker();
     docMaker.setConfig(config, source);
     docMaker.resetInputs();
     if (Files.exists(wikipedia)) {
       System.out.println("Extracting Wikipedia to: " + outputDir + " using EnwikiContentSource");
       Files.createDirectories(outputDir);
       ExtractWikipedia extractor = new ExtractWikipedia(docMaker, outputDir);
       extractor.extract();
     } else {
       printUsage();
     }
   }

   private static void printUsage() {
     System.err.println("Usage: java -cp <...> org.apache.lucene.benchmark.utils.ExtractWikipedia --input|-i <Path to Wikipedia XML file> " +
             "[--output|-o <Output Path>] [--discardImageOnlyDocs|-d]");
     System.err.println("--discardImageOnlyDocs tells the extractor to skip Wiki docs that contain only images");
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.benchmark.utils;


	import java.io.IOException;
	import java.io.Writer;
	import java.nio.charset.StandardCharsets;
	import java.nio.file.Files;
	import java.nio.file.Path;
	import java.nio.file.Paths;
	import java.util.Properties;

	import org.apache.lucene.benchmark.byTask.feeds.ContentSource;
	import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
	import org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource;
	import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
	import org.apache.lucene.benchmark.byTask.utils.Config;
	import org.apache.lucene.document.Document;
	import org.apache.lucene.util.IOUtils;

	/**
	* Extract the downloaded Wikipedia dump into separate files for indexing.
	*/
	public class ExtractWikipedia {

	private Path outputDir;

	static public int count = 0;

	static final int BASE = 10;
	protected DocMaker docMaker;

	public ExtractWikipedia(DocMaker docMaker, Path outputDir) throws IOException {
	this.outputDir = outputDir;
	this.docMaker = docMaker;
	System.out.println("Deleting all files in " + outputDir);
	IOUtils.rm(outputDir);
	}

	public Path directory(int count, Path directory) {
	if (directory == null) {
	directory = outputDir;
	}
	int base = BASE;
	while (base <= count) {
	base *= BASE;
	}
	if (count < BASE) {
	return directory;
	}
	directory = directory.resolve(Integer.toString(base / BASE));
	directory = directory.resolve(Integer.toString(count / (base / BASE)));
	return directory(count % (base / BASE), directory);
	}

	public void create(String id, String title, String time, String body) throws IOException {

	Path d = directory(count++, null);
	Files.createDirectories(d);
	Path f = d.resolve(id + ".txt");

	StringBuilder contents = new StringBuilder();

	contents.append(time);
	contents.append("\n\n");
	contents.append(title);
	contents.append("\n\n");
	contents.append(body);
	contents.append("\n");

	try (Writer writer = Files.newBufferedWriter(f, StandardCharsets.UTF_8)) {
	writer.write(contents.toString());
	}
	}

	public void extract() throws Exception {
	Document doc = null;
	System.out.println("Starting Extraction");
	long start = System.currentTimeMillis();
	try {
	while ((doc = docMaker.makeDocument()) != null) {
	create(doc.get(DocMaker.ID_FIELD), doc.get(DocMaker.TITLE_FIELD), doc
	.get(DocMaker.DATE_FIELD), doc.get(DocMaker.BODY_FIELD));
	}
	} catch (NoMoreDataException e) {
	//continue
	}
	long finish = System.currentTimeMillis();
	System.out.println("Extraction took " + (finish - start) + " ms");
	}

	public static void main(String[] args) throws Exception {

	Path wikipedia = null;
	Path outputDir = Paths.get("enwiki");
	boolean keepImageOnlyDocs = true;
	for (int i = 0; i < args.length; i++) {
	String arg = args[i];
	if (arg.equals("--input") \|\| arg.equals("-i")) {
	wikipedia = Paths.get(args[i + 1]);
	i++;
	} else if (arg.equals("--output") \|\| arg.equals("-o")) {
	outputDir = Paths.get(args[i + 1]);
	i++;
	} else if (arg.equals("--discardImageOnlyDocs") \|\| arg.equals("-d")) {
	keepImageOnlyDocs = false;
	}
	}

	Properties properties = new Properties();
	properties.setProperty("docs.file", wikipedia.toAbsolutePath().toString());
	properties.setProperty("content.source.forever", "false");
	properties.setProperty("keep.image.only.docs", String.valueOf(keepImageOnlyDocs));
	Config config = new Config(properties);

	ContentSource source = new EnwikiContentSource();
	source.setConfig(config);

	DocMaker docMaker = new DocMaker();
	docMaker.setConfig(config, source);
	docMaker.resetInputs();
	if (Files.exists(wikipedia)) {
	System.out.println("Extracting Wikipedia to: " + outputDir + " using EnwikiContentSource");
	Files.createDirectories(outputDir);
	ExtractWikipedia extractor = new ExtractWikipedia(docMaker, outputDir);
	extractor.extract();
	} else {
	printUsage();
	}
	}

	private static void printUsage() {
	System.err.println("Usage: java -cp <...> org.apache.lucene.benchmark.utils.ExtractWikipedia --input\|-i <Path to Wikipedia XML file> " +
	"[--output\|-o <Output Path>] [--discardImageOnlyDocs\|-d]");
	System.err.println("--discardImageOnlyDocs tells the extractor to skip Wiki docs that contain only images");
	}

	}