lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.benchmark.byTask.feeds;

 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.HashMap;
 import java.util.Locale;
 import java.util.Map;
 import javax.xml.XMLConstants;
 import javax.xml.parsers.ParserConfigurationException;
 import javax.xml.parsers.SAXParser;
 import javax.xml.parsers.SAXParserFactory;
 import org.apache.lucene.benchmark.byTask.utils.Config;
 import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.ThreadInterruptedException;
 import org.xml.sax.Attributes;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
 import org.xml.sax.SAXNotRecognizedException;
 import org.xml.sax.SAXNotSupportedException;
 import org.xml.sax.helpers.DefaultHandler;

 /**
  * A {@link ContentSource} which reads the English Wikipedia dump. You can read the .bz2 file
  * directly (it will be decompressed on the fly). Config properties:
  *
  * <ul>
  *   <li>keep.image.only.docs=false|true (default <b>true</b>).
  *   <li>docs.file=&lt;path to the file&gt;
  * </ul>
  */
 public class EnwikiContentSource extends ContentSource {

   private static final SAXParserFactory SAX_PARSER_FACTORY = SAXParserFactory.newDefaultInstance();

   static {
     try {
       SAX_PARSER_FACTORY.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
     } catch (SAXNotRecognizedException
         | SAXNotSupportedException
         | ParserConfigurationException e) {
       throw new Error(e);
     }
   }

   private class Parser extends DefaultHandler implements Runnable {
     private Thread t;
     private boolean threadDone;
     private boolean stopped = false;
     private String[] tuple;
     private NoMoreDataException nmde;
     private StringBuilder contents = new StringBuilder();
     private String title;
     private String body;
     private String time;
     private String id;

     String[] next() throws NoMoreDataException {
       if (t == null) {
         threadDone = false;
         t = new Thread(this);
         t.setDaemon(true);
         t.start();
       }
       String[] result;
       synchronized (this) {
         while (tuple == null && nmde == null && !threadDone && !stopped) {
           try {
             wait();
           } catch (InterruptedException ie) {
             throw new ThreadInterruptedException(ie);
           }
         }
         if (tuple != null) {
           result = tuple;
           tuple = null;
           notify();
           return result;
         }
         if (nmde != null) {
           // Set to null so we will re-start thread in case
           // we are re-used:
           t = null;
           throw nmde;
         }
         // The thread has exited yet did not hit end of
         // data, so this means it hit an exception.  We
         // throw NoMorDataException here to force
         // benchmark to stop the current alg:
         throw new NoMoreDataException();
       }
     }

     String time(String original) {
       StringBuilder buffer = new StringBuilder();

       buffer.append(original.substring(8, 10));
       buffer.append('-');
       buffer.append(months[Integer.parseInt(original.substring(5, 7)) - 1]);
       buffer.append('-');
       buffer.append(original.substring(0, 4));
       buffer.append(' ');
       buffer.append(original.substring(11, 19));
       buffer.append(".000");

       return buffer.toString();
     }

     @Override
     public void characters(char[] ch, int start, int length) {
       contents.append(ch, start, length);
     }

     @Override
     public void endElement(String namespace, String simple, String qualified) throws SAXException {
       int elemType = getElementType(qualified);
       switch (elemType) {
         case PAGE:
           // the body must be null and we either are keeping image docs or the
           // title does not start with Image:
           if (body != null && (keepImages || !title.startsWith("Image:"))) {
             String[] tmpTuple = new String[LENGTH];
             tmpTuple[TITLE] = title.replace('\t', ' ');
             tmpTuple[DATE] = time.replace('\t', ' ');
             tmpTuple[BODY] = body.replaceAll("[\t\n]", " ");
             tmpTuple[ID] = id;
             synchronized (this) {
               while (tuple != null && !stopped) {
                 try {
                   wait();
                 } catch (InterruptedException ie) {
                   throw new ThreadInterruptedException(ie);
                 }
               }
               tuple = tmpTuple;
               notify();
             }
           }
           break;
         case BODY:
           body = contents.toString();
           // workaround that startswith doesn't have an ignore case option, get at least 20 chars.
           String startsWith =
               body.substring(0, Math.min(10, contents.length())).toLowerCase(Locale.ROOT);
           if (startsWith.startsWith("#redirect")) {
             body = null;
           }
           break;
         case DATE:
           time = time(contents.toString());
           break;
         case TITLE:
           title = contents.toString();
           break;
         case ID:
           // the doc id is the first one in the page.  All other ids after that one can be ignored
           // according to the schema
           if (id == null) {
             id = contents.toString();
           }
           break;
         default:
           // this element should be discarded.
       }
     }

     @Override
     public void run() {

       try {
         SAXParser reader = SAX_PARSER_FACTORY.newSAXParser();
         while (!stopped) {
           final InputStream localFileIS = is;
           if (localFileIS != null) { // null means fileIS was closed on us
             try {
               // To work around a bug in XERCES (XERCESJ-1257), we assume the XML is always UTF8, so
               // we simply provide reader.
               reader.parse(
                   new InputSource(IOUtils.getDecodingReader(localFileIS, StandardCharsets.UTF_8)),
                   this);
             } catch (IOException ioe) {
               synchronized (EnwikiContentSource.this) {
                 if (localFileIS != is) {
                   // fileIS was closed on us, so, just fall through
                 } else
                   // Exception is real
                   throw ioe;
               }
             }
           }
           synchronized (this) {
             if (stopped || !forever) {
               nmde = new NoMoreDataException();
               notify();
               return;
             } else if (localFileIS == is) {
               // If file is not already re-opened then re-open it now
               is = openInputStream();
             }
           }
         }
       } catch (SAXException | IOException | ParserConfigurationException sae) {
         throw new RuntimeException(sae);
       } finally {
         synchronized (this) {
           threadDone = true;
           notify();
         }
       }
     }

     @Override
     public void startElement(
         String namespace, String simple, String qualified, Attributes attributes) {
       int elemType = getElementType(qualified);
       switch (elemType) {
         case PAGE:
           title = null;
           body = null;
           time = null;
           id = null;
           break;
           // intentional fall-through.
         case BODY:
         case DATE:
         case TITLE:
         case ID:
           contents.setLength(0);
           break;
         default:
           // this element should be discarded.
       }
     }

     private void stop() {
       synchronized (this) {
         stopped = true;
         if (tuple != null) {
           tuple = null;
           notify();
         }
       }
     }
   }

   private static final Map<String, Integer> ELEMENTS = new HashMap<>();
   private static final int TITLE = 0;
   private static final int DATE = TITLE + 1;
   private static final int BODY = DATE + 1;
   private static final int ID = BODY + 1;
   private static final int LENGTH = ID + 1;
   // LENGTH is used as the size of the tuple, so whatever constants we need that
   // should not be part of the tuple, we should define them after LENGTH.
   private static final int PAGE = LENGTH + 1;

   private static final String[] months = {
     "JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"
   };

   static {
     ELEMENTS.put("page", Integer.valueOf(PAGE));
     ELEMENTS.put("text", Integer.valueOf(BODY));
     ELEMENTS.put("timestamp", Integer.valueOf(DATE));
     ELEMENTS.put("title", Integer.valueOf(TITLE));
     ELEMENTS.put("id", Integer.valueOf(ID));
   }

   /**
    * Returns the type of the element if defined, otherwise returns -1. This method is useful in
    * startElement and endElement, by not needing to compare the element qualified name over and
    * over.
    */
   private static final int getElementType(String elem) {
     Integer val = ELEMENTS.get(elem);
     return val == null ? -1 : val.intValue();
   }

   private Path file;
   private boolean keepImages = true;
   private InputStream is;
   private Parser parser = new Parser();

   @Override
   public void close() throws IOException {
     synchronized (EnwikiContentSource.this) {
       parser.stop();
       if (is != null) {
         is.close();
         is = null;
       }
     }
   }

   @Override
   public synchronized DocData getNextDocData(DocData docData)
       throws NoMoreDataException, IOException {
     String[] tuple = parser.next();
     docData.clear();
     docData.setName(tuple[ID]);
     docData.setBody(tuple[BODY]);
     docData.setDate(tuple[DATE]);
     docData.setTitle(tuple[TITLE]);
     return docData;
   }

   @Override
   public void resetInputs() throws IOException {
     super.resetInputs();
     is = openInputStream();
   }

   /** Open the input stream. */
   protected InputStream openInputStream() throws IOException {
     return StreamUtils.inputStream(file);
   }

   @Override
   public void setConfig(Config config) {
     super.setConfig(config);
     keepImages = config.get("keep.image.only.docs", true);
     String fileName = config.get("docs.file", null);
     if (fileName != null) {
       file = Paths.get(fileName).toAbsolutePath();
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.benchmark.byTask.feeds;

	import java.io.IOException;
	import java.io.InputStream;
	import java.nio.charset.StandardCharsets;
	import java.nio.file.Path;
	import java.nio.file.Paths;
	import java.util.HashMap;
	import java.util.Locale;
	import java.util.Map;
	import javax.xml.XMLConstants;
	import javax.xml.parsers.ParserConfigurationException;
	import javax.xml.parsers.SAXParser;
	import javax.xml.parsers.SAXParserFactory;
	import org.apache.lucene.benchmark.byTask.utils.Config;
	import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
	import org.apache.lucene.util.IOUtils;
	import org.apache.lucene.util.ThreadInterruptedException;
	import org.xml.sax.Attributes;
	import org.xml.sax.InputSource;
	import org.xml.sax.SAXException;
	import org.xml.sax.SAXNotRecognizedException;
	import org.xml.sax.SAXNotSupportedException;
	import org.xml.sax.helpers.DefaultHandler;

	/**
	* A {@link ContentSource} which reads the English Wikipedia dump. You can read the .bz2 file
	* directly (it will be decompressed on the fly). Config properties:
	*
	* <ul>
	* <li>keep.image.only.docs=false\|true (default <b>true</b>).
	* <li>docs.file=<path to the file>
	* </ul>
	*/
	public class EnwikiContentSource extends ContentSource {

	private static final SAXParserFactory SAX_PARSER_FACTORY = SAXParserFactory.newDefaultInstance();

	static {
	try {
	SAX_PARSER_FACTORY.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
	} catch (SAXNotRecognizedException
	\| SAXNotSupportedException
	\| ParserConfigurationException e) {
	throw new Error(e);
	}
	}

	private class Parser extends DefaultHandler implements Runnable {
	private Thread t;
	private boolean threadDone;
	private boolean stopped = false;
	private String[] tuple;
	private NoMoreDataException nmde;
	private StringBuilder contents = new StringBuilder();
	private String title;
	private String body;
	private String time;
	private String id;

	String[] next() throws NoMoreDataException {
	if (t == null) {
	threadDone = false;
	t = new Thread(this);
	t.setDaemon(true);
	t.start();
	}
	String[] result;
	synchronized (this) {
	while (tuple == null && nmde == null && !threadDone && !stopped) {
	try {
	wait();
	} catch (InterruptedException ie) {
	throw new ThreadInterruptedException(ie);
	}
	}
	if (tuple != null) {
	result = tuple;
	tuple = null;
	notify();
	return result;
	}
	if (nmde != null) {
	// Set to null so we will re-start thread in case
	// we are re-used:
	t = null;
	throw nmde;
	}
	// The thread has exited yet did not hit end of
	// data, so this means it hit an exception. We
	// throw NoMorDataException here to force
	// benchmark to stop the current alg:
	throw new NoMoreDataException();
	}
	}

	String time(String original) {
	StringBuilder buffer = new StringBuilder();

	buffer.append(original.substring(8, 10));
	buffer.append('-');
	buffer.append(months[Integer.parseInt(original.substring(5, 7)) - 1]);
	buffer.append('-');
	buffer.append(original.substring(0, 4));
	buffer.append(' ');
	buffer.append(original.substring(11, 19));
	buffer.append(".000");

	return buffer.toString();
	}

	@Override
	public void characters(char[] ch, int start, int length) {
	contents.append(ch, start, length);
	}

	@Override
	public void endElement(String namespace, String simple, String qualified) throws SAXException {
	int elemType = getElementType(qualified);
	switch (elemType) {
	case PAGE:
	// the body must be null and we either are keeping image docs or the
	// title does not start with Image:
	if (body != null && (keepImages \|\| !title.startsWith("Image:"))) {
	String[] tmpTuple = new String[LENGTH];
	tmpTuple[TITLE] = title.replace('\t', ' ');
	tmpTuple[DATE] = time.replace('\t', ' ');
	tmpTuple[BODY] = body.replaceAll("[\t\n]", " ");
	tmpTuple[ID] = id;
	synchronized (this) {
	while (tuple != null && !stopped) {
	try {
	wait();
	} catch (InterruptedException ie) {
	throw new ThreadInterruptedException(ie);
	}
	}
	tuple = tmpTuple;
	notify();
	}
	}
	break;
	case BODY:
	body = contents.toString();
	// workaround that startswith doesn't have an ignore case option, get at least 20 chars.
	String startsWith =
	body.substring(0, Math.min(10, contents.length())).toLowerCase(Locale.ROOT);
	if (startsWith.startsWith("#redirect")) {
	body = null;
	}
	break;
	case DATE:
	time = time(contents.toString());
	break;
	case TITLE:
	title = contents.toString();
	break;
	case ID:
	// the doc id is the first one in the page. All other ids after that one can be ignored
	// according to the schema
	if (id == null) {
	id = contents.toString();
	}
	break;
	default:
	// this element should be discarded.
	}
	}

	@Override
	public void run() {

	try {
	SAXParser reader = SAX_PARSER_FACTORY.newSAXParser();
	while (!stopped) {
	final InputStream localFileIS = is;
	if (localFileIS != null) { // null means fileIS was closed on us
	try {
	// To work around a bug in XERCES (XERCESJ-1257), we assume the XML is always UTF8, so
	// we simply provide reader.
	reader.parse(
	new InputSource(IOUtils.getDecodingReader(localFileIS, StandardCharsets.UTF_8)),
	this);
	} catch (IOException ioe) {
	synchronized (EnwikiContentSource.this) {
	if (localFileIS != is) {
	// fileIS was closed on us, so, just fall through
	} else
	// Exception is real
	throw ioe;
	}
	}
	}
	synchronized (this) {
	if (stopped \|\| !forever) {
	nmde = new NoMoreDataException();
	notify();
	return;
	} else if (localFileIS == is) {
	// If file is not already re-opened then re-open it now
	is = openInputStream();
	}
	}
	}
	} catch (SAXException \| IOException \| ParserConfigurationException sae) {
	throw new RuntimeException(sae);
	} finally {
	synchronized (this) {
	threadDone = true;
	notify();
	}
	}
	}

	@Override
	public void startElement(
	String namespace, String simple, String qualified, Attributes attributes) {
	int elemType = getElementType(qualified);
	switch (elemType) {
	case PAGE:
	title = null;
	body = null;
	time = null;
	id = null;
	break;
	// intentional fall-through.
	case BODY:
	case DATE:
	case TITLE:
	case ID:
	contents.setLength(0);
	break;
	default:
	// this element should be discarded.
	}
	}

	private void stop() {
	synchronized (this) {
	stopped = true;
	if (tuple != null) {
	tuple = null;
	notify();
	}
	}
	}
	}

	private static final Map<String, Integer> ELEMENTS = new HashMap<>();
	private static final int TITLE = 0;
	private static final int DATE = TITLE + 1;
	private static final int BODY = DATE + 1;
	private static final int ID = BODY + 1;
	private static final int LENGTH = ID + 1;
	// LENGTH is used as the size of the tuple, so whatever constants we need that
	// should not be part of the tuple, we should define them after LENGTH.
	private static final int PAGE = LENGTH + 1;

	private static final String[] months = {
	"JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"
	};

	static {
	ELEMENTS.put("page", Integer.valueOf(PAGE));
	ELEMENTS.put("text", Integer.valueOf(BODY));
	ELEMENTS.put("timestamp", Integer.valueOf(DATE));
	ELEMENTS.put("title", Integer.valueOf(TITLE));
	ELEMENTS.put("id", Integer.valueOf(ID));
	}

	/**
	* Returns the type of the element if defined, otherwise returns -1. This method is useful in
	* startElement and endElement, by not needing to compare the element qualified name over and
	* over.
	*/
	private static final int getElementType(String elem) {
	Integer val = ELEMENTS.get(elem);
	return val == null ? -1 : val.intValue();
	}

	private Path file;
	private boolean keepImages = true;
	private InputStream is;
	private Parser parser = new Parser();

	@Override
	public void close() throws IOException {
	synchronized (EnwikiContentSource.this) {
	parser.stop();
	if (is != null) {
	is.close();
	is = null;
	}
	}
	}

	@Override
	public synchronized DocData getNextDocData(DocData docData)
	throws NoMoreDataException, IOException {
	String[] tuple = parser.next();
	docData.clear();
	docData.setName(tuple[ID]);
	docData.setBody(tuple[BODY]);
	docData.setDate(tuple[DATE]);
	docData.setTitle(tuple[TITLE]);
	return docData;
	}

	@Override
	public void resetInputs() throws IOException {
	super.resetInputs();
	is = openInputStream();
	}

	/** Open the input stream. */
	protected InputStream openInputStream() throws IOException {
	return StreamUtils.inputStream(file);
	}

	@Override
	public void setConfig(Config config) {
	super.setConfig(config);
	keepImages = config.get("keep.image.only.docs", true);
	String fileName = config.get("docs.file", null);
	if (fileName != null) {
	file = Paths.get(fileName).toAbsolutePath();
	}
	}
	}