src/main/java/org/apache/pirk/schema/query/QuerySchemaLoader.java - incubator-retired-pirk - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 package org.apache.pirk.schema.query;

 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.LinkedHashSet;
 import java.util.Map;
 import java.util.Set;

 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
 import javax.xml.parsers.ParserConfigurationException;

 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.pirk.utils.PIRException;
 import org.apache.pirk.utils.SystemConfiguration;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
 import org.xml.sax.SAXException;

 /**
  * Class to load any query schemas specified in the properties file, 'query.schemas'
  * <p>
  * Schemas should be specified as follows:
  *
  * <pre>
  * {@code
  * <schema>
  *    <schemaName> name of the schema </schemaName>
  *    <dataSchemaName> name of the data schema over which this query is run </dataSchemaName>
  *    <selectorName> name of the element in the data schema that will be the selector </selectorName>
  *    <elements>
  *       <name> element name of element in the data schema to include in the query response; just
  *              as with the data schema, the element name is case sensitive</name>
  *    </elements>
  *    <filter> (optional) name of the filter class to use to filter the data </filter>
  *    <filterNames> (optional)
  *       <name> element name of element in the data schema to apply pre-processing filters </name>
  *    </filterNames>
  *    <additional> (optional) additional fields for the query schema, in <key,value> pairs
  *       <field>
  *            <key> key corresponding the the field </key>
  *            <value> value corresponding to the field </value>
  *       </field>
  *    </additional>
  *   </schema>
  * }
  * </pre>
  * <p>
  * TODO: Allow the schema to specify how many array elements to return per element, if the element is an array type
  */
 public class QuerySchemaLoader
 {
   private static final Logger logger = LoggerFactory.getLogger(QuerySchemaLoader.class);

   static
   {
     logger.info("Loading query schemas: ");

     try
     {
       initialize();
     } catch (PIRException e)
     {
       logger.error(e.getLocalizedMessage());
     }
   }

   /* Kept for compatibility */
   /**
    * Initializes the static {@link QuerySchemaRegistry} with a list of query schema names.
    *
    * @throws PIRException
    *           - failed to initialize
    */
   public static void initialize() throws PIRException
   {
     initialize(false, null);
   }

   /* Kept for compatibility */
   /**
    * Initializes the static {@link QuerySchemaRegistry} with a list of available query schema names.
    *
    * @param hdfs
    *          If true, specifies that the query schema is an hdfs file; if false, that it is a regular file.
    * @param fs
    *          Used only when {@code hdfs} is true; the {@link FileSystem} handle for the hdfs in which the query schema exists
    * @throws PIRException
    *           - failed to initialize the query schemas because they could not be read or are invalid.
    */
   public static void initialize(boolean hdfs, FileSystem fs) throws PIRException
   {
     String querySchemas = SystemConfiguration.getProperty("query.schemas", "none");
     if (querySchemas.equals("none"))
     {
       logger.info("query.schemas = none");
       return;
     }

     String[] querySchemaFiles = querySchemas.split(",");
     try
     {
       for (String schemaFile : querySchemaFiles)
       {
         QuerySchema querySchema = readSchemaFile(schemaFile, fs, hdfs);
         QuerySchemaRegistry.put(querySchema);
       }
     } catch (IOException e)
     {
       throw new PIRException("Error reading query schema", e);
     }
   }

   private static QuerySchema readSchemaFile(String schemaFile, FileSystem fs, boolean hdfs) throws IOException, PIRException
   {
     logger.info("Loading query schemaFile = " + schemaFile);

     // Parse and load the schema file into a QuerySchema object.
     QuerySchemaLoader loader = new QuerySchemaLoader();
     InputStream is;
     if (hdfs)
     {
       logger.info("hdfs: filePath = " + schemaFile);
       is = fs.open(new Path(schemaFile));
     }
     else
     {
       logger.info("localFS: inputFile = " + schemaFile);
       is = new FileInputStream(schemaFile);
     }

     try
     {
       return loader.loadSchema(is);
     } finally
     {
       is.close();
     }
   }

   /**
    * Default constructor.
    */
   public QuerySchemaLoader()
   {

   }

   /**
    * Returns the query schema as defined in XML format on the given stream.
    *
    * @param stream
    *          The source of the XML query schema description.
    * @return The query schema.
    * @throws IOException
    *           A problem occurred reading from the given stream.
    * @throws PIRException
    *           The schema description is invalid.
    */
   public QuerySchema loadSchema(InputStream stream) throws IOException, PIRException
   {
     // Read in and parse the XML file.
     Document doc = parseXMLDocument(stream);

     // Used to build the final schema.
     QuerySchemaBuilder schemaBuilder = new QuerySchemaBuilder();

     // Extract the schemaName.
     String schemaName = extractValue(doc, "schemaName");
     schemaBuilder.setName(schemaName);
     logger.info("schemaName = " + schemaName);

     // Extract the dataSchemaName.
     String dataSchemaName = extractValue(doc, "dataSchemaName");
     schemaBuilder.setDataSchemaName(dataSchemaName);
     logger.info("dataSchemaName = " + dataSchemaName);

     // Extract the selectorName.
     String selectorName = extractValue(doc, "selectorName");
     schemaBuilder.setSelectorName(selectorName);
     logger.info("selectorName = " + selectorName);

     // Extract the query elements.
     NodeList elementsList = doc.getElementsByTagName("elements");
     if (elementsList.getLength() != 1)
     {
       throw new PIRException("elementsList.getLength() = " + elementsList.getLength() + " -- should be 1");
     }
     Element elements = (Element) elementsList.item(0);

     LinkedHashSet<String> elementNames = new LinkedHashSet<>();
     NodeList nList = elements.getElementsByTagName("name");
     for (int i = 0; i < nList.getLength(); i++)
     {
       Node nNode = nList.item(i);
       if (nNode.getNodeType() == Node.ELEMENT_NODE)
       {
         elementNames.add(nNode.getFirstChild().getNodeValue().trim());
       }
     }
     schemaBuilder.setQueryElementNames(elementNames);

     // Extract the filter, if it exists
     if (doc.getElementsByTagName("filter").item(0) != null)
     {
       schemaBuilder.setFilterTypeName(doc.getElementsByTagName("filter").item(0).getTextContent().trim());
     }

     // Create a filter over the query elements.
     schemaBuilder.setFilteredElementNames(extractFilteredElementNames(doc));

     // Extract the additional fields, if they exists
     Map<String,String> additionalFields = new HashMap<>();
     if (doc.getElementsByTagName("additional").item(0) != null)
     {
       NodeList fieldList = doc.getElementsByTagName("field");
       int numFields = fieldList.getLength();
       if (numFields == 0)
       {
         throw new PIRException("numFields = " + numFields + " -- should be at least one");
       }
       for (int i = 0; i < numFields; ++i)
       {
         Element fields = (Element) fieldList.item(i);
         NodeList kv = fields.getChildNodes();
         additionalFields.put(getNodeValue("key", kv), getNodeValue("value", kv));
       }
     }
     schemaBuilder.setAdditionalFields(additionalFields);

     // Create and return the query schema object.
     return schemaBuilder.build();
   }

   /**
    * Parses and normalizes the XML document available on the given stream.
    *
    * @param stream
    *          The input stream.
    * @return A Document representing the XML document.
    * @throws IOException
    *           - failed to read input
    * @throws PIRException
    *           - file could not be parsed
    */
   private Document parseXMLDocument(InputStream stream) throws IOException, PIRException
   {
     Document doc;
     try
     {
       DocumentBuilder dBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
       doc = dBuilder.parse(stream);
     } catch (ParserConfigurationException | SAXException e)
     {
       throw new PIRException("Schema parsing error", e);
     }
     doc.getDocumentElement().normalize();
     logger.info("Root element: " + doc.getDocumentElement().getNodeName());

     return doc;
   }

   /**
    * Returns the possibly empty set of element names over which the filter is applied, maintaining document order.
    *
    * @param doc
    *          An XML document specifying names upon which we will filter the query.
    * @return The set of names upon which we will filter the query.
    * @throws PIRException
    *           - Filter lists not found
    */
   private Set<String> extractFilteredElementNames(Document doc) throws PIRException
   {
     Set<String> filteredNamesSet = new HashSet<>();

     NodeList filterNamesList = doc.getElementsByTagName("filterNames");
     if (filterNamesList.getLength() != 0)
     {
       if (filterNamesList.getLength() > 1)
       {
         throw new PIRException("filterNamesList.getLength() = " + filterNamesList.getLength() + " -- should be 0 or 1");
       }

       // Extract element names from the list.
       NodeList filterNList = ((Element) filterNamesList.item(0)).getElementsByTagName("name");
       for (int i = 0; i < filterNList.getLength(); i++)
       {
         Node nNode = filterNList.item(i);
         if (nNode.getNodeType() == Node.ELEMENT_NODE)
         {
           // Pull the name and add to the set.
           String name = nNode.getFirstChild().getNodeValue().trim();
           filteredNamesSet.add(name);

           logger.info("filterName = " + name);
         }
       }
     }
     return filteredNamesSet;
   }

   /**
    * Extracts a top level, single value from the XML structure.
    *
    * Throws an exception if there is not exactly one tag with the given name.
    *
    * @param doc
    *          The XML document from which we extract data
    * @param tagName
    *          The name of the tag we wish to extract from the {@code doc}
    * @return The text content of the tag.
    * @throws PIRException
    *           - XML Document is Empty
    */
   private String extractValue(Document doc, String tagName) throws PIRException
   {
     NodeList itemList = doc.getElementsByTagName(tagName);
     if (itemList.getLength() != 1)
     {
       throw new PIRException("itemList.getLength() = " + itemList.getLength() + " -- should be 1");
     }
     return itemList.item(0).getTextContent().trim();
   }

   /**
    * Extracts the value corresponding to a given tag from the XML nodeList
    *
    * @param tagName
    *          The name of the tag for which to extract the value
    * @param nodes
    *          The NodeList
    * @return The given value
    */
   private String getNodeValue(String tagName, NodeList nodes)
   {
     String value = "";

     for (int x = 0; x < nodes.getLength(); x++)
     {
       Node node = nodes.item(x);
       if (node.getNodeName().equals(tagName))
       {
         value = node.getChildNodes().item(0).getNodeValue().trim();
       }
     }
     return value;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/
	package org.apache.pirk.schema.query;

	import java.io.FileInputStream;
	import java.io.IOException;
	import java.io.InputStream;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.LinkedHashSet;
	import java.util.Map;
	import java.util.Set;

	import javax.xml.parsers.DocumentBuilder;
	import javax.xml.parsers.DocumentBuilderFactory;
	import javax.xml.parsers.ParserConfigurationException;

	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.fs.Path;
	import org.apache.pirk.utils.PIRException;
	import org.apache.pirk.utils.SystemConfiguration;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;
	import org.w3c.dom.Document;
	import org.w3c.dom.Element;
	import org.w3c.dom.Node;
	import org.w3c.dom.NodeList;
	import org.xml.sax.SAXException;

	/**
	* Class to load any query schemas specified in the properties file, 'query.schemas'
	* <p>
	* Schemas should be specified as follows:
	*
	* <pre>
	* {@code
	* <schema>
	* <schemaName> name of the schema </schemaName>
	* <dataSchemaName> name of the data schema over which this query is run </dataSchemaName>
	* <selectorName> name of the element in the data schema that will be the selector </selectorName>
	* <elements>
	* <name> element name of element in the data schema to include in the query response; just
	* as with the data schema, the element name is case sensitive</name>
	* </elements>
	* <filter> (optional) name of the filter class to use to filter the data </filter>
	* <filterNames> (optional)
	* <name> element name of element in the data schema to apply pre-processing filters </name>
	* </filterNames>
	* <additional> (optional) additional fields for the query schema, in <key,value> pairs
	* <field>
	* <key> key corresponding the the field </key>
	* <value> value corresponding to the field </value>
	* </field>
	* </additional>
	* </schema>
	* }
	* </pre>
	* <p>
	* TODO: Allow the schema to specify how many array elements to return per element, if the element is an array type
	*/
	public class QuerySchemaLoader
	{
	private static final Logger logger = LoggerFactory.getLogger(QuerySchemaLoader.class);

	static
	{
	logger.info("Loading query schemas: ");

	try
	{
	initialize();
	} catch (PIRException e)
	{
	logger.error(e.getLocalizedMessage());
	}
	}

	/* Kept for compatibility */
	/**
	* Initializes the static {@link QuerySchemaRegistry} with a list of query schema names.
	*
	* @throws PIRException
	* - failed to initialize
	*/
	public static void initialize() throws PIRException
	{
	initialize(false, null);
	}

	/* Kept for compatibility */
	/**
	* Initializes the static {@link QuerySchemaRegistry} with a list of available query schema names.
	*
	* @param hdfs
	* If true, specifies that the query schema is an hdfs file; if false, that it is a regular file.
	* @param fs
	* Used only when {@code hdfs} is true; the {@link FileSystem} handle for the hdfs in which the query schema exists
	* @throws PIRException
	* - failed to initialize the query schemas because they could not be read or are invalid.
	*/
	public static void initialize(boolean hdfs, FileSystem fs) throws PIRException
	{
	String querySchemas = SystemConfiguration.getProperty("query.schemas", "none");
	if (querySchemas.equals("none"))
	{
	logger.info("query.schemas = none");
	return;
	}

	String[] querySchemaFiles = querySchemas.split(",");
	try
	{
	for (String schemaFile : querySchemaFiles)
	{
	QuerySchema querySchema = readSchemaFile(schemaFile, fs, hdfs);
	QuerySchemaRegistry.put(querySchema);
	}
	} catch (IOException e)
	{
	throw new PIRException("Error reading query schema", e);
	}
	}

	private static QuerySchema readSchemaFile(String schemaFile, FileSystem fs, boolean hdfs) throws IOException, PIRException
	{
	logger.info("Loading query schemaFile = " + schemaFile);

	// Parse and load the schema file into a QuerySchema object.
	QuerySchemaLoader loader = new QuerySchemaLoader();
	InputStream is;
	if (hdfs)
	{
	logger.info("hdfs: filePath = " + schemaFile);
	is = fs.open(new Path(schemaFile));
	}
	else
	{
	logger.info("localFS: inputFile = " + schemaFile);
	is = new FileInputStream(schemaFile);
	}

	try
	{
	return loader.loadSchema(is);
	} finally
	{
	is.close();
	}
	}

	/**
	* Default constructor.
	*/
	public QuerySchemaLoader()
	{

	}

	/**
	* Returns the query schema as defined in XML format on the given stream.
	*
	* @param stream
	* The source of the XML query schema description.
	* @return The query schema.
	* @throws IOException
	* A problem occurred reading from the given stream.
	* @throws PIRException
	* The schema description is invalid.
	*/
	public QuerySchema loadSchema(InputStream stream) throws IOException, PIRException
	{
	// Read in and parse the XML file.
	Document doc = parseXMLDocument(stream);

	// Used to build the final schema.
	QuerySchemaBuilder schemaBuilder = new QuerySchemaBuilder();

	// Extract the schemaName.
	String schemaName = extractValue(doc, "schemaName");
	schemaBuilder.setName(schemaName);
	logger.info("schemaName = " + schemaName);

	// Extract the dataSchemaName.
	String dataSchemaName = extractValue(doc, "dataSchemaName");
	schemaBuilder.setDataSchemaName(dataSchemaName);
	logger.info("dataSchemaName = " + dataSchemaName);

	// Extract the selectorName.
	String selectorName = extractValue(doc, "selectorName");
	schemaBuilder.setSelectorName(selectorName);
	logger.info("selectorName = " + selectorName);

	// Extract the query elements.
	NodeList elementsList = doc.getElementsByTagName("elements");
	if (elementsList.getLength() != 1)
	{
	throw new PIRException("elementsList.getLength() = " + elementsList.getLength() + " -- should be 1");
	}
	Element elements = (Element) elementsList.item(0);

	LinkedHashSet<String> elementNames = new LinkedHashSet<>();
	NodeList nList = elements.getElementsByTagName("name");
	for (int i = 0; i < nList.getLength(); i++)
	{
	Node nNode = nList.item(i);
	if (nNode.getNodeType() == Node.ELEMENT_NODE)
	{
	elementNames.add(nNode.getFirstChild().getNodeValue().trim());
	}
	}
	schemaBuilder.setQueryElementNames(elementNames);

	// Extract the filter, if it exists
	if (doc.getElementsByTagName("filter").item(0) != null)
	{
	schemaBuilder.setFilterTypeName(doc.getElementsByTagName("filter").item(0).getTextContent().trim());
	}

	// Create a filter over the query elements.
	schemaBuilder.setFilteredElementNames(extractFilteredElementNames(doc));

	// Extract the additional fields, if they exists
	Map<String,String> additionalFields = new HashMap<>();
	if (doc.getElementsByTagName("additional").item(0) != null)
	{
	NodeList fieldList = doc.getElementsByTagName("field");
	int numFields = fieldList.getLength();
	if (numFields == 0)
	{
	throw new PIRException("numFields = " + numFields + " -- should be at least one");
	}
	for (int i = 0; i < numFields; ++i)
	{
	Element fields = (Element) fieldList.item(i);
	NodeList kv = fields.getChildNodes();
	additionalFields.put(getNodeValue("key", kv), getNodeValue("value", kv));
	}
	}
	schemaBuilder.setAdditionalFields(additionalFields);

	// Create and return the query schema object.
	return schemaBuilder.build();
	}

	/**
	* Parses and normalizes the XML document available on the given stream.
	*
	* @param stream
	* The input stream.
	* @return A Document representing the XML document.
	* @throws IOException
	* - failed to read input
	* @throws PIRException
	* - file could not be parsed
	*/
	private Document parseXMLDocument(InputStream stream) throws IOException, PIRException
	{
	Document doc;
	try
	{
	DocumentBuilder dBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
	doc = dBuilder.parse(stream);
	} catch (ParserConfigurationException \| SAXException e)
	{
	throw new PIRException("Schema parsing error", e);
	}
	doc.getDocumentElement().normalize();
	logger.info("Root element: " + doc.getDocumentElement().getNodeName());

	return doc;
	}

	/**
	* Returns the possibly empty set of element names over which the filter is applied, maintaining document order.
	*
	* @param doc
	* An XML document specifying names upon which we will filter the query.
	* @return The set of names upon which we will filter the query.
	* @throws PIRException
	* - Filter lists not found
	*/
	private Set<String> extractFilteredElementNames(Document doc) throws PIRException
	{
	Set<String> filteredNamesSet = new HashSet<>();

	NodeList filterNamesList = doc.getElementsByTagName("filterNames");
	if (filterNamesList.getLength() != 0)
	{
	if (filterNamesList.getLength() > 1)
	{
	throw new PIRException("filterNamesList.getLength() = " + filterNamesList.getLength() + " -- should be 0 or 1");
	}

	// Extract element names from the list.
	NodeList filterNList = ((Element) filterNamesList.item(0)).getElementsByTagName("name");
	for (int i = 0; i < filterNList.getLength(); i++)
	{
	Node nNode = filterNList.item(i);
	if (nNode.getNodeType() == Node.ELEMENT_NODE)
	{
	// Pull the name and add to the set.
	String name = nNode.getFirstChild().getNodeValue().trim();
	filteredNamesSet.add(name);

	logger.info("filterName = " + name);
	}
	}
	}
	return filteredNamesSet;
	}

	/**
	* Extracts a top level, single value from the XML structure.
	*
	* Throws an exception if there is not exactly one tag with the given name.
	*
	* @param doc
	* The XML document from which we extract data
	* @param tagName
	* The name of the tag we wish to extract from the {@code doc}
	* @return The text content of the tag.
	* @throws PIRException
	* - XML Document is Empty
	*/
	private String extractValue(Document doc, String tagName) throws PIRException
	{
	NodeList itemList = doc.getElementsByTagName(tagName);
	if (itemList.getLength() != 1)
	{
	throw new PIRException("itemList.getLength() = " + itemList.getLength() + " -- should be 1");
	}
	return itemList.item(0).getTextContent().trim();
	}

	/**
	* Extracts the value corresponding to a given tag from the XML nodeList
	*
	* @param tagName
	* The name of the tag for which to extract the value
	* @param nodes
	* The NodeList
	* @return The given value
	*/
	private String getNodeValue(String tagName, NodeList nodes)
	{
	String value = "";

	for (int x = 0; x < nodes.getLength(); x++)
	{
	Node node = nodes.item(x);
	if (node.getNodeName().equals(tagName))
	{
	value = node.getChildNodes().item(0).getNodeValue().trim();
	}
	}
	return value;
	}
	}