hadoop-mapreduce/src/contrib/vertica/src/java/org/apache/hadoop/vertica/VerticaUtil.java - hadoop - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.hadoop.vertica;

 import java.io.IOException;
 import java.sql.Connection;
 import java.sql.DatabaseMetaData;
 import java.sql.ResultSet;
 import java.sql.ResultSetMetaData;
 import java.sql.SQLException;
 import java.sql.Statement;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;

 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.hadoop.mapreduce.JobContext;
 import org.apache.hadoop.mapreduce.InputSplit;
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.apache.hadoop.conf.Configuration;

 public class VerticaUtil {
   private static final Log LOG = LogFactory.getLog(VerticaUtil.class);

   public static int verticaVersion(Configuration conf, boolean output) throws IOException {
     int ver = -1;
     try {
     VerticaConfiguration vtconfig = new VerticaConfiguration(conf);
     Connection conn = vtconfig.getConnection(output);
     DatabaseMetaData dbmd = conn.getMetaData();
     ver = dbmd.getDatabaseMajorVersion() * 100;
     ver += dbmd.getDatabaseMinorVersion();
     } catch(ClassNotFoundException e) {
       throw new IOException("Vertica Driver required to use Vertica Input or Output Formatters");
     } catch (SQLException e) { throw new IOException(e); }
     return ver;
   }

   public static void checkOutputSpecs(Configuration conf) throws IOException {
     VerticaConfiguration vtconfig = new VerticaConfiguration(conf);

     String writerTable = vtconfig.getOutputTableName();
     if (writerTable == null)
       throw new IOException("Vertica output requires a table name defined by "
           + VerticaConfiguration.OUTPUT_TABLE_NAME_PROP);
     String[] def = vtconfig.getOutputTableDef();
     boolean dropTable = vtconfig.getDropTable();

     String schema = null;
     String table = null;
     String[] schemaTable = writerTable.split("\\.");
     if (schemaTable.length == 2) {
       schema = schemaTable[0];
       table = schemaTable[1];
     } else
       table = schemaTable[0];

     Statement stmt = null;
     try {
       Connection conn = vtconfig.getConnection(true);
       DatabaseMetaData dbmd = conn.getMetaData();
       ResultSet rs = dbmd.getTables(null, schema, table, null);
       boolean tableExists = rs.next();

       stmt = conn.createStatement();

       if (tableExists && dropTable) {
         if(verticaVersion(conf, true) >= 305) {
           stmt = conn.createStatement();
           stmt.execute("TRUNCATE TABLE " + writerTable);
         } else {
           // for version < 3.0 drop the table if it exists
           // if def is empty, grab the columns first to redfine the table
           if (def == null) {
             rs = dbmd.getColumns(null, schema, table, null);
             ArrayList<String> defs = new ArrayList<String>();
             while (rs.next())
               defs.add(rs.getString(4) + " " + rs.getString(5));
             def = defs.toArray(new String[0]);
           }

           stmt = conn.createStatement();
           stmt.execute("DROP TABLE " + writerTable + " CASCADE");
           tableExists = false; // force create
         }
       }

       // create table if it doesn't exist
       if (!tableExists) {
         if (def == null)
           throw new RuntimeException("Table " + writerTable
               + " does not exist and no table definition provided");
         if (schema != null) {
           rs = dbmd.getSchemas(null, schema);
           if (!rs.next())
             stmt.execute("CREATE SCHEMA " + schema);
         }
         StringBuffer tabledef = new StringBuffer("CREATE TABLE ").append(
             writerTable).append(" (");
         for (String column : def)
           tabledef.append(column).append(",");
         tabledef.replace(tabledef.length() - 1, tabledef.length(), ")");

         stmt.execute(tabledef.toString());
         // TODO: create segmented projections
         stmt.execute("select implement_temp_design('" + writerTable + "')");
       }
     } catch (Exception e) {
       throw new RuntimeException(e);
     } finally {
       if (stmt != null)
         try {
           stmt.close();
         } catch (SQLException e) {
           throw new RuntimeException(e);
         }
     }
   }

   // TODO: catch when params required but missing
   // TODO: better error message when count query is bad
   public static List<InputSplit> getSplits(JobContext context)
       throws IOException {
     Configuration conf = context.getConfiguration();
     int numSplits = conf.getInt("mapreduce.job.maps", 1);
     LOG.debug("creating splits up to " + numSplits);
     List<InputSplit> splits = new ArrayList<InputSplit>();
     int i = 0;
     long start = 0;
     long end = 0;
     boolean limitOffset = true;

     // This is the fancy part of mapping inputs...here's how we figure out
     // splits
     // get the params query or the params
     VerticaConfiguration config = new VerticaConfiguration(conf);
     String inputQuery = config.getInputQuery();

     if (inputQuery == null)
       throw new IOException("Vertica input requires query defined by "
           + VerticaConfiguration.QUERY_PROP);

     String paramsQuery = config.getParamsQuery();
     Collection<List<Object>> params = config.getInputParameters();

     // TODO: limit needs order by unique key
     // TODO: what if there are more parameters than numsplits?
     // prep a count(*) wrapper query and then populate the bind params for each
     String countQuery = "SELECT COUNT(*) FROM (\n" + inputQuery + "\n) count";

     if (paramsQuery != null) {
       LOG.debug("creating splits using paramsQuery :" + paramsQuery);
       Connection conn = null;
       Statement stmt = null;
       try {
         conn = config.getConnection(false);
         stmt = conn.createStatement();
         ResultSet rs = stmt.executeQuery(paramsQuery);
         ResultSetMetaData rsmd = rs.getMetaData();
         while (rs.next()) {
           limitOffset = false;
           List<Object> segmentParams = new ArrayList<Object>();
           for (int j = 1; j <= rsmd.getColumnCount(); j++) {
             segmentParams.add(rs.getObject(j));
           }
           splits.add(new VerticaInputSplit(inputQuery, segmentParams, start,
               end));
         }
       } catch (Exception e) {
         throw new IOException(e);
       } finally {
         try {
           if (stmt != null)
             stmt.close();
         } catch (SQLException e) {
           throw new IOException(e);
         }
       }
     } else if (params != null && params.size() > 0) {
       LOG.debug("creating splits using " + params.size() + " params");
       limitOffset = false;
       for (List<Object> segmentParams : params) {
         // if there are more numSplits than params we're going to introduce some
         // limit and offsets
         // TODO: write code to generate the start/end pairs for each group
         splits
             .add(new VerticaInputSplit(inputQuery, segmentParams, start, end));
       }
     }

     if (limitOffset) {
       LOG.debug("creating splits using limit and offset");
       Connection conn = null;
       Statement stmt = null;
       long count = 0;

       try {
         conn = config.getConnection(false);
         stmt = conn.createStatement();
         ResultSet rs = stmt.executeQuery(countQuery);
         rs.next();
         count = rs.getLong(1);
       } catch (Exception e) {
         throw new IOException(e);
       } finally {
         try {
           if (stmt != null)
             stmt.close();
         } catch (SQLException e) {
           throw new IOException(e);
         }
       }

       long splitSize = count / numSplits;
       end = splitSize;

       LOG.debug("creating " + numSplits + " splits for " + count + " records");

       for (i = 0; i < numSplits; i++) {
         splits.add(new VerticaInputSplit(inputQuery, null, start, end));
         start += splitSize;
         end += splitSize;
       }
     }

     LOG.debug("returning " + splits.size() + " final splits");
     return splits;
   }
 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.hadoop.vertica;

	import java.io.IOException;
	import java.sql.Connection;
	import java.sql.DatabaseMetaData;
	import java.sql.ResultSet;
	import java.sql.ResultSetMetaData;
	import java.sql.SQLException;
	import java.sql.Statement;
	import java.util.ArrayList;
	import java.util.Collection;
	import java.util.List;

	import org.apache.commons.logging.Log;
	import org.apache.commons.logging.LogFactory;
	import org.apache.hadoop.mapred.Reporter;
	import org.apache.hadoop.mapreduce.JobContext;
	import org.apache.hadoop.mapreduce.InputSplit;
	import org.apache.hadoop.mapreduce.TaskAttemptContext;
	import org.apache.hadoop.conf.Configuration;

	public class VerticaUtil {
	private static final Log LOG = LogFactory.getLog(VerticaUtil.class);

	public static int verticaVersion(Configuration conf, boolean output) throws IOException {
	int ver = -1;
	try {
	VerticaConfiguration vtconfig = new VerticaConfiguration(conf);
	Connection conn = vtconfig.getConnection(output);
	DatabaseMetaData dbmd = conn.getMetaData();
	ver = dbmd.getDatabaseMajorVersion() * 100;
	ver += dbmd.getDatabaseMinorVersion();
	} catch(ClassNotFoundException e) {
	throw new IOException("Vertica Driver required to use Vertica Input or Output Formatters");
	} catch (SQLException e) { throw new IOException(e); }
	return ver;
	}

	public static void checkOutputSpecs(Configuration conf) throws IOException {
	VerticaConfiguration vtconfig = new VerticaConfiguration(conf);

	String writerTable = vtconfig.getOutputTableName();
	if (writerTable == null)
	throw new IOException("Vertica output requires a table name defined by "
	+ VerticaConfiguration.OUTPUT_TABLE_NAME_PROP);
	String[] def = vtconfig.getOutputTableDef();
	boolean dropTable = vtconfig.getDropTable();

	String schema = null;
	String table = null;
	String[] schemaTable = writerTable.split("\\.");
	if (schemaTable.length == 2) {
	schema = schemaTable[0];
	table = schemaTable[1];
	} else
	table = schemaTable[0];

	Statement stmt = null;
	try {
	Connection conn = vtconfig.getConnection(true);
	DatabaseMetaData dbmd = conn.getMetaData();
	ResultSet rs = dbmd.getTables(null, schema, table, null);
	boolean tableExists = rs.next();

	stmt = conn.createStatement();

	if (tableExists && dropTable) {
	if(verticaVersion(conf, true) >= 305) {
	stmt = conn.createStatement();
	stmt.execute("TRUNCATE TABLE " + writerTable);
	} else {
	// for version < 3.0 drop the table if it exists
	// if def is empty, grab the columns first to redfine the table
	if (def == null) {
	rs = dbmd.getColumns(null, schema, table, null);
	ArrayList<String> defs = new ArrayList<String>();
	while (rs.next())
	defs.add(rs.getString(4) + " " + rs.getString(5));
	def = defs.toArray(new String[0]);
	}

	stmt = conn.createStatement();
	stmt.execute("DROP TABLE " + writerTable + " CASCADE");
	tableExists = false; // force create
	}
	}

	// create table if it doesn't exist
	if (!tableExists) {
	if (def == null)
	throw new RuntimeException("Table " + writerTable
	+ " does not exist and no table definition provided");
	if (schema != null) {
	rs = dbmd.getSchemas(null, schema);
	if (!rs.next())
	stmt.execute("CREATE SCHEMA " + schema);
	}
	StringBuffer tabledef = new StringBuffer("CREATE TABLE ").append(
	writerTable).append(" (");
	for (String column : def)
	tabledef.append(column).append(",");
	tabledef.replace(tabledef.length() - 1, tabledef.length(), ")");

	stmt.execute(tabledef.toString());
	// TODO: create segmented projections
	stmt.execute("select implement_temp_design('" + writerTable + "')");
	}
	} catch (Exception e) {
	throw new RuntimeException(e);
	} finally {
	if (stmt != null)
	try {
	stmt.close();
	} catch (SQLException e) {
	throw new RuntimeException(e);
	}
	}
	}

	// TODO: catch when params required but missing
	// TODO: better error message when count query is bad
	public static List<InputSplit> getSplits(JobContext context)
	throws IOException {
	Configuration conf = context.getConfiguration();
	int numSplits = conf.getInt("mapreduce.job.maps", 1);
	LOG.debug("creating splits up to " + numSplits);
	List<InputSplit> splits = new ArrayList<InputSplit>();
	int i = 0;
	long start = 0;
	long end = 0;
	boolean limitOffset = true;

	// This is the fancy part of mapping inputs...here's how we figure out
	// splits
	// get the params query or the params
	VerticaConfiguration config = new VerticaConfiguration(conf);
	String inputQuery = config.getInputQuery();

	if (inputQuery == null)
	throw new IOException("Vertica input requires query defined by "
	+ VerticaConfiguration.QUERY_PROP);

	String paramsQuery = config.getParamsQuery();
	Collection<List<Object>> params = config.getInputParameters();

	// TODO: limit needs order by unique key
	// TODO: what if there are more parameters than numsplits?
	// prep a count(*) wrapper query and then populate the bind params for each
	String countQuery = "SELECT COUNT(*) FROM (\n" + inputQuery + "\n) count";

	if (paramsQuery != null) {
	LOG.debug("creating splits using paramsQuery :" + paramsQuery);
	Connection conn = null;
	Statement stmt = null;
	try {
	conn = config.getConnection(false);
	stmt = conn.createStatement();
	ResultSet rs = stmt.executeQuery(paramsQuery);
	ResultSetMetaData rsmd = rs.getMetaData();
	while (rs.next()) {
	limitOffset = false;
	List<Object> segmentParams = new ArrayList<Object>();
	for (int j = 1; j <= rsmd.getColumnCount(); j++) {
	segmentParams.add(rs.getObject(j));
	}
	splits.add(new VerticaInputSplit(inputQuery, segmentParams, start,
	end));
	}
	} catch (Exception e) {
	throw new IOException(e);
	} finally {
	try {
	if (stmt != null)
	stmt.close();
	} catch (SQLException e) {
	throw new IOException(e);
	}
	}
	} else if (params != null && params.size() > 0) {
	LOG.debug("creating splits using " + params.size() + " params");
	limitOffset = false;
	for (List<Object> segmentParams : params) {
	// if there are more numSplits than params we're going to introduce some
	// limit and offsets
	// TODO: write code to generate the start/end pairs for each group
	splits
	.add(new VerticaInputSplit(inputQuery, segmentParams, start, end));
	}
	}

	if (limitOffset) {
	LOG.debug("creating splits using limit and offset");
	Connection conn = null;
	Statement stmt = null;
	long count = 0;

	try {
	conn = config.getConnection(false);
	stmt = conn.createStatement();
	ResultSet rs = stmt.executeQuery(countQuery);
	rs.next();
	count = rs.getLong(1);
	} catch (Exception e) {
	throw new IOException(e);
	} finally {
	try {
	if (stmt != null)
	stmt.close();
	} catch (SQLException e) {
	throw new IOException(e);
	}
	}

	long splitSize = count / numSplits;
	end = splitSize;

	LOG.debug("creating " + numSplits + " splits for " + count + " records");

	for (i = 0; i < numSplits; i++) {
	splits.add(new VerticaInputSplit(inputQuery, null, start, end));
	start += splitSize;
	end += splitSize;
	}
	}

	LOG.debug("returning " + splits.size() + " final splits");
	return splits;
	}
	}