| #!/usr/bin/env impala-python |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| # Functions for creating wide (i.e. many-column) tables. When run from the command line, |
| # specify either --get_columns to generate column descriptors, or --create_data to |
| # generate a CSV data file and prints a SQL load statement to incorporate |
| # into dataload SQL script generation. |
| |
| from datetime import datetime, timedelta |
| import itertools |
| import optparse |
| |
| parser = optparse.OptionParser() |
| parser.add_option("--get_columns", dest="get_columns", default=False, action="store_true") |
| parser.add_option("--create_data", dest="create_data", default=False, action="store_true") |
| parser.add_option("-n", "--num_columns", dest="num_columns", type="int") |
| parser.add_option("-o", "--output_file", dest="output_file") |
| parser.add_option("--num_rows", dest="num_rows", default=10) |
| |
| def get_columns(num_cols): |
| """Returns 'num_cols' column declarations, cycling through every column type, as a |
| a list of strings.""" |
| templates = [ |
| 'bool_col%i BOOLEAN', |
| 'tinyint_col%i TINYINT', |
| 'smallint_col%i SMALLINT', |
| 'int_col%i INT', |
| 'bigint_col%i BIGINT', |
| 'float_col%i FLOAT', |
| 'double_col%i DOUBLE', |
| 'string_col%i STRING', |
| ] |
| |
| iter = itertools.cycle(templates) |
| # Produces [bool_col1, tinyint_col1, ..., bool_col2, tinyint_col2, ...] |
| # The final list has 'num_cols' elements. |
| return [iter.next() % (i / len(templates) + 1) for i in xrange(num_cols)] |
| |
| # Data generators for different types. Each generator yields an infinite number of |
| # value strings suitable for writing to a CSV file. |
| |
| def bool_generator(): |
| """Generates True, False repeating""" |
| b = True |
| while True: |
| yield str(b) |
| b = not b |
| |
| def integer_generator(): |
| """Generates 0..4 repeating""" |
| i = 0 |
| while True: |
| yield str(i % 5) |
| i += 1 |
| |
| def floating_point_generator(): |
| """Generates 0, 1.1, ..., 4.4 repeating""" |
| i = 0 |
| while True: |
| yield str((i % 5) * 1.1) |
| i += 1 |
| |
| def quote(iter_fn): |
| """Returns a generator that returns quoted values of iter_fn.""" |
| def new_iter_fn(): |
| iter = iter_fn() |
| while True: |
| yield "'%s'" % iter.next() |
| return new_iter_fn |
| |
| def get_data(num_cols, num_rows, delimiter=',', quote_strings=False): |
| """Returns the data for the given number of rows and columns as a list of strings, each |
| of which is a row delimited by 'delimiter'.""" |
| generators = [ |
| bool_generator, # boolean |
| integer_generator, # tinyint |
| integer_generator, # smallint |
| integer_generator, # int |
| integer_generator, # bigint |
| floating_point_generator, # float |
| floating_point_generator, # double |
| quote(integer_generator) if quote_strings else integer_generator, # string |
| ] |
| # Create a generator instance for each column, cycling through the different types |
| iter = itertools.cycle(generators) |
| column_generators = [iter.next()() for i in xrange(num_cols)] |
| |
| # Populate each row using column_generators |
| rows = [] |
| for i in xrange(num_rows): |
| vals = [gen.next() for gen in column_generators] |
| rows.append(delimiter.join(vals)) |
| return rows |
| |
| if __name__ == "__main__": |
| (options, args) = parser.parse_args() |
| |
| if options.get_columns == options.create_data: |
| parser.error("Must specify either --get_columns or --create_data") |
| |
| if not options.num_columns: |
| parser.error("--num_columns option must be specified") |
| |
| if options.get_columns: |
| # Output column descriptors |
| print '\n'.join(get_columns(options.num_columns)) |
| |
| if options.create_data: |
| # Generate data locally, and output the SQL load command for use in dataload |
| if not options.output_file: |
| parser.error("--output_file option must be specified") |
| |
| with open(options.output_file, "w") as f: |
| for row in get_data(options.num_columns, options.num_rows): |
| f.write(row) |
| f.write('\n') |
| |
| print ("LOAD DATA LOCAL INPATH '%s' " |
| "OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};" |
| % options.output_file) |