tests/common/file_utils.py - impala - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 # This module contains utility functions for testing Parquet files,
 # and other functions used for checking for strings in files and
 # directories.

 from __future__ import absolute_import, division, print_function
 import os
 import re
 import tempfile
 from subprocess import check_call

 from tests.util.filesystem_utils import get_fs_path, WAREHOUSE_PREFIX


 def create_iceberg_table_from_directory(impala_client, unique_database, table_name,
                                         file_format):
   """Utility function to create an iceberg table from a directory. The directory must
   exist in $IMPALA_HOME/testdata/data/iceberg_test with the name 'table_name'"""

   # Only orc and parquet tested/supported
   assert file_format == "orc" or file_format == "parquet"

   local_dir = os.path.join(
     os.environ['IMPALA_HOME'], 'testdata/data/iceberg_test/{0}'.format(table_name))
   assert os.path.isdir(local_dir)

   # If using a prefix, rewrite iceberg metadata to use the prefix
   if WAREHOUSE_PREFIX:
     tmp_dir = tempfile.mktemp(table_name)
     check_call(['cp', '-r', local_dir, tmp_dir])
     rewrite = os.path.join(
         os.environ['IMPALA_HOME'], 'testdata/bin/rewrite-iceberg-metadata.py')
     check_call([rewrite, WAREHOUSE_PREFIX, os.path.join(tmp_dir, 'metadata')])
     local_dir = tmp_dir

   # Put the directory in the database's directory (not the table directory)
   hdfs_parent_dir = get_fs_path("/test-warehouse")

   hdfs_dir = os.path.join(hdfs_parent_dir, table_name)

   # Purge existing files if any
   check_call(['hdfs', 'dfs', '-rm', '-f', '-r', hdfs_dir])

   # Note: -d skips a staging copy
   check_call(['hdfs', 'dfs', '-put', '-d', local_dir, hdfs_dir])

   # Create external table
   qualified_table_name = '{0}.{1}'.format(unique_database, table_name)
   impala_client.execute("""create external table {0} stored as iceberg location '{1}'
                         tblproperties('write.format.default'='{2}', 'iceberg.catalog'=
                         'hadoop.tables')""".format(qualified_table_name, hdfs_dir,
                                                    file_format))

   # Automatic clean up after drop table
   impala_client.execute("""alter table {0} set tblproperties ('external.table.purge'=
                         'True');""".format(qualified_table_name))

 def create_table_from_parquet(impala_client, unique_database, table_name):
   """Utility function to create a database table from a Parquet file. A Parquet file must
   exist in $IMPALA_HOME/testdata/data with the name 'table_name'.parquet"""
   create_table_from_file(impala_client, unique_database, table_name, 'parquet')


 def create_table_from_orc(impala_client, unique_database, table_name):
   """Utility function to create a database table from a Orc file. A Orc file must
   exist in $IMPALA_HOME/testdata/data with the name 'table_name'.orc"""
   create_table_from_file(impala_client, unique_database, table_name, 'orc')


 def create_table_from_file(impala_client, unique_database, table_name, file_format):
   filename = '{0}.{1}'.format(table_name, file_format)
   local_file = os.path.join(os.environ['IMPALA_HOME'],
                             'testdata/data/{0}'.format(filename))
   assert os.path.isfile(local_file)

   # Put the file in the database's directory (not the table directory) so it is
   # available for a LOAD DATA statement.
   hdfs_file = get_fs_path(
       os.path.join("/test-warehouse", "{0}.db".format(unique_database), filename))
   # Note: -d skips a staging copy
   check_call(['hdfs', 'dfs', '-put', '-f', '-d', local_file, hdfs_file])

   # Create the table and load the file
   qualified_table_name = '{0}.{1}'.format(unique_database, table_name)
   impala_client.execute('create table {0} like {1} "{2}" stored as {1}'.format(
       qualified_table_name, file_format, hdfs_file))
   impala_client.execute('load data inpath "{0}" into table {1}'.format(
       hdfs_file, qualified_table_name))


 def create_table_and_copy_files(impala_client, create_stmt, unique_database, table_name,
                                 files):
   # Create the table
   create_stmt = create_stmt.format(db=unique_database, tbl=table_name)
   impala_client.execute(create_stmt)

   hdfs_dir = get_fs_path(
       os.path.join("/test-warehouse", unique_database + ".db", table_name))
   copy_files_to_hdfs_dir(files, hdfs_dir)

   # Refresh the table metadata to see the new files
   refresh_stmt = "refresh {0}.{1}".format(unique_database, table_name)
   impala_client.execute(refresh_stmt)


 def copy_files_to_hdfs_dir(files, hdfs_dir):
   # Copy the files
   #  - build a list of source files
   #  - issue a single put to the hdfs_dir ( -d skips a staging copy)
   source_files = []
   for local_file in files:
     # Cut off leading '/' to make os.path.join() happy
     local_file = local_file if local_file[0] != '/' else local_file[1:]
     local_file = os.path.join(os.environ['IMPALA_HOME'], local_file)
     assert os.path.isfile(local_file)
     source_files.append(local_file)
   check_call(['hdfs', 'dfs', '-put', '-f', '-d'] + source_files + [hdfs_dir])


 def grep_dir(dir, search, filename_search=""):
   '''Recursively search for files that contain 'search' and have a filename that matches
      'filename_search' and return a list of matched lines grouped by file.
   '''
   filename_matcher = re.compile(filename_search)
   matching_files = dict()
   for dir_name, _, file_names in os.walk(dir):
     for file_name in file_names:
       file_path = os.path.join(dir_name, file_name)
       if os.path.islink(file_path) or not filename_matcher.search(file_path):
         continue
       with open(file_path) as file:
         matching_lines = grep_file(file, search)
         if matching_lines:
           matching_files[file_name] = matching_lines
   return matching_files


 def grep_file(file, search):
   '''Return lines in 'file' that match the 'search' regex. 'file' must already be
      opened.
   '''
   matcher = re.compile(search)
   matching_lines = list()
   for line in file:
     if matcher.search(line):
       matching_lines.append(line)
   return matching_lines


 def assert_file_in_dir_contains(dir, search):
   '''Asserts that at least one file in the 'dir' contains the 'search' term.'''
   results = grep_dir(dir, search)
   assert results, "%s should have a file containing '%s' but no file was found" \
       % (dir, search)


 def assert_no_files_in_dir_contain(dir, search):
   '''Asserts that no files in the 'dir' contains the 'search' term.'''
   results = grep_dir(dir, search)
   assert not results, \
       "%s should not have any file containing '%s' but a file was found" \
       % (dir, search)
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	# This module contains utility functions for testing Parquet files,
	# and other functions used for checking for strings in files and
	# directories.

	from __future__ import absolute_import, division, print_function
	import os
	import re
	import tempfile
	from subprocess import check_call

	from tests.util.filesystem_utils import get_fs_path, WAREHOUSE_PREFIX


	def create_iceberg_table_from_directory(impala_client, unique_database, table_name,
	file_format):
	"""Utility function to create an iceberg table from a directory. The directory must
	exist in $IMPALA_HOME/testdata/data/iceberg_test with the name 'table_name'"""

	# Only orc and parquet tested/supported
	assert file_format == "orc" or file_format == "parquet"

	local_dir = os.path.join(
	os.environ['IMPALA_HOME'], 'testdata/data/iceberg_test/{0}'.format(table_name))
	assert os.path.isdir(local_dir)

	# If using a prefix, rewrite iceberg metadata to use the prefix
	if WAREHOUSE_PREFIX:
	tmp_dir = tempfile.mktemp(table_name)
	check_call(['cp', '-r', local_dir, tmp_dir])
	rewrite = os.path.join(
	os.environ['IMPALA_HOME'], 'testdata/bin/rewrite-iceberg-metadata.py')
	check_call([rewrite, WAREHOUSE_PREFIX, os.path.join(tmp_dir, 'metadata')])
	local_dir = tmp_dir

	# Put the directory in the database's directory (not the table directory)
	hdfs_parent_dir = get_fs_path("/test-warehouse")

	hdfs_dir = os.path.join(hdfs_parent_dir, table_name)

	# Purge existing files if any
	check_call(['hdfs', 'dfs', '-rm', '-f', '-r', hdfs_dir])

	# Note: -d skips a staging copy
	check_call(['hdfs', 'dfs', '-put', '-d', local_dir, hdfs_dir])

	# Create external table
	qualified_table_name = '{0}.{1}'.format(unique_database, table_name)
	impala_client.execute("""create external table {0} stored as iceberg location '{1}'
	tblproperties('write.format.default'='{2}', 'iceberg.catalog'=
	'hadoop.tables')""".format(qualified_table_name, hdfs_dir,
	file_format))

	# Automatic clean up after drop table
	impala_client.execute("""alter table {0} set tblproperties ('external.table.purge'=
	'True');""".format(qualified_table_name))

	def create_table_from_parquet(impala_client, unique_database, table_name):
	"""Utility function to create a database table from a Parquet file. A Parquet file must
	exist in $IMPALA_HOME/testdata/data with the name 'table_name'.parquet"""
	create_table_from_file(impala_client, unique_database, table_name, 'parquet')


	def create_table_from_orc(impala_client, unique_database, table_name):
	"""Utility function to create a database table from a Orc file. A Orc file must
	exist in $IMPALA_HOME/testdata/data with the name 'table_name'.orc"""
	create_table_from_file(impala_client, unique_database, table_name, 'orc')


	def create_table_from_file(impala_client, unique_database, table_name, file_format):
	filename = '{0}.{1}'.format(table_name, file_format)
	local_file = os.path.join(os.environ['IMPALA_HOME'],
	'testdata/data/{0}'.format(filename))
	assert os.path.isfile(local_file)

	# Put the file in the database's directory (not the table directory) so it is
	# available for a LOAD DATA statement.
	hdfs_file = get_fs_path(
	os.path.join("/test-warehouse", "{0}.db".format(unique_database), filename))
	# Note: -d skips a staging copy
	check_call(['hdfs', 'dfs', '-put', '-f', '-d', local_file, hdfs_file])

	# Create the table and load the file
	qualified_table_name = '{0}.{1}'.format(unique_database, table_name)
	impala_client.execute('create table {0} like {1} "{2}" stored as {1}'.format(
	qualified_table_name, file_format, hdfs_file))
	impala_client.execute('load data inpath "{0}" into table {1}'.format(
	hdfs_file, qualified_table_name))


	def create_table_and_copy_files(impala_client, create_stmt, unique_database, table_name,
	files):
	# Create the table
	create_stmt = create_stmt.format(db=unique_database, tbl=table_name)
	impala_client.execute(create_stmt)

	hdfs_dir = get_fs_path(
	os.path.join("/test-warehouse", unique_database + ".db", table_name))
	copy_files_to_hdfs_dir(files, hdfs_dir)

	# Refresh the table metadata to see the new files
	refresh_stmt = "refresh {0}.{1}".format(unique_database, table_name)
	impala_client.execute(refresh_stmt)


	def copy_files_to_hdfs_dir(files, hdfs_dir):
	# Copy the files
	# - build a list of source files
	# - issue a single put to the hdfs_dir ( -d skips a staging copy)
	source_files = []
	for local_file in files:
	# Cut off leading '/' to make os.path.join() happy
	local_file = local_file if local_file[0] != '/' else local_file[1:]
	local_file = os.path.join(os.environ['IMPALA_HOME'], local_file)
	assert os.path.isfile(local_file)
	source_files.append(local_file)
	check_call(['hdfs', 'dfs', '-put', '-f', '-d'] + source_files + [hdfs_dir])


	def grep_dir(dir, search, filename_search=""):
	'''Recursively search for files that contain 'search' and have a filename that matches
	'filename_search' and return a list of matched lines grouped by file.
	'''
	filename_matcher = re.compile(filename_search)
	matching_files = dict()
	for dir_name, _, file_names in os.walk(dir):
	for file_name in file_names:
	file_path = os.path.join(dir_name, file_name)
	if os.path.islink(file_path) or not filename_matcher.search(file_path):
	continue
	with open(file_path) as file:
	matching_lines = grep_file(file, search)
	if matching_lines:
	matching_files[file_name] = matching_lines
	return matching_files


	def grep_file(file, search):
	'''Return lines in 'file' that match the 'search' regex. 'file' must already be
	opened.
	'''
	matcher = re.compile(search)
	matching_lines = list()
	for line in file:
	if matcher.search(line):
	matching_lines.append(line)
	return matching_lines


	def assert_file_in_dir_contains(dir, search):
	'''Asserts that at least one file in the 'dir' contains the 'search' term.'''
	results = grep_dir(dir, search)
	assert results, "%s should have a file containing '%s' but no file was found" \
	% (dir, search)


	def assert_no_files_in_dir_contain(dir, search):
	'''Asserts that no files in the 'dir' contains the 'search' term.'''
	results = grep_dir(dir, search)
	assert not results, \
	"%s should not have any file containing '%s' but a file was found" \
	% (dir, search)