| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| |
| import os |
| import fnmatch |
| import re |
| |
| from tests.performance.query import Query |
| from tests.util.test_file_parser import parse_query_test_file |
| |
| class Workload(object): |
| """Represents a workload. |
| |
| A workload is the internal representation for the set of queries on a dataset. It |
| consists of the dataset name, and a mapping of query names to query strings. |
| |
| Args: |
| name (str): workload name. (Eg. tpch) |
| query_name_filters (list of str): List of regular expressions used for matching query |
| names |
| |
| Attributes: |
| name (str): workload name (Eg. tpch) |
| _query_map (dict): contains a query name -> string mapping; mapping of query name to |
| section (ex. "TPCH-Q10" -> "select * from...") |
| """ |
| |
| WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR'] |
| |
| def __init__(self, name, query_name_filters=None): |
| self._name = name |
| self._query_map = dict() |
| # Build the query name -> string mapping in the c'tor. We want to fail fast and early |
| # if the user input is bad. |
| self._validate_and_load(query_name_filters) |
| |
| @property |
| def name(self): |
| return self._name |
| |
| @property |
| def query_map(self): |
| return self._query_map |
| |
| def _validate_and_load(self, query_name_filters): |
| """Validates that the Workload is legal.""" |
| query_name_filters = map(str.strip, query_name_filters) if query_name_filters else [] |
| self._base_dir = os.path.join(Workload.WORKLOAD_DIR, self._name, 'queries') |
| # Check whether the workload name corresponds to an existing directory. |
| if not os.path.isdir(self._base_dir): |
| raise ValueError("Workload %s not found in %s" % (self._name, self._base_dir)) |
| sections = list() |
| # Parse all queries files for the given workload. |
| for file_name in self._list_query_files(): |
| sections.extend(parse_query_test_file(file_name)) |
| # If the user has specified query names, check whether all the user specified queries |
| # exist in the query files. |
| all_query_names = [s['QUERY_NAME'] for s in sections if s['QUERY_NAME'].strip()] |
| regex = re.compile(r'|'.join(['^%s$' % n for n in query_name_filters]), re.I) |
| matched_query_names = filter(lambda x: re.match(regex, x), all_query_names) |
| assert len(matched_query_names) > 0, "No matching queries found for %s" % self._name |
| # Filter the sections based on the queries the user wants. |
| sections = filter(lambda x: x['QUERY_NAME'] in matched_query_names, sections) |
| # Add the filtered queries to the query map |
| for section in sections: |
| self._query_map[section['QUERY_NAME']] = section['QUERY'] |
| |
| def _list_query_files(self): |
| """Return a list of all the .test files that contain queries""" |
| query_files = list() |
| for root, dirs, file_names in os.walk(self._base_dir): |
| for file_name in fnmatch.filter(file_names, '*.test'): |
| query_files.append(os.path.join(root, file_name)) |
| assert len(query_files) > 0, "No Query Files found in %s" % self._base_dir |
| return query_files |
| |
| def construct_queries(self, test_vector, scale_factor): |
| """Transform a query map into a list of query objects. |
| |
| Transform all the queries in the workload's query map to query objects based on the |
| input test vector and scale factor. |
| |
| Args: |
| test_vector (?): query vector |
| scale_factor (str): eg. "300gb" |
| |
| Returns: |
| (list of Query): these will be consumed by ? |
| """ |
| |
| queries = list() |
| for query_name, query_str in self._query_map.iteritems(): |
| queries.append(Query(name=query_name, |
| query_str=query_str, |
| workload=self._name, |
| scale_factor=scale_factor, |
| test_vector=test_vector)) |
| return queries |
| |