blob: 9d38dd5e2a838e1b82ab5177e92ff3c9d245f504 [file] [log] [blame]
#!/usr/bin/env python
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
This script downloads and parses AWS EC2 from
https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/index.json.
It writes a Python module with constants about EC2's sizes and regions.
Use it as following (run it in the root of the repo directory):
$ python contrib/scrape-ec2-sizes.py
"""
import re
import os
import json
import atexit
import requests
import tqdm # pylint: disable=import-error
import ijson # pylint: disable=import-error
FILEPATH = os.environ.get('TMP_JSON', '/tmp/ec.json')
URL = "https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/index.json"
IGNORED_FIELDS = ['locationType', 'operatingSystem']
REG1_STORAGE = re.compile(r'(\d+) x ([0-9,]+)')
REG2_STORAGE = re.compile(r'(\d+) GB.*?')
REG_BANDWIDTH = re.compile(r'\D*(\d+)\D*')
# From <https://aws.amazon.com/marketplace/help/200777880>
REGION_DETAILS = {
# America
'US East (N. Virginia)': {
'id': 'us-east-1',
'endpoint': 'ec2.us-east-1.amazonaws.com',
'api_name': 'ec2_us_east',
'country': 'USA',
'signature_version': '2',
},
'US East (Ohio)': {
'id': 'us-east-2',
'endpoint': 'ec2.us-east-2.amazonaws.com',
'api_name': 'ec2_us_east_ohio',
'country': 'USA',
'signature_version': '4',
},
'US West (N. California)': {
'id': 'us-west-1',
'endpoint': 'ec2.us-west-1.amazonaws.com',
'api_name': 'ec2_us_west',
'country': 'USA',
'signature_version': '2',
},
'US West (Oregon)': {
'id': 'us-west-2',
'endpoint': 'ec2.us-west-2.amazonaws.com',
'api_name': 'ec2_us_west_oregon',
'country': 'US',
'signature_version': '2',
},
'Canada (Central)': {
'id': 'ca-central-1',
'endpoint': 'ec2.ca-central-1.amazonaws.com',
'api_name': 'ec2_ca_central_1',
'country': 'Canada',
'signature_version': '4',
},
'South America (Sao Paulo)': {
'id': 'sa-east-1',
'endpoint': 'ec2.sa-east-1.amazonaws.com',
'api_name': 'ec2_sa_east',
'country': 'Brazil',
'signature_version': '2',
},
'AWS GovCloud (US)': {
'id': 'us-gov-west-1',
'endpoint': 'ec2.us-gov-west-1.amazonaws.com',
'api_name': 'ec2_us_govwest',
'country': 'US',
'signature_version': '2',
},
# EU
'eu-west-1': {
'id': 'eu-west-1',
'endpoint': 'ec2.eu-west-1.amazonaws.com',
'api_name': 'ec2_eu_west',
'country': 'Ireland',
'signature_version': '2',
},
'EU (Ireland)': { # Duplicate from AWS' JSON
'id': 'eu-west-1',
'endpoint': 'ec2.eu-west-1.amazonaws.com',
'api_name': 'ec2_eu_west',
'country': 'Ireland',
'signature_version': '2',
},
'EU (London)': {
'id': 'eu-west-2',
'endpoint': 'ec2.eu-west-2.amazonaws.com',
'api_name': 'ec2_eu_west_london',
'country': 'United Kingdom',
'signature_version': '4',
},
'EU (Paris)': {
'id': 'eu-west-3',
'endpoint': 'ec2.eu-west-3.amazonaws.com',
'api_name': 'ec2_eu_west_paris',
'country': 'France',
'signature_version': '4',
},
'EU (Frankfurt)': {
'id': 'eu-central-1',
'endpoint': 'ec2.eu-central-1.amazonaws.com',
'api_name': 'ec2_eu_central',
'country': 'Frankfurt',
'signature_version': '4',
},
'EU (Stockholm)': {
'id': 'eu-north-1',
'endpoint': 'ec2.eu-north-1.amazonaws.com',
'api_name': 'ec2_eu_north_stockholm',
'country': 'Stockholm',
'signature_version': '4',
},
# Asia
'Asia Pacific (Mumbai)': {
'id': 'ap-south-1',
'endpoint': 'ec2.ap-south-1.amazonaws.com',
'api_name': 'ec2_ap_south_1',
'country': 'India',
'signature_version': '4',
},
'Asia Pacific (Singapore)': {
'id': 'ap-southeast-1',
'endpoint': 'ec2.ap-southeast-1.amazonaws.com',
'api_name': 'ec2_ap_southeast',
'country': 'Singapore',
'signature_version': '2',
},
'Asia Pacific (Sydney)': {
'id': 'ap-southeast-2',
'endpoint': 'ec2.ap-southeast-2.amazonaws.com',
'api_name': 'ec2_ap_southeast_2',
'country': 'Australia',
'signature_version': '2',
},
'Asia Pacific (Tokyo)': {
'id': 'ap-northeast-1',
'endpoint': 'ec2.ap-northeast-1.amazonaws.com',
'api_name': 'ec2_ap_northeast',
'country': 'Japan',
'signature_version': '2',
},
'Asia Pacific (Seoul)': {
'id': 'ap-northeast-2',
'endpoint': 'ec2.ap-northeast-2.amazonaws.com',
'api_name': 'ec2_ap_northeast',
'country': 'South Korea',
'signature_version': '4',
},
'Asia Pacific (Osaka-Local)': {
'id': 'ap-northeast-3',
'endpoint': 'ec2.ap-northeast-3.amazonaws.com',
'api_name': 'ec2_ap_northeast',
'country': 'Japan',
'signature_version': '4',
},
# Not in JSON
'China (Beijing)': {
'id': 'cn-north-1',
'endpoint': 'ec2.cn-north-1.amazonaws.com.cn',
'api_name': 'ec2_cn_north',
'country': 'China',
'signature_version': '4',
},
'China (Ningxia)': {
'id': 'cn-northwest-1',
'endpoint': 'ec2.cn-northwest-1.amazonaws.com.cn',
'api_name': 'ec2_cn_northwest',
'country': 'China',
'signature_version': '4',
},
}
FILE_HEADER = """
# File generated by contrib/scrape-ec2-sizes.py script - DO NOT EDIT manually
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""".strip()
def download_json():
if os.path.isfile(FILEPATH):
print("Using data from existing cached file %s" % (FILEPATH))
return open(FILEPATH, 'r')
def remove_partial_cached_file():
if os.path.isfile(FILEPATH):
os.remove(FILEPATH)
# File not cached locally, download data and cache it
with requests.get(URL, stream=True) as response:
atexit.register(remove_partial_cached_file)
total_size_in_bytes = int(response.headers.get('content-length', 0))
progress_bar = tqdm.tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
chunk_size = 10 * 1024 * 1024
with open(FILEPATH, 'wb') as fp:
# NOTE: We use shutil.copyfileobj with large chunk size instead of
# response.iter_content with large chunk size since data we
# download is massive and copyfileobj is more efficient.
# shutil.copyfileobj(response.raw, fp, 10 * 1024 * 1024)
for chunk_data in response.iter_content(chunk_size):
progress_bar.update(len(chunk_data))
fp.write(chunk_data)
progress_bar.close()
atexit.unregister(remove_partial_cached_file)
return open(FILEPATH, 'r')
def get_json():
if not os.path.isfile(FILEPATH):
return download_json(), False
print("Using data from existing cached file %s" % (FILEPATH))
return open(FILEPATH, 'r'), True
def filter_extras(extras):
return {
key: extras[key] for key in extras
if key not in [
'capacitystatus', 'ebsOptimized', 'operation', 'licenseModel',
'preInstalledSw', 'tenancy', 'usagetype'
]
}
def parse():
# Set vars
sizes = {}
regions = {r['id']: r for r in REGION_DETAILS.values()}
for region_id in regions:
regions[region_id]['instance_types'] = []
# Parse
json_file, from_file = get_json()
products_data = ijson.items(json_file, 'products')
try:
products_data = next(products_data)
except ijson.common.IncompleteJSONError as e:
# This likely indicates that the cached file is incomplete or corrupt so we delete it and re
# download data
if from_file:
os.remove(FILEPATH)
json_file, from_file = get_json()
products_data = ijson.items(json_file, 'products')
products_data = next(products_data)
else:
raise e
for sku in products_data:
if products_data[sku].get('productFamily', "unknown") != "Compute Instance":
continue
location = products_data[sku]['attributes'].pop('location')
if location not in REGION_DETAILS:
continue
# Get region & size ID
region_id = REGION_DETAILS[location]['id']
instance_type = products_data[sku]['attributes']['instanceType']
# Add size to region
if instance_type not in regions[region_id]['instance_types']:
regions[region_id]['instance_types'].append(instance_type)
# Parse sizes
if instance_type not in sizes:
for field in IGNORED_FIELDS:
products_data[sku]['attributes'].pop(field, None)
# Compute RAM
ram = int(float(products_data[sku]['attributes']['memory'].split()[0]
.replace(',', '')) * 1024)
# Compute bandwdith
bw_match = REG_BANDWIDTH.match(products_data[sku]['attributes']['networkPerformance'])
if bw_match is not None:
bandwidth = int(bw_match.groups()[0])
else:
bandwidth = None
sizes[instance_type] = {
'id': instance_type,
'name': instance_type,
'ram': ram,
'bandwidth': bandwidth,
'extra': filter_extras(products_data[sku]['attributes']),
}
if products_data[sku]['attributes'].get('storage') != "EBS only":
match = REG1_STORAGE.match(products_data[sku]['attributes']['storage'])
if match:
disk_number, disk_size = match.groups()
else:
match = REG2_STORAGE.match(products_data[sku]['attributes']['storage'])
if match:
disk_number, disk_size = 1, match.groups()[0]
else:
disk_number, disk_size = 0, '0'
disk_number, disk_size = int(disk_number), int(disk_size.replace(',', ''))
sizes[instance_type]['disk'] = disk_number * disk_size
else:
sizes[instance_type]['disk'] = 0
products_data[sku]['attributes']
# Sort
for region in regions:
regions[region]['instance_types'] = sorted(regions[region]['instance_types'])
return sizes, regions
def dump():
print("Scraping size data, this may take up to 10-15 minutes...")
sizes, regions = parse()
separators = (',', ': ')
# 1. Write file with instance types constants
file_path = "libcloud/compute/constants/ec2_instance_types.py"
with open(file_path, "w") as fp:
fp.write(FILE_HEADER + "\n")
fp.write("\n")
fp.write("INSTANCE_TYPES = " + json.dumps(sizes, indent=4, sort_keys=True,
separators=separators).replace('null', 'None'))
print("")
print("Data written to %s" % (file_path))
print("")
# 2. Write file with full details for each region
file_path = "libcloud/compute/constants/ec2_region_details_complete.py"
with open(file_path, "w") as fp:
fp.write(FILE_HEADER + "\n")
fp.write("\n")
fp.write("REGION_DETAILS = " + json.dumps(regions, indent=4, sort_keys=True,
separators=separators).replace('null', 'None'))
print("Data written to %s" % (file_path))
print("")
# 3. Write file with partial region details (everything except instance_types attribute)
regions_partial = {}
keys_to_keep = ["api_name", "country", "id", "endpoint", "signature_version"]
for region_name, region_details in regions.items():
regions_partial[region_name] = {}
for key, value in region_details.items():
if key not in keys_to_keep:
continue
regions_partial[region_name][key] = value
file_path = "libcloud/compute/constants/ec2_region_details_partial.py"
with open(file_path, "w") as fp:
fp.write(FILE_HEADER + "\n")
fp.write("\n")
fp.write("REGION_DETAILS = " + json.dumps(regions_partial, indent=4, sort_keys=True,
separators=separators).replace('null', 'None'))
print("Data written to %s" % (file_path))
print("")
if __name__ == '__main__':
dump()