blob: dd005145c8609b9614598cffa10c9b90b9937057 [file] [log] [blame]
#!/usr/bin/env python
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
This script downloads and parses AWS EC2 from
https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/index.json.
It writes a Python module with constants about EC2's sizes and regions.
Use it as following (run it in the root of the repo directory):
$ python contrib/scrape-ec2-sizes.py
"""
import re
import os
import json
import atexit
import requests
import tqdm # pylint: disable=import-error
import ijson # pylint: disable=import-error
FILEPATH = os.environ.get("TMP_JSON", "/tmp/ec.json")
URL = "https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/index.json"
IGNORED_FIELDS = ["locationType", "operatingSystem"]
REG1_STORAGE = re.compile(r"(\d+) x ([0-9,]+)")
REG2_STORAGE = re.compile(r"(\d+) GB.*?")
REG_BANDWIDTH = re.compile(r"\D*(\d+)\D*")
# From <https://aws.amazon.com/marketplace/help/200777880>
REGION_DETAILS = {
# America
"US East (N. Virginia)": {
"id": "us-east-1",
"endpoint": "ec2.us-east-1.amazonaws.com",
"api_name": "ec2_us_east",
"country": "USA",
"signature_version": "2",
},
"US East (Ohio)": {
"id": "us-east-2",
"endpoint": "ec2.us-east-2.amazonaws.com",
"api_name": "ec2_us_east_ohio",
"country": "USA",
"signature_version": "4",
},
"US West (N. California)": {
"id": "us-west-1",
"endpoint": "ec2.us-west-1.amazonaws.com",
"api_name": "ec2_us_west",
"country": "USA",
"signature_version": "2",
},
"US West (Oregon)": {
"id": "us-west-2",
"endpoint": "ec2.us-west-2.amazonaws.com",
"api_name": "ec2_us_west_oregon",
"country": "US",
"signature_version": "2",
},
"Canada (Central)": {
"id": "ca-central-1",
"endpoint": "ec2.ca-central-1.amazonaws.com",
"api_name": "ec2_ca_central_1",
"country": "Canada",
"signature_version": "4",
},
"South America (Sao Paulo)": {
"id": "sa-east-1",
"endpoint": "ec2.sa-east-1.amazonaws.com",
"api_name": "ec2_sa_east",
"country": "Brazil",
"signature_version": "2",
},
"AWS GovCloud (US)": {
"id": "us-gov-west-1",
"endpoint": "ec2.us-gov-west-1.amazonaws.com",
"api_name": "ec2_us_govwest",
"country": "US",
"signature_version": "2",
},
# EU
"eu-west-1": {
"id": "eu-west-1",
"endpoint": "ec2.eu-west-1.amazonaws.com",
"api_name": "ec2_eu_west",
"country": "Ireland",
"signature_version": "2",
},
"EU (Ireland)": { # Duplicate from AWS' JSON
"id": "eu-west-1",
"endpoint": "ec2.eu-west-1.amazonaws.com",
"api_name": "ec2_eu_west",
"country": "Ireland",
"signature_version": "2",
},
"EU (London)": {
"id": "eu-west-2",
"endpoint": "ec2.eu-west-2.amazonaws.com",
"api_name": "ec2_eu_west_london",
"country": "United Kingdom",
"signature_version": "4",
},
"EU (Milan)": {
"id": "eu-south-1",
"endpoint": "ec2.eu-south-1.amazonaws.com",
"api_name": "ec2_eu_south",
"country": "Italy",
"signature_version": "4",
},
"EU (Paris)": {
"id": "eu-west-3",
"endpoint": "ec2.eu-west-3.amazonaws.com",
"api_name": "ec2_eu_west_paris",
"country": "France",
"signature_version": "4",
},
"EU (Frankfurt)": {
"id": "eu-central-1",
"endpoint": "ec2.eu-central-1.amazonaws.com",
"api_name": "ec2_eu_central",
"country": "Frankfurt",
"signature_version": "4",
},
"EU (Stockholm)": {
"id": "eu-north-1",
"endpoint": "ec2.eu-north-1.amazonaws.com",
"api_name": "ec2_eu_north_stockholm",
"country": "Stockholm",
"signature_version": "4",
},
# Asia
"Asia Pacific (Mumbai)": {
"id": "ap-south-1",
"endpoint": "ec2.ap-south-1.amazonaws.com",
"api_name": "ec2_ap_south_1",
"country": "India",
"signature_version": "4",
},
"Asia Pacific (Singapore)": {
"id": "ap-southeast-1",
"endpoint": "ec2.ap-southeast-1.amazonaws.com",
"api_name": "ec2_ap_southeast",
"country": "Singapore",
"signature_version": "2",
},
"Asia Pacific (Sydney)": {
"id": "ap-southeast-2",
"endpoint": "ec2.ap-southeast-2.amazonaws.com",
"api_name": "ec2_ap_southeast_2",
"country": "Australia",
"signature_version": "2",
},
"Asia Pacific (Tokyo)": {
"id": "ap-northeast-1",
"endpoint": "ec2.ap-northeast-1.amazonaws.com",
"api_name": "ec2_ap_northeast",
"country": "Japan",
"signature_version": "2",
},
"Asia Pacific (Seoul)": {
"id": "ap-northeast-2",
"endpoint": "ec2.ap-northeast-2.amazonaws.com",
"api_name": "ec2_ap_northeast",
"country": "South Korea",
"signature_version": "4",
},
"Asia Pacific (Osaka-Local)": {
"id": "ap-northeast-3",
"endpoint": "ec2.ap-northeast-3.amazonaws.com",
"api_name": "ec2_ap_northeast",
"country": "Japan",
"signature_version": "4",
},
"Asia Pacific (Hong Kong)": {
"id": "ap-east-1",
"endpoint": "ec2.ap-east-1.amazonaws.com",
"api_name": "ec2_ap_east",
"country": "Hong Kong",
"signature_version": "2",
},
# Not in JSON
"China (Beijing)": {
"id": "cn-north-1",
"endpoint": "ec2.cn-north-1.amazonaws.com.cn",
"api_name": "ec2_cn_north",
"country": "China",
"signature_version": "4",
},
"China (Ningxia)": {
"id": "cn-northwest-1",
"endpoint": "ec2.cn-northwest-1.amazonaws.com.cn",
"api_name": "ec2_cn_northwest",
"country": "China",
"signature_version": "4",
},
}
FILE_HEADER = """
# File generated by contrib/scrape-ec2-sizes.py script - DO NOT EDIT manually
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""".strip()
def download_json():
if os.path.isfile(FILEPATH):
print("Using data from existing cached file %s" % (FILEPATH))
return open(FILEPATH, "r")
def remove_partial_cached_file():
if os.path.isfile(FILEPATH):
os.remove(FILEPATH)
# File not cached locally, download data and cache it
with requests.get(URL, stream=True) as response:
atexit.register(remove_partial_cached_file)
total_size_in_bytes = int(response.headers.get("content-length", 0))
progress_bar = tqdm.tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
chunk_size = 10 * 1024 * 1024
with open(FILEPATH, "wb") as fp:
# NOTE: We use shutil.copyfileobj with large chunk size instead of
# response.iter_content with large chunk size since data we
# download is massive and copyfileobj is more efficient.
# shutil.copyfileobj(response.raw, fp, 10 * 1024 * 1024)
for chunk_data in response.iter_content(chunk_size):
progress_bar.update(len(chunk_data))
fp.write(chunk_data)
progress_bar.close()
atexit.unregister(remove_partial_cached_file)
return open(FILEPATH, "r")
def get_json():
if not os.path.isfile(FILEPATH):
return download_json(), False
print("Using data from existing cached file %s" % (FILEPATH))
return open(FILEPATH, "r"), True
def filter_extras(extras):
return {
key: extras[key]
for key in extras
if key
not in [
"capacitystatus",
"ebsOptimized",
"operation",
"licenseModel",
"preInstalledSw",
"tenancy",
"usagetype",
]
}
def parse():
# Set vars
sizes = {}
regions = {r["id"]: r for r in REGION_DETAILS.values()}
for region_id in regions:
regions[region_id]["instance_types"] = []
# Parse
json_file, from_file = get_json()
products_data = ijson.items(json_file, "products")
try:
products_data = next(products_data)
except ijson.common.IncompleteJSONError as e:
# This likely indicates that the cached file is incomplete or corrupt so we delete it and re
# download data
if from_file:
os.remove(FILEPATH)
json_file, from_file = get_json()
products_data = ijson.items(json_file, "products")
products_data = next(products_data)
else:
raise e
for sku in products_data:
if products_data[sku].get("productFamily", "unknown") != "Compute Instance":
continue
location = products_data[sku]["attributes"].pop("location")
if location not in REGION_DETAILS:
continue
# Get region & size ID
region_id = REGION_DETAILS[location]["id"]
instance_type = products_data[sku]["attributes"]["instanceType"]
# Add size to region
if instance_type not in regions[region_id]["instance_types"]:
regions[region_id]["instance_types"].append(instance_type)
# Parse sizes
if instance_type not in sizes:
for field in IGNORED_FIELDS:
products_data[sku]["attributes"].pop(field, None)
# Compute RAM
ram = int(
float(
products_data[sku]["attributes"]["memory"]
.split()[0]
.replace(",", "")
)
* 1024
)
# Compute bandwdith
bw_match = REG_BANDWIDTH.match(
products_data[sku]["attributes"]["networkPerformance"]
)
if bw_match is not None:
bandwidth = int(bw_match.groups()[0])
else:
bandwidth = None
sizes[instance_type] = {
"id": instance_type,
"name": instance_type,
"ram": ram,
"bandwidth": bandwidth,
"extra": filter_extras(products_data[sku]["attributes"]),
}
if products_data[sku]["attributes"].get("storage") != "EBS only":
match = REG1_STORAGE.match(products_data[sku]["attributes"]["storage"])
if match:
disk_number, disk_size = match.groups()
else:
match = REG2_STORAGE.match(
products_data[sku]["attributes"]["storage"]
)
if match:
disk_number, disk_size = 1, match.groups()[0]
else:
disk_number, disk_size = 0, "0"
disk_number, disk_size = (
int(disk_number),
int(disk_size.replace(",", "")),
)
sizes[instance_type]["disk"] = disk_number * disk_size
else:
sizes[instance_type]["disk"] = 0
products_data[sku]["attributes"]
# Sort
for region in regions:
regions[region]["instance_types"] = sorted(regions[region]["instance_types"])
return sizes, regions
def dump():
print("Scraping size data, this may take up to 10-15 minutes...")
sizes, regions = parse()
separators = (",", ": ")
# 1. Write file with instance types constants
file_path = "libcloud/compute/constants/ec2_instance_types.py"
with open(file_path, "w") as fp:
fp.write(FILE_HEADER + "\n")
fp.write("\n")
fp.write(
"INSTANCE_TYPES = "
+ json.dumps(
sizes, indent=4, sort_keys=True, separators=separators
).replace("null", "None")
)
print("")
print("Data written to %s" % (file_path))
print("")
# 2. Write file with full details for each region
file_path = "libcloud/compute/constants/ec2_region_details_complete.py"
with open(file_path, "w") as fp:
fp.write(FILE_HEADER + "\n")
fp.write("\n")
fp.write(
"REGION_DETAILS = "
+ json.dumps(
regions, indent=4, sort_keys=True, separators=separators
).replace("null", "None")
)
print("Data written to %s" % (file_path))
print("")
# 3. Write file with partial region details (everything except instance_types attribute)
regions_partial = {}
keys_to_keep = ["api_name", "country", "id", "endpoint", "signature_version"]
for region_name, region_details in regions.items():
regions_partial[region_name] = {}
for key, value in region_details.items():
if key not in keys_to_keep:
continue
regions_partial[region_name][key] = value
file_path = "libcloud/compute/constants/ec2_region_details_partial.py"
with open(file_path, "w") as fp:
fp.write(FILE_HEADER + "\n")
fp.write("\n")
fp.write(
"REGION_DETAILS = "
+ json.dumps(
regions_partial, indent=4, sort_keys=True, separators=separators
).replace("null", "None")
)
print("Data written to %s" % (file_path))
print("")
if __name__ == "__main__":
dump()