| #!/usr/bin/env python |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| """ |
| This script downloads and parses AWS EC2 from |
| https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/index.json. |
| It writes a Python module with constants about EC2's sizes and regions. |
| |
| Use it as following (run it in the root of the repo directory): |
| $ python contrib/scrape-ec2-sizes.py |
| """ |
| |
| import os |
| import re |
| import json |
| import atexit |
| |
| import tqdm # pylint: disable=import-error |
| import ijson # pylint: disable=import-error |
| import requests |
| |
| FILEPATH = os.environ.get("TMP_JSON", "/tmp/ec.json") |
| URL = "https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/index.json" |
| IGNORED_FIELDS = ["locationType", "operatingSystem"] |
| REG1_STORAGE = re.compile(r"(\d+) x ([0-9,]+)") |
| REG2_STORAGE = re.compile(r"(\d+) GB.*?") |
| REG_BANDWIDTH = re.compile(r"\D*(\d+)\D*") |
| # From <https://aws.amazon.com/marketplace/help/200777880> |
| REGION_DETAILS = { |
| # America |
| "US East (N. Virginia)": { |
| "id": "us-east-1", |
| "endpoint": "ec2.us-east-1.amazonaws.com", |
| "api_name": "ec2_us_east", |
| "country": "USA", |
| "signature_version": "2", |
| }, |
| "US East (Ohio)": { |
| "id": "us-east-2", |
| "endpoint": "ec2.us-east-2.amazonaws.com", |
| "api_name": "ec2_us_east_ohio", |
| "country": "USA", |
| "signature_version": "4", |
| }, |
| "US West (N. California)": { |
| "id": "us-west-1", |
| "endpoint": "ec2.us-west-1.amazonaws.com", |
| "api_name": "ec2_us_west", |
| "country": "USA", |
| "signature_version": "2", |
| }, |
| "US West (Oregon)": { |
| "id": "us-west-2", |
| "endpoint": "ec2.us-west-2.amazonaws.com", |
| "api_name": "ec2_us_west_oregon", |
| "country": "US", |
| "signature_version": "2", |
| }, |
| "Canada (Central)": { |
| "id": "ca-central-1", |
| "endpoint": "ec2.ca-central-1.amazonaws.com", |
| "api_name": "ec2_ca_central_1", |
| "country": "Canada", |
| "signature_version": "4", |
| }, |
| "South America (Sao Paulo)": { |
| "id": "sa-east-1", |
| "endpoint": "ec2.sa-east-1.amazonaws.com", |
| "api_name": "ec2_sa_east", |
| "country": "Brazil", |
| "signature_version": "2", |
| }, |
| "AWS GovCloud (US)": { |
| "id": "us-gov-west-1", |
| "endpoint": "ec2.us-gov-west-1.amazonaws.com", |
| "api_name": "ec2_us_govwest", |
| "country": "US", |
| "signature_version": "2", |
| }, |
| # Africa |
| "af-south-1": { |
| "id": "af-south-1", |
| "endpoint": "ec2.af-south-1.amazonaws.com", |
| "api_name": "ec2_af_south", |
| "country": "South Africa", |
| "signature_version": "4", |
| }, |
| # EU |
| "eu-west-1": { |
| "id": "eu-west-1", |
| "endpoint": "ec2.eu-west-1.amazonaws.com", |
| "api_name": "ec2_eu_west", |
| "country": "Ireland", |
| "signature_version": "2", |
| }, |
| "EU (Ireland)": { # Duplicate from AWS' JSON |
| "id": "eu-west-1", |
| "endpoint": "ec2.eu-west-1.amazonaws.com", |
| "api_name": "ec2_eu_west", |
| "country": "Ireland", |
| "signature_version": "2", |
| }, |
| "EU (London)": { |
| "id": "eu-west-2", |
| "endpoint": "ec2.eu-west-2.amazonaws.com", |
| "api_name": "ec2_eu_west_london", |
| "country": "United Kingdom", |
| "signature_version": "4", |
| }, |
| "EU (Milan)": { |
| "id": "eu-south-1", |
| "endpoint": "ec2.eu-south-1.amazonaws.com", |
| "api_name": "ec2_eu_south", |
| "country": "Italy", |
| "signature_version": "4", |
| }, |
| "EU (Paris)": { |
| "id": "eu-west-3", |
| "endpoint": "ec2.eu-west-3.amazonaws.com", |
| "api_name": "ec2_eu_west_paris", |
| "country": "France", |
| "signature_version": "4", |
| }, |
| "EU (Frankfurt)": { |
| "id": "eu-central-1", |
| "endpoint": "ec2.eu-central-1.amazonaws.com", |
| "api_name": "ec2_eu_central", |
| "country": "Frankfurt", |
| "signature_version": "4", |
| }, |
| "EU (Stockholm)": { |
| "id": "eu-north-1", |
| "endpoint": "ec2.eu-north-1.amazonaws.com", |
| "api_name": "ec2_eu_north_stockholm", |
| "country": "Stockholm", |
| "signature_version": "4", |
| }, |
| # Asia |
| "Asia Pacific (Mumbai)": { |
| "id": "ap-south-1", |
| "endpoint": "ec2.ap-south-1.amazonaws.com", |
| "api_name": "ec2_ap_south_1", |
| "country": "India", |
| "signature_version": "4", |
| }, |
| "Asia Pacific (Singapore)": { |
| "id": "ap-southeast-1", |
| "endpoint": "ec2.ap-southeast-1.amazonaws.com", |
| "api_name": "ec2_ap_southeast", |
| "country": "Singapore", |
| "signature_version": "2", |
| }, |
| "Asia Pacific (Sydney)": { |
| "id": "ap-southeast-2", |
| "endpoint": "ec2.ap-southeast-2.amazonaws.com", |
| "api_name": "ec2_ap_southeast_2", |
| "country": "Australia", |
| "signature_version": "2", |
| }, |
| "Asia Pacific (Tokyo)": { |
| "id": "ap-northeast-1", |
| "endpoint": "ec2.ap-northeast-1.amazonaws.com", |
| "api_name": "ec2_ap_northeast", |
| "country": "Japan", |
| "signature_version": "2", |
| }, |
| "Asia Pacific (Seoul)": { |
| "id": "ap-northeast-2", |
| "endpoint": "ec2.ap-northeast-2.amazonaws.com", |
| "api_name": "ec2_ap_northeast", |
| "country": "South Korea", |
| "signature_version": "4", |
| }, |
| "Asia Pacific (Osaka-Local)": { |
| "id": "ap-northeast-3", |
| "endpoint": "ec2.ap-northeast-3.amazonaws.com", |
| "api_name": "ec2_ap_northeast", |
| "country": "Japan", |
| "signature_version": "4", |
| }, |
| "Asia Pacific (Hong Kong)": { |
| "id": "ap-east-1", |
| "endpoint": "ec2.ap-east-1.amazonaws.com", |
| "api_name": "ec2_ap_east", |
| "country": "Hong Kong", |
| "signature_version": "2", |
| }, |
| # Not in JSON |
| "China (Beijing)": { |
| "id": "cn-north-1", |
| "endpoint": "ec2.cn-north-1.amazonaws.com.cn", |
| "api_name": "ec2_cn_north", |
| "country": "China", |
| "signature_version": "4", |
| }, |
| "China (Ningxia)": { |
| "id": "cn-northwest-1", |
| "endpoint": "ec2.cn-northwest-1.amazonaws.com.cn", |
| "api_name": "ec2_cn_northwest", |
| "country": "China", |
| "signature_version": "4", |
| }, |
| } |
| |
| FILE_HEADER = """ |
| # File generated by contrib/scrape-ec2-sizes.py script - DO NOT EDIT manually |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| """.strip() |
| |
| |
| def download_json(): |
| if os.path.isfile(FILEPATH): |
| print("Using data from existing cached file %s" % (FILEPATH)) |
| return open(FILEPATH) |
| |
| def remove_partial_cached_file(): |
| if os.path.isfile(FILEPATH): |
| os.remove(FILEPATH) |
| |
| # File not cached locally, download data and cache it |
| with requests.get(URL, stream=True) as response: |
| atexit.register(remove_partial_cached_file) |
| |
| total_size_in_bytes = int(response.headers.get("content-length", 0)) |
| progress_bar = tqdm.tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) |
| |
| chunk_size = 10 * 1024 * 1024 |
| |
| with open(FILEPATH, "wb") as fp: |
| # NOTE: We use shutil.copyfileobj with large chunk size instead of |
| # response.iter_content with large chunk size since data we |
| # download is massive and copyfileobj is more efficient. |
| # shutil.copyfileobj(response.raw, fp, 10 * 1024 * 1024) |
| for chunk_data in response.iter_content(chunk_size): |
| progress_bar.update(len(chunk_data)) |
| fp.write(chunk_data) |
| |
| progress_bar.close() |
| atexit.unregister(remove_partial_cached_file) |
| |
| return open(FILEPATH) |
| |
| |
| def get_json(): |
| if not os.path.isfile(FILEPATH): |
| return download_json(), False |
| |
| print("Using data from existing cached file %s" % (FILEPATH)) |
| return open(FILEPATH), True |
| |
| |
| def filter_extras(extras): |
| return { |
| key: extras[key] |
| for key in extras |
| if key |
| not in [ |
| "capacitystatus", |
| "ebsOptimized", |
| "operation", |
| "licenseModel", |
| "preInstalledSw", |
| "tenancy", |
| "usagetype", |
| ] |
| } |
| |
| |
| def parse(): |
| # Set vars |
| sizes = {} |
| regions = {r["id"]: r for r in REGION_DETAILS.values()} |
| for region_id in regions: |
| regions[region_id]["instance_types"] = [] |
| # Parse |
| json_file, from_file = get_json() |
| products_data = ijson.items(json_file, "products") |
| |
| try: |
| products_data = next(products_data) |
| except ijson.common.IncompleteJSONError as e: |
| # This likely indicates that the cached file is incomplete or corrupt so we delete it and re |
| # download data |
| if from_file: |
| os.remove(FILEPATH) |
| json_file, from_file = get_json() |
| products_data = ijson.items(json_file, "products") |
| products_data = next(products_data) |
| else: |
| raise e |
| |
| for sku in products_data: |
| if products_data[sku].get("productFamily", "unknown") != "Compute Instance": |
| continue |
| location = products_data[sku]["attributes"].pop("location") |
| if location not in REGION_DETAILS: |
| continue |
| # Get region & size ID |
| region_id = REGION_DETAILS[location]["id"] |
| instance_type = products_data[sku]["attributes"]["instanceType"] |
| # Add size to region |
| if instance_type not in regions[region_id]["instance_types"]: |
| regions[region_id]["instance_types"].append(instance_type) |
| # Parse sizes |
| if instance_type not in sizes: |
| for field in IGNORED_FIELDS: |
| products_data[sku]["attributes"].pop(field, None) |
| # Compute RAM |
| ram = int( |
| float(products_data[sku]["attributes"]["memory"].split()[0].replace(",", "")) * 1024 |
| ) |
| # Compute bandwdith |
| bw_match = REG_BANDWIDTH.match(products_data[sku]["attributes"]["networkPerformance"]) |
| if bw_match is not None: |
| bandwidth = int(bw_match.groups()[0]) |
| else: |
| bandwidth = None |
| sizes[instance_type] = { |
| "id": instance_type, |
| "name": instance_type, |
| "ram": ram, |
| "bandwidth": bandwidth, |
| "extra": filter_extras(products_data[sku]["attributes"]), |
| } |
| if products_data[sku]["attributes"].get("storage") != "EBS only": |
| match = REG1_STORAGE.match(products_data[sku]["attributes"]["storage"]) |
| if match: |
| disk_number, disk_size = match.groups() |
| else: |
| match = REG2_STORAGE.match(products_data[sku]["attributes"]["storage"]) |
| if match: |
| disk_number, disk_size = 1, match.groups()[0] |
| else: |
| disk_number, disk_size = 0, "0" |
| disk_number, disk_size = ( |
| int(disk_number), |
| int(disk_size.replace(",", "")), |
| ) |
| sizes[instance_type]["disk"] = disk_number * disk_size |
| else: |
| sizes[instance_type]["disk"] = 0 |
| products_data[sku]["attributes"] |
| # Sort |
| for region in regions: |
| regions[region]["instance_types"] = sorted(regions[region]["instance_types"]) |
| return sizes, regions |
| |
| |
| def dump(): |
| print("Scraping size data, this may take up to 10-15 minutes...") |
| |
| sizes, regions = parse() |
| |
| separators = (",", ": ") |
| |
| # 1. Write file with instance types constants |
| file_path = "libcloud/compute/constants/ec2_instance_types.py" |
| with open(file_path, "w") as fp: |
| fp.write(FILE_HEADER + "\n") |
| fp.write("\n") |
| fp.write( |
| "INSTANCE_TYPES = " |
| + json.dumps(sizes, indent=4, sort_keys=True, separators=separators).replace( |
| "null", "None" |
| ) |
| ) |
| |
| print("") |
| print("Data written to %s" % (file_path)) |
| print("") |
| |
| # 2. Write file with full details for each region |
| file_path = "libcloud/compute/constants/ec2_region_details_complete.py" |
| with open(file_path, "w") as fp: |
| fp.write(FILE_HEADER + "\n") |
| fp.write("\n") |
| fp.write( |
| "REGION_DETAILS = " |
| + json.dumps(regions, indent=4, sort_keys=True, separators=separators).replace( |
| "null", "None" |
| ) |
| ) |
| |
| print("Data written to %s" % (file_path)) |
| print("") |
| |
| # 3. Write file with partial region details (everything except instance_types attribute) |
| regions_partial = {} |
| keys_to_keep = ["api_name", "country", "id", "endpoint", "signature_version"] |
| |
| for region_name, region_details in regions.items(): |
| regions_partial[region_name] = {} |
| |
| for key, value in region_details.items(): |
| if key not in keys_to_keep: |
| continue |
| |
| regions_partial[region_name][key] = value |
| |
| file_path = "libcloud/compute/constants/ec2_region_details_partial.py" |
| |
| with open(file_path, "w") as fp: |
| fp.write(FILE_HEADER + "\n") |
| fp.write("\n") |
| fp.write( |
| "REGION_DETAILS = " |
| + json.dumps(regions_partial, indent=4, sort_keys=True, separators=separators).replace( |
| "null", "None" |
| ) |
| ) |
| |
| print("Data written to %s" % (file_path)) |
| print("") |
| |
| |
| if __name__ == "__main__": |
| dump() |