blob: 6a029ea87ff86796ae184127738a007c85850f4a [file] [log] [blame]
#!/usr/bin/env python
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import os
import re
import copy
import json
import time
import atexit
from collections import OrderedDict, defaultdict
import tqdm # pylint: disable=import-error
import ijson # pylint: disable=import-error
import requests
# Buffer size for ijson.parse() function. Larger buffer size results in increased memory
# consumption, but faster parsing.
IJSON_BUF_SIZE = 10 * 65536
# same URL as the one used by scrape-ec2-sizes.py, now it has official data on pricing
URL = "https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/index.json"
RE_NUMERIC_OTHER = re.compile(r"(?:([0-9]+)|([-A-Z_a-z]+)|([^-0-9A-Z_a-z]+))")
BASE_PATH = os.path.dirname(os.path.abspath(__file__))
PRICING_FILE_PATH = os.path.join(BASE_PATH, "../libcloud/data/pricing.json")
PRICING_FILE_PATH = os.path.abspath(PRICING_FILE_PATH)
FILEPATH = os.environ.get("TMP_JSON", "/tmp/ec.json")
INSTANCE_SIZES = [
"micro",
"small",
"medium",
"large",
"xlarge",
"x-large",
"extra-large",
]
def download_json():
if os.path.isfile(FILEPATH):
mtime_str = time.strftime("%Y-%m-%d %H:%I:%S UTC", time.gmtime(os.path.getmtime(FILEPATH)))
print("Using data from existing cached file {} (mtime={})".format(FILEPATH, mtime_str))
return open(FILEPATH), True
def remove_partial_cached_file():
if os.path.isfile(FILEPATH):
os.remove(FILEPATH)
# File not cached locally, download data and cache it
with requests.get(URL, stream=True) as response:
atexit.register(remove_partial_cached_file)
total_size_in_bytes = int(response.headers.get("content-length", 0))
progress_bar = tqdm.tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
chunk_size = 10 * 1024 * 1024
with open(FILEPATH, "wb") as fp:
# NOTE: We use shutil.copyfileobj with large chunk size instead of
# response.iter_content with large chunk size since data we
# download is massive and copyfileobj is more efficient.
# shutil.copyfileobj(response.raw, fp, 10 * 1024 * 1024)
for chunk_data in response.iter_content(chunk_size):
progress_bar.update(len(chunk_data))
fp.write(chunk_data)
progress_bar.close()
atexit.unregister(remove_partial_cached_file)
return FILEPATH, False
def get_json():
if not os.path.isfile(FILEPATH):
return download_json()[0], False
mtime_str = time.strftime("%Y-%m-%d %H:%I:%S UTC", time.gmtime(os.path.getmtime(FILEPATH)))
print("Using data from existing cached file {} (mtime={})".format(FILEPATH, mtime_str))
return FILEPATH, True
# Prices and sizes are in different dicts and categorized by sku
def get_all_prices():
# return variable
# prices = {sku : {price: int, unit: string}}
prices = {}
current_sku = ""
current_rate_code = ""
amazonEC2_offer_code = "JRTCKXETXF"
json_file, from_file = get_json()
with open(json_file) as f:
print("Starting to parse pricing data, this could take up to 15 minutes...")
parser = ijson.parse(f, buf_size=IJSON_BUF_SIZE)
# use parser because file is very large
for prefix, event, value in tqdm.tqdm(parser):
if "products" in prefix:
continue
if (prefix, event) == ("terms.OnDemand", "map_key"):
current_sku = value
prices[current_sku] = {}
elif (prefix, event) == (
f"terms.OnDemand.{current_sku}.{current_sku}.{amazonEC2_offer_code}.priceDimensions",
"map_key",
):
current_rate_code = value
elif (prefix, event) == (
f"terms.OnDemand.{current_sku}.{current_sku}.{amazonEC2_offer_code}.priceDimensions"
f".{current_rate_code}.unit",
"string",
):
prices[current_sku]["unit"] = value
elif (prefix, event) == (
f"terms.OnDemand.{current_sku}.{current_sku}.{amazonEC2_offer_code}.priceDimensions"
f".{current_rate_code}.pricePerUnit.USD",
"string",
):
prices[current_sku]["price"] = value
return prices
# For each combination of location - size - os the file has a different sku.
# For each sku we have a price
def scrape_ec2_pricing():
skus = {}
prices = get_all_prices()
json_file, from_file = get_json()
with open(json_file) as f:
print("Starting to parse pricing data, this could take up to 15 minutes...")
# use parser because file is very large
parser = ijson.parse(f, buf_size=IJSON_BUF_SIZE)
current_sku = ""
for prefix, event, value in tqdm.tqdm(parser):
if "terms" in prefix:
break
if (prefix, event) == ("products", "map_key"):
current_sku = value
skus[current_sku] = {"sku": value}
elif (prefix, event) == (f"products.{current_sku}.productFamily", "string"):
skus[current_sku]["family"] = value
elif (prefix, event) == (
f"products.{current_sku}.attributes.location",
"string",
):
skus[current_sku]["locationName"] = value
elif (prefix, event) == (
f"products.{current_sku}.attributes.locationType",
"string",
):
skus[current_sku]["locationType"] = value
elif (prefix, event) == (
f"products.{current_sku}.attributes.instanceType",
"string",
):
skus[current_sku]["size"] = value
elif (prefix, event) == (
f"products.{current_sku}.attributes.operatingSystem",
"string",
):
skus[current_sku]["os"] = value
elif (prefix, event) == (
f"products.{current_sku}.attributes.usagetype",
"string",
):
skus[current_sku]["usage_type"] = value
elif (prefix, event) == (
f"products.{current_sku}.attributes.preInstalledSw",
"string",
):
skus[current_sku]["preInstalledSw"] = value
elif (prefix, event) == (
f"products.{current_sku}.attributes.regionCode",
"string",
):
skus[current_sku]["location"] = value
# only get prices of compute instances atm
elif (prefix, event) == (f"products.{current_sku}", "end_map"):
if (
"Compute Instance" not in skus[current_sku]["family"]
and "Dedicated Host" not in skus[current_sku]["family"]
):
del skus[current_sku]
ec2_linux = defaultdict(OrderedDict)
ec2_windows = defaultdict(OrderedDict)
ec2_rhel = defaultdict(OrderedDict)
ec2_rhel_ha = defaultdict(OrderedDict)
ec2_suse = defaultdict(OrderedDict)
os_map = {
"Linux": ec2_linux,
"Windows": ec2_windows,
"RHEL": ec2_rhel,
"SUSE": ec2_suse,
"Red Hat Enterprise Linux with HA": ec2_rhel_ha,
}
for sku in skus:
if skus[sku]["locationType"] != "AWS Region":
continue
# skip any SQL
if skus[sku]["preInstalledSw"] != "NA":
continue
os = skus[sku]["os"]
if os == "NA":
continue
os_dict = os_map.get(os)
# new OS, until it is documented skip it
if os_dict is None:
print(f"Unexpected OS {os}")
continue
size = skus[sku]["size"]
location = skus[sku]["location"]
# size is first seen
if not os_dict.get(size):
os_dict[size] = {}
# if price already exists pick the BoxUsage usage type which means on demand
if os_dict.get(size, {}).get(location) and "BoxUsage" not in skus[sku]["usage_type"]:
continue
# if price is not a number then label it as not available
try:
price = float(prices[sku]["price"])
os_dict[size][location] = price
except ValueError:
os_dict[size][location] = "n/a"
except KeyError:
# size is available only reserved
del os_dict[size]
return {
"ec2_linux": ec2_linux,
"ec2_windows": ec2_windows,
"ec2_rhel": ec2_rhel,
"ec2_suse": ec2_suse,
"ec2_rhel_ha": ec2_rhel_ha,
}
def update_pricing_file(pricing_file_path, pricing_data):
with open(pricing_file_path) as fp:
content = fp.read()
data = json.loads(content)
original_data = copy.deepcopy(data)
data["compute"].update(pricing_data)
if data == original_data:
# Nothing has changed, bail out early and don't update "updated" attribute
print("Nothing has changed, skipping update.")
return
data["updated"] = int(time.time())
# Always sort the pricing info
data = sort_nested_dict(data)
content = json.dumps(data, indent=4)
lines = content.splitlines()
lines = [line.rstrip() for line in lines]
content = "\n".join(lines)
with open(pricing_file_path, "w") as fp:
fp.write(content)
def sort_nested_dict(value):
"""
Recursively sort a nested dict.
"""
result = OrderedDict()
for key, value in sorted(value.items(), key=sort_key_by_numeric_other):
if isinstance(value, (dict, OrderedDict)):
result[key] = sort_nested_dict(value)
else:
result[key] = value
return result
def sort_key_by_numeric_other(key_value):
"""
Split key into numeric, alpha and other part and sort accordingly.
"""
result = []
for (numeric, alpha, other) in RE_NUMERIC_OTHER.findall(key_value[0]):
numeric = int(numeric) if numeric else -1
alpha = INSTANCE_SIZES.index(alpha) if alpha in INSTANCE_SIZES else alpha
alpha = str(alpha)
item = tuple([numeric, alpha, other])
result.append(item)
return tuple(result)
def main():
print(
"Scraping EC2 pricing data (if this runs for the first time "
"it has to download a 3GB file, depending on your bandwith "
"it might take a while)...."
)
pricing_data = scrape_ec2_pricing()
update_pricing_file(pricing_file_path=PRICING_FILE_PATH, pricing_data=pricing_data)
print("Pricing data updated")
if __name__ == "__main__":
main()