Update scrape-ec2-sizes script so it uses larger chunk size since the file we are downloading is massive (multiple 100MBs). Also update the script and make sure we try to clean up the file in case locally cached file is corrupted or incomplete.

commit: 173ca2759fff547bc5e9de6ea3d22aff7584c6e5 [log] [tgz]
author: Tomaz Muraus <tomaz@tomaz.me> Tue Sep 22 11:49:39 2020 +0200
committer: Tomaz Muraus <tomaz@tomaz.me> Tue Sep 22 12:14:22 2020 +0200
tree: a2a6d1c775523d2685f7bcb936c40fdeb2fc1797
parent: 8613705f3bf2d8b5a202d60acb7e43299be042b7 [diff]
diff --git a/contrib/scrape-ec2-sizes.py b/contrib/scrape-ec2-sizes.py
index 7872ab8..c202903 100755
--- a/contrib/scrape-ec2-sizes.py
+++ b/contrib/scrape-ec2-sizes.py

@@ -28,6 +28,8 @@
 import re
 import os
 import json
+import shutil
+import atexit
 
 import requests
 import ijson  # pylint: disable=import-error
@@ -195,22 +197,25 @@
 
 
 def download_json():
-    response = requests.get(URL, stream=True)
-    try:
+    if os.path.isfile(FILEPATH):
         return open(FILEPATH, 'r')
-    except IOError:
-        with open(FILEPATH, 'wb') as fo:
-            for chunk in response.iter_content(chunk_size=2**20):
-                if chunk:
-                    fo.write(chunk)
+
+    # File not cached locally, download data and cache it
+    with requests.get(URL, stream=True) as response:
+        with open(FILEPATH, 'wb') as fp:
+            # NOTE: We use shutil.copyfileobj with large chunk size instead of
+            # response.iter_content with large chunk size since data we
+            # download is massive and copyfileobj is more efficient.
+            shutil.copyfileobj(response.raw, fp, 10 * 1024 * 1024)
+
     return open(FILEPATH, 'r')
 
 
 def get_json():
-    try:
-        return open(FILEPATH, 'r')
-    except IOError:
-        return download_json()
+    if not os.path.isfile(FILEPATH):
+        return download_json(), False
+
+    return open(FILEPATH, 'r'), True
 
 
 def filter_extras(extras):
@@ -230,9 +235,22 @@
     for region_id in regions:
         regions[region_id]['instance_types'] = []
     # Parse
-    json_file = get_json()
+    json_file, from_file = get_json()
     products_data = ijson.items(json_file, 'products')
-    products_data = next(products_data)
+
+    try:
+        products_data = next(products_data)
+    except ijson.common.IncompleteJSONError as e:
+        # This likely indicates that the cached file is incomplete or corrupt so we delete it and re
+        # download data
+        if from_file:
+            os.remove(FILEPATH)
+            json_file, from_file = get_json()
+            products_data = ijson.items(json_file, 'products')
+            products_data = next(products_data)
+        else:
+            raise e
+
     for sku in products_data:
         if products_data[sku]['productFamily'] != "Compute Instance":
             continue

diff --git a/tox.ini b/tox.ini
index 7f5caf9..3a6d498 100644
--- a/tox.ini
+++ b/tox.ini

@@ -152,7 +152,9 @@
 basepython: python3.7
 deps = requests
        ijson
-commands = bash -c 'python contrib/scrape-ec2-sizes.py > libcloud/compute/constants.py'
+commands = 
+    bash -c 'echo "Scrapping EC2 sizes, this may take up to 5 minutes more since the actual JSON data we download and scrape is very large"'
+    bash -c 'python contrib/scrape-ec2-sizes.py > libcloud/compute/constants.py'
 
 [testenv:pylint]
 deps = -r{toxinidir}/requirements-tests.txt
commit	173ca2759fff547bc5e9de6ea3d22aff7584c6e5	[log] [tgz]
author	Tomaz Muraus <tomaz@tomaz.me>	Tue Sep 22 11:49:39 2020 +0200
committer	Tomaz Muraus <tomaz@tomaz.me>	Tue Sep 22 12:14:22 2020 +0200
tree	a2a6d1c775523d2685f7bcb936c40fdeb2fc1797
parent	8613705f3bf2d8b5a202d60acb7e43299be042b7 [diff]