Add support for ephemeral storage in Azure clusters (#333)

* Add support for ephemeral storage in Azure clusters

Enables the use of either the local temporary disk or the NVME disks in
Lsv2 VMs in Azure. This can be a cost-effective option for short-lived
dev/test clusters.
diff --git a/README.md b/README.md
index 5ce4b68..0b27230 100644
--- a/README.md
+++ b/README.md
@@ -157,13 +157,15 @@
 * `cluster_user` should be set to the name of the administrative user
 * `proxy_hostname` (optional) is the name of the machine which has access to the cluster VNET
 
-Under the `azure` section, edit following values as per your configuration
+Under the `azure` section, edit following values as per your configuration:
 * `resource_group` to provide the resource-group name for the cluster deployment. A new resource group with
   this name will be created if it doesn't already exist
 * `vnet` to provide the name of the VNET that your cluster nodes should use. A new VNET with this name will be
   created if it doesn't already exist
 * `subnet` to provide a name for the subnet within which the cluster resources will be deployed
 * `numnodes` to change the cluster size in terms of number of nodes deployed
+* `data_disk_count` to specify how many persistent data disks are attached to each node and will be used by HDFS.
+   If you would prefer to use ephemeral / storage for Azure clusters, please follow [these steps](docs/azure-ephemeral-disks.md).
 * `vm_sku` to specify the VM size to use. You can choose from the
   [available VM sizes](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/sizes-general).
 * `use_adlsg2` to use Azure Data Lake Storage(ADLS) Gen2 as datastore for Accumulo
diff --git a/ansible/roles/azure/tasks/create_vmss.yml b/ansible/roles/azure/tasks/create_vmss.yml
index 50dfeff..00eb9cd 100644
--- a/ansible/roles/azure/tasks/create_vmss.yml
+++ b/ansible/roles/azure/tasks/create_vmss.yml
@@ -114,7 +114,7 @@
 - name: Create luns dictionary
   set_fact:
     luns_dict: "{{ luns_dict | default ([]) + [{ 'lun': item, 'disk_size_gb': disk_size_gb , 'caching': None } ] }}"
-  with_sequence: start=0 end={{numdisks-1}}
+  with_sequence: start=0 end={{ data_disk_count-1 if data_disk_count > 0 else 0 }}
 
 - name: Set single placement group to correct value
   set_fact:
@@ -146,7 +146,7 @@
       publisher: OpenLogic
       sku: 7.5
       version: latest
-    data_disks: "{{ luns_dict }}"
+    data_disks: "{{ luns_dict if data_disk_count > 0 else omit }}"
   tags: create_vmss
 
 # SECTION 4: Automatically populate entries in the hosts file and in the muchos.props file, based on the VMSS node details
diff --git a/ansible/roles/common/tasks/azure.yml b/ansible/roles/common/tasks/azure.yml
index 16f34e9..2329f39 100644
--- a/ansible/roles/common/tasks/azure.yml
+++ b/ansible/roles/common/tasks/azure.yml
@@ -16,9 +16,9 @@
 #
 - name: Find luns
   find:
-    paths: "/dev/disk/azure/scsi1"
-    patterns: "lun*"
-    file_type: link
+    paths: "{{ azure_disk_device_path }}"
+    patterns: "{{ azure_disk_device_pattern }}"
+    file_type: any
   register: files_matched
 - name: Create xfs filesytems
   filesystem:
@@ -41,13 +41,21 @@
     fstype: xfs
     state: mounted
   with_indexed_items: "{{ disk_uuids.results }}"
+- name: Set temp storage folder ownership
+  file:
+    path: '{{ mount_root }}'
+    state: directory
+    owner: "{{ cluster_user }}"
+    group: "{{ cluster_group }}"
+  when: mount_root == '/mnt/resource'
 - name: Set mount point ownership
   file:
     path: '{{ mount_root }}{{ item.0 + 1 }}'
     state: directory
     owner: "{{ cluster_user }}"
     group: "{{ cluster_group }}"
-  with_indexed_items: "{{ files_matched.files }}"
+  with_indexed_items:
+    - "{{ files_matched.files }}"
 - name: Create directory to mount Azure File share
   file:
     path: "{{ azure_fileshare_mount }}"
diff --git a/conf/muchos.props.example b/conf/muchos.props.example
index 927d22d..e994cee 100644
--- a/conf/muchos.props.example
+++ b/conf/muchos.props.example
@@ -117,9 +117,17 @@
 # https://docs.microsoft.com/en-us/azure/virtual-machines/linux/sizes-general
 vm_sku = Standard_D8s_v3
 # Each VM will be provisioned with the following type of managed disk
+# The azure_disk_device* parameters below specify the Linux device paths Muchos looks for when selecting disks for storage
+# The default values below are for using Azure managed disks
+azure_disk_device_path = /dev/disk/azure/scsi1
+azure_disk_device_pattern = lun*
+# If using Azure Lsv2 VMs which have NVME disks for ephemeral storage, use the parameters below instead of the defaults
+# azure_disk_device_path = /dev
+# azure_disk_device_pattern = nvme*n1
+# Type of the data disk attached to the VMSS. 'Standard_LRS' for HDD, 'Premium_LRS' for SSD, 'StandardSSD_LRS' for Standard SSD
 managed_disk_type = Standard_LRS
 # Number of managed disks provisioned on each VM
-numdisks = 3
+data_disk_count = 3
 # The size of each managed disk provisioned
 disk_size_gb = 128
 # Location to mount managed disks in each VM
diff --git a/docs/azure-ephemeral-disks.md b/docs/azure-ephemeral-disks.md
new file mode 100644
index 0000000..98e2135
--- /dev/null
+++ b/docs/azure-ephemeral-disks.md
@@ -0,0 +1,25 @@
+Using ephemeral storage within clusters deployed by Muchos for Azure
+--------------------------------------------------------------------
+
+By default for Azure based clusters, Muchos will create 3 data disks, each of size 128GiB, attached to each VM. These
+[managed disks](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/managed-disks-overview) provide
+persistent storage which ensures that the data in HDFS is safe and consistent even if the VMs are deallocated (stopped).
+
+However, if you'd like to use only the ephemeral / temporary disk storage for HDFS, you first need to understand that
+using temp storage will result in lost data across VM deallocate - start cycles. If that behavior is acceptable
+for your dev/test scenario, there are two options available to use ephemeral storage within Azure:
+* Use the temporary SSD disk which is available on most VM types. This tends to be smaller in size. Refer to the 
+[Azure VM sizes](https://docs.microsoft.com/en-us/azure/virtual-machines/dv3-dsv3-series) page for details on temp storage sizes
+* Use the [Lsv2 series VMs](https://docs.microsoft.com/en-us/azure/virtual-machines/lsv2-series) which offer larger amounts of NVME based temp storage
+
+For using "regular" temporary storage (non-NVME), you need to change the following within the `azure` section within muchos.props:
+* `data_disk_count` needs to be set to 0
+* `mount_root` within the `azure` section needs to be set to `/mnt/resource'
+
+If you'd like larger NVME temporary disks, another option is to use the storage-optimized Lsv2 VM type in Azure. To use the 
+NVME disks available in these VMs, you must change the following within the `azure` section within muchos.props:
+* `vm_sku` needs to be set to one of the sizes from [this page](https://docs.microsoft.com/en-us/azure/virtual-machines/lsv2-series), for example Standard_L8s_v2
+* `data_disk_count` needs to be set to 0
+* `mount_root` within the `azure` section should be set to `/var/data` (which is also the default)
+* `azure_disk_device_path` should be set to `/dev`
+* `azure_disk_device_pattern` should be set to `nvme*n1`
\ No newline at end of file
diff --git a/lib/muchos/config/azure.py b/lib/muchos/config/azure.py
index aed1da8..b07fba6 100644
--- a/lib/muchos/config/azure.py
+++ b/lib/muchos/config/azure.py
@@ -54,23 +54,60 @@
     def data_dirs_common(self, nodeType):
         data_dirs = []
 
-        num_disks = int(self.get("azure", "numdisks"))
+        num_disks = self.data_disk_count()
+
+        # Check if using temp storage (non-NVME) for HDFS
+        if num_disks == 0 and self.mount_root() == "/mnt/resource":
+            data_dirs.append(self.mount_root())
+            return data_dirs
+
+        # Check if using Lsv2 NVME temp storage for HDFS
+        lsv2_vm_disk_map = { "Standard_L8s_v2": 1,
+            "Standard_L16s_v2": 2,
+            "Standard_L32s_v2": 4,
+            "Standard_L48s_v2": 6,
+            "Standard_L64s_v2": 8,
+            "Standard_L80s_v2": 10}
+
+        if num_disks == 0 and self.vm_sku() in lsv2_vm_disk_map.keys():
+            # pretend that we have N data disks - in this case those are NVME temp disks
+            num_disks = lsv2_vm_disk_map[self.vm_sku()]
+
+        # Persistent data disks attached to VMs
         range_var = num_disks + 1
         for diskNum in range(1, range_var):
-            data_dirs.append(self.get("azure", "mount_root") +
-                                str(diskNum))
+            data_dirs.append(self.mount_root() + str(diskNum))
 
         return data_dirs
 
     def metrics_drive_ids(self):
         drive_ids = []
-        range_var = int(self.get("azure", "numdisks")) + 1
+        range_var = self.data_disk_count() + 1
         for i in range(1, range_var):
             drive_ids.append(self.get("azure", "metrics_drive_root") +
                                 str(i))
         return drive_ids
 
     @ansible_host_var
+    def vm_sku(self):
+        return self.get('azure', 'vm_sku')
+
+    @ansible_host_var
+    @is_valid(is_type(int))
+    def data_disk_count(self):
+        return self.getint('azure', 'data_disk_count')
+
+    @ansible_host_var
+    @default('/dev/disk/azure/scsi1')
+    def azure_disk_device_path(self):
+        return self.get('azure', 'azure_disk_device_path')
+
+    @ansible_host_var
+    @default('lun*')
+    def azure_disk_device_pattern(self):
+        return self.get('azure', 'azure_disk_device_pattern')
+
+    @ansible_host_var
     @default(None)
     def azure_fileshare_mount(self):
         return self.get('azure', 'azure_fileshare_mount')