blob: 16eed151a9813c831444bfd252a83ed9fb38dfda [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
[general]
# Cluster type (Azure, ec2, or existing)
cluster_type = ec2
# Cluster user name (install command will SSH to cluster using this user)
# Leave default below if launching cluster in AWS
cluster_user = centos
# Cluster user group
cluster_group = %(cluster_user)s
# Cluster user home directory
user_home = /home/%(cluster_user)s
# Install directory where Hadoop, Accumulo, etc will be installed
install_dir = %(user_home)s/install
# Hostname of proxy node that Muchos will use to direct installation of cluster. Will be given
# public IP if launching in EC2. If not launching in EC2, node must have public IP that can be reached
# from your machine. Hostname can be chosen from "nodes" section below.
proxy_hostname = leader1
# If set, a SOCKS proxy will be created on the specified port when connecting to proxy using 'muchos ssh <cluster>'
#proxy_socks_port = 38585
# Accumulo Instance name
accumulo_instance = muchos
# Accumluo Password
accumulo_password = secret
# Software versions (make sure you have a corresponding entry for the checksum in conf/checksums)
hadoop_version = 3.2.1
zookeeper_version = 3.5.8
spark_version = 2.4.5
fluo_version = 1.2.0
fluo_yarn_version = 1.0.0
accumulo_version = 2.0.0
# Specifies if software should be downloaded. If 'False', tarballs of the software above should be in conf/upload/
download_software = True
# Install Hub (for GitHub)
install_hub = True
# The java package to install
java_package=java-1.8.0-openjdk-devel
# The package to use for java 11
# java_package=java-11-openjdk-devel
# Please read the High-Availability section of the README before switching to 'True'
hdfs_ha = False
# Give a logical name for the cluster, all one word no special characters. Required to support HDFS HA.
nameservice_id = muchoshacluster
# number of accumulo tablet servers to run per host. This is only effective when systemd is set to 'True'. Needs changes in
# accumulo-service script inorder to support non-systemd cases.
num_tservers = 1
# If accumulo services are to be run under systemd, set this to 'True'
use_systemd = False
[ec2]
# AWS machine image to use. The default below is for a CentOS 7 image (in us-east-1).
# You may need to change this value if a new image has been released or you are running in a different region.
# Before using this AMI, subscribe to it on the CentOS product page below or launching will fail:
# https://aws.amazon.com/marketplace/pp/B00O7WM7QW
aws_ami = ami-0affd4508a5d2481b
# Type of AWS instance launched by default
default_instance_type = m5d.large
# Type of AWS instance launched for any node running 'worker' service
# Leave default below to use same instance type set by 'default_instance_type' property
worker_instance_type = %(default_instance_type)s
# Enable template mode by selecting a template from conf/templates, in order to leverage your own
# custom EC2 launch requests (optional). See conf/templates/README.md for more information
#cluster_template = example
# VPC to launch instances in (optional)
#vpc_id = vpc-xxxxx
# VPC Subnet to launch instances in (optional)
#subnet_id = subnet-xxxxxx
# Security group ID to launch in (optional)
#security_group_id = sg-xxxxxx
# Name of public key that will be loaded by Amazon on to your EC2 instances.
# You can upload and name your public key using the EC2 Management Console.
# Only the user with this key will be able to SSH to the cluster.
# Name below should be your 'Key pair name' in EC2 and not name of your public key file.
key_name = my_aws_key
# Type of filesystem to format instance storage as.
fstype = ext3
# Force formatting of instance devices, even when it has an existing filesystem.
force_format = no
# Tags to add instances
#instance_tags = key1:value1,key2:value2
# Nodes will be given public IP addresses if true
associate_public_ip = true
# Path to file containing user data that will be executed at launch
#user_data_path = /path/to/user_data
# Shutdown instances after a delay (in minutes). If 0, no shutdown will occur.
shutdown_delay_minutes = 0
# Shutdown behavior of EC2 instances: terminate or stop
shutdown_behavior = stop
[azure]
# Name of the Azure resource group to use. All resources are created within this
# resource group. If a resource group with this name already exists, it will be
# used, otherwise a new resource group with this name will be created and used.
resource_group = accumulo-rg
# Name of the Azure Virtual Network (VNET) to use. If a VNET with this name
# already exists, it will be used; else a new VNET with this name will be created.
vnet = vnet1
# The CIDR prefix used to create the virtual network (VNET).
vnet_cidr = 10.0.0.0/8
# A single subnet is created within the VNET and given the following name.
subnet = subnet1
# The CIDR prefix used for the single subnet within the virtual network.
subnet_cidr = 10.1.0.0/16
# Size of the cluster to provision.
# A virtual machine scale set (VMSS) with these many VMs will be created.
# The minimum allowed size for this is 3 nodes for non-HA & 4 nodes for HA setup
numnodes = 8
# The size of each virtual machine. See the following link for other sizes:
# https://docs.microsoft.com/en-us/azure/virtual-machines/linux/sizes-general
vm_sku = Standard_D8s_v3
# Each VM will be provisioned with the following type of managed disk
# The azure_disk_device* parameters below specify the Linux device paths Muchos looks for when selecting disks for storage
# The default values below are for using Azure managed disks
azure_disk_device_path = /dev/disk/azure/scsi1
azure_disk_device_pattern = lun*
# If using Azure Lsv2 VMs which have NVME disks for ephemeral storage, use the parameters below instead of the defaults
# azure_disk_device_path = /dev
# azure_disk_device_pattern = nvme*n1
# Type of the data disk attached to the VMSS. 'Standard_LRS' for HDD, 'Premium_LRS' for SSD, 'StandardSSD_LRS' for Standard SSD
managed_disk_type = Standard_LRS
# Number of managed disks provisioned on each VM
data_disk_count = 3
# The size of each managed disk provisioned
disk_size_gb = 128
# Location to mount managed disks in each VM
mount_root = /var/data
# Location where the metrics data will be written
metrics_drive_root = var-data
# Optional proxy VM. If not set, the first node of the cluster will be selected as the proxy.
azure_proxy_host =
location = westus2
# Enable ADLS Gen2 storage configuration. Muchos parameters instance_volumes_input, instance_volumes_adls & adls_storage_type is not required if use_adlsg2 is false.
use_adlsg2 = False
# Storage accounts can be auto generated or manually specified. "|" is used as separator between manual and auto generated storage account names and must be specified
# Manual and Auto generated names are mutually exclusive
#
# Specifying storage accounts manually:
# |abfss://<container-name>@<storage-account-name>.<domain-name>/<folder-name>". Use comma to specify multiple entries
# Example:|abfss://accumulodata@shnawastore1.dfs.core.windows.net/accumulo,abfss://accumulodata@shnawastore2.dfs.core.windows.net/accumulo
#
# Specifying auto-generated storage accounts:
# <Number-of-Storage-Accounts>,<domain-name>|
# Example: 3,dfs.core.windows.net|
instance_volumes_input = 1,dfs.core.windows.net|
# Do not update "instance_volumes_adls", it will be populated dynamically during launch phase of muchos
instance_volumes_adls =
# Type of storage for ADLS Gen2 storage accounts
adls_storage_type = Standard_LRS
# Specify user assigned identity name. "{{ vmss_name }}-ua-msi" will be created if value is not provided
user_assigned_identity =
# Do not update "azure_tenant_id", it will be populated dynamically during launch phase of muchos
azure_tenant_id =
# Do not update "azure_client_id", it will be populated dynamically during launch phase of muchos
azure_client_id =
# Do not update "principal_id", it will be populated dynamically during launch phase of muchos when "use_hdfs = False"
principal_id =
# Optional Azure fileshare to mount on all nodes.
# Path and credentials must be updated to enable this.
#azure_fileshare_mount = /mnt/azure-fileshare
#azure_fileshare = //fileshare-to-mount.file.core.windows.net/path
#azure_fileshare_username = fs_username
#azure_fileshare_password = fs_password
# Optional integration with Azure Log Analytics
# Workspace ID and key must be updated to enable this.
# For details on how to get a workspace ID and key, see the link below
# https://docs.microsoft.com/en-us/azure/azure-monitor/learn/quick-collect-linux-computer#obtain-workspace-id-and-key
az_oms_integration_needed = False
#az_logs_id =
#az_logs_key =
[existing]
# Root of data dirs
mount_root = /var/data
# Data directories on all nodes
data_dirs = /var/data1,/var/data2,/var/data3
# Identifies drives for metrics
metrics_drive_ids = var-data1,var-data2,var-data3
[performance]
# Automatically tune Accumulo, Yarn, and Fluo performance setting by selecting or
# creating a performance profile. Try not to use more memory than each node has
# and leave some space for the OS.
profile=perf-small
# Below are different performance profiles that can be selected. Each profile
# has the same properties with different values.
[perf-small]
# Amount of JVM heap for each tserver
accumulo_tserv_mem=2G
# Amount of data cache for each tserver. Only applies when using Accumulo 1.x
accumulo_dcache_size=768M
# Amount of index cache for each tserver. Only applies when using Accumulo 1.x
accumulo_icache_size=256M
# In memory map size for each tserver. Only applies when using Accumulo 1.x
accumulo_imap_size=512M
# Amount of JVM heap for each Fluo worker
fluo_worker_mem_mb=2048
# Determines the gap between the Yarn memory limit and the java -Xmx setting.
# For example if fluo_worker_mem_mb is set to 2048 and twill_reserve_mem_mb is
# set to 256, then for workers the java -Xmx setting will be set to 2048-256.
# If yarn is killing worker processes because they are using too much memory,
# then consider increasing this setting.
twill_reserve_mem_mb=256
# Number of threads for each Flup worker
fluo_worker_threads=20
# Number of worker to run per node
fluo_worker_instances_multiplier=1
# Max amount of memory for YARN per node
yarn_nm_mem_mb=4096
[perf-medium]
accumulo_tserv_mem=3G
# Accumulo configs below only apply when using Accumulo 1.x
accumulo_dcache_size=1536M
accumulo_icache_size=512M
accumulo_imap_size=512M
fluo_worker_mem_mb=4096
twill_reserve_mem_mb=512
fluo_worker_threads=64
fluo_worker_instances_multiplier=1
yarn_nm_mem_mb=8192
[perf-large]
accumulo_tserv_mem=4G
# Accumulo configs below only apply when using Accumulo 1.x
accumulo_dcache_size=2G
accumulo_icache_size=1G
accumulo_imap_size=512M
fluo_worker_mem_mb=4096
twill_reserve_mem_mb=512
fluo_worker_threads=64
fluo_worker_instances_multiplier=2
yarn_nm_mem_mb=16384
[azd16s]
accumulo_tserv_mem=4G
accumulo_dcache_size=2G
accumulo_icache_size=1G
accumulo_imap_size=512M
fluo_worker_mem_mb=4096
twill_reserve_mem_mb=512
fluo_worker_threads=64
fluo_worker_instances_multiplier=2
yarn_nm_mem_mb=16384
[azd8s]
accumulo_tserv_mem=4G
accumulo_dcache_size=2G
accumulo_icache_size=1G
accumulo_imap_size=512M
fluo_worker_mem_mb=4096
twill_reserve_mem_mb=512
fluo_worker_threads=64
fluo_worker_instances_multiplier=2
yarn_nm_mem_mb=16384
[ansible-vars]
# This section is used to override Ansible variables. Any variable set below will be placed in the hosts file created by Muchos.
# Expected format: variable = value
[nodes]
# If cluster_type=existing, the list of nodes below needs to manually populated with a list of cluster nodes in the following format:
# <Hostname> = <Service1>[,<Service2>,<Service3>]
# Where:
# Hostname = Must be unique. Will be used for hostname in EC2 or should match hostname on your own cluster
# Service = Service to run on node (possible values: zookeeper, namenode, resourcemanager, accumulomaster, client, swarmmanager,
# mesosmaster, worker, fluo, metrics, spark). The following services are required: namenode, resourcemanager,
# accumulomaster, zookeeper & worker
# If cluster_type=azure, the list of nodes below is auto-generated by the launch action e.g. "muchos launch --cluster accumuloclstr"
# For the 'azure' cluster type, it is perfectly normal if the auto-generated list of node names is not sequential
leader1 = namenode,resourcemanager,accumulomaster,zookeeper
leader2 = metrics
worker1 = worker,swarmmanager
worker2 = worker
worker3 = worker
worker4 = worker