diff --git a/README.md b/README.md
index 4339ae88..4dd1fcc8 100644
--- a/README.md
+++ b/README.md
@@ -164,6 +164,9 @@ Under the `azure` section, edit following values as per your configuration
* `numnodes` to change the cluster size in terms of number of nodes deployed
* `vm_sku` to specify the VM size to use. You can choose from the
[available VM sizes](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/sizes-general).
+* `use_adlsg2` to use Azure Data Lake Storage(ADLS) Gen2 as datastore for Accumulo
+ [ADLS Gen2 Doc](https://docs.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction).
+ [Setup ADLS Gen2 as datastore for Accumulo](https://accumulo.apache.org/blog/2019/10/15/accumulo-adlsgen2-notes.html).
Within Azure the `nodes` section is auto populated with the hostnames and their default roles.
diff --git a/ansible/accumulo.yml b/ansible/accumulo.yml
index 2af9d670..2352c855 100644
--- a/ansible/accumulo.yml
+++ b/ansible/accumulo.yml
@@ -27,6 +27,16 @@
- import_tasks: roles/accumulo/tasks/init-accumulo.yml
handlers:
- import_tasks: roles/accumulo/handlers/init-accumulo.yml
+- hosts: all:!{{ azure_proxy_host }}
+ tasks:
+ - import_tasks: roles/accumulo/tasks/add-adlsgen2.yml
+ when: accumulo_major_version == '2' and use_adlsg2 == True
+- hosts: accumulomaster[0]
+ tasks:
+ - import_tasks: roles/accumulo/tasks/init-adlsgen2.yml
+ when: accumulo_major_version == '2' and use_adlsg2 == True
+ handlers:
+ - import_tasks: roles/accumulo/handlers/init-adlsgen2.yml
- hosts: accumulo
tasks:
- name: "start accumulo 1.0"
diff --git a/ansible/roles/accumulo/handlers/init-adlsgen2.yml b/ansible/roles/accumulo/handlers/init-adlsgen2.yml
new file mode 100644
index 00000000..06f67b5b
--- /dev/null
+++ b/ansible/roles/accumulo/handlers/init-adlsgen2.yml
@@ -0,0 +1,19 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+- name: "Initialize Apache Accumulo on ADLS Gen2 volume"
+ command: "{{ accumulo_home }}/bin/accumulo init --add-volumes"
diff --git a/ansible/roles/accumulo/tasks/add-adlsgen2.yml b/ansible/roles/accumulo/tasks/add-adlsgen2.yml
new file mode 100644
index 00000000..8056f2dc
--- /dev/null
+++ b/ansible/roles/accumulo/tasks/add-adlsgen2.yml
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+- name: Add ADLS Gen2 volume
+ lineinfile:
+ path: "{{ accumulo_home }}/conf/accumulo.properties"
+ regexp: '^instance.volumes='
+ line: "instance.volumes={{ hdfs_root }}/accumulo,{{ instance_volumes_preferred }}"
diff --git a/ansible/roles/accumulo/tasks/init-adlsgen2.yml b/ansible/roles/accumulo/tasks/init-adlsgen2.yml
new file mode 100644
index 00000000..505b23d4
--- /dev/null
+++ b/ansible/roles/accumulo/tasks/init-adlsgen2.yml
@@ -0,0 +1,22 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+- name: "determine if accumulo needs to be initialized on adlsgen2"
+ command: "{{ hadoop_home }}/bin/hdfs dfs -stat {{ instance_volumes_preferred[0] }}"
+ register: adlsgen2_stat
+ changed_when: adlsgen2_stat.rc != 0
+ failed_when: adlsgen2_stat.rc != 0 and 'No such file or directory' not in adlsgen2_stat.stderr
+ notify: Initialize Apache Accumulo on ADLS Gen2 volume
diff --git a/ansible/roles/accumulo/templates/accumulo-env.sh b/ansible/roles/accumulo/templates/accumulo-env.sh
index a6a1bc65..083007bf 100755
--- a/ansible/roles/accumulo/templates/accumulo-env.sh
+++ b/ansible/roles/accumulo/templates/accumulo-env.sh
@@ -41,6 +41,10 @@ export HADOOP_HOME={{ hadoop_home }}
export HADOOP_CONF_DIR="$HADOOP_HOME/etc/hadoop"
CLASSPATH="${conf}:${lib}/*:${HADOOP_CONF_DIR}:${ZOOKEEPER_HOME}/*:${HADOOP_HOME}/share/hadoop/client/*"
+{% if use_adlsg2 == True %}
+CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/tools/lib/*"
+CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/common/lib/*"
+{% endif %}
export CLASSPATH
JAVA_OPTS=("${ACCUMULO_JAVA_OPTS[@]}"
@@ -50,6 +54,9 @@ JAVA_OPTS=("${ACCUMULO_JAVA_OPTS[@]}"
'-XX:OnOutOfMemoryError=kill -9 %p'
'-XX:-OmitStackTraceInFastThrow'
'-Djava.net.preferIPv4Stack=true'
+{% if use_adlsg2 == True %}
+ '-Dorg.wildfly.openssl.path=/usr/lib64'
+{% endif %}
"-Daccumulo.native.lib.path=${lib}/native")
case "$cmd" in
diff --git a/ansible/roles/accumulo/templates/accumulo.properties b/ansible/roles/accumulo/templates/accumulo.properties
index 895cc993..eac3ddf8 100644
--- a/ansible/roles/accumulo/templates/accumulo.properties
+++ b/ansible/roles/accumulo/templates/accumulo.properties
@@ -42,3 +42,9 @@ tserver.server.threads.minimum=64
## The maximum size for each write-ahead log
tserver.walog.max.size=512M
+
+{% if use_adlsg2 == True %}
+general.volume.chooser=org.apache.accumulo.server.fs.PreferredVolumeChooser
+general.custom.volume.preferred.default={{ instance_volumes_preferred }}
+general.custom.volume.preferred.logger={{ hdfs_root }}/accumulo
+{% endif %}
diff --git a/ansible/roles/azure/tasks/create_adlsgen2.yml b/ansible/roles/azure/tasks/create_adlsgen2.yml
new file mode 100644
index 00000000..cd674dd7
--- /dev/null
+++ b/ansible/roles/azure/tasks/create_adlsgen2.yml
@@ -0,0 +1,235 @@
+---
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+# These Ansible tasks only run on the client machine where Muchos runs
+# At a high level, the various sections in this file do the following:
+# 1. Create an Azure ADLS Gen2 storage account.
+# 2. Create User Assigned Identity.
+# 3. Assign roles to storage accounts.
+# 4. Create filesysystem/container in storage accounts.
+# 5. Update tenant_id, client_id and instance_volumes_preferred in muchos.props.
+# 6. Assign User Assigned Identity to VMSS.
+
+- name: Generate MD5 checksum based on resource_group name, vmss_name and cluster name
+ shell: echo -n {{ resource_group + vmss_name + location }}|md5sum|tr -cd "[:alnum:]"|cut -c 1-16|tr '[:upper:]' '[:lower:]'
+ register: StorageAccountMD5
+
+- name: Generate random names for storage account names
+ set_fact:
+ StorageAccountName: "{{ StorageAccountMD5.stdout + 99|random(seed=resource_group)|string + 99|random(seed=vmss_name)|string + 9|random(seed=location)|string }}"
+
+- name: Initialize instance variables
+ set_fact:
+ InstanceVolumesAuto: []
+ InstanceVolumesManual: []
+
+- name: Validate instance_volumes_input
+ fail: msg="Variable instance_volumes_input incorrectly specified, Both Manual and Auto cannot be specified at same time"
+ when: instance_volumes_input.split('|')[0].split(',') != [''] and instance_volumes_input.split('|')[1].split(',') != ['']
+
+- name: Assign manual or autogenerated volumes
+ set_fact:
+ InstanceVolumesTemp: "{{ instance_volumes_input.split('|')[0].split(',')|list if instance_volumes_input.split('|')[0].split(',') != [''] else instance_volumes_input.split('|')[1].split(',')|list }}"
+
+- name: Retrieve sequence end number to get the number of storage accounts
+ set_fact:
+ InstanceVolumesEndSequence: "{{ '1' if instance_volumes_input.split('|')[0].split(',') == [''] else InstanceVolumesTemp[0]|int }}"
+
+- name: Generate names for Storage Accounts
+ set_fact:
+ InstanceVolumesAuto: "{{ InstanceVolumesAuto + ['abfss://'+'accumulodata'+'@'+StorageAccountName+item+'.'+InstanceVolumesTemp[1]+'/accumulo'] }}"
+ with_sequence: start=1 end={{ InstanceVolumesEndSequence|int }}
+ when: InstanceVolumesTemp[0]|int != 0
+
+- name: Retrieve ABFSS values when specified manually
+ set_fact:
+ InstanceVolumesManual: "{{ InstanceVolumesManual + [ item ] }}"
+ loop:
+ "{{ InstanceVolumesTemp }}"
+ when: item.split('://')[0] == 'abfss' and instance_volumes_input.split('|')[0].split(',') == ['']
+
+# This is final list of instance volumes
+- name: Assign variables for autogeneration or manual for storage account creation
+ set_fact:
+ InstanceVolumes: "{{ InstanceVolumesManual if instance_volumes_input.split('|')[0].split(',') == [''] else InstanceVolumesAuto }}"
+
+- name: Update instance_volumes_preferred in muchos.props
+ lineinfile:
+ path: "{{ deploy_path }}/conf/muchos.props"
+ regexp: '^instance_volumes_preferred\s*=\s*|^[#]instance_volumes_preferred\s*=\s*'
+ line: "instance_volumes_preferred = {{ InstanceVolumes|join(',') }}"
+
+# Not registering variable because storage values are not visible immediately
+- name: Create ADLS Gen2 storage acount using REST API
+ azure_rm_resource:
+ resource_group: "{{ resource_group }}"
+ provider: Storage
+ resource_type: storageAccounts
+ resource_name: "{{ item.split('@')[1].split('.')[0] }}"
+ api_version: '2019-04-01'
+ idempotency: yes
+ state: present
+ body:
+ sku:
+ name: "{{ adls_storage_type }}"
+ kind: StorageV2
+ properties:
+ isHnsEnabled: yes
+ location: "{{ location }}"
+ loop:
+ "{{ InstanceVolumes }}"
+
+# Creating User Assigned identity with vmss_name suffixed by ua-msi if not specified in muchos.props
+# Not registering variable because user identity values are not visible immediately
+- name: Create User Assigned Identity
+ azure_rm_resource:
+ resource_group: "{{ resource_group }}"
+ provider: ManagedIdentity
+ resource_type: userAssignedIdentities
+ resource_name: "{{ user_assigned_identity if user_assigned_identity !='' else vmss_name + '-ua-msi' }}"
+ api_version: '2018-11-30'
+ idempotency: yes
+ state: present
+ body:
+ location: "{{ location }}"
+
+# Retrieving facts about User Assigned Identity
+- name: Get facts for User Assigned Identity
+ azure_rm_resource_facts:
+ resource_group: "{{ resource_group }}"
+ provider: ManagedIdentity
+ resource_type: userAssignedIdentities
+ resource_name: "{{ user_assigned_identity if user_assigned_identity !='' else vmss_name + '-ua-msi' }}"
+ api_version: '2018-11-30'
+ register: UserAssignedIdentityInfo
+ retries: 20
+ delay: 15
+ until: UserAssignedIdentityInfo.response|map(attribute='properties')|map(attribute='principalId')|join('') is defined
+
+- name: Update principal_id in muchos.props
+ lineinfile:
+ path: "{{ deploy_path }}/conf/muchos.props"
+ regexp: '^principal_id\s*=\s*|^[#]principal_id\s*=\s*'
+ line: "principal_id = {{ UserAssignedIdentityInfo.response|map(attribute='properties')|map(attribute='principalId')|join('') }}"
+
+# This will be used to assign the MSI for VMSS
+- name: Format User Assigned Identity for API
+ set_fact:
+ UserAssignedIdentityArr: "{{ UserAssignedIdentityInfo.response|default({})|map(attribute='id')|map('regex_replace','^(.*)$','{\"\\1\":{}}')|list}}"
+
+# Retrieve facts about role assignment
+- name: Get role definition id for "Storage Blob Data Contributor"
+ azure_rm_resource_facts:
+ resource_group: "{{ resource_group }}"
+ provider: Authorization
+ resource_type: roleDefinitions
+ resource_name: ba92f5b4-2d11-453d-a403-e96b0029c9fe
+ api_version: '2015-07-01'
+ register: RoleDefinitionInfo
+
+# Retrieve storage acount informationn.
+- name: Check if the storage accounts is visible
+ azure_rm_storageaccount_facts:
+ resource_group: "{{ resource_group }}"
+ name: "{{ item.split('@')[1].split('.')[0] }}"
+ register: StorageAccountsInfo
+ retries: 20
+ delay: 15
+ until: StorageAccountsInfo.storageaccounts|sum(start=[])|map(attribute='id')|join('') is defined
+ loop:
+ "{{ InstanceVolumes }}"
+
+# Retrieve storage accounts id creeated -- Used for account assignments
+- name: Get the id of storage accounts created
+ set_fact:
+ StorageAccountsId: "{{StorageAccountsInfo.results|map(attribute='ansible_facts')|map(attribute='azure_storageaccounts')|sum(start=[])|map(attribute='id')|list|unique }}"
+
+# Adding this module since role aassignment fails if it already exists.
+- name: Get facts about role assignment
+ azure_rm_roleassignment_facts:
+ scope: "{{ item }}"
+ assignee: "{{ UserAssignedIdentityInfo.response|map(attribute='properties')|map(attribute='principalId')|list|join('') }}"
+ role_definition_id: "{{ RoleDefinitionInfo.response|map(attribute='id')|list|join('') }}"
+ register: RoleAssignmentResults
+ retries: 20
+ delay: 15
+ until: UserAssignedIdentityInfo.response|map(attribute='properties')|map(attribute='principalId')|join('') is defined and RoleDefinitionInfo.response|map(attribute='id')|join('') is defined
+ loop:
+ "{{ StorageAccountsId }}"
+
+- name: Set fact for getting storage accounts that have assigned roles
+ set_fact:
+ StorageAccountRoles: "{{ item|map(attribute='scope')|list|unique }}"
+ no_log: True
+ loop:
+ "{{RoleAssignmentResults.results|map(attribute='roleassignments')|list }}"
+
+# This retry logic is needed due to race condition between storage account create complete and role assignment
+- name: Create a role assignment
+ azure_rm_roleassignment:
+ scope: "{{ item }}"
+ assignee_object_id: "{{ UserAssignedIdentityInfo.response|map(attribute='properties')|map(attribute='principalId')|list|join('') }}"
+ role_definition_id: "{{ RoleDefinitionInfo.response|map(attribute='id')|list|join('') }}"
+ state: present
+ retries: 30
+ delay: 15
+ register: roleassignresult
+ until: roleassignresult is succeeded
+ loop:
+ "{{ StorageAccountsId }}"
+ when: item not in StorageAccountRoles
+
+# This retry logic is needed due to race condition between storage account creation and creating filesystem
+- name: Create container/Filesystem on ADLS Gen2
+ azure_rm_storageblob:
+ resource_group: "{{ resource_group }}"
+ storage_account_name: "{{ item.split('@')[1].split('.')[0] }}"
+ container: "{{ item.split('@')[0].split('://')[1] }}"
+ retries: 30
+ delay: 15
+ register: createfsresult
+ until: createfsresult is succeeded and (createfsresult.changed == False or (createfsresult.changed == True and createfsresult.container|length > 0))
+ loop:
+ "{{ InstanceVolumes }}"
+
+# Retrieve tenantId for core-site.xml
+- name: Update tenantId in muchos.props
+ lineinfile:
+ path: "{{ deploy_path }}/conf/muchos.props"
+ regexp: '^azure_tenant_id\s*=\s*|^[#]azure_tenant_id\s*=\s*'
+ line: "azure_tenant_id = {{ UserAssignedIdentityInfo.response|map(attribute='properties')|map(attribute='tenantId')|list|join('') }}"
+
+# Retrieve clientId for core-site.xml
+- name: Update clientid in muchos.props
+ lineinfile:
+ path: "{{ deploy_path }}/conf/muchos.props"
+ regexp: '^azure_client_id\s*=\s*|^[#]azure_client_id\s*=\s*'
+ line: "azure_client_id = {{ UserAssignedIdentityInfo.response|map(attribute='properties')|map(attribute='clientId')|list|join('') }}"
+
+- name: Assign User Assigned Identity to VMSS
+ azure_rm_resource:
+ resource_group: "{{ resource_group }}"
+ provider: Compute
+ resource_type: virtualMachineScaleSets
+ resource_name: "{{ vmss_name }}"
+ api_version: '2019-03-01'
+ body:
+ location: "{{ location }}"
+ identity:
+ type: UserAssigned
+ userAssignedIdentities: "{{ UserAssignedIdentityArr|join('') }}"
diff --git a/ansible/roles/azure/tasks/main.yml b/ansible/roles/azure/tasks/main.yml
index 6ec80d75..a846779c 100644
--- a/ansible/roles/azure/tasks/main.yml
+++ b/ansible/roles/azure/tasks/main.yml
@@ -19,3 +19,5 @@
# tasks file for azure
- import_tasks: create_vmss.yml
+- import_tasks: create_adlsgen2.yml
+ when: use_adlsg2 == True
diff --git a/ansible/roles/hadoop-ha/tasks/main.yml b/ansible/roles/hadoop-ha/tasks/main.yml
index 7f456c81..dd92ae19 100644
--- a/ansible/roles/hadoop-ha/tasks/main.yml
+++ b/ansible/roles/hadoop-ha/tasks/main.yml
@@ -54,3 +54,11 @@
replace: "export HADOOP_LOG_DIR={{ worker_data_dirs[0] }}/logs/hadoop"
- name: "Create hadoop log dir"
file: path={{ worker_data_dirs[0] }}/logs/hadoop state=directory
+- name: Insert HADOOP_OPTIONAL_TOOLS & HADOOP_OPTS in hadoop-env.sh
+ blockinfile:
+ path: "{{ hadoop_home }}/etc/hadoop/hadoop-env.sh"
+ insertafter: EOF
+ block: |
+ export HADOOP_OPTIONAL_TOOLS=hadoop-azure
+ export HADOOP_OPTS="-Dorg.wildfly.openssl.path=/usr/lib64 ${HADOOP_OPTS}"
+ when: hadoop_major_version == '3' and use_adlsg2 == True
diff --git a/ansible/roles/hadoop-ha/templates/core-site.xml b/ansible/roles/hadoop-ha/templates/core-site.xml
index dd54827d..d717c5c2 100644
--- a/ansible/roles/hadoop-ha/templates/core-site.xml
+++ b/ansible/roles/hadoop-ha/templates/core-site.xml
@@ -38,4 +38,34 @@
ha.zookeeper.quorum
{{ zookeeper_connect }}
+{% if use_adlsg2 == True %}
+
+ fs.azure.account.auth.type
+ OAuth
+
+
+ fs.azure.account.oauth.provider.type
+ org.apache.hadoop.fs.azurebfs.oauth2.MsiTokenProvider
+
+
+ fs.azure.account.oauth2.msi.tenant
+ {{ azure_tenant_id}}
+
+
+ fs.azure.account.oauth2.client.id
+ {{ azure_client_id }}
+
+
+ fs.azure.use.upn
+ true
+
+
+ fs.azure.identity.transformer.service.principal.substitution.list
+ *
+
+
+ fs.azure.identity.transformer.service.principal.id
+ {{ principal_id }}
+
+{% endif %}
diff --git a/ansible/roles/hadoop-ha/templates/mapred-site.xml b/ansible/roles/hadoop-ha/templates/mapred-site.xml
index c6be0cec..c3def168 100644
--- a/ansible/roles/hadoop-ha/templates/mapred-site.xml
+++ b/ansible/roles/hadoop-ha/templates/mapred-site.xml
@@ -54,4 +54,10 @@
HADOOP_MAPRED_HOME={{ hadoop_home }}
{% endif %}
+{% if use_adlsg2 == True %}
+
+ mapreduce.application.classpath
+ $HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*,$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/common/*,$HADOOP_MAPRED_HOME/share/hadoop/common/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/yarn/*,$HADOOP_MAPRED_HOME/share/hadoop/yarn/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/hdfs/*,$HADOOP_MAPRED_HOME/share/hadoop/hdfs/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/tools/lib/*,${HADOOP_HOME}/share/hadoop/client/*
+
+{% endif %}
diff --git a/ansible/roles/hadoop-ha/templates/yarn-site.xml b/ansible/roles/hadoop-ha/templates/yarn-site.xml
index 85033a6b..eb45896f 100644
--- a/ansible/roles/hadoop-ha/templates/yarn-site.xml
+++ b/ansible/roles/hadoop-ha/templates/yarn-site.xml
@@ -93,4 +93,10 @@
twill.java.reserved.memory.mb
{{ twill_reserve_mem_mb }}
+ {% if use_adlsg2 == True %}
+
+ yarn.application.classpath
+ ${HADOOP_HOME}/share/hadoop/tools/lib/*,${HADOOP_HOME}/share/hadoop/hdfs/lib/*,${HADOOP_HOME}/share/hadoop/common/lib/*,${HADOOP_HOME}/share/hadoop/yarn/*,${HADOOP_HOME}/share/hadoop/yarn/lib/*,${HADOOP_HOME}/share/hadoop/hdfs/*,${HADOOP_HOME}/share/hadoop/common/*,${HADOOP_HOME}/share/hadoop/mapreduce/*,${HADOOP_HOME}/share/hadoop/mapreduce/lib/*,${HADOOP_HOME}/share/hadoop/client/*
+
+ {% endif %}
diff --git a/ansible/roles/hadoop/tasks/main.yml b/ansible/roles/hadoop/tasks/main.yml
index d0219b31..a6733a9c 100644
--- a/ansible/roles/hadoop/tasks/main.yml
+++ b/ansible/roles/hadoop/tasks/main.yml
@@ -55,3 +55,11 @@
- name: "Create hadoop log dir"
file: path={{ worker_data_dirs[0] }}/logs/hadoop state=directory
+- name: Insert HADOOP_OPTIONAL_TOOLS & HADOOP_OPTS in hadoop-env.sh
+ blockinfile:
+ path: "{{ hadoop_home }}/etc/hadoop/hadoop-env.sh"
+ insertafter: EOF
+ block: |
+ export HADOOP_OPTIONAL_TOOLS=hadoop-azure
+ export HADOOP_OPTS="-Dorg.wildfly.openssl.path=/usr/lib64 ${HADOOP_OPTS}"
+ when: hadoop_major_version == '3' and use_adlsg2 == True
diff --git a/ansible/roles/hadoop/templates/core-site.xml b/ansible/roles/hadoop/templates/core-site.xml
index 56232aa7..c5f15974 100644
--- a/ansible/roles/hadoop/templates/core-site.xml
+++ b/ansible/roles/hadoop/templates/core-site.xml
@@ -36,4 +36,34 @@
dfs.domain.socket.path
/var/lib/hadoop-hdfs/dn_socket
+{% if use_adlsg2 == True %}
+
+ fs.azure.account.auth.type
+ OAuth
+
+
+ fs.azure.account.oauth.provider.type
+ org.apache.hadoop.fs.azurebfs.oauth2.MsiTokenProvider
+
+
+ fs.azure.account.oauth2.msi.tenant
+ {{ azure_tenant_id}}
+
+
+ fs.azure.account.oauth2.client.id
+ {{ azure_client_id }}
+
+
+ fs.azure.use.upn
+ true
+
+
+ fs.azure.identity.transformer.service.principal.substitution.list
+ *
+
+
+ fs.azure.identity.transformer.service.principal.id
+ {{ principal_id }}
+
+{% endif %}
diff --git a/ansible/roles/hadoop/templates/mapred-site.xml b/ansible/roles/hadoop/templates/mapred-site.xml
index a95eb775..7ecf751d 100644
--- a/ansible/roles/hadoop/templates/mapred-site.xml
+++ b/ansible/roles/hadoop/templates/mapred-site.xml
@@ -56,4 +56,10 @@
HADOOP_MAPRED_HOME={{ hadoop_home }}
{% endif %}
+{% if use_adlsg2 == True %}
+
+ mapreduce.application.classpath
+ $HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*,$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/common/*,$HADOOP_MAPRED_HOME/share/hadoop/common/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/yarn/*,$HADOOP_MAPRED_HOME/share/hadoop/yarn/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/hdfs/*,$HADOOP_MAPRED_HOME/share/hadoop/hdfs/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/tools/lib/*,${HADOOP_HOME}/share/hadoop/client/*
+
+{% endif %}
diff --git a/ansible/roles/hadoop/templates/yarn-site.xml b/ansible/roles/hadoop/templates/yarn-site.xml
index ac62174d..847f98b0 100644
--- a/ansible/roles/hadoop/templates/yarn-site.xml
+++ b/ansible/roles/hadoop/templates/yarn-site.xml
@@ -82,4 +82,10 @@
twill.java.reserved.memory.mb
{{ twill_reserve_mem_mb }}
+ {% if use_adlsg2 == True %}
+
+ yarn.application.classpath
+ ${HADOOP_HOME}/share/hadoop/tools/lib/*,${HADOOP_HOME}/share/hadoop/hdfs/lib/*,${HADOOP_HOME}/share/hadoop/common/lib/*,${HADOOP_HOME}/share/hadoop/yarn/*,${HADOOP_HOME}/share/hadoop/yarn/lib/*,${HADOOP_HOME}/share/hadoop/hdfs/*,${HADOOP_HOME}/share/hadoop/common/*,${HADOOP_HOME}/share/hadoop/mapreduce/*,${HADOOP_HOME}/share/hadoop/mapreduce/lib/*,${HADOOP_HOME}/share/hadoop/client/*
+
+ {% endif %}
diff --git a/conf/muchos.props.example b/conf/muchos.props.example
index b34a4374..41cb5d5d 100644
--- a/conf/muchos.props.example
+++ b/conf/muchos.props.example
@@ -129,6 +129,31 @@ metrics_drive_root = var-data
# Optional proxy VM. If not set, the first node of the cluster will be selected as the proxy.
azure_proxy_host =
location = westus2
+# Enable ADLS Gen2 storage configuration. Muchos parameters instance_volumes_input, instance_volumes_preferred & adls_storage_type is not required if use_adlsg2 is false.
+use_adlsg2 = False
+# Storage accounts can be auto generated or manually specified. "|" is used as separator between manual and auto generated storage account names and must be specified
+# Manual and Auto generated names are mutually exclusive
+#
+# Specifying storage accounts manually:
+# |abfss://@./". Use comma to specify multiple entries
+# Example:|abfss://accumulodata@shnawastore1.dfs.core.windows.net/accumulo,abfss://accumulodata@shnawastore2.dfs.core.windows.net/accumulo
+#
+# Specifying auto-generated storage accounts:
+# ,|
+# Example: 3,dfs.core.windows.net|
+instance_volumes_input = 1,dfs.core.windows.net|
+# Do not update "instance_volumes_preferred", it will be populated dynamically during launch phase of muchos
+instance_volumes_preferred =
+# Type of storage for ADLS Gen2 storage accounts
+adls_storage_type = Standard_LRS
+# Specify user assigned identity name. "{{ vmss_name }}-ua-msi" will be created if value is not provided
+user_assigned_identity =
+# Do not update "azure_tenant_id", it will be populated dynamically during launch phase of muchos
+azure_tenant_id =
+# Do not update "azure_client_id", it will be populated dynamically during launch phase of muchos
+azure_client_id =
+# Do not update "principal_id", it will be populated dynamically during launch phase of muchos when "use_hdfs = False"
+principal_id =
# Optional Azure fileshare to mount on all nodes.
# Path and credentials must be updated to enable this.
#azure_fileshare_mount = /mnt/azure-fileshare
diff --git a/lib/muchos/config/azure.py b/lib/muchos/config/azure.py
index fe93b55f..86c584ce 100644
--- a/lib/muchos/config/azure.py
+++ b/lib/muchos/config/azure.py
@@ -104,4 +104,29 @@ def logs_id(self):
@ansible_host_var(name='az_logs_key')
@default(None)
def logs_key(self):
- return self.get('azure', 'az_logs_key')
\ No newline at end of file
+ return self.get('azure', 'az_logs_key')
+
+ @ansible_host_var(name='use_adlsg2')
+ @default(None)
+ def use_adlsg2(self):
+ return self.get('azure', 'use_adlsg2')
+
+ @ansible_host_var(name='azure_tenant_id')
+ @default(None)
+ def azure_tenant_id(self):
+ return self.get('azure', 'azure_tenant_id')
+
+ @ansible_host_var(name='azure_client_id')
+ @default(None)
+ def azure_client_id(self):
+ return self.get('azure', 'azure_client_id')
+
+ @ansible_host_var(name='principal_id')
+ @default(None)
+ def principal_id(self):
+ return self.get('azure', 'principal_id')
+
+ @ansible_host_var(name='instance_volumes_preferred')
+ @default(None)
+ def instance_volumes_preferred(self):
+ return self.get('azure', 'instance_volumes_preferred')