diff --git a/README.md b/README.md index 4339ae88..4dd1fcc8 100644 --- a/README.md +++ b/README.md @@ -164,6 +164,9 @@ Under the `azure` section, edit following values as per your configuration * `numnodes` to change the cluster size in terms of number of nodes deployed * `vm_sku` to specify the VM size to use. You can choose from the [available VM sizes](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/sizes-general). +* `use_adlsg2` to use Azure Data Lake Storage(ADLS) Gen2 as datastore for Accumulo + [ADLS Gen2 Doc](https://docs.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction). + [Setup ADLS Gen2 as datastore for Accumulo](https://accumulo.apache.org/blog/2019/10/15/accumulo-adlsgen2-notes.html). Within Azure the `nodes` section is auto populated with the hostnames and their default roles. diff --git a/ansible/accumulo.yml b/ansible/accumulo.yml index 2af9d670..2352c855 100644 --- a/ansible/accumulo.yml +++ b/ansible/accumulo.yml @@ -27,6 +27,16 @@ - import_tasks: roles/accumulo/tasks/init-accumulo.yml handlers: - import_tasks: roles/accumulo/handlers/init-accumulo.yml +- hosts: all:!{{ azure_proxy_host }} + tasks: + - import_tasks: roles/accumulo/tasks/add-adlsgen2.yml + when: accumulo_major_version == '2' and use_adlsg2 == True +- hosts: accumulomaster[0] + tasks: + - import_tasks: roles/accumulo/tasks/init-adlsgen2.yml + when: accumulo_major_version == '2' and use_adlsg2 == True + handlers: + - import_tasks: roles/accumulo/handlers/init-adlsgen2.yml - hosts: accumulo tasks: - name: "start accumulo 1.0" diff --git a/ansible/roles/accumulo/handlers/init-adlsgen2.yml b/ansible/roles/accumulo/handlers/init-adlsgen2.yml new file mode 100644 index 00000000..06f67b5b --- /dev/null +++ b/ansible/roles/accumulo/handlers/init-adlsgen2.yml @@ -0,0 +1,19 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +- name: "Initialize Apache Accumulo on ADLS Gen2 volume" + command: "{{ accumulo_home }}/bin/accumulo init --add-volumes" diff --git a/ansible/roles/accumulo/tasks/add-adlsgen2.yml b/ansible/roles/accumulo/tasks/add-adlsgen2.yml new file mode 100644 index 00000000..8056f2dc --- /dev/null +++ b/ansible/roles/accumulo/tasks/add-adlsgen2.yml @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +- name: Add ADLS Gen2 volume + lineinfile: + path: "{{ accumulo_home }}/conf/accumulo.properties" + regexp: '^instance.volumes=' + line: "instance.volumes={{ hdfs_root }}/accumulo,{{ instance_volumes_preferred }}" diff --git a/ansible/roles/accumulo/tasks/init-adlsgen2.yml b/ansible/roles/accumulo/tasks/init-adlsgen2.yml new file mode 100644 index 00000000..505b23d4 --- /dev/null +++ b/ansible/roles/accumulo/tasks/init-adlsgen2.yml @@ -0,0 +1,22 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +- name: "determine if accumulo needs to be initialized on adlsgen2" + command: "{{ hadoop_home }}/bin/hdfs dfs -stat {{ instance_volumes_preferred[0] }}" + register: adlsgen2_stat + changed_when: adlsgen2_stat.rc != 0 + failed_when: adlsgen2_stat.rc != 0 and 'No such file or directory' not in adlsgen2_stat.stderr + notify: Initialize Apache Accumulo on ADLS Gen2 volume diff --git a/ansible/roles/accumulo/templates/accumulo-env.sh b/ansible/roles/accumulo/templates/accumulo-env.sh index a6a1bc65..083007bf 100755 --- a/ansible/roles/accumulo/templates/accumulo-env.sh +++ b/ansible/roles/accumulo/templates/accumulo-env.sh @@ -41,6 +41,10 @@ export HADOOP_HOME={{ hadoop_home }} export HADOOP_CONF_DIR="$HADOOP_HOME/etc/hadoop" CLASSPATH="${conf}:${lib}/*:${HADOOP_CONF_DIR}:${ZOOKEEPER_HOME}/*:${HADOOP_HOME}/share/hadoop/client/*" +{% if use_adlsg2 == True %} +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/tools/lib/*" +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/common/lib/*" +{% endif %} export CLASSPATH JAVA_OPTS=("${ACCUMULO_JAVA_OPTS[@]}" @@ -50,6 +54,9 @@ JAVA_OPTS=("${ACCUMULO_JAVA_OPTS[@]}" '-XX:OnOutOfMemoryError=kill -9 %p' '-XX:-OmitStackTraceInFastThrow' '-Djava.net.preferIPv4Stack=true' +{% if use_adlsg2 == True %} + '-Dorg.wildfly.openssl.path=/usr/lib64' +{% endif %} "-Daccumulo.native.lib.path=${lib}/native") case "$cmd" in diff --git a/ansible/roles/accumulo/templates/accumulo.properties b/ansible/roles/accumulo/templates/accumulo.properties index 895cc993..eac3ddf8 100644 --- a/ansible/roles/accumulo/templates/accumulo.properties +++ b/ansible/roles/accumulo/templates/accumulo.properties @@ -42,3 +42,9 @@ tserver.server.threads.minimum=64 ## The maximum size for each write-ahead log tserver.walog.max.size=512M + +{% if use_adlsg2 == True %} +general.volume.chooser=org.apache.accumulo.server.fs.PreferredVolumeChooser +general.custom.volume.preferred.default={{ instance_volumes_preferred }} +general.custom.volume.preferred.logger={{ hdfs_root }}/accumulo +{% endif %} diff --git a/ansible/roles/azure/tasks/create_adlsgen2.yml b/ansible/roles/azure/tasks/create_adlsgen2.yml new file mode 100644 index 00000000..cd674dd7 --- /dev/null +++ b/ansible/roles/azure/tasks/create_adlsgen2.yml @@ -0,0 +1,235 @@ +--- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# These Ansible tasks only run on the client machine where Muchos runs +# At a high level, the various sections in this file do the following: +# 1. Create an Azure ADLS Gen2 storage account. +# 2. Create User Assigned Identity. +# 3. Assign roles to storage accounts. +# 4. Create filesysystem/container in storage accounts. +# 5. Update tenant_id, client_id and instance_volumes_preferred in muchos.props. +# 6. Assign User Assigned Identity to VMSS. + +- name: Generate MD5 checksum based on resource_group name, vmss_name and cluster name + shell: echo -n {{ resource_group + vmss_name + location }}|md5sum|tr -cd "[:alnum:]"|cut -c 1-16|tr '[:upper:]' '[:lower:]' + register: StorageAccountMD5 + +- name: Generate random names for storage account names + set_fact: + StorageAccountName: "{{ StorageAccountMD5.stdout + 99|random(seed=resource_group)|string + 99|random(seed=vmss_name)|string + 9|random(seed=location)|string }}" + +- name: Initialize instance variables + set_fact: + InstanceVolumesAuto: [] + InstanceVolumesManual: [] + +- name: Validate instance_volumes_input + fail: msg="Variable instance_volumes_input incorrectly specified, Both Manual and Auto cannot be specified at same time" + when: instance_volumes_input.split('|')[0].split(',') != [''] and instance_volumes_input.split('|')[1].split(',') != [''] + +- name: Assign manual or autogenerated volumes + set_fact: + InstanceVolumesTemp: "{{ instance_volumes_input.split('|')[0].split(',')|list if instance_volumes_input.split('|')[0].split(',') != [''] else instance_volumes_input.split('|')[1].split(',')|list }}" + +- name: Retrieve sequence end number to get the number of storage accounts + set_fact: + InstanceVolumesEndSequence: "{{ '1' if instance_volumes_input.split('|')[0].split(',') == [''] else InstanceVolumesTemp[0]|int }}" + +- name: Generate names for Storage Accounts + set_fact: + InstanceVolumesAuto: "{{ InstanceVolumesAuto + ['abfss://'+'accumulodata'+'@'+StorageAccountName+item+'.'+InstanceVolumesTemp[1]+'/accumulo'] }}" + with_sequence: start=1 end={{ InstanceVolumesEndSequence|int }} + when: InstanceVolumesTemp[0]|int != 0 + +- name: Retrieve ABFSS values when specified manually + set_fact: + InstanceVolumesManual: "{{ InstanceVolumesManual + [ item ] }}" + loop: + "{{ InstanceVolumesTemp }}" + when: item.split('://')[0] == 'abfss' and instance_volumes_input.split('|')[0].split(',') == [''] + +# This is final list of instance volumes +- name: Assign variables for autogeneration or manual for storage account creation + set_fact: + InstanceVolumes: "{{ InstanceVolumesManual if instance_volumes_input.split('|')[0].split(',') == [''] else InstanceVolumesAuto }}" + +- name: Update instance_volumes_preferred in muchos.props + lineinfile: + path: "{{ deploy_path }}/conf/muchos.props" + regexp: '^instance_volumes_preferred\s*=\s*|^[#]instance_volumes_preferred\s*=\s*' + line: "instance_volumes_preferred = {{ InstanceVolumes|join(',') }}" + +# Not registering variable because storage values are not visible immediately +- name: Create ADLS Gen2 storage acount using REST API + azure_rm_resource: + resource_group: "{{ resource_group }}" + provider: Storage + resource_type: storageAccounts + resource_name: "{{ item.split('@')[1].split('.')[0] }}" + api_version: '2019-04-01' + idempotency: yes + state: present + body: + sku: + name: "{{ adls_storage_type }}" + kind: StorageV2 + properties: + isHnsEnabled: yes + location: "{{ location }}" + loop: + "{{ InstanceVolumes }}" + +# Creating User Assigned identity with vmss_name suffixed by ua-msi if not specified in muchos.props +# Not registering variable because user identity values are not visible immediately +- name: Create User Assigned Identity + azure_rm_resource: + resource_group: "{{ resource_group }}" + provider: ManagedIdentity + resource_type: userAssignedIdentities + resource_name: "{{ user_assigned_identity if user_assigned_identity !='' else vmss_name + '-ua-msi' }}" + api_version: '2018-11-30' + idempotency: yes + state: present + body: + location: "{{ location }}" + +# Retrieving facts about User Assigned Identity +- name: Get facts for User Assigned Identity + azure_rm_resource_facts: + resource_group: "{{ resource_group }}" + provider: ManagedIdentity + resource_type: userAssignedIdentities + resource_name: "{{ user_assigned_identity if user_assigned_identity !='' else vmss_name + '-ua-msi' }}" + api_version: '2018-11-30' + register: UserAssignedIdentityInfo + retries: 20 + delay: 15 + until: UserAssignedIdentityInfo.response|map(attribute='properties')|map(attribute='principalId')|join('') is defined + +- name: Update principal_id in muchos.props + lineinfile: + path: "{{ deploy_path }}/conf/muchos.props" + regexp: '^principal_id\s*=\s*|^[#]principal_id\s*=\s*' + line: "principal_id = {{ UserAssignedIdentityInfo.response|map(attribute='properties')|map(attribute='principalId')|join('') }}" + +# This will be used to assign the MSI for VMSS +- name: Format User Assigned Identity for API + set_fact: + UserAssignedIdentityArr: "{{ UserAssignedIdentityInfo.response|default({})|map(attribute='id')|map('regex_replace','^(.*)$','{\"\\1\":{}}')|list}}" + +# Retrieve facts about role assignment +- name: Get role definition id for "Storage Blob Data Contributor" + azure_rm_resource_facts: + resource_group: "{{ resource_group }}" + provider: Authorization + resource_type: roleDefinitions + resource_name: ba92f5b4-2d11-453d-a403-e96b0029c9fe + api_version: '2015-07-01' + register: RoleDefinitionInfo + +# Retrieve storage acount informationn. +- name: Check if the storage accounts is visible + azure_rm_storageaccount_facts: + resource_group: "{{ resource_group }}" + name: "{{ item.split('@')[1].split('.')[0] }}" + register: StorageAccountsInfo + retries: 20 + delay: 15 + until: StorageAccountsInfo.storageaccounts|sum(start=[])|map(attribute='id')|join('') is defined + loop: + "{{ InstanceVolumes }}" + +# Retrieve storage accounts id creeated -- Used for account assignments +- name: Get the id of storage accounts created + set_fact: + StorageAccountsId: "{{StorageAccountsInfo.results|map(attribute='ansible_facts')|map(attribute='azure_storageaccounts')|sum(start=[])|map(attribute='id')|list|unique }}" + +# Adding this module since role aassignment fails if it already exists. +- name: Get facts about role assignment + azure_rm_roleassignment_facts: + scope: "{{ item }}" + assignee: "{{ UserAssignedIdentityInfo.response|map(attribute='properties')|map(attribute='principalId')|list|join('') }}" + role_definition_id: "{{ RoleDefinitionInfo.response|map(attribute='id')|list|join('') }}" + register: RoleAssignmentResults + retries: 20 + delay: 15 + until: UserAssignedIdentityInfo.response|map(attribute='properties')|map(attribute='principalId')|join('') is defined and RoleDefinitionInfo.response|map(attribute='id')|join('') is defined + loop: + "{{ StorageAccountsId }}" + +- name: Set fact for getting storage accounts that have assigned roles + set_fact: + StorageAccountRoles: "{{ item|map(attribute='scope')|list|unique }}" + no_log: True + loop: + "{{RoleAssignmentResults.results|map(attribute='roleassignments')|list }}" + +# This retry logic is needed due to race condition between storage account create complete and role assignment +- name: Create a role assignment + azure_rm_roleassignment: + scope: "{{ item }}" + assignee_object_id: "{{ UserAssignedIdentityInfo.response|map(attribute='properties')|map(attribute='principalId')|list|join('') }}" + role_definition_id: "{{ RoleDefinitionInfo.response|map(attribute='id')|list|join('') }}" + state: present + retries: 30 + delay: 15 + register: roleassignresult + until: roleassignresult is succeeded + loop: + "{{ StorageAccountsId }}" + when: item not in StorageAccountRoles + +# This retry logic is needed due to race condition between storage account creation and creating filesystem +- name: Create container/Filesystem on ADLS Gen2 + azure_rm_storageblob: + resource_group: "{{ resource_group }}" + storage_account_name: "{{ item.split('@')[1].split('.')[0] }}" + container: "{{ item.split('@')[0].split('://')[1] }}" + retries: 30 + delay: 15 + register: createfsresult + until: createfsresult is succeeded and (createfsresult.changed == False or (createfsresult.changed == True and createfsresult.container|length > 0)) + loop: + "{{ InstanceVolumes }}" + +# Retrieve tenantId for core-site.xml +- name: Update tenantId in muchos.props + lineinfile: + path: "{{ deploy_path }}/conf/muchos.props" + regexp: '^azure_tenant_id\s*=\s*|^[#]azure_tenant_id\s*=\s*' + line: "azure_tenant_id = {{ UserAssignedIdentityInfo.response|map(attribute='properties')|map(attribute='tenantId')|list|join('') }}" + +# Retrieve clientId for core-site.xml +- name: Update clientid in muchos.props + lineinfile: + path: "{{ deploy_path }}/conf/muchos.props" + regexp: '^azure_client_id\s*=\s*|^[#]azure_client_id\s*=\s*' + line: "azure_client_id = {{ UserAssignedIdentityInfo.response|map(attribute='properties')|map(attribute='clientId')|list|join('') }}" + +- name: Assign User Assigned Identity to VMSS + azure_rm_resource: + resource_group: "{{ resource_group }}" + provider: Compute + resource_type: virtualMachineScaleSets + resource_name: "{{ vmss_name }}" + api_version: '2019-03-01' + body: + location: "{{ location }}" + identity: + type: UserAssigned + userAssignedIdentities: "{{ UserAssignedIdentityArr|join('') }}" diff --git a/ansible/roles/azure/tasks/main.yml b/ansible/roles/azure/tasks/main.yml index 6ec80d75..a846779c 100644 --- a/ansible/roles/azure/tasks/main.yml +++ b/ansible/roles/azure/tasks/main.yml @@ -19,3 +19,5 @@ # tasks file for azure - import_tasks: create_vmss.yml +- import_tasks: create_adlsgen2.yml + when: use_adlsg2 == True diff --git a/ansible/roles/hadoop-ha/tasks/main.yml b/ansible/roles/hadoop-ha/tasks/main.yml index 7f456c81..dd92ae19 100644 --- a/ansible/roles/hadoop-ha/tasks/main.yml +++ b/ansible/roles/hadoop-ha/tasks/main.yml @@ -54,3 +54,11 @@ replace: "export HADOOP_LOG_DIR={{ worker_data_dirs[0] }}/logs/hadoop" - name: "Create hadoop log dir" file: path={{ worker_data_dirs[0] }}/logs/hadoop state=directory +- name: Insert HADOOP_OPTIONAL_TOOLS & HADOOP_OPTS in hadoop-env.sh + blockinfile: + path: "{{ hadoop_home }}/etc/hadoop/hadoop-env.sh" + insertafter: EOF + block: | + export HADOOP_OPTIONAL_TOOLS=hadoop-azure + export HADOOP_OPTS="-Dorg.wildfly.openssl.path=/usr/lib64 ${HADOOP_OPTS}" + when: hadoop_major_version == '3' and use_adlsg2 == True diff --git a/ansible/roles/hadoop-ha/templates/core-site.xml b/ansible/roles/hadoop-ha/templates/core-site.xml index dd54827d..d717c5c2 100644 --- a/ansible/roles/hadoop-ha/templates/core-site.xml +++ b/ansible/roles/hadoop-ha/templates/core-site.xml @@ -38,4 +38,34 @@ ha.zookeeper.quorum {{ zookeeper_connect }} +{% if use_adlsg2 == True %} + + fs.azure.account.auth.type + OAuth + + + fs.azure.account.oauth.provider.type + org.apache.hadoop.fs.azurebfs.oauth2.MsiTokenProvider + + + fs.azure.account.oauth2.msi.tenant + {{ azure_tenant_id}} + + + fs.azure.account.oauth2.client.id + {{ azure_client_id }} + + + fs.azure.use.upn + true + + + fs.azure.identity.transformer.service.principal.substitution.list + * + + + fs.azure.identity.transformer.service.principal.id + {{ principal_id }} + +{% endif %} diff --git a/ansible/roles/hadoop-ha/templates/mapred-site.xml b/ansible/roles/hadoop-ha/templates/mapred-site.xml index c6be0cec..c3def168 100644 --- a/ansible/roles/hadoop-ha/templates/mapred-site.xml +++ b/ansible/roles/hadoop-ha/templates/mapred-site.xml @@ -54,4 +54,10 @@ HADOOP_MAPRED_HOME={{ hadoop_home }} {% endif %} +{% if use_adlsg2 == True %} + + mapreduce.application.classpath + $HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*,$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/common/*,$HADOOP_MAPRED_HOME/share/hadoop/common/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/yarn/*,$HADOOP_MAPRED_HOME/share/hadoop/yarn/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/hdfs/*,$HADOOP_MAPRED_HOME/share/hadoop/hdfs/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/tools/lib/*,${HADOOP_HOME}/share/hadoop/client/* + +{% endif %} diff --git a/ansible/roles/hadoop-ha/templates/yarn-site.xml b/ansible/roles/hadoop-ha/templates/yarn-site.xml index 85033a6b..eb45896f 100644 --- a/ansible/roles/hadoop-ha/templates/yarn-site.xml +++ b/ansible/roles/hadoop-ha/templates/yarn-site.xml @@ -93,4 +93,10 @@ twill.java.reserved.memory.mb {{ twill_reserve_mem_mb }} + {% if use_adlsg2 == True %} + + yarn.application.classpath + ${HADOOP_HOME}/share/hadoop/tools/lib/*,${HADOOP_HOME}/share/hadoop/hdfs/lib/*,${HADOOP_HOME}/share/hadoop/common/lib/*,${HADOOP_HOME}/share/hadoop/yarn/*,${HADOOP_HOME}/share/hadoop/yarn/lib/*,${HADOOP_HOME}/share/hadoop/hdfs/*,${HADOOP_HOME}/share/hadoop/common/*,${HADOOP_HOME}/share/hadoop/mapreduce/*,${HADOOP_HOME}/share/hadoop/mapreduce/lib/*,${HADOOP_HOME}/share/hadoop/client/* + + {% endif %} diff --git a/ansible/roles/hadoop/tasks/main.yml b/ansible/roles/hadoop/tasks/main.yml index d0219b31..a6733a9c 100644 --- a/ansible/roles/hadoop/tasks/main.yml +++ b/ansible/roles/hadoop/tasks/main.yml @@ -55,3 +55,11 @@ - name: "Create hadoop log dir" file: path={{ worker_data_dirs[0] }}/logs/hadoop state=directory +- name: Insert HADOOP_OPTIONAL_TOOLS & HADOOP_OPTS in hadoop-env.sh + blockinfile: + path: "{{ hadoop_home }}/etc/hadoop/hadoop-env.sh" + insertafter: EOF + block: | + export HADOOP_OPTIONAL_TOOLS=hadoop-azure + export HADOOP_OPTS="-Dorg.wildfly.openssl.path=/usr/lib64 ${HADOOP_OPTS}" + when: hadoop_major_version == '3' and use_adlsg2 == True diff --git a/ansible/roles/hadoop/templates/core-site.xml b/ansible/roles/hadoop/templates/core-site.xml index 56232aa7..c5f15974 100644 --- a/ansible/roles/hadoop/templates/core-site.xml +++ b/ansible/roles/hadoop/templates/core-site.xml @@ -36,4 +36,34 @@ dfs.domain.socket.path /var/lib/hadoop-hdfs/dn_socket +{% if use_adlsg2 == True %} + + fs.azure.account.auth.type + OAuth + + + fs.azure.account.oauth.provider.type + org.apache.hadoop.fs.azurebfs.oauth2.MsiTokenProvider + + + fs.azure.account.oauth2.msi.tenant + {{ azure_tenant_id}} + + + fs.azure.account.oauth2.client.id + {{ azure_client_id }} + + + fs.azure.use.upn + true + + + fs.azure.identity.transformer.service.principal.substitution.list + * + + + fs.azure.identity.transformer.service.principal.id + {{ principal_id }} + +{% endif %} diff --git a/ansible/roles/hadoop/templates/mapred-site.xml b/ansible/roles/hadoop/templates/mapred-site.xml index a95eb775..7ecf751d 100644 --- a/ansible/roles/hadoop/templates/mapred-site.xml +++ b/ansible/roles/hadoop/templates/mapred-site.xml @@ -56,4 +56,10 @@ HADOOP_MAPRED_HOME={{ hadoop_home }} {% endif %} +{% if use_adlsg2 == True %} + + mapreduce.application.classpath + $HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*,$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/common/*,$HADOOP_MAPRED_HOME/share/hadoop/common/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/yarn/*,$HADOOP_MAPRED_HOME/share/hadoop/yarn/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/hdfs/*,$HADOOP_MAPRED_HOME/share/hadoop/hdfs/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/tools/lib/*,${HADOOP_HOME}/share/hadoop/client/* + +{% endif %} diff --git a/ansible/roles/hadoop/templates/yarn-site.xml b/ansible/roles/hadoop/templates/yarn-site.xml index ac62174d..847f98b0 100644 --- a/ansible/roles/hadoop/templates/yarn-site.xml +++ b/ansible/roles/hadoop/templates/yarn-site.xml @@ -82,4 +82,10 @@ twill.java.reserved.memory.mb {{ twill_reserve_mem_mb }} + {% if use_adlsg2 == True %} + + yarn.application.classpath + ${HADOOP_HOME}/share/hadoop/tools/lib/*,${HADOOP_HOME}/share/hadoop/hdfs/lib/*,${HADOOP_HOME}/share/hadoop/common/lib/*,${HADOOP_HOME}/share/hadoop/yarn/*,${HADOOP_HOME}/share/hadoop/yarn/lib/*,${HADOOP_HOME}/share/hadoop/hdfs/*,${HADOOP_HOME}/share/hadoop/common/*,${HADOOP_HOME}/share/hadoop/mapreduce/*,${HADOOP_HOME}/share/hadoop/mapreduce/lib/*,${HADOOP_HOME}/share/hadoop/client/* + + {% endif %} diff --git a/conf/muchos.props.example b/conf/muchos.props.example index b34a4374..41cb5d5d 100644 --- a/conf/muchos.props.example +++ b/conf/muchos.props.example @@ -129,6 +129,31 @@ metrics_drive_root = var-data # Optional proxy VM. If not set, the first node of the cluster will be selected as the proxy. azure_proxy_host = location = westus2 +# Enable ADLS Gen2 storage configuration. Muchos parameters instance_volumes_input, instance_volumes_preferred & adls_storage_type is not required if use_adlsg2 is false. +use_adlsg2 = False +# Storage accounts can be auto generated or manually specified. "|" is used as separator between manual and auto generated storage account names and must be specified +# Manual and Auto generated names are mutually exclusive +# +# Specifying storage accounts manually: +# |abfss://@./". Use comma to specify multiple entries +# Example:|abfss://accumulodata@shnawastore1.dfs.core.windows.net/accumulo,abfss://accumulodata@shnawastore2.dfs.core.windows.net/accumulo +# +# Specifying auto-generated storage accounts: +# ,| +# Example: 3,dfs.core.windows.net| +instance_volumes_input = 1,dfs.core.windows.net| +# Do not update "instance_volumes_preferred", it will be populated dynamically during launch phase of muchos +instance_volumes_preferred = +# Type of storage for ADLS Gen2 storage accounts +adls_storage_type = Standard_LRS +# Specify user assigned identity name. "{{ vmss_name }}-ua-msi" will be created if value is not provided +user_assigned_identity = +# Do not update "azure_tenant_id", it will be populated dynamically during launch phase of muchos +azure_tenant_id = +# Do not update "azure_client_id", it will be populated dynamically during launch phase of muchos +azure_client_id = +# Do not update "principal_id", it will be populated dynamically during launch phase of muchos when "use_hdfs = False" +principal_id = # Optional Azure fileshare to mount on all nodes. # Path and credentials must be updated to enable this. #azure_fileshare_mount = /mnt/azure-fileshare diff --git a/lib/muchos/config/azure.py b/lib/muchos/config/azure.py index fe93b55f..86c584ce 100644 --- a/lib/muchos/config/azure.py +++ b/lib/muchos/config/azure.py @@ -104,4 +104,29 @@ def logs_id(self): @ansible_host_var(name='az_logs_key') @default(None) def logs_key(self): - return self.get('azure', 'az_logs_key') \ No newline at end of file + return self.get('azure', 'az_logs_key') + + @ansible_host_var(name='use_adlsg2') + @default(None) + def use_adlsg2(self): + return self.get('azure', 'use_adlsg2') + + @ansible_host_var(name='azure_tenant_id') + @default(None) + def azure_tenant_id(self): + return self.get('azure', 'azure_tenant_id') + + @ansible_host_var(name='azure_client_id') + @default(None) + def azure_client_id(self): + return self.get('azure', 'azure_client_id') + + @ansible_host_var(name='principal_id') + @default(None) + def principal_id(self): + return self.get('azure', 'principal_id') + + @ansible_host_var(name='instance_volumes_preferred') + @default(None) + def instance_volumes_preferred(self): + return self.get('azure', 'instance_volumes_preferred')