Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugfixes and improved version of login checks #83

Merged
merged 7 commits into from
Mar 21, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion group_vars/all/vars.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ auth_users:
uid: 1008
pub_keys: |
ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBDvx1ebTndL/HitD30uNpvESXWUAxT3j0e0CzrBUZ8fHDv+vZTbWBRtWbnLgCnVDPa3GclA1lpnvJD9JBjBhUa8= ger@ger-pc

robin:
comment: 'Robin Teeninga'
uid: 1009
Expand Down
2 changes: 1 addition & 1 deletion group_vars/talos-cluster/vars.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ vcompute_real_memory: 7822
vcompute_max_cpus_per_node: "{{ vcompute_sockets * vcompute_cores_per_socket - 2 }}"
vcompute_max_mem_per_node: "{{ vcompute_real_memory - vcompute_sockets * vcompute_cores_per_socket * 512 }}"
vcompute_local_disk: 0
vcompute_features: 'tmp02'
vcompute_features: 'tmp08'
vcompute_ethernet_interfaces:
- 'eth0'
- 'eth1'
Expand Down
9 changes: 6 additions & 3 deletions roles/logins/files/login_checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,12 @@ login_actions () {
# but in the first case there are no SLURM related environment variables defined.
#

# SOURCE_HPC_ENV variable checking disabled (it is not set ) Egon 30-10-2018
#if [ ${TERM} == 'dumb' ] && [ -z ${SOURCE_HPC_ENV} ]; then
if [ ${TERM} == 'dumb' ]; then
#
# ToDo: fix this. As of CentOS 7.x interactive session that eventually report ${TERM} == 'bash'
# report ${TERM} == 'dumb' at the point where this script is executed in the PAM stack :(.
# Makes it impossible to determine the difference between an SFTP session versus a Bash session.
#
if [ ${TERM} == 'dumb' ] && [ -z "${SOURCE_HPC_ENV:-}" ]; then
$LOGGER "debug: exiting because of dumb terminal"
exit 0
fi
Expand Down
12 changes: 11 additions & 1 deletion roles/logins/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,17 @@
dest: "/etc/pam-script.d/{{ item }}"
owner: root
group: root
state: link
#
# Login checks currently disabled,
# because error handling/reporting no longer works on CentOS >= 7.x,
# due to changes in the PAM stack.
# Login checks were only used to create Slurm accounts in the Slurm accounting DB.
# This functionality has been relocated to the Slurm job_submit.lua plugin,
# which will now automatically create account, users and associations of slurm users to slurm accounts
# upon job submission when they do not already exist.
#
# state: link
state: absent
with_items:
- login_checks.sh_ses_open
when: inventory_hostname in groups['cluster']
Expand Down
4 changes: 2 additions & 2 deletions roles/slurm-client/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,9 @@
group: root
mode: '0750'
- name: /var/spool/slurmd
owner: slurm
owner: root
group: root
mode: '0750'
mode: '0755'

- name: Deploy slurm.conf
template:
Expand Down
149 changes: 122 additions & 27 deletions roles/slurm/files/job_submit.lua
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,112 @@ QOS_TIME_LIMITS = {
--
--DEFAULT_WALLTIME = '1'

--
-- Check if the user submitting the job is associated to a Slurm account in the Slurm accounting database and
-- create the relevant Slurm account and/or Slurm user and/or association if it does not already exist.
--
function ensure_user_has_slurm_association(uid, user, group)
--
-- Skip root user.
--
if uid == 0 then
return true
end

slurm.log_debug("Checking assoc for user %s (uid=%u) in account for group %s...", user, uid, group)
if association_exists(user, group) then
slurm.log_debug("Association of user %s to account %s already exists.", user, group)
return true
else
if account_exists(group) then
slurm.log_debug("Account %s already exists.", group)
else
slurm.log_info("Account %s does not exist; creating one...", group)
if not create_account(group) then
return false
end
end
slurm.log_info("Association of user %s to account %s does not exist; creating one...", user, group)
if not create_association(user,group) then
return false
end
end
return true
end

function account_exists(group)
--
-- Unfortunately, filehandles returned by io.popen() don't have a way to return their exitstatuses in <= lua 5.2.
-- Should be reasonably safe here, since if we erroneously conclude the association doesn't exist,
-- then we'll just try to add it.
-- http://lua-users.org/lists/lua-l/2012-01/msg00364.html
--
local query = io.popen(string.format(
"sacctmgr --parsable2 --noheader list accounts format=account account='%s'", group))
for line in query:lines() do
if line == group then
return true
end
end
return false
end

function create_account(group)
local retval = os.execute(string.format(
"sacctmgr -i create account '%s' descr=scientists org=various parent=users fairshare=parent", group))
if retval ~= 0 then
slurm.log_error("Failed to create account %s (exit status = %d).", group, retval)
slurm.log_user("Failed to create account %s (exit status = %d). Contact an admin.", group, retval)
return false
else
slurm.log_info("Created account for group %s.", group)
return true
end
end

function association_exists(user, group)
--
-- Unfortunately, filehandles returned by io.popen() don't have a way to return their exitstatuses in <= lua 5.2.
-- Should be reasonably safe here, since if we erroneously conclude the association doesn't exist,
-- then we'll just try to add it.
-- http://lua-users.org/lists/lua-l/2012-01/msg00364.html
--
local query = io.popen(string.format(
"sacctmgr --parsable2 --noheader list associations format=user,account user='%s' account='%s'", user, group))
for line in query:lines() do
if line == user .. '|' .. group then
return true
end
end
return false
end

function create_association(user,group)
local retval = os.execute(string.format(
"sacctmgr -i create user name='%s' account='%s' fairshare=parent", user, group))
if retval ~= 0 then
slurm.log_error("Failed to create association of user %s to account %s (exit status = %d).", user, group, retval)
slurm.log_user("Failed to create association of user %s to account %s (exit status = %d). Contact an admin.", user, group, retval)
return false
else
slurm.log_info("Created association of user %s to account %s.", user, group)
return true
end
end

function slurm_job_submit(job_desc, part_list, submit_uid)
--
-- Get details for the user who is trying to submit a job.
--
submit_user = posix.getpasswd(submit_uid)

--
-- Force jobs to share nodes when they don't consume all resources on a node.
--
if job_desc.shared == 0 then
job_desc.shared = 1
end

--
-- Check if the job does have a time limit specified.
-- For some reason (bug?), the nil value is passed as 4294967294.
Expand Down Expand Up @@ -87,6 +187,8 @@ function slurm_job_submit(job_desc, part_list, submit_uid)
--slurm.log_debug("Path to job *.err = %s.", tostring(job_desc.std_err))
--slurm.log_debug("Job's working dir = %s.", tostring(job_desc.work_dir))
local job_metadata = {job_desc.std_out, job_desc.std_err, job_desc.work_dir}
local group = nil
local lfs = nil
for inx,job_metadata_value in ipairs(job_metadata) do
if string.match(tostring(job_metadata_value), '^/home/') then
slurm.log_error(
Expand All @@ -99,31 +201,20 @@ function slurm_job_submit(job_desc, part_list, submit_uid)
"Rejecting job named %s from user %s (uid=%u).", tostring(job_desc.name), tostring(submit_user.name), job_desc.user_id)
return slurm.ERROR
end
local entitlement, group, lfs = string.match(tostring(job_metadata_value), '^/groups/([^/-]+)-([^/]+)/(tmp%d%d)/?')
if lfs == nil then
-- Temporary workaround for tmp02, which uses a symlink in /groups/..., that is resolved to the physical path by SLURM.
entitlement, group, lfs = string.match(tostring(job_metadata_value), '^/target/gpfs2/groups/([^/-]+)-([^/]+)/(tmp%d%d)/?')
end
if entitlement ~= nil and group ~= nill and lfs ~= nil then
slurm.log_debug("Found entitlement '%s' and LFS '%s' in job's metadata.", tostring(entitlement), tostring(lfs))
group, lfs = string.match(tostring(job_metadata_value), '^/groups/([^/]+)/(tmp%d%d)/?')
if group ~= nil and lfs ~= nil then
slurm.log_debug("Found group '%s' and LFS '%s' in job's metadata.", tostring(group), tostring(lfs))
if job_desc.features == nil or job_desc.features == '' then
job_desc.features = entitlement .. '&' .. lfs
slurm.log_debug("Job had no features yet; Assigned entitlement and LFS as first features: %s.", tostring(job_desc.features))
job_desc.features = lfs
slurm.log_debug("Job had no features yet; Assigned LFS as first feature: %s.", tostring(job_desc.features))
else
if not string.match(tostring(job_desc.features), entitlement) then
job_desc.features = job_desc.features .. '&' .. entitlement
slurm.log_debug("Appended entitlement %s to job's features.", tostring(entitlement))
else
slurm.log_debug("Job's features already contained entitlement %s.", tostring(entitlement))
end
if not string.match(tostring(job_desc.features), lfs) then
job_desc.features = job_desc.features .. '&' .. lfs
slurm.log_debug("Appended LFS %s to job's features.", tostring(lfs))
else
slurm.log_debug("Job's features already contained LFS %s.", tostring(lfs))
end
end
slurm.log_info("Job's features now contains: %s.", tostring(job_desc.features))
else
slurm.log_error(
"Job's working dir, *.err file or *.out file is not located in /groups/${group}/tmp*/...\n" ..
Expand All @@ -138,6 +229,20 @@ function slurm_job_submit(job_desc, part_list, submit_uid)
return slurm.ERROR
end
end
slurm.log_debug("Job's features contains: %s.", tostring(job_desc.features))
--
-- Check if the user submitting the job is associated to a Slurm account in the Slurm accounting database and
-- create the relevant Slurm account and/or Slurm user and/or association if it does not already exist.
-- Note: as slurm account we use the group that was found last while parsing job_metadata above.
--
if not ensure_user_has_slurm_association(submit_uid, tostring(submit_user.name), tostring(group)) then
slurm.log_error("Failed to create association in the Slurm accounting database for user %s in account/group %s", tostring(submit_user.name), tostring(group))
slurm.log_error("Rejecting job named %s from user %s (uid=%u).", tostring(job_desc.name), tostring(submit_user.name), job_desc.user_id)
slurm.log_user(
"Failed to create association in the Slurm accounting database. Contact an admin.\n" ..
"Rejecting job named %s from user %s (uid=%u).", tostring(job_desc.name), tostring(submit_user.name), job_desc.user_id)
return slurm.ERROR
end

--
-- Process final list of features:
Expand All @@ -161,7 +266,7 @@ function slurm_job_submit(job_desc, part_list, submit_uid)
job_desc.qos = 'ds'
end
end

--
-- Make sure we have a sanity checked base-QoS.
--
Expand Down Expand Up @@ -218,16 +323,6 @@ function slurm_job_submit(job_desc, part_list, submit_uid)
slurm.log_info("Assigned QoS %s to job named %s from user %s (uid=%u).", new_qos, job_desc.name, tostring(submit_user.name), job_desc.user_id)
end

--
-- Check if the user submitting the job is associated to a Slurm account in the Slurm accounting database and
-- create the relevant Slurm account and/or Slurm user and/or association if it does not already exist.
-- Skip this check for the root user.
--
if job_desc.user_id ~= 0 then
--submit_user_primary_group = posix.getgroup(submit_user.gid).name
--ensure_assoc_exists(submit_user.name, entitlement .. '-' .. group)
end

return slurm.SUCCESS

end
Expand Down
2 changes: 1 addition & 1 deletion roles/slurm/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@
- name: Make services reload their configs.
command: systemctl daemon-reload

- name: Make sure servcies are started.
- name: Make sure services are started.
systemd:
name: "{{item}}"
state: restarted
Expand Down