diff options
Diffstat (limited to 'roles/cuda/tasks')
-rw-r--r-- | roles/cuda/tasks/configure_apt.yml | 15 | ||||
-rw-r--r-- | roles/cuda/tasks/configure_yum.yml | 28 | ||||
-rw-r--r-- | roles/cuda/tasks/cuda_init.yml | 48 | ||||
-rw-r--r-- | roles/cuda/tasks/main.yml | 54 |
4 files changed, 145 insertions, 0 deletions
diff --git a/roles/cuda/tasks/configure_apt.yml b/roles/cuda/tasks/configure_apt.yml new file mode 100644 index 0000000..53a38a5 --- /dev/null +++ b/roles/cuda/tasks/configure_apt.yml @@ -0,0 +1,15 @@ +--- +# tasks file for ansible-role-cuda +- name: Trust packaging key for Nvidia repositories (apt) + apt_key: + data: "{{ lookup('file', 'files/nvidia_packaging_key.asc') }}" + id: "{{ cuda_packaging_key_id }}" + state: present + +- name: Configure Nvidia repository (apt) + apt_repository: + repo: "deb {{ cuda_repo_url }}/{{ cuda_repo_subfolder }}/x86_64 /" + filename: nvidia + state: present + +# vim:ft=ansible: diff --git a/roles/cuda/tasks/configure_yum.yml b/roles/cuda/tasks/configure_yum.yml new file mode 100644 index 0000000..e888468 --- /dev/null +++ b/roles/cuda/tasks/configure_yum.yml @@ -0,0 +1,28 @@ +--- +# tasks file for ansible-role-cuda +- name: Upload packaging key for Nvidia repositories + copy: + src: nvidia_packaging_key.asc + dest: "{{ cuda_rpm_key_path }}" + mode: 0644 + +- name: Trust packaging key for Nvidia repositories (rpm) + rpm_key: + key: "{{ cuda_rpm_key_path }}" + state: present + +- name: Remove trust for old Nvidia packaging key + rpm_key: + key: 5C37D3BE + state: absent + +- name: Configure Nvidia repository (yum) + yum_repository: + name: nvidia + description: Official Nvidia repository + baseurl: "{{ cuda_repo_url }}/{{ cuda_repo_subfolder }}/x86_64/" + gpgkey: "file://{{ cuda_rpm_key_path }}" + gpgcheck: yes + enabled: yes + +# vim:ft=ansible: diff --git a/roles/cuda/tasks/cuda_init.yml b/roles/cuda/tasks/cuda_init.yml new file mode 100644 index 0000000..ff54994 --- /dev/null +++ b/roles/cuda/tasks/cuda_init.yml @@ -0,0 +1,48 @@ +--- +- name: template in cuda_init.sh used during boot + template: + src: cuda_init.sh.j2 + dest: /usr/local/bin/cuda_init.sh + mode: 0755 + notify: + - Initialize the GPUs + +- name: lineinfile/make sure cuda_init.sh script is absent from rc.local + lineinfile: + dest: /etc/rc.local + insertafter: "^touch /var/lock/subsys/local" + regexp: "^/bin/bash /usr/local/bin/cuda_init.sh$" + line: "/bin/bash /usr/local/bin/cuda_init.sh" + state: absent + +- name: template in cuda_init.service systemd script + template: + src: cuda_init.service.j2 + dest: /etc/systemd/system/cuda_init.service + mode: 0644 + notify: + - reload systemd unit files + - Restart cuda_init service + when: ansible_service_mgr == "systemd" + +- name: enable the cuda_init systemd service + service: + name: cuda_init + enabled: yes + when: ansible_service_mgr == "systemd" + +- name: check if cuda_gpu_name0 ( /dev/nvidia0 ) exists + stat: + path: "{{ cuda_gpu_name0 }}" + register: reg_cuda_gpu_name0 + check_mode: no + failed_when: false + +- name: Initialize the GPUs - run cuda_init.sh if there is no /dev/nvidia0 + command: /bin/bash /usr/local/bin/cuda_init.sh + when: + - reg_cuda_gpu_name0.stat.exists is defined + - reg_cuda_gpu_name0.stat.exists == False + - cuda_init_restart_service + +# vim:ft=ansible: diff --git a/roles/cuda/tasks/main.yml b/roles/cuda/tasks/main.yml new file mode 100644 index 0000000..f292f67 --- /dev/null +++ b/roles/cuda/tasks/main.yml @@ -0,0 +1,54 @@ +--- +# tasks file for ansible-role-cuda +- name: "Gather OS specific variables" + include_vars: "{{ item }}" + with_first_found: + - "{{ ansible_distribution|lower }}-{{ ansible_distribution_version }}.yml" + - "{{ ansible_distribution|lower }}-{{ ansible_distribution_major_version }}.yml" + - "{{ ansible_distribution|lower }}.yml" + - "{{ ansible_os_family|lower }}.yml" + +- block: + - include_tasks: configure_yum.yml + when: ansible_pkg_mgr == 'yum' or ansible_pkg_mgr == 'dnf' + + - include_tasks: configure_apt.yml + when: ansible_pkg_mgr == 'apt' + + - name: Install kernel development files + package: name=kernel-devel state=present + register: result + + - name: Synchronize kernel and kernel-devel packages + package: name=kernel state=latest + when: (result | changed) + + - name: Install CUDA and related packages (1.5-2GB download, also restarts if cuda_restart_node_on_install is set to True) + package: + name: "{{ item }}" + state: present + with_items: "{{ cuda_packages }}" + register: cuda_packages_installation + notify: + - ZZ CUDA Restart server + - ZZ CUDA Wait for server to restart + + - name: Template CUDA paths to user environments + template: + src: cuda.sh.j2 + dest: /etc/profile.d/cuda.sh + mode: 0755 + when: cuda_bash_profile + + - include_tasks: cuda_init.yml + when: cuda_init == True + + # This is here because if we in the same playbook try to start slurmd without + # having run the cuda_init.sh script then slurmd doesn't start and the play fails. + # todo: reload nvidia modules/etc instead of restart + - name: flush the handlers - so that the node is rebooted after CUDA is installed and that the GPUs are initialized before we start slurm + meta: flush_handlers + + when: gpu == True + +# vim:ft=ansible: |