summaryrefslogtreecommitdiffstats
path: root/roles/cuda/tasks
diff options
context:
space:
mode:
Diffstat (limited to 'roles/cuda/tasks')
-rw-r--r--roles/cuda/tasks/configure_apt.yml15
-rw-r--r--roles/cuda/tasks/configure_yum.yml28
-rw-r--r--roles/cuda/tasks/cuda_init.yml48
-rw-r--r--roles/cuda/tasks/main.yml54
4 files changed, 145 insertions, 0 deletions
diff --git a/roles/cuda/tasks/configure_apt.yml b/roles/cuda/tasks/configure_apt.yml
new file mode 100644
index 0000000..53a38a5
--- /dev/null
+++ b/roles/cuda/tasks/configure_apt.yml
@@ -0,0 +1,15 @@
+---
+# tasks file for ansible-role-cuda
+- name: Trust packaging key for Nvidia repositories (apt)
+ apt_key:
+ data: "{{ lookup('file', 'files/nvidia_packaging_key.asc') }}"
+ id: "{{ cuda_packaging_key_id }}"
+ state: present
+
+- name: Configure Nvidia repository (apt)
+ apt_repository:
+ repo: "deb {{ cuda_repo_url }}/{{ cuda_repo_subfolder }}/x86_64 /"
+ filename: nvidia
+ state: present
+
+# vim:ft=ansible:
diff --git a/roles/cuda/tasks/configure_yum.yml b/roles/cuda/tasks/configure_yum.yml
new file mode 100644
index 0000000..e888468
--- /dev/null
+++ b/roles/cuda/tasks/configure_yum.yml
@@ -0,0 +1,28 @@
+---
+# tasks file for ansible-role-cuda
+- name: Upload packaging key for Nvidia repositories
+ copy:
+ src: nvidia_packaging_key.asc
+ dest: "{{ cuda_rpm_key_path }}"
+ mode: 0644
+
+- name: Trust packaging key for Nvidia repositories (rpm)
+ rpm_key:
+ key: "{{ cuda_rpm_key_path }}"
+ state: present
+
+- name: Remove trust for old Nvidia packaging key
+ rpm_key:
+ key: 5C37D3BE
+ state: absent
+
+- name: Configure Nvidia repository (yum)
+ yum_repository:
+ name: nvidia
+ description: Official Nvidia repository
+ baseurl: "{{ cuda_repo_url }}/{{ cuda_repo_subfolder }}/x86_64/"
+ gpgkey: "file://{{ cuda_rpm_key_path }}"
+ gpgcheck: yes
+ enabled: yes
+
+# vim:ft=ansible:
diff --git a/roles/cuda/tasks/cuda_init.yml b/roles/cuda/tasks/cuda_init.yml
new file mode 100644
index 0000000..ff54994
--- /dev/null
+++ b/roles/cuda/tasks/cuda_init.yml
@@ -0,0 +1,48 @@
+---
+- name: template in cuda_init.sh used during boot
+ template:
+ src: cuda_init.sh.j2
+ dest: /usr/local/bin/cuda_init.sh
+ mode: 0755
+ notify:
+ - Initialize the GPUs
+
+- name: lineinfile/make sure cuda_init.sh script is absent from rc.local
+ lineinfile:
+ dest: /etc/rc.local
+ insertafter: "^touch /var/lock/subsys/local"
+ regexp: "^/bin/bash /usr/local/bin/cuda_init.sh$"
+ line: "/bin/bash /usr/local/bin/cuda_init.sh"
+ state: absent
+
+- name: template in cuda_init.service systemd script
+ template:
+ src: cuda_init.service.j2
+ dest: /etc/systemd/system/cuda_init.service
+ mode: 0644
+ notify:
+ - reload systemd unit files
+ - Restart cuda_init service
+ when: ansible_service_mgr == "systemd"
+
+- name: enable the cuda_init systemd service
+ service:
+ name: cuda_init
+ enabled: yes
+ when: ansible_service_mgr == "systemd"
+
+- name: check if cuda_gpu_name0 ( /dev/nvidia0 ) exists
+ stat:
+ path: "{{ cuda_gpu_name0 }}"
+ register: reg_cuda_gpu_name0
+ check_mode: no
+ failed_when: false
+
+- name: Initialize the GPUs - run cuda_init.sh if there is no /dev/nvidia0
+ command: /bin/bash /usr/local/bin/cuda_init.sh
+ when:
+ - reg_cuda_gpu_name0.stat.exists is defined
+ - reg_cuda_gpu_name0.stat.exists == False
+ - cuda_init_restart_service
+
+# vim:ft=ansible:
diff --git a/roles/cuda/tasks/main.yml b/roles/cuda/tasks/main.yml
new file mode 100644
index 0000000..f292f67
--- /dev/null
+++ b/roles/cuda/tasks/main.yml
@@ -0,0 +1,54 @@
+---
+# tasks file for ansible-role-cuda
+- name: "Gather OS specific variables"
+ include_vars: "{{ item }}"
+ with_first_found:
+ - "{{ ansible_distribution|lower }}-{{ ansible_distribution_version }}.yml"
+ - "{{ ansible_distribution|lower }}-{{ ansible_distribution_major_version }}.yml"
+ - "{{ ansible_distribution|lower }}.yml"
+ - "{{ ansible_os_family|lower }}.yml"
+
+- block:
+ - include_tasks: configure_yum.yml
+ when: ansible_pkg_mgr == 'yum' or ansible_pkg_mgr == 'dnf'
+
+ - include_tasks: configure_apt.yml
+ when: ansible_pkg_mgr == 'apt'
+
+ - name: Install kernel development files
+ package: name=kernel-devel state=present
+ register: result
+
+ - name: Synchronize kernel and kernel-devel packages
+ package: name=kernel state=latest
+ when: (result | changed)
+
+ - name: Install CUDA and related packages (1.5-2GB download, also restarts if cuda_restart_node_on_install is set to True)
+ package:
+ name: "{{ item }}"
+ state: present
+ with_items: "{{ cuda_packages }}"
+ register: cuda_packages_installation
+ notify:
+ - ZZ CUDA Restart server
+ - ZZ CUDA Wait for server to restart
+
+ - name: Template CUDA paths to user environments
+ template:
+ src: cuda.sh.j2
+ dest: /etc/profile.d/cuda.sh
+ mode: 0755
+ when: cuda_bash_profile
+
+ - include_tasks: cuda_init.yml
+ when: cuda_init == True
+
+ # This is here because if we in the same playbook try to start slurmd without
+ # having run the cuda_init.sh script then slurmd doesn't start and the play fails.
+ # todo: reload nvidia modules/etc instead of restart
+ - name: flush the handlers - so that the node is rebooted after CUDA is installed and that the GPUs are initialized before we start slurm
+ meta: flush_handlers
+
+ when: gpu == True
+
+# vim:ft=ansible: