From a6d5c625956a5051b7bbd9fc48430b9df11084ee Mon Sep 17 00:00:00 2001 From: Michael Gugino Date: Fri, 8 Dec 2017 18:30:09 -0500 Subject: Refactor node upgrade to include less serial tasks This commit moves the pulling of images, packages, and updating config files into a non-serialized play. The serialized play is now in charge of marking unschedulable, draining, stopping and restarting services, and marking schedulable. If rpm install / container download takes 60s per host, this will save 3 hours and 10 minutes at 200 hosts per cluster and forks of 20 hosts. --- .../openshift-cluster/upgrades/upgrade_nodes.yml | 33 ++++-- roles/openshift_node/handlers/main.yml | 6 + roles/openshift_node/tasks/dnsmasq.yml | 43 ------- roles/openshift_node/tasks/dnsmasq_install.yml | 43 +++++++ roles/openshift_node/tasks/docker/upgrade.yml | 27 ----- roles/openshift_node/tasks/main.yml | 1 + roles/openshift_node/tasks/upgrade.yml | 127 +++------------------ .../tasks/upgrade/containerized_node_upgrade.yml | 11 -- roles/openshift_node/tasks/upgrade/restart.yml | 9 ++ roles/openshift_node/tasks/upgrade_pre.yml | 118 +++++++++++++++++++ 10 files changed, 218 insertions(+), 200 deletions(-) create mode 100644 roles/openshift_node/tasks/dnsmasq_install.yml delete mode 100644 roles/openshift_node/tasks/docker/upgrade.yml create mode 100644 roles/openshift_node/tasks/upgrade_pre.yml diff --git a/playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml b/playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml index f7a85545b..a3cb1d0f9 100644 --- a/playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml +++ b/playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml @@ -1,16 +1,25 @@ --- +- name: Prepull images and rpms before doing rolling restart + hosts: oo_nodes_to_upgrade:!oo_masters_to_config + roles: + - role: openshift_facts + tasks: + - include_role: + name: openshift_node + tasks_from: upgrade_pre.yml + vars: + openshift_node_upgrade_in_progress: True + - name: Drain and upgrade nodes hosts: oo_nodes_to_upgrade:!oo_masters_to_config # This var must be set with -e on invocation, as it is not a per-host inventory var # and is evaluated early. Values such as "20%" can also be used. serial: "{{ openshift_upgrade_nodes_serial | default(1) }}" max_fail_percentage: "{{ openshift_upgrade_nodes_max_fail_percentage | default(0) }}" - + roles: + - lib_openshift + - openshift_facts pre_tasks: - - name: Load lib_openshift modules - import_role: - name: lib_openshift - # TODO: To better handle re-trying failed upgrades, it would be nice to check if the node # or docker actually needs an upgrade before proceeding. Perhaps best to save this until # we merge upgrade functionality into the base roles and a normal config.yml playbook run. @@ -33,18 +42,12 @@ retries: 60 delay: 60 - roles: - - openshift_facts post_tasks: - include_role: name: openshift_node tasks_from: upgrade.yml vars: openshift_node_upgrade_in_progress: True - - include_role: - name: openshift_excluder - vars: - r_openshift_excluder_action: enable - name: Set node schedulability oc_adm_manage_node: node: "{{ openshift.node.nodename | lower }}" @@ -55,3 +58,11 @@ register: node_schedulable until: node_schedulable|succeeded when: node_unschedulable|changed + +- name: Re-enable excluders + hosts: oo_nodes_to_upgrade:!oo_masters_to_config + tasks: + - include_role: + name: openshift_excluder + vars: + r_openshift_excluder_action: enable diff --git a/roles/openshift_node/handlers/main.yml b/roles/openshift_node/handlers/main.yml index 170a3dc6e..1d9797f84 100644 --- a/roles/openshift_node/handlers/main.yml +++ b/roles/openshift_node/handlers/main.yml @@ -4,11 +4,15 @@ name: NetworkManager state: restarted enabled: True + when: + - (not skip_node_svc_handlers | default(False) | bool) - name: restart dnsmasq systemd: name: dnsmasq state: restarted + when: + - (not skip_node_svc_handlers | default(False) | bool) - name: restart openvswitch systemd: @@ -47,3 +51,5 @@ - name: reload systemd units command: systemctl daemon-reload + when: + - (not skip_node_svc_handlers | default(False) | bool) diff --git a/roles/openshift_node/tasks/dnsmasq.yml b/roles/openshift_node/tasks/dnsmasq.yml index f210a3a21..31ca46ec0 100644 --- a/roles/openshift_node/tasks/dnsmasq.yml +++ b/roles/openshift_node/tasks/dnsmasq.yml @@ -1,43 +1,4 @@ --- -- name: Check for NetworkManager service - command: > - systemctl show NetworkManager - register: nm_show - changed_when: false - ignore_errors: True - -- name: Set fact using_network_manager - set_fact: - network_manager_active: "{{ True if 'ActiveState=active' in nm_show.stdout else False }}" - -- name: Install dnsmasq - package: name=dnsmasq state=installed - when: not openshift.common.is_atomic | bool - register: result - until: result | success - -- name: ensure origin/node directory exists - file: - state: directory - path: "{{ item }}" - owner: root - group: root - mode: '0700' - with_items: - - /etc/origin - - /etc/origin/node - -# this file is copied to /etc/dnsmasq.d/ when the node starts and is removed -# when the node stops. A dbus-message is sent to dnsmasq to add the same entries -# so that dnsmasq doesn't need to be restarted. Once we can use dnsmasq 2.77 or -# newer we can use --server-file option to update the servers dynamically and -# reload them by sending dnsmasq a SIGHUP. We write the file in case someone else -# triggers a restart of dnsmasq but not a node restart. -- name: Install node-dnsmasq.conf - template: - src: node-dnsmasq.conf.j2 - dest: /etc/origin/node/node-dnsmasq.conf - - name: Install dnsmasq configuration template: src: origin-dns.conf.j2 @@ -63,7 +24,3 @@ # Dynamic NetworkManager based dispatcher - include_tasks: dnsmasq/network-manager.yml when: network_manager_active | bool - -# Relies on ansible in order to configure static config -- include_tasks: dnsmasq/no-network-manager.yml - when: not network_manager_active | bool diff --git a/roles/openshift_node/tasks/dnsmasq_install.yml b/roles/openshift_node/tasks/dnsmasq_install.yml new file mode 100644 index 000000000..9f66bf12d --- /dev/null +++ b/roles/openshift_node/tasks/dnsmasq_install.yml @@ -0,0 +1,43 @@ +--- +- name: Check for NetworkManager service + command: > + systemctl show NetworkManager + register: nm_show + changed_when: false + ignore_errors: True + +- name: Set fact using_network_manager + set_fact: + network_manager_active: "{{ True if 'ActiveState=active' in nm_show.stdout else False }}" + +- name: Install dnsmasq + package: name=dnsmasq state=installed + when: not openshift.common.is_atomic | bool + register: result + until: result | success + +- name: ensure origin/node directory exists + file: + state: directory + path: "{{ item }}" + owner: root + group: root + mode: '0700' + with_items: + - /etc/origin + - /etc/origin/node + +# this file is copied to /etc/dnsmasq.d/ when the node starts and is removed +# when the node stops. A dbus-message is sent to dnsmasq to add the same entries +# so that dnsmasq doesn't need to be restarted. Once we can use dnsmasq 2.77 or +# newer we can use --server-file option to update the servers dynamically and +# reload them by sending dnsmasq a SIGHUP. We write the file in case someone else +# triggers a restart of dnsmasq but not a node restart. +- name: Install node-dnsmasq.conf + template: + src: node-dnsmasq.conf.j2 + dest: /etc/origin/node/node-dnsmasq.conf + +# Relies on ansible in order to configure static config +- include_tasks: dnsmasq/no-network-manager.yml + when: not network_manager_active | bool diff --git a/roles/openshift_node/tasks/docker/upgrade.yml b/roles/openshift_node/tasks/docker/upgrade.yml deleted file mode 100644 index bbe9c71f5..000000000 --- a/roles/openshift_node/tasks/docker/upgrade.yml +++ /dev/null @@ -1,27 +0,0 @@ ---- -# input variables: -# - openshift_service_type -# - openshift.common.is_containerized -# - docker_version -# - skip_docker_restart - -- name: Check Docker image count - shell: "docker images -aq | wc -l" - register: docker_image_count - -- debug: var=docker_image_count.stdout - -- service: - name: docker - state: stopped - register: l_openshift_node_upgrade_docker_stop_result - until: not l_openshift_node_upgrade_docker_stop_result | failed - retries: 3 - delay: 30 - -- name: Upgrade Docker - package: name=docker{{ '-' + docker_version }} state=present - register: result - until: result | success - -# starting docker happens back in ../main.yml where it calls ../restart.yml diff --git a/roles/openshift_node/tasks/main.yml b/roles/openshift_node/tasks/main.yml index 32c5f495f..946deb4d3 100644 --- a/roles/openshift_node/tasks/main.yml +++ b/roles/openshift_node/tasks/main.yml @@ -6,6 +6,7 @@ - deployment_type == 'openshift-enterprise' - not openshift_use_crio +- include_tasks: dnsmasq_install.yml - include_tasks: dnsmasq.yml - name: setup firewall diff --git a/roles/openshift_node/tasks/upgrade.yml b/roles/openshift_node/tasks/upgrade.yml index 9f333645a..ede73f22c 100644 --- a/roles/openshift_node/tasks/upgrade.yml +++ b/roles/openshift_node/tasks/upgrade.yml @@ -10,8 +10,6 @@ # tasks file for openshift_node_upgrade -- include_tasks: registry_auth.yml - - name: Stop node and openvswitch services service: name: "{{ item }}" @@ -21,58 +19,6 @@ - openvswitch failed_when: false -- name: Stop additional containerized services - service: - name: "{{ item }}" - state: stopped - with_items: - - "{{ openshift_service_type }}-master-controllers" - - "{{ openshift_service_type }}-master-api" - - etcd_container - failed_when: false - when: openshift.common.is_containerized | bool - -- name: Pre-pull node image - command: > - docker pull {{ openshift.node.node_image }}:{{ openshift_image_tag }} - register: pull_result - changed_when: "'Downloaded newer image' in pull_result.stdout" - when: openshift.common.is_containerized | bool - -- name: Pre-pull openvswitch image - command: > - docker pull {{ openshift.node.ovs_image }}:{{ openshift_image_tag }} - register: pull_result - changed_when: "'Downloaded newer image' in pull_result.stdout" - when: - - openshift.common.is_containerized | bool - - openshift_use_openshift_sdn | bool - -- include_tasks: docker/upgrade.yml - vars: - # We will restart Docker ourselves after everything is ready: - skip_docker_restart: True - when: - - l_docker_upgrade is defined - - l_docker_upgrade | bool - -- include_tasks: "{{ node_config_hook }}" - when: node_config_hook is defined - -- include_tasks: upgrade/rpm_upgrade.yml - vars: - component: "node" - openshift_version: "{{ openshift_pkg_version | default('') }}" - when: not openshift.common.is_containerized | bool - -- name: Remove obsolete docker-sdn-ovs.conf - file: - path: "/etc/systemd/system/docker.service.d/docker-sdn-ovs.conf" - state: absent - -- include_tasks: upgrade/containerized_node_upgrade.yml - when: openshift.common.is_containerized | bool - - name: Ensure containerized services stopped before Docker restart service: name: "{{ item }}" @@ -86,6 +32,17 @@ failed_when: false when: openshift.common.is_containerized | bool +- service: + name: docker + state: stopped + register: l_openshift_node_upgrade_docker_stop_result + until: not l_openshift_node_upgrade_docker_stop_result | failed + retries: 3 + delay: 30 + when: + - l_docker_upgrade is defined + - l_docker_upgrade | bool + - name: Stop rpm based services service: name: "{{ item }}" @@ -96,56 +53,19 @@ failed_when: false when: not openshift.common.is_containerized | bool +- include_tasks: "{{ node_config_hook }}" + when: node_config_hook is defined + # https://bugzilla.redhat.com/show_bug.cgi?id=1513054 - name: Clean up dockershim data file: path: "/var/lib/dockershim/sandbox/" state: absent -- name: Upgrade openvswitch - package: - name: openvswitch - state: latest - when: not openshift.common.is_containerized | bool - register: result - until: result | success - -- name: Update oreg value - yedit: - src: "{{ openshift.common.config_base }}/node/node-config.yaml" - key: 'imageConfig.format' - value: "{{ oreg_url | default(oreg_url_node) }}" - when: oreg_url is defined or oreg_url_node is defined - -# https://docs.openshift.com/container-platform/3.4/admin_guide/overcommit.html#disabling-swap-memory -- name: Check for swap usage - command: grep "^[^#].*swap" /etc/fstab - # grep: match any lines which don't begin with '#' and contain 'swap' - changed_when: false - failed_when: false - register: swap_result - - # Disable Swap Block -- block: - - - name: Disable swap - command: swapoff --all - - - name: Remove swap entries from /etc/fstab - replace: - dest: /etc/fstab - regexp: '(^[^#].*swap.*)' - replace: '# \1' - backup: yes - - - name: Add notice about disabling swap - lineinfile: - dest: /etc/fstab - line: '# OpenShift-Ansible Installer disabled swap per overcommit guidelines' - state: present - +- name: Disable swap + command: swapoff --all when: - - swap_result.stdout_lines | length > 0 + - openshift_node_upgrade_swap_result | default(False) | bool - openshift_disable_swap | default(true) | bool # End Disable Swap Block @@ -155,17 +75,6 @@ - ansible_selinux is defined - ansible_selinux.status == 'enabled' -- name: Apply 3.6 dns config changes - yedit: - src: /etc/origin/node/node-config.yaml - key: "{{ item.key }}" - value: "{{ item.value }}" - with_items: - - key: "dnsBindAddress" - value: "127.0.0.1:53" - - key: "dnsRecursiveResolvConf" - value: "/etc/origin/node/resolv.conf" - # Restart all services - include_tasks: upgrade/restart.yml @@ -182,3 +91,5 @@ delay: 5 - include_tasks: dnsmasq.yml + +- meta: flush_handlers diff --git a/roles/openshift_node/tasks/upgrade/containerized_node_upgrade.yml b/roles/openshift_node/tasks/upgrade/containerized_node_upgrade.yml index 245de60a7..8e547351b 100644 --- a/roles/openshift_node/tasks/upgrade/containerized_node_upgrade.yml +++ b/roles/openshift_node/tasks/upgrade/containerized_node_upgrade.yml @@ -1,14 +1,3 @@ --- -# This is a hack to allow us to use systemd_units.yml, but skip the handlers which -# restart services. We will unconditionally restart all containerized services -# because we have to unconditionally restart Docker: -- set_fact: - skip_node_svc_handlers: True - - name: Update systemd units include_tasks: ../systemd_units.yml - -# This is a no-op because of skip_node_svc_handlers, but lets us trigger it before end of -# play when the node has already been marked schedulable again. (this would look strange -# in logs otherwise) -- meta: flush_handlers diff --git a/roles/openshift_node/tasks/upgrade/restart.yml b/roles/openshift_node/tasks/upgrade/restart.yml index 65c301783..717cfa712 100644 --- a/roles/openshift_node/tasks/upgrade/restart.yml +++ b/roles/openshift_node/tasks/upgrade/restart.yml @@ -13,6 +13,15 @@ - name: Reload systemd to ensure latest unit files command: systemctl daemon-reload +- name: Restart support services + service: + name: "{{ item }}" + state: restarted + enabled: True + with_items: + - NetworkManager + - dnsmasq + - name: Restart container runtime service: name: "{{ openshift_docker_service_name }}" diff --git a/roles/openshift_node/tasks/upgrade_pre.yml b/roles/openshift_node/tasks/upgrade_pre.yml new file mode 100644 index 000000000..5d7961a24 --- /dev/null +++ b/roles/openshift_node/tasks/upgrade_pre.yml @@ -0,0 +1,118 @@ +--- +# This is a hack to allow us to update various components without restarting +# services. This will persist into the upgrade play as well, so everything +# needs to be restarted by hand. +- set_fact: + skip_node_svc_handlers: True + +- include_tasks: registry_auth.yml + +- name: Check Docker image count + shell: "docker images -aq | wc -l" + register: docker_image_count + when: + - l_docker_upgrade is defined + - l_docker_upgrade | bool + +- debug: var=docker_image_count.stdout + when: + - l_docker_upgrade is defined + - l_docker_upgrade | bool + +- name: Upgrade Docker + package: name=docker{{ '-' + docker_version }} state=present + register: result + until: result | success + when: + - l_docker_upgrade is defined + - l_docker_upgrade | bool + +- name: Pre-pull node image + command: > + docker pull {{ openshift.node.node_image }}:{{ openshift_image_tag }} + register: pull_result + changed_when: "'Downloaded newer image' in pull_result.stdout" + when: openshift.common.is_containerized | bool + +- name: Pre-pull openvswitch image + command: > + docker pull {{ openshift.node.ovs_image }}:{{ openshift_image_tag }} + register: pull_result + changed_when: "'Downloaded newer image' in pull_result.stdout" + when: + - openshift.common.is_containerized | bool + - openshift_use_openshift_sdn | bool + +- include_tasks: upgrade/rpm_upgrade.yml + vars: + component: "node" + openshift_version: "{{ openshift_pkg_version | default('') }}" + when: not openshift.common.is_containerized | bool + +- name: Remove obsolete docker-sdn-ovs.conf + file: + path: "/etc/systemd/system/docker.service.d/docker-sdn-ovs.conf" + state: absent + +- include_tasks: upgrade/containerized_node_upgrade.yml + when: openshift.common.is_containerized | bool + +- name: Upgrade openvswitch + package: + name: openvswitch + state: latest + when: not openshift.common.is_containerized | bool + register: result + until: result | success + +- name: Update oreg value + yedit: + src: "{{ openshift.common.config_base }}/node/node-config.yaml" + key: 'imageConfig.format' + value: "{{ oreg_url | default(oreg_url_node) }}" + when: oreg_url is defined or oreg_url_node is defined + +# https://docs.openshift.com/container-platform/3.4/admin_guide/overcommit.html#disabling-swap-memory +- name: Check for swap usage + command: grep "^[^#].*swap" /etc/fstab + # grep: match any lines which don't begin with '#' and contain 'swap' + changed_when: false + failed_when: false + register: swap_result + +# Set this fact here so we can use it during the next play, which is serial. +- name: set_fact swap_result + set_fact: + openshift_node_upgrade_swap_result: "{{ swap_result.stdout_lines | length > 0 | bool }}" + +# Disable Swap Block (pre) +- block: + - name: Remove swap entries from /etc/fstab + replace: + dest: /etc/fstab + regexp: '(^[^#].*swap.*)' + replace: '# \1' + backup: yes + + - name: Add notice about disabling swap + lineinfile: + dest: /etc/fstab + line: '# OpenShift-Ansible Installer disabled swap per overcommit guidelines' + state: present + when: + - openshift_node_upgrade_swap_result | default(False) | bool + - openshift_disable_swap | default(true) | bool + # End Disable Swap Block + +- name: Apply 3.6 dns config changes + yedit: + src: /etc/origin/node/node-config.yaml + key: "{{ item.key }}" + value: "{{ item.value }}" + with_items: + - key: "dnsBindAddress" + value: "127.0.0.1:53" + - key: "dnsRecursiveResolvConf" + value: "/etc/origin/node/resolv.conf" + +- include_tasks: dnsmasq_install.yml -- cgit v1.2.3