10 files changed, 325 insertions, 0 deletions
diff --git a/playbooks/common/openshift-cluster/upgrades/etcd/backup.yml b/playbooks/common/openshift-cluster/upgrades/etcd/backup.yml
new file mode 100644
index 000000000..d0eadf1fc
--- /dev/null
+++ b/playbooks/common/openshift-cluster/upgrades/etcd/backup.yml
@@ -0,0 +1,94 @@
+---
+- name: Backup etcd
+  hosts: etcd_hosts_to_backup
+  vars:
+    embedded_etcd: "{{ groups.oo_etcd_to_config | default([]) | length == 0 }}"
+    timestamp: "{{ lookup('pipe', 'date +%Y%m%d%H%M%S') }}"
+    etcdctl_command: "{{ 'etcdctl' if not openshift.common.is_containerized or embedded_etcd else 'docker exec etcd_container etcdctl' }}"
+  roles:
+  - openshift_facts
+  tasks:
+  # Ensure we persist the etcd role for this host in openshift_facts
+  - openshift_facts:
+      role: etcd
+      local_facts: {}
+    when: "'etcd' not in openshift"
+
+  - stat: path=/var/lib/openshift
+    register: var_lib_openshift
+
+  - stat: path=/var/lib/origin
+    register: var_lib_origin
+
+  - name: Create origin symlink if necessary
+    file: src=/var/lib/openshift/ dest=/var/lib/origin state=link
+    when: var_lib_openshift.stat.exists == True and var_lib_origin.stat.exists == False
+
+  # TODO: replace shell module with command and update later checks
+  # We assume to be using the data dir for all backups.
+  - name: Check available disk space for etcd backup
+    shell: df --output=avail -k {{ openshift.common.data_dir }} | tail -n 1
+    register: avail_disk
+
+  # TODO: replace shell module with command and update later checks
+  - name: Check current embedded etcd disk usage
+    shell: du -k {{ openshift.etcd.etcd_data_dir }} | tail -n 1 | cut -f1
+    register: etcd_disk_usage
+    when: embedded_etcd | bool
+
+  - name: Abort if insufficient disk space for etcd backup
+    fail:
+      msg: >
+        {{ etcd_disk_usage.stdout }} Kb disk space required for etcd backup,
+        {{ avail_disk.stdout }} Kb available.
+    when: (embedded_etcd | bool) and (etcd_disk_usage.stdout|int > avail_disk.stdout|int)
+
+  # For non containerized and non embedded we should have the correct version of
+  # etcd installed already. So don't do anything.
+  #
+  # For embedded or containerized we need to use the latest because OCP 3.3 uses
+  # a version of etcd that can only be backed up with etcd-3.x and if it's
+  # containerized then etcd version may be newer than that on the host so
+  # upgrade it.
+  #
+  # On atomic we have neither yum nor dnf so ansible throws a hard to debug error
+  # if you use package there, like this: "Could not find a module for unknown."
+  # see https://bugzilla.redhat.com/show_bug.cgi?id=1408668
+  #
+  # TODO - We should refactor all containerized backups to use the containerized
+  # version of etcd to perform the backup rather than relying on the host's
+  # binaries. Until we do that we'll continue to have problems backing up etcd
+  # when atomic host has an older version than the version that's running in the
+  # container whether that's embedded or not
+  - name: Install latest etcd for containerized or embedded
+    package:
+      name: etcd
+      state: latest
+    when: ( embedded_etcd | bool or openshift.common.is_containerized ) and not openshift.common.is_atomic
+
+  - name: Generate etcd backup
+    command: >
+      {{ etcdctl_command }} backup --data-dir={{ openshift.etcd.etcd_data_dir }}
+      --backup-dir={{ openshift.common.data_dir }}/etcd-backup-{{ backup_tag | default('') }}{{ timestamp }}
+
+  - set_fact:
+      etcd_backup_complete: True
+
+  - name: Display location of etcd backup
+    debug:
+      msg: "Etcd backup created in {{ openshift.common.data_dir }}/etcd-backup-{{ backup_tag | default('') }}{{ timestamp }}"
+
+- name: Gate on etcd backup
+  hosts: localhost
+  connection: local
+  become: no
+  tasks:
+  - set_fact:
+      etcd_backup_completed: "{{ hostvars
+                                 | oo_select_keys(groups.etcd_hosts_to_backup)
+                                 | oo_collect('inventory_hostname', {'etcd_backup_complete': true}) }}"
+  - set_fact:
+      etcd_backup_failed: "{{ groups.etcd_hosts_to_backup | difference(etcd_backup_completed) }}"
+  - fail:
+      msg: "Upgrade cannot continue. The following hosts did not complete etcd backup: {{ etcd_backup_failed | join(',') }}"
+    when: etcd_backup_failed | length > 0
diff --git a/playbooks/common/openshift-cluster/upgrades/etcd/containerized_tasks.yml b/playbooks/common/openshift-cluster/upgrades/etcd/containerized_tasks.yml
new file mode 100644
index 000000000..5f8b59e17
--- /dev/null
+++ b/playbooks/common/openshift-cluster/upgrades/etcd/containerized_tasks.yml
@@ -0,0 +1,46 @@
+---
+- name: Verify cluster is healthy pre-upgrade
+  command: "etcdctl --cert-file /etc/etcd/peer.crt --key-file /etc/etcd/peer.key --ca-file /etc/etcd/ca.crt -C https://{{ openshift.common.hostname }}:2379 cluster-health"
+
+- name: Get current image
+  shell: grep 'ExecStart=' /etc/systemd/system/etcd_container.service | awk '{print $NF}'
+  register: current_image
+
+- name: Set new_etcd_image
+  set_fact:
+    new_etcd_image: "{{ current_image.stdout | regex_replace('/etcd.*$','/etcd:' ~ upgrade_version ) }}"
+
+- name: Pull new etcd image
+  command: "docker pull {{ new_etcd_image }}"
+
+- name: Update to latest etcd image
+  replace:
+    dest: /etc/systemd/system/etcd_container.service
+    regexp: "{{ current_image.stdout }}$"
+    replace: "{{ new_etcd_image }}"
+
+- name: Restart etcd_container
+  systemd:
+    name: etcd_container
+    daemon_reload: yes
+    state: restarted
+
+## TODO: probably should just move this into the backup playbooks, also this
+## will fail on atomic host. We need to revisit how to do etcd backups there as
+## the container may be newer than etcdctl on the host. Assumes etcd3 obsoletes etcd (7.3.1)
+- name: Upgrade etcd for etcdctl when not atomic
+  package: name=etcd state=latest
+  when: not openshift.common.is_atomic | bool
+
+- name: Verify cluster is healthy
+  command: "etcdctl --cert-file /etc/etcd/peer.crt --key-file /etc/etcd/peer.key --ca-file /etc/etcd/ca.crt -C https://{{ openshift.common.hostname }}:2379 cluster-health"
+  register: etcdctl
+  until: etcdctl.rc == 0
+  retries: 3
+  delay: 10
+
+- name: Store new etcd_image
+  openshift_facts:
+    role: etcd
+    local_facts:
+      etcd_image: "{{ new_etcd_image }}"
diff --git a/playbooks/common/openshift-cluster/upgrades/etcd/fedora_tasks.yml b/playbooks/common/openshift-cluster/upgrades/etcd/fedora_tasks.yml
new file mode 100644
index 000000000..30232110e
--- /dev/null
+++ b/playbooks/common/openshift-cluster/upgrades/etcd/fedora_tasks.yml
@@ -0,0 +1,23 @@
+---
+# F23 GA'd with etcd 2.0, currently has 2.2 in updates
+# F24 GA'd with etcd-2.2, currently has 2.2 in updates
+# F25 Beta currently has etcd 3.0
+- name: Verify cluster is healthy pre-upgrade
+  command: "etcdctl --cert-file /etc/etcd/peer.crt --key-file /etc/etcd/peer.key --ca-file /etc/etcd/ca.crt -C https://{{ openshift.common.hostname }}:2379 cluster-health"
+
+- name: Update etcd
+  package:
+    name: "etcd"
+    state: "latest"
+
+- name: Restart etcd
+  service:
+    name: etcd
+    state: restarted
+
+- name: Verify cluster is healthy
+  command: "etcdctl --cert-file /etc/etcd/peer.crt --key-file /etc/etcd/peer.key --ca-file /etc/etcd/ca.crt -C https://{{ openshift.common.hostname }}:2379 cluster-health"
+  register: etcdctl
+  until: etcdctl.rc == 0
+  retries: 3
+  delay: 10
diff --git a/playbooks/common/openshift-cluster/upgrades/etcd/files/etcdctl.sh b/playbooks/common/openshift-cluster/upgrades/etcd/files/etcdctl.sh
new file mode 120000
index 000000000..641e04e44
--- /dev/null
+++ b/playbooks/common/openshift-cluster/upgrades/etcd/files/etcdctl.sh
@@ -0,0 +1 @@
+../roles/etcd/files/etcdctl.sh
+\ No newline at end of file
diff --git a/playbooks/common/openshift-cluster/upgrades/etcd/filter_plugins b/playbooks/common/openshift-cluster/upgrades/etcd/filter_plugins
new file mode 120000
index 000000000..27ddaa18b
--- /dev/null
+++ b/playbooks/common/openshift-cluster/upgrades/etcd/filter_plugins
@@ -0,0 +1 @@
+../../../../../filter_plugins
+\ No newline at end of file
diff --git a/playbooks/common/openshift-cluster/upgrades/etcd/lookup_plugins b/playbooks/common/openshift-cluster/upgrades/etcd/lookup_plugins
new file mode 120000
index 000000000..cf407f69b
--- /dev/null
+++ b/playbooks/common/openshift-cluster/upgrades/etcd/lookup_plugins
@@ -0,0 +1 @@
+../../../../../lookup_plugins
+\ No newline at end of file
diff --git a/playbooks/common/openshift-cluster/upgrades/etcd/main.yml b/playbooks/common/openshift-cluster/upgrades/etcd/main.yml
new file mode 100644
index 000000000..8268adc2e
--- /dev/null
+++ b/playbooks/common/openshift-cluster/upgrades/etcd/main.yml
@@ -0,0 +1,44 @@
+---
+# For 1.4/3.4 we want to upgrade everyone to etcd-3.0. etcd docs say to
+# upgrade from 2.0.x to 2.1.x to 2.2.x to 2.3.x to 3.0.x. While this is a tedius
+# task for RHEL and CENTOS it's simply not possible in Fedora unless you've
+# mirrored packages on your own because only the GA and latest versions are
+# available in the repos. So for Fedora we'll simply skip this, sorry.
+
+- include: ../../evaluate_groups.yml
+  tags:
+  - always
+
+# We use two groups one for hosts we're upgrading which doesn't include embedded etcd
+# The other for backing up which includes the embedded etcd host, there's no need to
+# upgrade embedded etcd that just happens when the master is updated.
+- name: Evaluate additional groups for etcd
+  hosts: localhost
+  connection: local
+  become: no
+  tasks:
+  - name: Evaluate etcd_hosts_to_upgrade
+    add_host:
+      name: "{{ item }}"
+      groups: etcd_hosts_to_upgrade
+    with_items: "{{ groups.oo_etcd_to_config if groups.oo_etcd_to_config is defined and groups.oo_etcd_to_config | length > 0 else [] }}"
+  - name: Evaluate etcd_hosts_to_backup
+    add_host:
+      name: "{{ item }}"
+      groups: etcd_hosts_to_backup
+    with_items: "{{ groups.oo_etcd_to_config if groups.oo_etcd_to_config is defined and groups.oo_etcd_to_config | length > 0 else groups.oo_first_master }}"
+
+- name: Backup etcd before upgrading anything
+  include: backup.yml
+  vars:
+    backup_tag: "pre-upgrade-"
+  when: openshift_etcd_backup | default(true) | bool
+
+- name: Drop etcdctl profiles
+  hosts: etcd_hosts_to_upgrade
+  tasks:
+  - include: roles/etcd/tasks/etcdctl.yml
+
+- name: Perform etcd upgrade
+  include: ./upgrade.yml
+  when: openshift_etcd_upgrade | default(true) | bool
diff --git a/playbooks/common/openshift-cluster/upgrades/etcd/rhel_tasks.yml b/playbooks/common/openshift-cluster/upgrades/etcd/rhel_tasks.yml
new file mode 100644
index 000000000..3a972e8ab
--- /dev/null
+++ b/playbooks/common/openshift-cluster/upgrades/etcd/rhel_tasks.yml
@@ -0,0 +1,20 @@
+---
+- name: Verify cluster is healthy pre-upgrade
+  command: "etcdctl --cert-file /etc/etcd/peer.crt --key-file /etc/etcd/peer.key --ca-file /etc/etcd/ca.crt -C https://{{ openshift.common.hostname }}:2379 cluster-health"
+
+- name: Update etcd RPM
+  package:
+    name: etcd-{{ upgrade_version }}*
+    state: latest
+
+- name: Restart etcd
+  service:
+    name: etcd
+    state: restarted
+
+- name: Verify cluster is healthy
+  command: "etcdctl --cert-file /etc/etcd/peer.crt --key-file /etc/etcd/peer.key --ca-file /etc/etcd/ca.crt -C https://{{ openshift.common.hostname }}:2379 cluster-health"
+  register: etcdctl
+  until: etcdctl.rc == 0
+  retries: 3
+  delay: 10
diff --git a/playbooks/common/openshift-cluster/upgrades/etcd/roles b/playbooks/common/openshift-cluster/upgrades/etcd/roles
new file mode 120000
index 000000000..6bc1a7aef
--- /dev/null
+++ b/playbooks/common/openshift-cluster/upgrades/etcd/roles
@@ -0,0 +1 @@
+../../../../../roles
+\ No newline at end of file
diff --git a/playbooks/common/openshift-cluster/upgrades/etcd/upgrade.yml b/playbooks/common/openshift-cluster/upgrades/etcd/upgrade.yml
new file mode 100644
index 000000000..0f8d94737
--- /dev/null
+++ b/playbooks/common/openshift-cluster/upgrades/etcd/upgrade.yml
@@ -0,0 +1,94 @@
+---
+- name: Determine etcd version
+  hosts: etcd_hosts_to_upgrade
+  tasks:
+  - name: Record RPM based etcd version
+    command: rpm -qa --qf '%{version}' etcd\*
+    args:
+      warn: no
+    register: etcd_rpm_version
+    failed_when: false
+    when: not openshift.common.is_containerized | bool
+  - name: Record containerized etcd version
+    command: docker exec etcd_container rpm -qa --qf '%{version}' etcd\*
+    register: etcd_container_version
+    failed_when: false
+    when: openshift.common.is_containerized | bool
+
+# I really dislike this copy/pasta but I wasn't able to find a way to get it to loop
+# through hosts, then loop through tasks only when appropriate
+- name: Upgrade to 2.1
+  hosts: etcd_hosts_to_upgrade
+  serial: 1
+  vars:
+    upgrade_version: '2.1'
+  tasks:
+  - include: rhel_tasks.yml
+    when: etcd_rpm_version.stdout | default('99') | version_compare('2.1','<') and ansible_distribution == 'RedHat' and not openshift.common.is_containerized | bool
+
+- name: Upgrade RPM hosts to 2.2
+  hosts: etcd_hosts_to_upgrade
+  serial: 1
+  vars:
+    upgrade_version: '2.2'
+  tasks:
+  - include: rhel_tasks.yml
+    when: etcd_rpm_version.stdout | default('99') | version_compare('2.2','<') and ansible_distribution == 'RedHat' and not openshift.common.is_containerized | bool
+
+- name: Upgrade containerized hosts to 2.2.5
+  hosts: etcd_hosts_to_upgrade
+  serial: 1
+  vars:
+    upgrade_version: 2.2.5
+  tasks:
+  - include: containerized_tasks.yml
+    when: etcd_container_version.stdout | default('99') | version_compare('2.2','<') and openshift.common.is_containerized | bool
+
+- name: Upgrade RPM hosts to 2.3
+  hosts: etcd_hosts_to_upgrade
+  serial: 1
+  vars:
+    upgrade_version: '2.3'
+  tasks:
+  - include: rhel_tasks.yml
+    when: etcd_rpm_version.stdout | default('99') | version_compare('2.3','<') and ansible_distribution == 'RedHat' and not openshift.common.is_containerized | bool
+
+- name: Upgrade containerized hosts to 2.3.7
+  hosts: etcd_hosts_to_upgrade
+  serial: 1
+  vars:
+    upgrade_version: 2.3.7
+  tasks:
+  - include: containerized_tasks.yml
+    when: etcd_container_version.stdout | default('99') | version_compare('2.3','<') and openshift.common.is_containerized | bool
+
+- name: Upgrade RPM hosts to 3.0
+  hosts: etcd_hosts_to_upgrade
+  serial: 1
+  vars:
+    upgrade_version: '3.0'
+  tasks:
+  - include: rhel_tasks.yml
+    when: etcd_rpm_version.stdout | default('99') | version_compare('3.0','<') and ansible_distribution == 'RedHat' and not openshift.common.is_containerized | bool
+
+- name: Upgrade containerized hosts to etcd3 image
+  hosts: etcd_hosts_to_upgrade
+  serial: 1
+  vars:
+    upgrade_version: 3.0.15
+  tasks:
+  - include: containerized_tasks.yml
+    when: etcd_container_version.stdout | default('99') | version_compare('3.0','<') and openshift.common.is_containerized | bool
+
+- name: Upgrade fedora to latest
+  hosts: etcd_hosts_to_upgrade
+  serial: 1
+  tasks:
+  - include: fedora_tasks.yml
+    when: ansible_distribution == 'Fedora' and not openshift.common.is_containerized | bool
+
+- name: Backup etcd
+  include: backup.yml
+  vars:
+    backup_tag: "post-3.0-"
+  when: openshift_etcd_backup | default(true) | bool