diff options
Diffstat (limited to 'roles/openshift_health_checker/openshift_checks')
17 files changed, 758 insertions, 566 deletions
diff --git a/roles/openshift_health_checker/openshift_checks/__init__.py b/roles/openshift_health_checker/openshift_checks/__init__.py index 40a28cde5..07ec6f7ef 100644 --- a/roles/openshift_health_checker/openshift_checks/__init__.py +++ b/roles/openshift_health_checker/openshift_checks/__init__.py @@ -10,11 +10,34 @@ from importlib import import_module from ansible.module_utils import six from ansible.module_utils.six.moves import reduce # pylint: disable=import-error,redefined-builtin +from ansible.plugins.filter.core import to_bool as ansible_to_bool class OpenShiftCheckException(Exception): - """Raised when a check cannot proceed.""" - pass + """Raised when a check encounters a failure condition.""" + + def __init__(self, name, msg=None): + # msg is for the message the user will see when this is raised. + # name is for test code to identify the error without looking at msg text. + if msg is None: # for parameter backward compatibility + msg = name + name = self.__class__.__name__ + self.name = name + super(OpenShiftCheckException, self).__init__(msg) + + +class OpenShiftCheckExceptionList(OpenShiftCheckException): + """A container for multiple logging errors that may be detected in one check.""" + def __init__(self, errors): + self.errors = errors + super(OpenShiftCheckExceptionList, self).__init__( + 'OpenShiftCheckExceptionList', + '\n'.join(str(msg) for msg in errors) + ) + + # make iterable + def __getitem__(self, index): + return self.errors[index] @six.add_metaclass(ABCMeta) @@ -35,6 +58,9 @@ class OpenShiftCheck(object): self.task_vars = task_vars or {} self.tmp = tmp + # set to True when the check changes the host, for accurate total "changed" count + self.changed = False + @abstractproperty def name(self): """The name of this check, usually derived from the class name.""" @@ -94,16 +120,107 @@ class OpenShiftCheck(object): Ansible task_vars structures are Python dicts, often mapping strings to other dicts. This helper makes it easier to get a nested value, raising - OpenShiftCheckException when a key is not found or returning a default value - provided as a keyword argument. + OpenShiftCheckException when a key is not found. + + Keyword args: + default: + On missing key, return this as default value instead of raising exception. + convert: + Supply a function to apply to normalize the value before returning it. + None is the default (return as-is). + This function should raise ValueError if the user has provided a value + that cannot be converted, or OpenShiftCheckException if some other + problem needs to be described to the user. """ + if len(keys) == 1: + keys = keys[0].split(".") + try: value = reduce(operator.getitem, keys, self.task_vars) except (KeyError, TypeError): - if "default" in kwargs: - return kwargs["default"] - raise OpenShiftCheckException("'{}' is undefined".format(".".join(map(str, keys)))) - return value + if "default" not in kwargs: + raise OpenShiftCheckException( + "This check expects the '{}' inventory variable to be defined\n" + "in order to proceed, but it is undefined. There may be a bug\n" + "in Ansible, the checks, or their dependencies." + "".format(".".join(map(str, keys))) + ) + value = kwargs["default"] + + convert = kwargs.get("convert", None) + try: + if convert is None: + return value + elif convert is bool: # interpret bool as Ansible does, instead of python truthiness + return ansible_to_bool(value) + else: + return convert(value) + + except ValueError as error: # user error in specifying value + raise OpenShiftCheckException( + 'Cannot convert inventory variable to expected type:\n' + ' "{var}={value}"\n' + '{error}'.format(var=".".join(keys), value=value, error=error) + ) + + except OpenShiftCheckException: # some other check-specific problem + raise + + except Exception as error: # probably a bug in the function + raise OpenShiftCheckException( + 'There is a bug in this check. While trying to convert variable \n' + ' "{var}={value}"\n' + 'the given converter cannot be used or failed unexpectedly:\n' + '{error}'.format(var=".".join(keys), value=value, error=error) + ) + + @staticmethod + def get_major_minor_version(openshift_image_tag): + """Parse and return the deployed version of OpenShift as a tuple.""" + if openshift_image_tag and openshift_image_tag[0] == 'v': + openshift_image_tag = openshift_image_tag[1:] + + # map major release versions across releases + # to a common major version + openshift_major_release_version = { + "1": "3", + } + + components = openshift_image_tag.split(".") + if not components or len(components) < 2: + msg = "An invalid version of OpenShift was found for this host: {}" + raise OpenShiftCheckException(msg.format(openshift_image_tag)) + + if components[0] in openshift_major_release_version: + components[0] = openshift_major_release_version[components[0]] + + components = tuple(int(x) for x in components[:2]) + return components + + def find_ansible_mount(self, path): + """Return the mount point for path from ansible_mounts.""" + + # reorganize list of mounts into dict by path + mount_for_path = { + mount['mount']: mount + for mount + in self.get_var('ansible_mounts') + } + + # NOTE: including base cases '/' and '' to ensure the loop ends + mount_targets = set(mount_for_path.keys()) | {'/', ''} + mount_point = path + while mount_point not in mount_targets: + mount_point = os.path.dirname(mount_point) + + try: + return mount_for_path[mount_point] + except KeyError: + known_mounts = ', '.join('"{}"'.format(mount) for mount in sorted(mount_for_path)) + raise OpenShiftCheckException( + 'Unable to determine mount point for path "{}".\n' + 'Known mount points: {}.'.format(path, known_mounts or 'none') + ) LOADER_EXCLUDES = ( diff --git a/roles/openshift_health_checker/openshift_checks/disk_availability.py b/roles/openshift_health_checker/openshift_checks/disk_availability.py index 283461294..6d1dea9ce 100644 --- a/roles/openshift_health_checker/openshift_checks/disk_availability.py +++ b/roles/openshift_health_checker/openshift_checks/disk_availability.py @@ -1,6 +1,5 @@ """Check that there is enough disk space in predefined paths.""" -import os.path import tempfile from openshift_checks import OpenShiftCheck, OpenShiftCheckException @@ -35,6 +34,15 @@ class DiskAvailability(OpenShiftCheck): }, } + # recommended disk space for each location under an upgrade context + recommended_disk_upgrade_bytes = { + '/var': { + 'masters': 10 * 10**9, + 'nodes': 5 * 10 ** 9, + 'etcd': 5 * 10 ** 9, + }, + } + def is_active(self): """Skip hosts that do not have recommended disk space requirements.""" group_names = self.get_var("group_names", default=[]) @@ -46,9 +54,6 @@ class DiskAvailability(OpenShiftCheck): def run(self): group_names = self.get_var("group_names") - ansible_mounts = self.get_var("ansible_mounts") - ansible_mounts = {mount['mount']: mount for mount in ansible_mounts} - user_config = self.get_var("openshift_check_min_host_disk_gb", default={}) try: # For backwards-compatibility, if openshift_check_min_host_disk_gb @@ -71,7 +76,7 @@ class DiskAvailability(OpenShiftCheck): # not part of the official recommendation but present in the user # configuration. for path, recommendation in self.recommended_disk_space_bytes.items(): - free_bytes = self.free_bytes(path, ansible_mounts) + free_bytes = self.free_bytes(path) recommended_bytes = max(recommendation.get(name, 0) for name in group_names) config = user_config.get(path, {}) @@ -80,9 +85,34 @@ class DiskAvailability(OpenShiftCheck): config_bytes = max(config.get(name, 0) for name in group_names) * 10**9 recommended_bytes = config_bytes or recommended_bytes + # if an "upgrade" context is set, update the minimum disk requirement + # as this signifies an in-place upgrade - the node might have the + # required total disk space, but some of that space may already be + # in use by the existing OpenShift deployment. + context = self.get_var("r_openshift_health_checker_playbook_context", default="") + if context == "upgrade": + recommended_upgrade_paths = self.recommended_disk_upgrade_bytes.get(path, {}) + if recommended_upgrade_paths: + recommended_bytes = config_bytes or max(recommended_upgrade_paths.get(name, 0) + for name in group_names) + if free_bytes < recommended_bytes: free_gb = float(free_bytes) / 10**9 recommended_gb = float(recommended_bytes) / 10**9 + msg = ( + 'Available disk space in "{}" ({:.1f} GB) ' + 'is below minimum recommended ({:.1f} GB)' + ).format(path, free_gb, recommended_gb) + + # warn if check failed under an "upgrade" context + # due to limits imposed by the user config + if config_bytes and context == "upgrade": + msg += ('\n\nMake sure to account for decreased disk space during an upgrade\n' + 'due to an existing OpenShift deployment. Please check the value of\n' + ' openshift_check_min_host_disk_gb={}\n' + 'in your Ansible inventory, and lower the recommended disk space availability\n' + 'if necessary for this upgrade.').format(config_bytes) + return { 'failed': True, 'msg': ( @@ -93,22 +123,17 @@ class DiskAvailability(OpenShiftCheck): return {} - @staticmethod - def free_bytes(path, ansible_mounts): + def free_bytes(self, path): """Return the size available in path based on ansible_mounts.""" - mount_point = path - # arbitry value to prevent an infinite loop, in the unlike case that '/' - # is not in ansible_mounts. - max_depth = 32 - while mount_point not in ansible_mounts and max_depth > 0: - mount_point = os.path.dirname(mount_point) - max_depth -= 1 - + mount = self.find_ansible_mount(path) try: - free_bytes = ansible_mounts[mount_point]['size_available'] + return mount['size_available'] except KeyError: - known_mounts = ', '.join('"{}"'.format(mount) for mount in sorted(ansible_mounts)) or 'none' - msg = 'Unable to determine disk availability for "{}". Known mount points: {}.' - raise OpenShiftCheckException(msg.format(path, known_mounts)) - - return free_bytes + raise OpenShiftCheckException( + 'Unable to retrieve disk availability for "{path}".\n' + 'Ansible facts included a matching mount point for this path:\n' + ' {mount}\n' + 'however it is missing the size_available field.\n' + 'To investigate, you can inspect the output of `ansible -m setup <host>`' + ''.format(path=path, mount=mount) + ) diff --git a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py index 77180223e..85a922f86 100644 --- a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py +++ b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py @@ -41,11 +41,10 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck): return super(DockerImageAvailability, self).is_active() and has_valid_deployment_type def run(self): - msg, failed, changed = self.ensure_dependencies() + msg, failed = self.ensure_dependencies() if failed: return { "failed": True, - "changed": changed, "msg": "Some dependencies are required in order to check Docker image availability.\n" + msg } @@ -54,11 +53,11 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck): # exit early if all images were found locally if not missing_images: - return {"changed": changed} + return {} registries = self.known_docker_registries() if not registries: - return {"failed": True, "msg": "Unable to retrieve any docker registries.", "changed": changed} + return {"failed": True, "msg": "Unable to retrieve any docker registries."} available_images = self.available_images(missing_images, registries) unavailable_images = set(missing_images) - set(available_images) @@ -70,10 +69,9 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck): "One or more required Docker images are not available:\n {}\n" "Configured registries: {}" ).format(",\n ".join(sorted(unavailable_images)), ", ".join(registries)), - "changed": changed, } - return {"changed": changed} + return {} def required_images(self): """ diff --git a/roles/openshift_health_checker/openshift_checks/docker_storage.py b/roles/openshift_health_checker/openshift_checks/docker_storage.py index dea15a56e..0558ddf14 100644 --- a/roles/openshift_health_checker/openshift_checks/docker_storage.py +++ b/roles/openshift_health_checker/openshift_checks/docker_storage.py @@ -1,6 +1,5 @@ """Check Docker storage driver and usage.""" import json -import os.path import re from openshift_checks import OpenShiftCheck, OpenShiftCheckException from openshift_checks.mixins import DockerHostMixin @@ -43,21 +42,20 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck): ] def run(self): - msg, failed, changed = self.ensure_dependencies() + msg, failed = self.ensure_dependencies() if failed: return { "failed": True, - "changed": changed, "msg": "Some dependencies are required in order to query docker storage on host:\n" + msg } # attempt to get the docker info hash from the API docker_info = self.execute_module("docker_info", {}) if docker_info.get("failed"): - return {"failed": True, "changed": changed, + return {"failed": True, "msg": "Failed to query Docker API. Is docker running on this host?"} if not docker_info.get("info"): # this would be very strange - return {"failed": True, "changed": changed, + return {"failed": True, "msg": "Docker API query missing info:\n{}".format(json.dumps(docker_info))} docker_info = docker_info["info"] @@ -68,7 +66,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck): "Detected unsupported Docker storage driver '{driver}'.\n" "Supported storage drivers are: {drivers}" ).format(driver=driver, drivers=', '.join(self.storage_drivers)) - return {"failed": True, "changed": changed, "msg": msg} + return {"failed": True, "msg": msg} # driver status info is a list of tuples; convert to dict and validate based on driver driver_status = {item[0]: item[1] for item in docker_info.get("DriverStatus", [])} @@ -81,7 +79,6 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck): if driver in ['overlay', 'overlay2']: result = self.check_overlay_support(docker_info, driver_status) - result['changed'] = result.get('changed', False) or changed return result def check_devicemapper_support(self, driver_status): @@ -254,7 +251,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck): "msg": "Specified 'max_overlay_usage_percent' is not a percentage: {}".format(threshold), } - mount = self.find_ansible_mount(path, self.get_var("ansible_mounts")) + mount = self.find_ansible_mount(path) try: free_bytes = mount['size_available'] total_bytes = mount['size_total'] @@ -277,22 +274,3 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck): } return {} - - # TODO(lmeyer): migrate to base class - @staticmethod - def find_ansible_mount(path, ansible_mounts): - """Return the mount point for path from ansible_mounts.""" - - mount_for_path = {mount['mount']: mount for mount in ansible_mounts} - mount_point = path - while mount_point not in mount_for_path: - if mount_point in ["/", ""]: # "/" not in ansible_mounts??? - break - mount_point = os.path.dirname(mount_point) - - try: - return mount_for_path[mount_point] - except KeyError: - known_mounts = ', '.join('"{}"'.format(mount) for mount in sorted(mount_for_path)) or 'none' - msg = 'Unable to determine mount point for path "{}". Known mount points: {}.' - raise OpenShiftCheckException(msg.format(path, known_mounts)) diff --git a/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py b/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py index 28c38504d..f4296753a 100644 --- a/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py +++ b/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py @@ -2,7 +2,7 @@ Ansible module for determining if the size of OpenShift image data exceeds a specified limit in an etcd cluster. """ -from openshift_checks import OpenShiftCheck, OpenShiftCheckException +from openshift_checks import OpenShiftCheck class EtcdImageDataSize(OpenShiftCheck): @@ -12,7 +12,7 @@ class EtcdImageDataSize(OpenShiftCheck): tags = ["etcd"] def run(self): - etcd_mountpath = self._get_etcd_mountpath(self.get_var("ansible_mounts")) + etcd_mountpath = self.find_ansible_mount("/var/lib/etcd") etcd_avail_diskspace = etcd_mountpath["size_available"] etcd_total_diskspace = etcd_mountpath["size_total"] @@ -56,7 +56,7 @@ class EtcdImageDataSize(OpenShiftCheck): reason = etcdkeysize["module_stderr"] msg = msg.format(host=etcd_host, reason=reason) - return {"failed": True, "changed": False, "msg": msg} + return {"failed": True, "msg": msg} if etcdkeysize["size_limit_exceeded"]: limit = self._to_gigabytes(etcd_imagedata_size_limit) @@ -65,20 +65,7 @@ class EtcdImageDataSize(OpenShiftCheck): "Use the `oadm prune images` command to cleanup unused Docker images.") return {"failed": True, "msg": msg.format(host=etcd_host, limit=limit)} - return {"changed": False} - - @staticmethod - def _get_etcd_mountpath(ansible_mounts): - valid_etcd_mount_paths = ["/var/lib/etcd", "/var/lib", "/var", "/"] - - mount_for_path = {mnt.get("mount"): mnt for mnt in ansible_mounts} - for path in valid_etcd_mount_paths: - if path in mount_for_path: - return mount_for_path[path] - - paths = ', '.join(sorted(mount_for_path)) or 'none' - msg = "Unable to determine a valid etcd mountpath. Paths mounted: {}.".format(paths) - raise OpenShiftCheckException(msg) + return {} @staticmethod def _to_gigabytes(byte_size): diff --git a/roles/openshift_health_checker/openshift_checks/etcd_traffic.py b/roles/openshift_health_checker/openshift_checks/etcd_traffic.py index cc1b14d8a..b4c8957e9 100644 --- a/roles/openshift_health_checker/openshift_checks/etcd_traffic.py +++ b/roles/openshift_health_checker/openshift_checks/etcd_traffic.py @@ -14,8 +14,8 @@ class EtcdTraffic(OpenShiftCheck): group_names = self.get_var("group_names", default=[]) valid_group_names = "etcd" in group_names - version = self.get_var("openshift", "common", "short_version") - valid_version = version in ("3.4", "3.5", "1.4", "1.5") + version = self.get_major_minor_version(self.get_var("openshift_image_tag")) + valid_version = version in ((3, 4), (3, 5)) return super(EtcdTraffic, self).is_active() and valid_group_names and valid_version diff --git a/roles/openshift_health_checker/openshift_checks/etcd_volume.py b/roles/openshift_health_checker/openshift_checks/etcd_volume.py index da7d0364a..e5d93ff3f 100644 --- a/roles/openshift_health_checker/openshift_checks/etcd_volume.py +++ b/roles/openshift_health_checker/openshift_checks/etcd_volume.py @@ -1,6 +1,6 @@ """A health check for OpenShift clusters.""" -from openshift_checks import OpenShiftCheck, OpenShiftCheckException +from openshift_checks import OpenShiftCheck class EtcdVolume(OpenShiftCheck): @@ -11,8 +11,8 @@ class EtcdVolume(OpenShiftCheck): # Default device usage threshold. Value should be in the range [0, 100]. default_threshold_percent = 90 - # Where to find ectd data, higher priority first. - supported_mount_paths = ["/var/lib/etcd", "/var/lib", "/var", "/"] + # Where to find etcd data + etcd_mount_path = "/var/lib/etcd" def is_active(self): etcd_hosts = self.get_var("groups", "etcd", default=[]) or self.get_var("groups", "masters", default=[]) or [] @@ -20,7 +20,7 @@ class EtcdVolume(OpenShiftCheck): return super(EtcdVolume, self).is_active() and is_etcd_host def run(self): - mount_info = self._etcd_mount_info() + mount_info = self.find_ansible_mount(self.etcd_mount_path) available = mount_info["size_available"] total = mount_info["size_total"] used = total - available @@ -40,16 +40,4 @@ class EtcdVolume(OpenShiftCheck): ) return {"failed": True, "msg": msg} - return {"changed": False} - - def _etcd_mount_info(self): - ansible_mounts = self.get_var("ansible_mounts") - mounts = {mnt.get("mount"): mnt for mnt in ansible_mounts} - - for path in self.supported_mount_paths: - if path in mounts: - return mounts[path] - - paths = ', '.join(sorted(mounts)) or 'none' - msg = "Unable to find etcd storage mount point. Paths mounted: {}.".format(paths) - raise OpenShiftCheckException(msg) + return {} diff --git a/roles/openshift_health_checker/openshift_checks/logging/curator.py b/roles/openshift_health_checker/openshift_checks/logging/curator.py index f82ae64d7..b27f97172 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/curator.py +++ b/roles/openshift_health_checker/openshift_checks/logging/curator.py @@ -1,6 +1,6 @@ """Check for an aggregated logging Curator deployment""" -from openshift_checks.logging.logging import LoggingCheck +from openshift_checks.logging.logging import OpenShiftCheckException, LoggingCheck class Curator(LoggingCheck): @@ -9,46 +9,35 @@ class Curator(LoggingCheck): name = "curator" tags = ["health", "logging"] - logging_namespace = None - def run(self): - self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging") - curator_pods, error = super(Curator, self).get_pods_for_component( - self.logging_namespace, - "curator", - ) - if error: - return {"failed": True, "changed": False, "msg": error} - check_error = self.check_curator(curator_pods) - - if check_error: - msg = ("The following Curator deployment issue was found:" - "\n-------\n" - "{}".format(check_error)) - return {"failed": True, "changed": False, "msg": msg} + """Check various things and gather errors. Returns: result as hash""" + curator_pods = self.get_pods_for_component("curator") + self.check_curator(curator_pods) # TODO(lmeyer): run it all again for the ops cluster - return {"failed": False, "changed": False, "msg": 'No problems found with Curator deployment.'} + + return {} def check_curator(self, pods): """Check to see if curator is up and working. Returns: error string""" if not pods: - return ( + raise OpenShiftCheckException( + "MissingComponentPods", "There are no Curator pods for the logging stack,\n" "so nothing will prune Elasticsearch indexes.\n" "Is Curator correctly deployed?" ) - not_running = super(Curator, self).not_running_pods(pods) + not_running = self.not_running_pods(pods) if len(not_running) == len(pods): - return ( + raise OpenShiftCheckException( + "CuratorNotRunning", "The Curator pod is not currently in a running state,\n" "so Elasticsearch indexes may increase without bound." ) if len(pods) - len(not_running) > 1: - return ( + raise OpenShiftCheckException( + "TooManyCurators", "There is more than one Curator pod running. This should not normally happen.\n" "Although this doesn't cause any problems, you may want to investigate." ) - - return None diff --git a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py index 1e478c04d..7fc843fd7 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py +++ b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py @@ -3,6 +3,7 @@ import json import re +from openshift_checks import OpenShiftCheckException, OpenShiftCheckExceptionList from openshift_checks.logging.logging import LoggingCheck @@ -12,174 +13,181 @@ class Elasticsearch(LoggingCheck): name = "elasticsearch" tags = ["health", "logging"] - logging_namespace = None - def run(self): """Check various things and gather errors. Returns: result as hash""" - self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging") - es_pods, error = super(Elasticsearch, self).get_pods_for_component( - self.logging_namespace, - "es", - ) - if error: - return {"failed": True, "changed": False, "msg": error} - check_error = self.check_elasticsearch(es_pods) - - if check_error: - msg = ("The following Elasticsearch deployment issue was found:" - "\n-------\n" - "{}".format(check_error)) - return {"failed": True, "changed": False, "msg": msg} - + es_pods = self.get_pods_for_component("es") + self.check_elasticsearch(es_pods) # TODO(lmeyer): run it all again for the ops cluster - return {"failed": False, "changed": False, "msg": 'No problems found with Elasticsearch deployment.'} - def _not_running_elasticsearch_pods(self, es_pods): - """Returns: list of pods that are not running, list of errors about non-running pods""" - not_running = super(Elasticsearch, self).not_running_pods(es_pods) - if not_running: - return not_running, [( - 'The following Elasticsearch pods are not running:\n' - '{pods}' - 'These pods will not aggregate logs from their nodes.' - ).format(pods=''.join( - " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None')) - for pod in not_running - ))] - return not_running, [] + return {} def check_elasticsearch(self, es_pods): - """Various checks for elasticsearch. Returns: error string""" - not_running_pods, error_msgs = self._not_running_elasticsearch_pods(es_pods) - running_pods = [pod for pod in es_pods if pod not in not_running_pods] + """Perform checks for Elasticsearch. Raises OpenShiftCheckExceptionList on any errors.""" + running_pods, errors = self.running_elasticsearch_pods(es_pods) pods_by_name = { pod['metadata']['name']: pod for pod in running_pods # Filter out pods that are not members of a DC if pod['metadata'].get('labels', {}).get('deploymentconfig') } if not pods_by_name: - return 'No logging Elasticsearch pods were found. Is logging deployed?' - error_msgs += self._check_elasticsearch_masters(pods_by_name) - error_msgs += self._check_elasticsearch_node_list(pods_by_name) - error_msgs += self._check_es_cluster_health(pods_by_name) - error_msgs += self._check_elasticsearch_diskspace(pods_by_name) - return '\n'.join(error_msgs) + # nothing running, cannot run the rest of the check + errors.append(OpenShiftCheckException( + 'NoRunningPods', + 'No logging Elasticsearch pods were found running, so no logs are being aggregated.' + )) + raise OpenShiftCheckExceptionList(errors) + + errors += self.check_elasticsearch_masters(pods_by_name) + errors += self.check_elasticsearch_node_list(pods_by_name) + errors += self.check_es_cluster_health(pods_by_name) + errors += self.check_elasticsearch_diskspace(pods_by_name) + if errors: + raise OpenShiftCheckExceptionList(errors) + + def running_elasticsearch_pods(self, es_pods): + """Returns: list of running pods, list of errors about non-running pods""" + not_running = self.not_running_pods(es_pods) + running_pods = [pod for pod in es_pods if pod not in not_running] + if not_running: + return running_pods, [OpenShiftCheckException( + 'PodNotRunning', + 'The following Elasticsearch pods are defined but not running:\n' + '{pods}'.format(pods=''.join( + " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None')) + for pod in not_running + )) + )] + return running_pods, [] @staticmethod def _build_es_curl_cmd(pod_name, url): base = "exec {name} -- curl -s --cert {base}cert --key {base}key --cacert {base}ca -XGET '{url}'" return base.format(base="/etc/elasticsearch/secret/admin-", name=pod_name, url=url) - def _check_elasticsearch_masters(self, pods_by_name): - """Check that Elasticsearch masters are sane. Returns: list of error strings""" + def check_elasticsearch_masters(self, pods_by_name): + """Check that Elasticsearch masters are sane. Returns: list of errors""" es_master_names = set() - error_msgs = [] + errors = [] for pod_name in pods_by_name.keys(): # Compare what each ES node reports as master and compare for split brain get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master") - master_name_str = self._exec_oc(get_master_cmd, []) + master_name_str = self.exec_oc(get_master_cmd, []) master_names = (master_name_str or '').split(' ') if len(master_names) > 1: es_master_names.add(master_names[1]) else: - error_msgs.append( - 'No master? Elasticsearch {pod} returned bad string when asked master name:\n' + errors.append(OpenShiftCheckException( + 'NoMasterName', + 'Elasticsearch {pod} gave unexpected response when asked master name:\n' ' {response}'.format(pod=pod_name, response=master_name_str) - ) + )) if not es_master_names: - error_msgs.append('No logging Elasticsearch masters were found. Is logging deployed?') - return '\n'.join(error_msgs) + errors.append(OpenShiftCheckException( + 'NoMasterFound', + 'No logging Elasticsearch masters were found.' + )) + return errors if len(es_master_names) > 1: - error_msgs.append( + errors.append(OpenShiftCheckException( + 'SplitBrainMasters', 'Found multiple Elasticsearch masters according to the pods:\n' '{master_list}\n' 'This implies that the masters have "split brain" and are not correctly\n' 'replicating data for the logging cluster. Log loss is likely to occur.' .format(master_list='\n'.join(' ' + master for master in es_master_names)) - ) + )) - return error_msgs + return errors - def _check_elasticsearch_node_list(self, pods_by_name): - """Check that reported ES masters are accounted for by pods. Returns: list of error strings""" + def check_elasticsearch_node_list(self, pods_by_name): + """Check that reported ES masters are accounted for by pods. Returns: list of errors""" if not pods_by_name: - return ['No logging Elasticsearch masters were found. Is logging deployed?'] + return [OpenShiftCheckException( + 'MissingComponentPods', + 'No logging Elasticsearch pods were found.' + )] # get ES cluster nodes node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes') - cluster_node_data = self._exec_oc(node_cmd, []) + cluster_node_data = self.exec_oc(node_cmd, []) try: cluster_nodes = json.loads(cluster_node_data)['nodes'] except (ValueError, KeyError): - return [ + return [OpenShiftCheckException( + 'MissingNodeList', 'Failed to query Elasticsearch for the list of ES nodes. The output was:\n' + cluster_node_data - ] + )] # Try to match all ES-reported node hosts to known pods. - error_msgs = [] + errors = [] for node in cluster_nodes.values(): # Note that with 1.4/3.4 the pod IP may be used as the master name if not any(node['host'] in (pod_name, pod['status'].get('podIP')) for pod_name, pod in pods_by_name.items()): - error_msgs.append( + errors.append(OpenShiftCheckException( + 'EsPodNodeMismatch', 'The Elasticsearch cluster reports a member node "{node}"\n' 'that does not correspond to any known ES pod.'.format(node=node['host']) - ) + )) - return error_msgs + return errors - def _check_es_cluster_health(self, pods_by_name): + def check_es_cluster_health(self, pods_by_name): """Exec into the elasticsearch pods and check the cluster health. Returns: list of errors""" - error_msgs = [] + errors = [] for pod_name in pods_by_name.keys(): cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true') - cluster_health_data = self._exec_oc(cluster_health_cmd, []) + cluster_health_data = self.exec_oc(cluster_health_cmd, []) try: health_res = json.loads(cluster_health_data) if not health_res or not health_res.get('status'): raise ValueError() except ValueError: - error_msgs.append( + errors.append(OpenShiftCheckException( + 'BadEsResponse', 'Could not retrieve cluster health status from logging ES pod "{pod}".\n' 'Response was:\n{output}'.format(pod=pod_name, output=cluster_health_data) - ) + )) continue if health_res['status'] not in ['green', 'yellow']: - error_msgs.append( + errors.append(OpenShiftCheckException( + 'EsClusterHealthRed', 'Elasticsearch cluster health status is RED according to pod "{}"'.format(pod_name) - ) + )) - return error_msgs + return errors - def _check_elasticsearch_diskspace(self, pods_by_name): + def check_elasticsearch_diskspace(self, pods_by_name): """ Exec into an ES pod and query the diskspace on the persistent volume. Returns: list of errors """ - error_msgs = [] + errors = [] for pod_name in pods_by_name.keys(): df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name) - disk_output = self._exec_oc(df_cmd, []) + disk_output = self.exec_oc(df_cmd, []) lines = disk_output.splitlines() # expecting one header looking like 'IUse% Use%' and one body line body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$' if len(lines) != 2 or len(lines[0].split()) != 2 or not re.match(body_re, lines[1]): - error_msgs.append( + errors.append(OpenShiftCheckException( + 'BadDfResponse', 'Could not retrieve storage usage from logging ES pod "{pod}".\n' 'Response to `df` command was:\n{output}'.format(pod=pod_name, output=disk_output) - ) + )) continue inode_pct, disk_pct = re.match(body_re, lines[1]).groups() inode_pct_thresh = self.get_var('openshift_check_efk_es_inode_pct', default='90') if int(inode_pct) >= int(inode_pct_thresh): - error_msgs.append( + errors.append(OpenShiftCheckException( + 'InodeUsageTooHigh', 'Inode percent usage on the storage volume for logging ES pod "{pod}"\n' ' is {pct}, greater than threshold {limit}.\n' ' Note: threshold can be specified in inventory with {param}'.format( @@ -187,10 +195,11 @@ class Elasticsearch(LoggingCheck): pct=str(inode_pct), limit=str(inode_pct_thresh), param='openshift_check_efk_es_inode_pct', - )) + ))) disk_pct_thresh = self.get_var('openshift_check_efk_es_storage_pct', default='80') if int(disk_pct) >= int(disk_pct_thresh): - error_msgs.append( + errors.append(OpenShiftCheckException( + 'DiskUsageTooHigh', 'Disk percent usage on the storage volume for logging ES pod "{pod}"\n' ' is {pct}, greater than threshold {limit}.\n' ' Note: threshold can be specified in inventory with {param}'.format( @@ -198,13 +207,6 @@ class Elasticsearch(LoggingCheck): pct=str(disk_pct), limit=str(disk_pct_thresh), param='openshift_check_efk_es_storage_pct', - )) - - return error_msgs + ))) - def _exec_oc(self, cmd_str, extra_args): - return super(Elasticsearch, self).exec_oc( - self.logging_namespace, - cmd_str, - extra_args, - ) + return errors diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py index 063e707a9..3b192a281 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py +++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py @@ -2,6 +2,8 @@ import json + +from openshift_checks import OpenShiftCheckException, OpenShiftCheckExceptionList from openshift_checks.logging.logging import LoggingCheck @@ -11,61 +13,97 @@ class Fluentd(LoggingCheck): name = "fluentd" tags = ["health", "logging"] - logging_namespace = None - def run(self): - """Check various things and gather errors. Returns: result as hash""" + """Check the Fluentd deployment and raise an error if any problems are found.""" + + fluentd_pods = self.get_pods_for_component("fluentd") + self.check_fluentd(fluentd_pods) + return {} + + def check_fluentd(self, pods): + """Verify fluentd is running everywhere. Raises OpenShiftCheckExceptionList if error(s) found.""" - self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging") - fluentd_pods, error = super(Fluentd, self).get_pods_for_component( - self.logging_namespace, - "fluentd", + node_selector = self.get_var( + 'openshift_logging_fluentd_nodeselector', + default='logging-infra-fluentd=true' ) - if error: - return {"failed": True, "changed": False, "msg": error} - check_error = self.check_fluentd(fluentd_pods) - if check_error: - msg = ("The following Fluentd deployment issue was found:" - "\n-------\n" - "{}".format(check_error)) - return {"failed": True, "changed": False, "msg": msg} + nodes_by_name = self.get_nodes_by_name() + fluentd_nodes = self.filter_fluentd_labeled_nodes(nodes_by_name, node_selector) + + errors = [] + errors += self.check_node_labeling(nodes_by_name, fluentd_nodes, node_selector) + errors += self.check_nodes_have_fluentd(pods, fluentd_nodes) + errors += self.check_fluentd_pods_running(pods) + + # Make sure there are no extra fluentd pods + if len(pods) > len(fluentd_nodes): + errors.append(OpenShiftCheckException( + 'TooManyFluentdPods', + 'There are more Fluentd pods running than nodes labeled.\n' + 'This may not cause problems with logging but it likely indicates something wrong.' + )) + + if errors: + raise OpenShiftCheckExceptionList(errors) - # TODO(lmeyer): run it all again for the ops cluster - return {"failed": False, "changed": False, "msg": 'No problems found with Fluentd deployment.'} + def get_nodes_by_name(self): + """Retrieve all the node definitions. Returns: dict(name: node)""" + nodes_json = self.exec_oc("get nodes -o json", []) + try: + nodes = json.loads(nodes_json) + except ValueError: # no valid json - should not happen + raise OpenShiftCheckException( + "BadOcNodeList", + "Could not obtain a list of nodes to validate fluentd.\n" + "Output from oc get:\n" + nodes_json + ) + if not nodes or not nodes.get('items'): # also should not happen + raise OpenShiftCheckException( + "NoNodesDefined", + "No nodes appear to be defined according to the API." + ) + return { + node['metadata']['name']: node + for node in nodes['items'] + } @staticmethod - def _filter_fluentd_labeled_nodes(nodes_by_name, node_selector): - """Filter to all nodes with fluentd label. Returns dict(name: node), error string""" + def filter_fluentd_labeled_nodes(nodes_by_name, node_selector): + """Filter to all nodes with fluentd label. Returns dict(name: node)""" label, value = node_selector.split('=', 1) fluentd_nodes = { name: node for name, node in nodes_by_name.items() if node['metadata']['labels'].get(label) == value } if not fluentd_nodes: - return None, ( + raise OpenShiftCheckException( + 'NoNodesLabeled', 'There are no nodes with the fluentd label {label}.\n' - 'This means no logs will be aggregated from the nodes.' - ).format(label=node_selector) - return fluentd_nodes, None + 'This means no logs will be aggregated from the nodes.'.format(label=node_selector) + ) + return fluentd_nodes - def _check_node_labeling(self, nodes_by_name, fluentd_nodes, node_selector): - """Note if nodes are not labeled as expected. Returns: error string""" + def check_node_labeling(self, nodes_by_name, fluentd_nodes, node_selector): + """Note if nodes are not labeled as expected. Returns: error list""" intended_nodes = self.get_var('openshift_logging_fluentd_hosts', default=['--all']) if not intended_nodes or '--all' in intended_nodes: intended_nodes = nodes_by_name.keys() nodes_missing_labels = set(intended_nodes) - set(fluentd_nodes.keys()) if nodes_missing_labels: - return ( + return [OpenShiftCheckException( + 'NodesUnlabeled', 'The following nodes are supposed to be labeled with {label} but are not:\n' ' {nodes}\n' - 'Fluentd will not aggregate logs from these nodes.' - ).format(label=node_selector, nodes=', '.join(nodes_missing_labels)) - return None + 'Fluentd will not aggregate logs from these nodes.'.format( + label=node_selector, nodes=', '.join(nodes_missing_labels) + ))] + + return [] @staticmethod - def _check_nodes_have_fluentd(pods, fluentd_nodes): - """Make sure fluentd is on all the labeled nodes. Returns: error string""" + def check_nodes_have_fluentd(pods, fluentd_nodes): + """Make sure fluentd is on all the labeled nodes. Returns: error list""" unmatched_nodes = fluentd_nodes.copy() node_names_by_label = { node['metadata']['labels']['kubernetes.io/hostname']: name @@ -85,83 +123,32 @@ class Fluentd(LoggingCheck): ]: unmatched_nodes.pop(name, None) if unmatched_nodes: - return ( + return [OpenShiftCheckException( + 'MissingFluentdPod', 'The following nodes are supposed to have a Fluentd pod but do not:\n' - '{nodes}' - 'These nodes will not have their logs aggregated.' - ).format(nodes=''.join( - " {}\n".format(name) - for name in unmatched_nodes.keys() - )) - return None + ' {nodes}\n' + 'These nodes will not have their logs aggregated.'.format( + nodes='\n '.join(unmatched_nodes.keys()) + ))] + + return [] - def _check_fluentd_pods_running(self, pods): + def check_fluentd_pods_running(self, pods): """Make sure all fluentd pods are running. Returns: error string""" not_running = super(Fluentd, self).not_running_pods(pods) if not_running: - return ( + return [OpenShiftCheckException( + 'FluentdNotRunning', 'The following Fluentd pods are supposed to be running but are not:\n' - '{pods}' - 'These pods will not aggregate logs from their nodes.' - ).format(pods=''.join( - " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None')) - for pod in not_running - )) - return None - - def check_fluentd(self, pods): - """Verify fluentd is running everywhere. Returns: error string""" - - node_selector = self.get_var( - 'openshift_logging_fluentd_nodeselector', - default='logging-infra-fluentd=true' - ) - - nodes_by_name, error = self.get_nodes_by_name() - - if error: - return error - fluentd_nodes, error = self._filter_fluentd_labeled_nodes(nodes_by_name, node_selector) - if error: - return error - - error_msgs = [] - error = self._check_node_labeling(nodes_by_name, fluentd_nodes, node_selector) - if error: - error_msgs.append(error) - error = self._check_nodes_have_fluentd(pods, fluentd_nodes) - if error: - error_msgs.append(error) - error = self._check_fluentd_pods_running(pods) - if error: - error_msgs.append(error) - - # Make sure there are no extra fluentd pods - if len(pods) > len(fluentd_nodes): - error_msgs.append( - 'There are more Fluentd pods running than nodes labeled.\n' - 'This may not cause problems with logging but it likely indicates something wrong.' - ) - - return '\n'.join(error_msgs) - - def get_nodes_by_name(self): - """Retrieve all the node definitions. Returns: dict(name: node), error string""" - nodes_json = self._exec_oc("get nodes -o json", []) - try: - nodes = json.loads(nodes_json) - except ValueError: # no valid json - should not happen - return None, "Could not obtain a list of nodes to validate fluentd. Output from oc get:\n" + nodes_json - if not nodes or not nodes.get('items'): # also should not happen - return None, "No nodes appear to be defined according to the API." - return { - node['metadata']['name']: node - for node in nodes['items'] - }, None - - def _exec_oc(self, cmd_str, extra_args): - return super(Fluentd, self).exec_oc( - self.logging_namespace, - cmd_str, - extra_args, - ) + ' {pods}\n' + 'These pods will not aggregate logs from their nodes.'.format( + pods='\n'.join( + " {name} ({host})".format( + name=pod['metadata']['name'], + host=pod['spec'].get('host', 'None') + ) + for pod in not_running + ) + ))] + + return [] diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py new file mode 100644 index 000000000..d783e6760 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py @@ -0,0 +1,131 @@ +""" +Module for performing checks on a Fluentd logging deployment configuration +""" + +from openshift_checks import OpenShiftCheckException +from openshift_checks.logging.logging import LoggingCheck + + +class FluentdConfig(LoggingCheck): + """Module that checks logging configuration of an integrated logging Fluentd deployment""" + name = "fluentd_config" + tags = ["health"] + + def is_active(self): + logging_deployed = self.get_var("openshift_hosted_logging_deploy", default=False) + + try: + version = self.get_major_minor_version(self.get_var("openshift_image_tag")) + except ValueError: + # if failed to parse OpenShift version, perform check anyway (if logging enabled) + return logging_deployed + + return logging_deployed and version < (3, 6) + + def run(self): + """Check that Fluentd has running pods, and that its logging config matches Docker's logging config.""" + config_error = self.check_logging_config() + if config_error: + msg = ("The following Fluentd logging configuration problem was found:" + "\n{}".format(config_error)) + return {"failed": True, "msg": msg} + + return {} + + def check_logging_config(self): + """Ensure that the configured Docker logging driver matches fluentd settings. + This means that, at least for now, if the following condition is met: + + openshift_logging_fluentd_use_journal == True + + then the value of the configured Docker logging driver should be "journald". + Otherwise, the value of the Docker logging driver should be "json-file". + Returns an error string if the above condition is not met, or None otherwise.""" + use_journald = self.get_var("openshift_logging_fluentd_use_journal", default=True) + + # if check is running on a master, retrieve all running pods + # and check any pod's container for the env var "USE_JOURNAL" + group_names = self.get_var("group_names") + if "masters" in group_names: + use_journald = self.check_fluentd_env_var() + + docker_info = self.execute_module("docker_info", {}) + try: + logging_driver = docker_info["info"]["LoggingDriver"] + except KeyError: + return "Unable to determine Docker logging driver." + + logging_driver = docker_info["info"]["LoggingDriver"] + recommended_logging_driver = "journald" + error = None + + # If fluentd is set to use journald but Docker is not, recommend setting the `--log-driver` + # option as an inventory file variable, or adding the log driver value as part of the + # Docker configuration in /etc/docker/daemon.json. There is no global --log-driver flag that + # can be passed to the Docker binary; the only other recommendation that can be made, would be + # to pass the `--log-driver` flag to the "run" sub-command of the `docker` binary when running + # individual containers. + if use_journald and logging_driver != "journald": + error = ('Your Fluentd configuration is set to aggregate Docker container logs from "journald".\n' + 'This differs from your Docker configuration, which has been set to use "{driver}" ' + 'as the default method of storing logs.\n' + 'This discrepancy in configuration will prevent Fluentd from receiving any logs' + 'from your Docker containers.').format(driver=logging_driver) + elif not use_journald and logging_driver != "json-file": + recommended_logging_driver = "json-file" + error = ('Your Fluentd configuration is set to aggregate Docker container logs from ' + 'individual json log files per container.\n ' + 'This differs from your Docker configuration, which has been set to use ' + '"{driver}" as the default method of storing logs.\n' + 'This discrepancy in configuration will prevent Fluentd from receiving any logs' + 'from your Docker containers.').format(driver=logging_driver) + + if error: + error += ('\nTo resolve this issue, add the following variable to your Ansible inventory file:\n\n' + ' openshift_docker_options="--log-driver={driver}"\n\n' + 'Alternatively, you can add the following option to your Docker configuration, located in' + '"/etc/docker/daemon.json":\n\n' + '{{ "log-driver": "{driver}" }}\n\n' + 'See https://docs.docker.com/engine/admin/logging/json-file ' + 'for more information.').format(driver=recommended_logging_driver) + + return error + + def check_fluentd_env_var(self): + """Read and return the value of the 'USE_JOURNAL' environment variable on a fluentd pod.""" + running_pods = self.running_fluentd_pods() + + try: + pod_containers = running_pods[0]["spec"]["containers"] + except KeyError: + return "Unable to detect running containers on selected Fluentd pod." + + if not pod_containers: + msg = ('There are no running containers on selected Fluentd pod "{}".\n' + 'Unable to calculate expected logging driver.').format(running_pods[0]["metadata"].get("name", "")) + raise OpenShiftCheckException(msg) + + pod_env = pod_containers[0].get("env") + if not pod_env: + msg = ('There are no environment variables set on the Fluentd container "{}".\n' + 'Unable to calculate expected logging driver.').format(pod_containers[0].get("name")) + raise OpenShiftCheckException(msg) + + for env in pod_env: + if env["name"] == "USE_JOURNAL": + return env.get("value", "false") != "false" + + return False + + def running_fluentd_pods(self): + """Return a list of running fluentd pods.""" + fluentd_pods = self.get_pods_for_component("fluentd") + + running_fluentd_pods = [pod for pod in fluentd_pods if pod['status']['phase'] == 'Running'] + if not running_fluentd_pods: + raise OpenShiftCheckException( + 'No Fluentd pods were found to be in the "Running" state. ' + 'At least one Fluentd pod is required in order to perform this check.' + ) + + return running_fluentd_pods diff --git a/roles/openshift_health_checker/openshift_checks/logging/kibana.py b/roles/openshift_health_checker/openshift_checks/logging/kibana.py index 60f94e106..3b1cf8baa 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/kibana.py +++ b/roles/openshift_health_checker/openshift_checks/logging/kibana.py @@ -12,7 +12,7 @@ except ImportError: from urllib.error import HTTPError, URLError import urllib.request as urllib2 -from openshift_checks.logging.logging import LoggingCheck +from openshift_checks.logging.logging import LoggingCheck, OpenShiftCheckException class Kibana(LoggingCheck): @@ -21,31 +21,15 @@ class Kibana(LoggingCheck): name = "kibana" tags = ["health", "logging"] - logging_namespace = None - def run(self): """Check various things and gather errors. Returns: result as hash""" - self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging") - kibana_pods, error = super(Kibana, self).get_pods_for_component( - self.logging_namespace, - "kibana", - ) - if error: - return {"failed": True, "changed": False, "msg": error} - check_error = self.check_kibana(kibana_pods) - - if not check_error: - check_error = self._check_kibana_route() - - if check_error: - msg = ("The following Kibana deployment issue was found:" - "\n-------\n" - "{}".format(check_error)) - return {"failed": True, "changed": False, "msg": msg} - + kibana_pods = self.get_pods_for_component("kibana") + self.check_kibana(kibana_pods) + self.check_kibana_route() # TODO(lmeyer): run it all again for the ops cluster - return {"failed": False, "changed": False, "msg": 'No problems found with Kibana deployment.'} + + return {} def _verify_url_internal(self, url): """ @@ -68,7 +52,7 @@ class Kibana(LoggingCheck): def _verify_url_external(url): """ Try to reach a URL from ansible control host. - Returns: success (bool), reason (for failure) + Raise an OpenShiftCheckException if anything goes wrong. """ # This actually checks from the ansible control host, which may or may not # really be "external" to the cluster. @@ -94,133 +78,149 @@ class Kibana(LoggingCheck): return None def check_kibana(self, pods): - """Check to see if Kibana is up and working. Returns: error string.""" + """Check to see if Kibana is up and working. Raises OpenShiftCheckException if not.""" if not pods: - return "There are no Kibana pods deployed, so no access to the logging UI." + raise OpenShiftCheckException( + "MissingComponentPods", + "There are no Kibana pods deployed, so no access to the logging UI." + ) not_running = self.not_running_pods(pods) if len(not_running) == len(pods): - return "No Kibana pod is in a running state, so there is no access to the logging UI." + raise OpenShiftCheckException( + "NoRunningPods", + "No Kibana pod is in a running state, so there is no access to the logging UI." + ) elif not_running: - return ( + raise OpenShiftCheckException( + "PodNotRunning", "The following Kibana pods are not currently in a running state:\n" - "{pods}" - "However at least one is, so service may not be impacted." - ).format(pods="".join(" " + pod['metadata']['name'] + "\n" for pod in not_running)) - - return None + " {pods}\n" + "However at least one is, so service may not be impacted.".format( + pods="\n ".join(pod['metadata']['name'] for pod in not_running) + ) + ) def _get_kibana_url(self): """ Get kibana route or report error. - Returns: url (or empty), reason for failure + Returns: url """ # Get logging url - get_route = self._exec_oc("get route logging-kibana -o json", []) + get_route = self.exec_oc("get route logging-kibana -o json", []) if not get_route: - return None, 'no_route_exists' + raise OpenShiftCheckException( + 'no_route_exists', + 'No route is defined for Kibana in the logging namespace,\n' + 'so the logging stack is not accessible. Is logging deployed?\n' + 'Did something remove the logging-kibana route?' + ) - route = json.loads(get_route) + try: + route = json.loads(get_route) + # check that the route has been accepted by a router + ingress = route["status"]["ingress"] + except (ValueError, KeyError): + raise OpenShiftCheckException( + 'get_route_failed', + '"oc get route" returned an unexpected response:\n' + get_route + ) - # check that the route has been accepted by a router - ingress = route["status"]["ingress"] # ingress can be null if there is no router, or empty if not routed if not ingress or not ingress[0]: - return None, 'route_not_accepted' + raise OpenShiftCheckException( + 'route_not_accepted', + 'The logging-kibana route is not being routed by any router.\n' + 'Is the router deployed and working?' + ) host = route.get("spec", {}).get("host") if not host: - return None, 'route_missing_host' + raise OpenShiftCheckException( + 'route_missing_host', + 'The logging-kibana route has no hostname defined,\n' + 'which should never happen. Did something alter its definition?' + ) - return 'https://{}/'.format(host), None + return 'https://{}/'.format(host) - def _check_kibana_route(self): + def check_kibana_route(self): """ Check to see if kibana route is up and working. - Returns: error string + Raises exception if not. """ - known_errors = dict( - no_route_exists=( - 'No route is defined for Kibana in the logging namespace,\n' - 'so the logging stack is not accessible. Is logging deployed?\n' - 'Did something remove the logging-kibana route?' - ), - route_not_accepted=( - 'The logging-kibana route is not being routed by any router.\n' - 'Is the router deployed and working?' - ), - route_missing_host=( - 'The logging-kibana route has no hostname defined,\n' - 'which should never happen. Did something alter its definition?' - ), - ) - kibana_url, error = self._get_kibana_url() - if not kibana_url: - return known_errors.get(error, error) + kibana_url = self._get_kibana_url() # first, check that kibana is reachable from the master. error = self._verify_url_internal(kibana_url) if error: if 'urlopen error [Errno 111] Connection refused' in error: - error = ( + raise OpenShiftCheckException( + 'FailedToConnectInternal', 'Failed to connect from this master to Kibana URL {url}\n' - 'Is kibana running, and is at least one router routing to it?' - ).format(url=kibana_url) + 'Is kibana running, and is at least one router routing to it?'.format(url=kibana_url) + ) elif 'urlopen error [Errno -2] Name or service not known' in error: - error = ( + raise OpenShiftCheckException( + 'FailedToResolveInternal', 'Failed to connect from this master to Kibana URL {url}\n' 'because the hostname does not resolve.\n' - 'Is DNS configured for the Kibana hostname?' - ).format(url=kibana_url) + 'Is DNS configured for the Kibana hostname?'.format(url=kibana_url) + ) elif 'Status code was not' in error: - error = ( + raise OpenShiftCheckException( + 'WrongReturnCodeInternal', 'A request from this master to the Kibana URL {url}\n' 'did not return the correct status code (302).\n' 'This could mean that Kibana is malfunctioning, the hostname is\n' 'resolving incorrectly, or other network issues. The output was:\n' - ' {error}' - ).format(url=kibana_url, error=error) - return 'Error validating the logging Kibana route:\n' + error + ' {error}'.format(url=kibana_url, error=error) + ) + raise OpenShiftCheckException( + 'MiscRouteErrorInternal', + 'Error validating the logging Kibana route internally:\n' + error + ) # in production we would like the kibana route to work from outside the # cluster too; but that may not be the case, so allow disabling just this part. - if not self.get_var("openshift_check_efk_kibana_external", default=True): - return None + if self.get_var("openshift_check_efk_kibana_external", default="True").lower() != "true": + return error = self._verify_url_external(kibana_url) - if error: - if 'urlopen error [Errno 111] Connection refused' in error: - error = ( - 'Failed to connect from the Ansible control host to Kibana URL {url}\n' - 'Is the router for the Kibana hostname exposed externally?' - ).format(url=kibana_url) - elif 'urlopen error [Errno -2] Name or service not known' in error: - error = ( - 'Failed to resolve the Kibana hostname in {url}\n' - 'from the Ansible control host.\n' - 'Is DNS configured to resolve this Kibana hostname externally?' - ).format(url=kibana_url) - elif 'Expected success (200)' in error: - error = ( - 'A request to Kibana at {url}\n' - 'returned the wrong error code:\n' - ' {error}\n' - 'This could mean that Kibana is malfunctioning, the hostname is\n' - 'resolving incorrectly, or other network issues.' - ).format(url=kibana_url, error=error) - error = ( - 'Error validating the logging Kibana route:\n{error}\n' - 'To disable external Kibana route validation, set in your inventory:\n' - ' openshift_check_efk_kibana_external=False' - ).format(error=error) - return error - return None - def _exec_oc(self, cmd_str, extra_args): - return super(Kibana, self).exec_oc( - self.logging_namespace, - cmd_str, - extra_args, + if not error: + return + + error_fmt = ( + 'Error validating the logging Kibana route:\n{error}\n' + 'To disable external Kibana route validation, set the variable:\n' + ' openshift_check_efk_kibana_external=False' + ) + if 'urlopen error [Errno 111] Connection refused' in error: + msg = ( + 'Failed to connect from the Ansible control host to Kibana URL {url}\n' + 'Is the router for the Kibana hostname exposed externally?' + ).format(url=kibana_url) + raise OpenShiftCheckException('FailedToConnect', error_fmt.format(error=msg)) + elif 'urlopen error [Errno -2] Name or service not known' in error: + msg = ( + 'Failed to resolve the Kibana hostname in {url}\n' + 'from the Ansible control host.\n' + 'Is DNS configured to resolve this Kibana hostname externally?' + ).format(url=kibana_url) + raise OpenShiftCheckException('FailedToResolve', error_fmt.format(error=msg)) + elif 'Expected success (200)' in error: + msg = ( + 'A request to Kibana at {url}\n' + 'returned the wrong error code:\n' + ' {error}\n' + 'This could mean that Kibana is malfunctioning, the hostname is\n' + 'resolving incorrectly, or other network issues.' + ).format(url=kibana_url, error=error) + raise OpenShiftCheckException('WrongReturnCode', error_fmt.format(error=msg)) + raise OpenShiftCheckException( + 'MiscRouteError', + 'Error validating the logging Kibana route externally:\n' + error ) diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging.py b/roles/openshift_health_checker/openshift_checks/logging/logging.py index 43ba6c406..ecd8adb64 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/logging.py +++ b/roles/openshift_health_checker/openshift_checks/logging/logging.py @@ -8,6 +8,16 @@ import os from openshift_checks import OpenShiftCheck, OpenShiftCheckException +class MissingComponentPods(OpenShiftCheckException): + """Raised when a component has no pods in the namespace.""" + pass + + +class CouldNotUseOc(OpenShiftCheckException): + """Raised when ocutil has a failure running oc.""" + pass + + class LoggingCheck(OpenShiftCheck): """Base class for OpenShift aggregated logging component checks""" @@ -15,10 +25,9 @@ class LoggingCheck(OpenShiftCheck): # run by itself. name = "logging" - logging_namespace = "logging" def is_active(self): - logging_deployed = self.get_var("openshift_hosted_logging_deploy", default=False) + logging_deployed = self.get_var("openshift_hosted_logging_deploy", convert=bool, default=False) return logging_deployed and super(LoggingCheck, self).is_active() and self.is_first_master() def is_first_master(self): @@ -32,22 +41,24 @@ class LoggingCheck(OpenShiftCheck): def run(self): return {} - def get_pods_for_component(self, namespace, logging_component): - """Get all pods for a given component. Returns: list of pods for component, error string""" + def get_pods_for_component(self, logging_component): + """Get all pods for a given component. Returns: list of pods.""" pod_output = self.exec_oc( - namespace, "get pods -l component={} -o json".format(logging_component), [], ) try: - pods = json.loads(pod_output) - if not pods or not pods.get('items'): + pods = json.loads(pod_output) # raises ValueError if deserialize fails + if not pods or not pods.get('items'): # also a broken response, treat the same raise ValueError() except ValueError: - # successful run but non-parsing data generally means there were no pods in the namespace - return None, 'No pods were found for the "{}" logging component.'.format(logging_component) + # successful run but non-parsing data generally means there were no pods to be found + raise MissingComponentPods( + 'There are no "{}" component pods in the "{}" namespace.\n' + 'Is logging deployed?'.format(logging_component, self.logging_namespace()) + ) - return pods['items'], None + return pods['items'] @staticmethod def not_running_pods(pods): @@ -63,15 +74,19 @@ class LoggingCheck(OpenShiftCheck): ) ] - def exec_oc(self, namespace="logging", cmd_str="", extra_args=None): + def logging_namespace(self): + """Returns the namespace in which logging is configured to deploy.""" + return self.get_var("openshift_logging_namespace", default="logging") + + def exec_oc(self, cmd_str="", extra_args=None): """ Execute an 'oc' command in the remote host. Returns: output of command and namespace, - or raises OpenShiftCheckException on error + or raises CouldNotUseOc on error """ config_base = self.get_var("openshift", "common", "config_base") args = { - "namespace": namespace, + "namespace": self.logging_namespace(), "config_file": os.path.join(config_base, "master", "admin.kubeconfig"), "cmd": cmd_str, "extra_args": list(extra_args) if extra_args else [], @@ -79,17 +94,16 @@ class LoggingCheck(OpenShiftCheck): result = self.execute_module("ocutil", args) if result.get("failed"): - msg = ( - 'Unexpected error using `oc` to validate the logging stack components.\n' - 'Error executing `oc {cmd}`:\n' - '{error}' - ).format(cmd=args['cmd'], error=result['result']) - if result['result'] == '[Errno 2] No such file or directory': - msg = ( + raise CouldNotUseOc( "This host is supposed to be a master but does not have the `oc` command where expected.\n" "Has an installation been run on this host yet?" ) - raise OpenShiftCheckException(msg) + + raise CouldNotUseOc( + 'Unexpected error using `oc` to validate the logging stack components.\n' + 'Error executing `oc {cmd}`:\n' + '{error}'.format(cmd=args['cmd'], error=result['result']) + ) return result.get("result", "") diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py index b24e88e05..d781db649 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py +++ b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py @@ -19,8 +19,6 @@ class LoggingIndexTime(LoggingCheck): name = "logging_index_time" tags = ["health", "logging"] - logging_namespace = "logging" - def run(self): """Add log entry by making unique request to Kibana. Check for unique entry in the ElasticSearch pod logs.""" try: @@ -28,29 +26,25 @@ class LoggingIndexTime(LoggingCheck): self.get_var("openshift_check_logging_index_timeout_seconds", default=ES_CMD_TIMEOUT_SECONDS) ) except ValueError: - return { - "failed": True, - "msg": ('Invalid value provided for "openshift_check_logging_index_timeout_seconds". ' - 'Value must be an integer representing an amount in seconds.'), - } + raise OpenShiftCheckException( + 'InvalidTimeout', + 'Invalid value provided for "openshift_check_logging_index_timeout_seconds". ' + 'Value must be an integer representing an amount in seconds.' + ) running_component_pods = dict() # get all component pods - self.logging_namespace = self.get_var("openshift_logging_namespace", default=self.logging_namespace) for component, name in (['kibana', 'Kibana'], ['es', 'Elasticsearch']): - pods, error = self.get_pods_for_component(self.logging_namespace, component) - - if error: - msg = 'Unable to retrieve pods for the {} logging component: {}' - return {"failed": True, "changed": False, "msg": msg.format(name, error)} - + pods = self.get_pods_for_component(component) running_pods = self.running_pods(pods) if not running_pods: - msg = ('No {} pods in the "Running" state were found.' - 'At least one pod is required in order to perform this check.') - return {"failed": True, "changed": False, "msg": msg.format(name)} + raise OpenShiftCheckException( + component + 'NoRunningPods', + 'No {} pods in the "Running" state were found.' + 'At least one pod is required in order to perform this check.'.format(name) + ) running_component_pods[component] = running_pods @@ -65,8 +59,11 @@ class LoggingIndexTime(LoggingCheck): interval = 1 while not self.query_es_from_es(es_pod, uuid): if time.time() + interval > deadline: - msg = "expecting match in Elasticsearch for message with uuid {}, but no matches were found after {}s." - raise OpenShiftCheckException(msg.format(uuid, timeout_secs)) + raise OpenShiftCheckException( + "NoMatchFound", + "expecting match in Elasticsearch for message with uuid {}, " + "but no matches were found after {}s.".format(uuid, timeout_secs) + ) time.sleep(interval) def curl_kibana_with_uuid(self, kibana_pod): @@ -76,22 +73,23 @@ class LoggingIndexTime(LoggingCheck): exec_cmd = "exec {pod_name} -c kibana -- curl --max-time 30 -s http://localhost:5601/{uuid}" exec_cmd = exec_cmd.format(pod_name=pod_name, uuid=uuid) - error_str = self.exec_oc(self.logging_namespace, exec_cmd, []) + error_str = self.exec_oc(exec_cmd, []) try: error_code = json.loads(error_str)["statusCode"] - except KeyError: - msg = ('invalid response returned from Kibana request (Missing "statusCode" key):\n' - 'Command: {}\nResponse: {}').format(exec_cmd, error_str) - raise OpenShiftCheckException(msg) - except ValueError: - msg = ('invalid response returned from Kibana request (Non-JSON output):\n' - 'Command: {}\nResponse: {}').format(exec_cmd, error_str) - raise OpenShiftCheckException(msg) + except (KeyError, ValueError): + raise OpenShiftCheckException( + 'kibanaInvalidResponse', + 'invalid response returned from Kibana request:\n' + 'Command: {}\nResponse: {}'.format(exec_cmd, error_str) + ) if error_code != 404: - msg = 'invalid error code returned from Kibana request. Expecting error code "404", but got "{}" instead.' - raise OpenShiftCheckException(msg.format(error_code)) + raise OpenShiftCheckException( + 'kibanaInvalidReturnCode', + 'invalid error code returned from Kibana request.\n' + 'Expecting error code "404", but got "{}" instead.'.format(error_code) + ) return uuid @@ -105,17 +103,18 @@ class LoggingIndexTime(LoggingCheck): "--key /etc/elasticsearch/secret/admin-key " "https://logging-es:9200/project.{namespace}*/_count?q=message:{uuid}" ) - exec_cmd = exec_cmd.format(pod_name=pod_name, namespace=self.logging_namespace, uuid=uuid) - result = self.exec_oc(self.logging_namespace, exec_cmd, []) + exec_cmd = exec_cmd.format(pod_name=pod_name, namespace=self.logging_namespace(), uuid=uuid) + result = self.exec_oc(exec_cmd, []) try: count = json.loads(result)["count"] - except KeyError: - msg = 'invalid response from Elasticsearch query:\n"{}"\nMissing "count" key:\n{}' - raise OpenShiftCheckException(msg.format(exec_cmd, result)) - except ValueError: - msg = 'invalid response from Elasticsearch query:\n"{}"\nNon-JSON output:\n{}' - raise OpenShiftCheckException(msg.format(exec_cmd, result)) + except (KeyError, ValueError): + raise OpenShiftCheckException( + 'esInvalidResponse', + 'Invalid response from Elasticsearch query:\n' + ' {}\n' + 'Response was:\n{}'.format(exec_cmd, result) + ) return count diff --git a/roles/openshift_health_checker/openshift_checks/mixins.py b/roles/openshift_health_checker/openshift_checks/mixins.py index 3b2c64e6a..e9bae60a3 100644 --- a/roles/openshift_health_checker/openshift_checks/mixins.py +++ b/roles/openshift_health_checker/openshift_checks/mixins.py @@ -29,10 +29,10 @@ class DockerHostMixin(object): """ Ensure that docker-related packages exist, but not on atomic hosts (which would not be able to install but should already have them). - Returns: msg, failed, changed + Returns: msg, failed """ if self.get_var("openshift", "common", "is_atomic"): - return "", False, False + return "", False # NOTE: we would use the "package" module but it's actually an action plugin # and it's not clear how to invoke one of those. This is about the same anyway: @@ -49,5 +49,5 @@ class DockerHostMixin(object): " {deps}\n{msg}" ).format(deps=',\n '.join(self.dependencies), msg=msg) failed = result.get("failed", False) or result.get("rc", 0) != 0 - changed = result.get("changed", False) - return msg, failed, changed + self.changed = result.get("changed", False) + return msg, failed diff --git a/roles/openshift_health_checker/openshift_checks/ovs_version.py b/roles/openshift_health_checker/openshift_checks/ovs_version.py index cd6ebd493..363c12def 100644 --- a/roles/openshift_health_checker/openshift_checks/ovs_version.py +++ b/roles/openshift_health_checker/openshift_checks/ovs_version.py @@ -16,17 +16,11 @@ class OvsVersion(NotContainerizedMixin, OpenShiftCheck): tags = ["health"] openshift_to_ovs_version = { - "3.6": "2.6", - "3.5": "2.6", + "3.6": ["2.6", "2.7"], + "3.5": ["2.6", "2.7"], "3.4": "2.4", } - # map major release versions across releases - # to a common major version - openshift_major_release_version = { - "1": "3", - } - def is_active(self): """Skip hosts that do not have package requirements.""" group_names = self.get_var("group_names", default=[]) @@ -46,32 +40,15 @@ class OvsVersion(NotContainerizedMixin, OpenShiftCheck): def get_required_ovs_version(self): """Return the correct Open vSwitch version for the current OpenShift version""" - openshift_version = self._get_openshift_version() + openshift_version_tuple = self.get_major_minor_version(self.get_var("openshift_image_tag")) - if float(openshift_version) < 3.5: + if openshift_version_tuple < (3, 5): return self.openshift_to_ovs_version["3.4"] - ovs_version = self.openshift_to_ovs_version.get(str(openshift_version)) + openshift_version = ".".join(str(x) for x in openshift_version_tuple) + ovs_version = self.openshift_to_ovs_version.get(openshift_version) if ovs_version: - return self.openshift_to_ovs_version[str(openshift_version)] + return self.openshift_to_ovs_version[openshift_version] msg = "There is no recommended version of Open vSwitch for the current version of OpenShift: {}" raise OpenShiftCheckException(msg.format(openshift_version)) - - def _get_openshift_version(self): - openshift_version = self.get_var("openshift_image_tag") - if openshift_version and openshift_version[0] == 'v': - openshift_version = openshift_version[1:] - - return self._parse_version(openshift_version) - - def _parse_version(self, version): - components = version.split(".") - if not components or len(components) < 2: - msg = "An invalid version of OpenShift was found for this host: {}" - raise OpenShiftCheckException(msg.format(version)) - - if components[0] in self.openshift_major_release_version: - components[0] = self.openshift_major_release_version[components[0]] - - return '.'.join(components[:2]) diff --git a/roles/openshift_health_checker/openshift_checks/package_version.py b/roles/openshift_health_checker/openshift_checks/package_version.py index 020786804..8b780114f 100644 --- a/roles/openshift_health_checker/openshift_checks/package_version.py +++ b/roles/openshift_health_checker/openshift_checks/package_version.py @@ -1,4 +1,7 @@ """Check that available RPM packages match the required versions.""" + +import re + from openshift_checks import OpenShiftCheck, OpenShiftCheckException from openshift_checks.mixins import NotContainerizedMixin @@ -9,23 +12,25 @@ class PackageVersion(NotContainerizedMixin, OpenShiftCheck): name = "package_version" tags = ["preflight"] + # NOTE: versions outside those specified are mapped to least/greatest openshift_to_ovs_version = { - "3.6": ["2.6", "2.7"], - "3.5": ["2.6", "2.7"], - "3.4": "2.4", + (3, 4): "2.4", + (3, 5): ["2.6", "2.7"], + (3, 6): ["2.6", "2.7"], } openshift_to_docker_version = { - "3.1": "1.8", - "3.2": "1.10", - "3.3": "1.10", - "3.4": "1.12", + (3, 1): "1.8", + (3, 2): "1.10", + (3, 3): "1.10", + (3, 4): "1.12", + (3, 5): "1.12", + (3, 6): "1.12", } - # map major release versions across releases - # to a common major version - openshift_major_release_version = { - "1": "3", + # map major OpenShift release versions across releases to a common major version + map_major_release_version = { + 1: 3, } def is_active(self): @@ -73,54 +78,49 @@ class PackageVersion(NotContainerizedMixin, OpenShiftCheck): return self.execute_module("aos_version", args) def get_required_ovs_version(self): - """Return the correct Open vSwitch version for the current OpenShift version. - If the current OpenShift version is >= 3.5, ensure Open vSwitch version 2.6, - Else ensure Open vSwitch version 2.4""" - openshift_version = self.get_openshift_version() + """Return the correct Open vSwitch version(s) for the current OpenShift version.""" + openshift_version = self.get_openshift_version_tuple() - if float(openshift_version) < 3.5: - return self.openshift_to_ovs_version["3.4"] + earliest = min(self.openshift_to_ovs_version) + latest = max(self.openshift_to_ovs_version) + if openshift_version < earliest: + return self.openshift_to_ovs_version[earliest] + if openshift_version > latest: + return self.openshift_to_ovs_version[latest] - ovs_version = self.openshift_to_ovs_version.get(str(openshift_version)) - if ovs_version: - return ovs_version + ovs_version = self.openshift_to_ovs_version.get(openshift_version) + if not ovs_version: + msg = "There is no recommended version of Open vSwitch for the current version of OpenShift: {}" + raise OpenShiftCheckException(msg.format(".".join(str(comp) for comp in openshift_version))) - msg = "There is no recommended version of Open vSwitch for the current version of OpenShift: {}" - raise OpenShiftCheckException(msg.format(openshift_version)) + return ovs_version def get_required_docker_version(self): - """Return the correct Docker version for the current OpenShift version. - If the OpenShift version is 3.1, ensure Docker version 1.8. - If the OpenShift version is 3.2 or 3.3, ensure Docker version 1.10. - If the current OpenShift version is >= 3.4, ensure Docker version 1.12.""" - openshift_version = self.get_openshift_version() - - if float(openshift_version) >= 3.4: - return self.openshift_to_docker_version["3.4"] - - docker_version = self.openshift_to_docker_version.get(str(openshift_version)) - if docker_version: - return docker_version - - msg = "There is no recommended version of Docker for the current version of OpenShift: {}" - raise OpenShiftCheckException(msg.format(openshift_version)) - - def get_openshift_version(self): - """Return received image tag as a normalized X.Y minor version string.""" - openshift_version = self.get_var("openshift_image_tag") - if openshift_version and openshift_version[0] == 'v': - openshift_version = openshift_version[1:] - - return self.parse_version(openshift_version) - - def parse_version(self, version): - """Return a normalized X.Y minor version string.""" - components = version.split(".") - if not components or len(components) < 2: + """Return the correct Docker version(s) for the current OpenShift version.""" + openshift_version = self.get_openshift_version_tuple() + + earliest = min(self.openshift_to_docker_version) + latest = max(self.openshift_to_docker_version) + if openshift_version < earliest: + return self.openshift_to_docker_version[earliest] + if openshift_version > latest: + return self.openshift_to_docker_version[latest] + + docker_version = self.openshift_to_docker_version.get(openshift_version) + if not docker_version: + msg = "There is no recommended version of Docker for the current version of OpenShift: {}" + raise OpenShiftCheckException(msg.format(".".join(str(comp) for comp in openshift_version))) + + return docker_version + + def get_openshift_version_tuple(self): + """Return received image tag as a normalized (X, Y) minor version tuple.""" + version = self.get_var("openshift_image_tag") + comps = [int(component) for component in re.findall(r'\d+', version)] + + if len(comps) < 2: msg = "An invalid version of OpenShift was found for this host: {}" raise OpenShiftCheckException(msg.format(version)) - if components[0] in self.openshift_major_release_version: - components[0] = self.openshift_major_release_version[components[0]] - - return '.'.join(components[:2]) + comps[0] = self.map_major_release_version.get(comps[0], comps[0]) + return tuple(comps[0:2]) |