diff options
Diffstat (limited to 'roles/openshift_health_checker')
12 files changed, 699 insertions, 0 deletions
diff --git a/roles/openshift_health_checker/HOWTO_CHECKS.md b/roles/openshift_health_checker/HOWTO_CHECKS.md new file mode 100644 index 000000000..1573c14da --- /dev/null +++ b/roles/openshift_health_checker/HOWTO_CHECKS.md @@ -0,0 +1,34 @@ +# OpenShift health checks + +This Ansible role contains health checks to diagnose problems in OpenShift +environments. + +Checks are typically implemented as two parts: + +1. a Python module in [openshift_checks/](openshift_checks), with a class that + inherits from `OpenShiftCheck`. +2. a custom Ansible module in [library/](library), for cases when the modules + shipped with Ansible do not provide the required functionality. + +The checks are called from an Ansible playbooks via the `openshift_health_check` +action plugin. See +[playbooks/byo/openshift-preflight/check.yml](../../playbooks/byo/openshift-preflight/check.yml) +for an example. + +The action plugin dynamically discovers all checks and executes only those +selected in the play. + +Checks can determine when they are active by implementing the method +`is_active`. Inactive checks are skipped. This is similar to the `when` +instruction in Ansible plays. + +Checks may have tags, which are a way to group related checks together. For +instance, to run all preflight checks, pass in the group `'@preflight'` to +`openshift_health_check`. + +Groups are automatically computed from tags. + +Groups and individual check names can be used together in the argument list to +`openshift_health_check`. + +Look at existing checks for the implementation details. diff --git a/roles/openshift_health_checker/README.md b/roles/openshift_health_checker/README.md new file mode 100644 index 000000000..4ab5f1f7b --- /dev/null +++ b/roles/openshift_health_checker/README.md @@ -0,0 +1,45 @@ +OpenShift Health Checker +======================== + +This role detects common problems with OpenShift installations or with +environments prior to install. + +For more information about creating new checks, see [HOWTO_CHECKS.md](HOWTO_CHECKS.md). + +Requirements +------------ + +* Ansible 2.2+ + +Role Variables +-------------- + +None + +Dependencies +------------ + +- openshift_facts + +Example Playbook +---------------- + +```yaml +--- +- hosts: OSEv3 + name: run OpenShift health checks + roles: + - openshift_health_checker + post_tasks: + - action: openshift_health_check +``` + +License +------- + +Apache License Version 2.0 + +Author Information +------------------ + +Customer Success team (dev@lists.openshift.redhat.com) diff --git a/roles/openshift_health_checker/action_plugins/openshift_health_check.py b/roles/openshift_health_checker/action_plugins/openshift_health_check.py new file mode 100644 index 000000000..0411797b1 --- /dev/null +++ b/roles/openshift_health_checker/action_plugins/openshift_health_check.py @@ -0,0 +1,116 @@ +""" +Ansible action plugin to execute health checks in OpenShift clusters. +""" +# pylint: disable=wrong-import-position,missing-docstring,invalid-name +import sys +import os + +try: + from __main__ import display +except ImportError: + from ansible.utils.display import Display + display = Display() + +from ansible.plugins.action import ActionBase + +# Augment sys.path so that we can import checks from a directory relative to +# this callback plugin. +sys.path.insert(1, os.path.dirname(os.path.dirname(__file__))) + +from openshift_checks import OpenShiftCheck, OpenShiftCheckException # noqa: E402 + + +class ActionModule(ActionBase): + + def run(self, tmp=None, task_vars=None): + result = super(ActionModule, self).run(tmp, task_vars) + + if task_vars is None: + task_vars = {} + + if "openshift" not in task_vars: + result["failed"] = True + result["msg"] = "'openshift' is undefined, did 'openshift_facts' run?" + return result + + try: + known_checks = self.load_known_checks() + except OpenShiftCheckException as e: + result["failed"] = True + result["msg"] = str(e) + return result + + args = self._task.args + requested_checks = resolve_checks(args.get("checks", []), known_checks.values()) + + unknown_checks = requested_checks - set(known_checks) + if unknown_checks: + result["failed"] = True + result["msg"] = ( + "One or more checks are unknown: {}. " + "Make sure there is no typo in the playbook and no files are missing." + ).format(", ".join(unknown_checks)) + return result + + result["checks"] = check_results = {} + + for check_name in requested_checks & set(known_checks): + display.banner("CHECK [{} : {}]".format(check_name, task_vars["ansible_host"])) + check = known_checks[check_name] + + if check.is_active(task_vars): + try: + r = check.run(tmp, task_vars) + except OpenShiftCheckException as e: + r = {} + r["failed"] = True + r["msg"] = str(e) + else: + r = {"skipped": True} + + check_results[check_name] = r + + if r.get("failed", False): + result["failed"] = True + result["msg"] = "One or more checks failed" + + return result + + def load_known_checks(self): + known_checks = {} + + known_check_classes = set(cls for cls in OpenShiftCheck.subclasses()) + + for cls in known_check_classes: + check_name = cls.name + if check_name in known_checks: + other_cls = known_checks[check_name].__class__ + raise OpenShiftCheckException( + "non-unique check name '{}' in: '{}.{}' and '{}.{}'".format( + check_name, + cls.__module__, cls.__name__, + other_cls.__module__, other_cls.__name__)) + known_checks[check_name] = cls(module_executor=self._execute_module) + + return known_checks + + +def resolve_checks(names, all_checks): + """Returns a set of resolved check names. + + Resolving a check name involves expanding tag references (e.g., '@tag') with + all the checks that contain the given tag. + + names should be a sequence of strings. + + all_checks should be a sequence of check classes/instances. + """ + resolved = set() + for name in names: + if name.startswith("@"): + for check in all_checks: + if name[1:] in check.tags: + resolved.add(check.name) + else: + resolved.add(name) + return resolved diff --git a/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py b/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py new file mode 100644 index 000000000..8caefab15 --- /dev/null +++ b/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py @@ -0,0 +1,100 @@ +# vim: expandtab:tabstop=4:shiftwidth=4 +''' +Ansible callback plugin. +''' + +from pprint import pformat + +from ansible.plugins.callback import CallbackBase +from ansible import constants as C +from ansible.utils.color import stringc + + +class CallbackModule(CallbackBase): + ''' + This callback plugin stores task results and summarizes failures. + The file name is prefixed with `zz_` to make this plugin be loaded last by + Ansible, thus making its output the last thing that users see. + ''' + + CALLBACK_VERSION = 2.0 + CALLBACK_TYPE = 'aggregate' + CALLBACK_NAME = 'failure_summary' + CALLBACK_NEEDS_WHITELIST = False + + def __init__(self): + super(CallbackModule, self).__init__() + self.__failures = [] + + def v2_runner_on_failed(self, result, ignore_errors=False): + super(CallbackModule, self).v2_runner_on_failed(result, ignore_errors) + self.__failures.append(dict(result=result, ignore_errors=ignore_errors)) + + def v2_playbook_on_stats(self, stats): + super(CallbackModule, self).v2_playbook_on_stats(stats) + # TODO: update condition to consider a host var or env var to + # enable/disable the summary, so that we can control the output from a + # play. + if self.__failures: + self._print_failure_summary() + + def _print_failure_summary(self): + '''Print a summary of failed tasks (including ignored failures).''' + self._display.display(u'\nFailure summary:\n') + + # TODO: group failures by host or by task. If grouped by host, it is + # easy to see all problems of a given host. If grouped by task, it is + # easy to see what hosts needs the same fix. + + width = len(str(len(self.__failures))) + initial_indent_format = u' {{:>{width}}}. '.format(width=width) + initial_indent_len = len(initial_indent_format.format(0)) + subsequent_indent = u' ' * initial_indent_len + subsequent_extra_indent = u' ' * (initial_indent_len + 10) + + for i, failure in enumerate(self.__failures, 1): + lines = _format_failure(failure) + self._display.display(u'\n{}{}'.format(initial_indent_format.format(i), lines[0])) + for line in lines[1:]: + line = line.replace(u'\n', u'\n' + subsequent_extra_indent) + indented = u'{}{}'.format(subsequent_indent, line) + self._display.display(indented) + + +# Reason: disable pylint protected-access because we need to access _* +# attributes of a task result to implement this method. +# Status: permanently disabled unless Ansible's API changes. +# pylint: disable=protected-access +def _format_failure(failure): + '''Return a list of pretty-formatted lines describing a failure, including + relevant information about it. Line separators are not included.''' + result = failure['result'] + host = result._host.get_name() + play = _get_play(result._task) + if play: + play = play.get_name() + task = result._task.get_name() + msg = result._result.get('msg', u'???') + rows = ( + (u'Host', host), + (u'Play', play), + (u'Task', task), + (u'Message', stringc(msg, C.COLOR_ERROR)), + ) + if 'checks' in result._result: + rows += ((u'Details', stringc(pformat(result._result['checks']), C.COLOR_ERROR)),) + row_format = '{:10}{}' + return [row_format.format(header + u':', body) for header, body in rows] + + +# Reason: disable pylint protected-access because we need to access _* +# attributes of obj to implement this function. +# This is inspired by ansible.playbook.base.Base.dump_me. +# Status: permanently disabled unless Ansible's API changes. +# pylint: disable=protected-access +def _get_play(obj): + '''Given a task or block, recursively tries to find its parent play.''' + if hasattr(obj, '_play'): + return obj._play + if getattr(obj, '_parent'): + return _get_play(obj._parent) diff --git a/roles/openshift_health_checker/library/aos_version.py b/roles/openshift_health_checker/library/aos_version.py new file mode 100755 index 000000000..13b7d310b --- /dev/null +++ b/roles/openshift_health_checker/library/aos_version.py @@ -0,0 +1,91 @@ +#!/usr/bin/python +# vim: expandtab:tabstop=4:shiftwidth=4 +''' +Ansible module for determining if multiple versions of an OpenShift package are +available, and if the version requested is available down to the given +precision. + +Multiple versions available suggest that multiple repos are enabled for the +different versions, which may cause installation problems. +''' + +import yum # pylint: disable=import-error + +from ansible.module_utils.basic import AnsibleModule + + +def main(): # pylint: disable=missing-docstring,too-many-branches + module = AnsibleModule( + argument_spec=dict( + prefix=dict(required=True), # atomic-openshift, origin, ... + version=dict(required=True), + ), + supports_check_mode=True + ) + + def bail(error): # pylint: disable=missing-docstring + module.fail_json(msg=error) + + rpm_prefix = module.params['prefix'] + + if not rpm_prefix: + bail("prefix must not be empty") + + yb = yum.YumBase() # pylint: disable=invalid-name + + # search for package versions available for aos pkgs + expected_pkgs = [ + rpm_prefix, + rpm_prefix + '-master', + rpm_prefix + '-node', + ] + try: + pkgs = yb.pkgSack.returnPackages(patterns=expected_pkgs) + except yum.Errors.PackageSackError as e: # pylint: disable=invalid-name + # you only hit this if *none* of the packages are available + bail('Unable to find any OpenShift packages.\nCheck your subscription and repo settings.\n%s' % e) + + # determine what level of precision we're expecting for the version + expected_version = module.params['version'] + if expected_version.startswith('v'): # v3.3 => 3.3 + expected_version = expected_version[1:] + num_dots = expected_version.count('.') + + pkgs_by_name_version = {} + pkgs_precise_version_found = {} + for pkg in pkgs: + # get expected version precision + match_version = '.'.join(pkg.version.split('.')[:num_dots + 1]) + if match_version == expected_version: + pkgs_precise_version_found[pkg.name] = True + # get x.y version precision + minor_version = '.'.join(pkg.version.split('.')[:2]) + if pkg.name not in pkgs_by_name_version: + pkgs_by_name_version[pkg.name] = {} + pkgs_by_name_version[pkg.name][minor_version] = True + + # see if any packages couldn't be found at requested version + # see if any packages are available in more than one minor version + not_found = [] + multi_found = [] + for name in expected_pkgs: + if name not in pkgs_precise_version_found: + not_found.append(name) + if name in pkgs_by_name_version and len(pkgs_by_name_version[name]) > 1: + multi_found.append(name) + if not_found: + msg = 'Not all of the required packages are available at requested version %s:\n' % expected_version + for name in not_found: + msg += ' %s\n' % name + bail(msg + 'Please check your subscriptions and enabled repositories.') + if multi_found: + msg = 'Multiple minor versions of these packages are available\n' + for name in multi_found: + msg += ' %s\n' % name + bail(msg + "There should only be one OpenShift version's repository enabled at a time.") + + module.exit_json(changed=False) + + +if __name__ == '__main__': + main() diff --git a/roles/openshift_health_checker/library/check_yum_update.py b/roles/openshift_health_checker/library/check_yum_update.py new file mode 100755 index 000000000..9bc14fd47 --- /dev/null +++ b/roles/openshift_health_checker/library/check_yum_update.py @@ -0,0 +1,105 @@ +#!/usr/bin/python +# vim: expandtab:tabstop=4:shiftwidth=4 +''' +Ansible module to test whether a yum update or install will succeed, +without actually performing it or running yum. +parameters: + packages: (optional) A list of package names to install or update. + If omitted, all installed RPMs are considered for updates. +''' + +import sys + +import yum # pylint: disable=import-error + +from ansible.module_utils.basic import AnsibleModule + + +def main(): # pylint: disable=missing-docstring,too-many-branches + module = AnsibleModule( + argument_spec=dict( + packages=dict(type='list', default=[]) + ), + supports_check_mode=True + ) + + def bail(error): # pylint: disable=missing-docstring + module.fail_json(msg=error) + + yb = yum.YumBase() # pylint: disable=invalid-name + # determine if the existing yum configuration is valid + try: + yb.repos.populateSack(mdtype='metadata', cacheonly=1) + # for error of type: + # 1. can't reach the repo URL(s) + except yum.Errors.NoMoreMirrorsRepoError as e: # pylint: disable=invalid-name + bail('Error getting data from at least one yum repository: %s' % e) + # 2. invalid repo definition + except yum.Errors.RepoError as e: # pylint: disable=invalid-name + bail('Error with yum repository configuration: %s' % e) + # 3. other/unknown + # * just report the problem verbatim + except: # pylint: disable=bare-except; # noqa + bail('Unexpected error with yum repository: %s' % sys.exc_info()[1]) + + packages = module.params['packages'] + no_such_pkg = [] + for pkg in packages: + try: + yb.install(name=pkg) + except yum.Errors.InstallError as e: # pylint: disable=invalid-name + no_such_pkg.append(pkg) + except: # pylint: disable=bare-except; # noqa + bail('Unexpected error with yum install/update: %s' % + sys.exc_info()[1]) + if not packages: + # no packages requested means test a yum update of everything + yb.update() + elif no_such_pkg: + # wanted specific packages to install but some aren't available + user_msg = 'Cannot install all of the necessary packages. Unavailable:\n' + for pkg in no_such_pkg: + user_msg += ' %s\n' % pkg + user_msg += 'You may need to enable one or more yum repositories to make this content available.' + bail(user_msg) + + try: + txn_result, txn_msgs = yb.buildTransaction() + except: # pylint: disable=bare-except; # noqa + bail('Unexpected error during dependency resolution for yum update: \n %s' % + sys.exc_info()[1]) + + # find out if there are any errors with the update/install + if txn_result == 0: # 'normal exit' meaning there's nothing to install/update + pass + elif txn_result == 1: # error with transaction + user_msg = 'Could not perform a yum update.\n' + if len(txn_msgs) > 0: + user_msg += 'Errors from dependency resolution:\n' + for msg in txn_msgs: + user_msg += ' %s\n' % msg + user_msg += 'You should resolve these issues before proceeding with an install.\n' + user_msg += 'You may need to remove or downgrade packages or enable/disable yum repositories.' + bail(user_msg) + # TODO: it would be nice depending on the problem: + # 1. dependency for update not found + # * construct the dependency tree + # * find the installed package(s) that required the missing dep + # * determine if any of these packages matter to openshift + # * build helpful error output + # 2. conflicts among packages in available content + # * analyze dependency tree and build helpful error output + # 3. other/unknown + # * report the problem verbatim + # * add to this list as we come across problems we can clearly diagnose + elif txn_result == 2: # everything resolved fine + pass + else: + bail('Unknown error(s) from dependency resolution. Exit Code: %d:\n%s' % + (txn_result, txn_msgs)) + + module.exit_json(changed=False) + + +if __name__ == '__main__': + main() diff --git a/roles/openshift_health_checker/meta/main.yml b/roles/openshift_health_checker/meta/main.yml new file mode 100644 index 000000000..0bbeadd34 --- /dev/null +++ b/roles/openshift_health_checker/meta/main.yml @@ -0,0 +1,3 @@ +--- +dependencies: + - role: openshift_facts diff --git a/roles/openshift_health_checker/openshift_checks/__init__.py b/roles/openshift_health_checker/openshift_checks/__init__.py new file mode 100644 index 000000000..c31242624 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/__init__.py @@ -0,0 +1,84 @@ +""" +Health checks for OpenShift clusters. +""" + +import os +from abc import ABCMeta, abstractmethod, abstractproperty +from importlib import import_module +import operator + +import six +from six.moves import reduce + + +class OpenShiftCheckException(Exception): + """Raised when a check cannot proceed.""" + pass + + +@six.add_metaclass(ABCMeta) +class OpenShiftCheck(object): + """A base class for defining checks for an OpenShift cluster environment.""" + + def __init__(self, module_executor): + self.module_executor = module_executor + + @abstractproperty + def name(self): + """The name of this check, usually derived from the class name.""" + return "openshift_check" + + @property + def tags(self): + """A list of tags that this check satisfy. + + Tags are used to reference multiple checks with a single '@tagname' + special check name. + """ + return [] + + @classmethod + def is_active(cls, task_vars): # pylint: disable=unused-argument + """Returns true if this check applies to the ansible-playbook run.""" + return True + + @abstractmethod + def run(self, tmp, task_vars): + """Executes a check, normally implemented as a module.""" + return {} + + @classmethod + def subclasses(cls): + """Returns a generator of subclasses of this class and its subclasses.""" + for subclass in cls.__subclasses__(): # pylint: disable=no-member + yield subclass + for subclass in subclass.subclasses(): + yield subclass + + +def get_var(task_vars, *keys, **kwargs): + """Helper function to get deeply nested values from task_vars. + + Ansible task_vars structures are Python dicts, often mapping strings to + other dicts. This helper makes it easier to get a nested value, raising + OpenShiftCheckException when a key is not found. + """ + try: + value = reduce(operator.getitem, keys, task_vars) + except (KeyError, TypeError): + if "default" in kwargs: + return kwargs["default"] + raise OpenShiftCheckException("'{}' is undefined".format(".".join(map(str, keys)))) + return value + + +# Dynamically import all submodules for the side effect of loading checks. + +EXCLUDES = ( + "__init__.py", + "mixins.py", +) + +for name in os.listdir(os.path.dirname(__file__)): + if name.endswith(".py") and name not in EXCLUDES: + import_module(__package__ + "." + name[:-3]) diff --git a/roles/openshift_health_checker/openshift_checks/mixins.py b/roles/openshift_health_checker/openshift_checks/mixins.py new file mode 100644 index 000000000..4029fba62 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/mixins.py @@ -0,0 +1,21 @@ +# pylint: disable=missing-docstring +from openshift_checks import get_var + + +class NotContainerized(object): + """Mixin for checks that are only active when not in containerized mode.""" + + @classmethod + def is_active(cls, task_vars): + return ( + # This mixin is meant to be used with subclasses of + # OpenShiftCheck. Pylint disables this by default on mixins, + # though it relies on the class name ending in 'mixin'. + # pylint: disable=no-member + super(NotContainerized, cls).is_active(task_vars) and + not cls.is_containerized(task_vars) + ) + + @staticmethod + def is_containerized(task_vars): + return get_var(task_vars, "openshift", "common", "is_containerized") diff --git a/roles/openshift_health_checker/openshift_checks/package_availability.py b/roles/openshift_health_checker/openshift_checks/package_availability.py new file mode 100644 index 000000000..8faeef5ee --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/package_availability.py @@ -0,0 +1,66 @@ +# pylint: disable=missing-docstring +from openshift_checks import OpenShiftCheck, get_var +from openshift_checks.mixins import NotContainerized + + +class PackageAvailability(NotContainerized, OpenShiftCheck): + """Check that required RPM packages are available.""" + + name = "package_availability" + tags = ["preflight"] + + def run(self, tmp, task_vars): + rpm_prefix = get_var(task_vars, "openshift", "common", "service_type") + group_names = get_var(task_vars, "group_names", default=[]) + + packages = set() + + if "masters" in group_names: + packages.update(self.master_packages(rpm_prefix)) + if "nodes" in group_names: + packages.update(self.node_packages(rpm_prefix)) + + args = {"packages": sorted(set(packages))} + return self.module_executor("check_yum_update", args, tmp, task_vars) + + @staticmethod + def master_packages(rpm_prefix): + return [ + "{rpm_prefix}".format(rpm_prefix=rpm_prefix), + "{rpm_prefix}-clients".format(rpm_prefix=rpm_prefix), + "{rpm_prefix}-master".format(rpm_prefix=rpm_prefix), + "bash-completion", + "cockpit-bridge", + "cockpit-docker", + "cockpit-kubernetes", + "cockpit-shell", + "cockpit-ws", + "etcd", + "httpd-tools", + ] + + @staticmethod + def node_packages(rpm_prefix): + return [ + "{rpm_prefix}".format(rpm_prefix=rpm_prefix), + "{rpm_prefix}-node".format(rpm_prefix=rpm_prefix), + "{rpm_prefix}-sdn-ovs".format(rpm_prefix=rpm_prefix), + "bind", + "ceph-common", + "dnsmasq", + "docker", + "firewalld", + "flannel", + "glusterfs-fuse", + "iptables-services", + "iptables", + "iscsi-initiator-utils", + "libselinux-python", + "nfs-utils", + "ntp", + "openssl", + "pyparted", + "python-httplib2", + "PyYAML", + "yum-utils", + ] diff --git a/roles/openshift_health_checker/openshift_checks/package_update.py b/roles/openshift_health_checker/openshift_checks/package_update.py new file mode 100644 index 000000000..86b7b6245 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/package_update.py @@ -0,0 +1,14 @@ +# pylint: disable=missing-docstring +from openshift_checks import OpenShiftCheck +from openshift_checks.mixins import NotContainerized + + +class PackageUpdate(NotContainerized, OpenShiftCheck): + """Check that there are no conflicts in RPM packages.""" + + name = "package_update" + tags = ["preflight"] + + def run(self, tmp, task_vars): + args = {"packages": []} + return self.module_executor("check_yum_update", args, tmp, task_vars) diff --git a/roles/openshift_health_checker/openshift_checks/package_version.py b/roles/openshift_health_checker/openshift_checks/package_version.py new file mode 100644 index 000000000..7fa09cbfd --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/package_version.py @@ -0,0 +1,20 @@ +# pylint: disable=missing-docstring +from openshift_checks import OpenShiftCheck, get_var +from openshift_checks.mixins import NotContainerized + + +class PackageVersion(NotContainerized, OpenShiftCheck): + """Check that available RPM packages match the required versions.""" + + name = "package_version" + tags = ["preflight"] + + def run(self, tmp, task_vars): + rpm_prefix = get_var(task_vars, "openshift", "common", "service_type") + openshift_release = get_var(task_vars, "openshift_release") + + args = { + "prefix": rpm_prefix, + "version": openshift_release, + } + return self.module_executor("aos_version", args, tmp, task_vars) |