summaryrefslogtreecommitdiffstats
path: root/roles/openshift_health_checker
diff options
context:
space:
mode:
Diffstat (limited to 'roles/openshift_health_checker')
-rw-r--r--roles/openshift_health_checker/action_plugins/openshift_health_check.py306
-rw-r--r--roles/openshift_health_checker/callback_plugins/zz_failure_summary.py282
-rw-r--r--[-rwxr-xr-x]roles/openshift_health_checker/library/aos_version.py198
-rw-r--r--[-rwxr-xr-x]roles/openshift_health_checker/library/check_yum_update.py0
-rw-r--r--roles/openshift_health_checker/library/docker_info.py2
-rw-r--r--roles/openshift_health_checker/library/etcdkeysize.py122
-rw-r--r--roles/openshift_health_checker/library/ocutil.py73
-rw-r--r--roles/openshift_health_checker/library/rpm_version.py133
-rw-r--r--roles/openshift_health_checker/library/search_journalctl.py150
-rw-r--r--roles/openshift_health_checker/meta/main.yml3
-rw-r--r--roles/openshift_health_checker/openshift_checks/__init__.py351
-rw-r--r--roles/openshift_health_checker/openshift_checks/diagnostics.py62
-rw-r--r--roles/openshift_health_checker/openshift_checks/disk_availability.py173
-rw-r--r--roles/openshift_health_checker/openshift_checks/docker_image_availability.py372
-rw-r--r--roles/openshift_health_checker/openshift_checks/docker_storage.py276
-rw-r--r--roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py72
-rw-r--r--roles/openshift_health_checker/openshift_checks/etcd_traffic.py44
-rw-r--r--roles/openshift_health_checker/openshift_checks/etcd_volume.py47
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/__init__.py0
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/curator.py43
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py212
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/fluentd.py154
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py131
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/kibana.py226
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/logging.py101
-rw-r--r--roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py129
-rw-r--r--roles/openshift_health_checker/openshift_checks/memory_availability.py44
-rw-r--r--roles/openshift_health_checker/openshift_checks/mixins.py53
-rw-r--r--roles/openshift_health_checker/openshift_checks/ovs_version.py54
-rw-r--r--roles/openshift_health_checker/openshift_checks/package_availability.py28
-rw-r--r--roles/openshift_health_checker/openshift_checks/package_update.py8
-rw-r--r--roles/openshift_health_checker/openshift_checks/package_version.py126
-rw-r--r--roles/openshift_health_checker/test/action_plugin_test.py183
-rw-r--r--roles/openshift_health_checker/test/aos_version_test.py200
-rw-r--r--roles/openshift_health_checker/test/conftest.py1
-rw-r--r--roles/openshift_health_checker/test/curator_test.py57
-rw-r--r--roles/openshift_health_checker/test/diagnostics_test.py50
-rw-r--r--roles/openshift_health_checker/test/disk_availability_test.py209
-rw-r--r--roles/openshift_health_checker/test/docker_image_availability_test.py269
-rw-r--r--roles/openshift_health_checker/test/docker_storage_test.py305
-rw-r--r--roles/openshift_health_checker/test/elasticsearch_test.py203
-rw-r--r--roles/openshift_health_checker/test/etcd_imagedata_size_test.py329
-rw-r--r--roles/openshift_health_checker/test/etcd_traffic_test.py72
-rw-r--r--roles/openshift_health_checker/test/etcd_volume_test.py147
-rw-r--r--roles/openshift_health_checker/test/fluentd_config_test.py348
-rw-r--r--roles/openshift_health_checker/test/fluentd_test.py112
-rw-r--r--roles/openshift_health_checker/test/kibana_test.py244
-rw-r--r--roles/openshift_health_checker/test/logging_check_test.py166
-rw-r--r--roles/openshift_health_checker/test/logging_index_time_test.py170
-rw-r--r--roles/openshift_health_checker/test/memory_availability_test.py96
-rw-r--r--roles/openshift_health_checker/test/mixins_test.py4
-rw-r--r--roles/openshift_health_checker/test/openshift_check_test.py92
-rw-r--r--roles/openshift_health_checker/test/ovs_version_test.py86
-rw-r--r--roles/openshift_health_checker/test/package_availability_test.py19
-rw-r--r--roles/openshift_health_checker/test/package_update_test.py11
-rw-r--r--roles/openshift_health_checker/test/package_version_test.py112
-rw-r--r--roles/openshift_health_checker/test/rpm_version_test.py82
-rw-r--r--roles/openshift_health_checker/test/search_journalctl_test.py157
-rw-r--r--roles/openshift_health_checker/test/zz_failure_summary_test.py85
59 files changed, 7051 insertions, 733 deletions
diff --git a/roles/openshift_health_checker/action_plugins/openshift_health_check.py b/roles/openshift_health_checker/action_plugins/openshift_health_check.py
index 03c40b78b..3ee3b132c 100644
--- a/roles/openshift_health_checker/action_plugins/openshift_health_check.py
+++ b/roles/openshift_health_checker/action_plugins/openshift_health_check.py
@@ -1,91 +1,145 @@
"""
Ansible action plugin to execute health checks in OpenShift clusters.
"""
-# pylint: disable=wrong-import-position,missing-docstring,invalid-name
import sys
import os
+import base64
+import traceback
+import errno
+import json
from collections import defaultdict
+from ansible.plugins.action import ActionBase
+from ansible.module_utils.six import string_types
+
try:
from __main__ import display
except ImportError:
+ # pylint: disable=ungrouped-imports; this is the standard way how to import
+ # the default display object in Ansible action plugins.
from ansible.utils.display import Display
display = Display()
-from ansible.plugins.action import ActionBase
-
# Augment sys.path so that we can import checks from a directory relative to
# this callback plugin.
sys.path.insert(1, os.path.dirname(os.path.dirname(__file__)))
+# pylint: disable=wrong-import-position; the import statement must come after
+# the manipulation of sys.path.
from openshift_checks import OpenShiftCheck, OpenShiftCheckException, load_checks # noqa: E402
class ActionModule(ActionBase):
+ """Action plugin to execute health checks."""
def run(self, tmp=None, task_vars=None):
result = super(ActionModule, self).run(tmp, task_vars)
+ task_vars = task_vars or {}
- if task_vars is None:
- task_vars = {}
+ # callback plugins cannot read Ansible vars, but we would like
+ # zz_failure_summary to have access to certain values. We do so by
+ # storing the information we need in the result.
+ result['playbook_context'] = task_vars.get('r_openshift_health_checker_playbook_context')
- if "openshift" not in task_vars:
- result["failed"] = True
- result["msg"] = "'openshift' is undefined, did 'openshift_facts' run?"
- return result
+ # if the user wants to write check results to files, they provide this directory:
+ output_dir = task_vars.get("openshift_checks_output_dir")
+ if output_dir:
+ output_dir = os.path.join(output_dir, task_vars["ansible_host"])
try:
- known_checks = self.load_known_checks()
- except OpenShiftCheckException as e:
+ known_checks = self.load_known_checks(tmp, task_vars, output_dir)
+ args = self._task.args
+ requested_checks = normalize(args.get('checks', []))
+
+ if not requested_checks:
+ result['failed'] = True
+ result['msg'] = list_known_checks(known_checks)
+ return result
+
+ resolved_checks = resolve_checks(requested_checks, known_checks.values())
+ except OpenShiftCheckException as exc:
result["failed"] = True
- result["msg"] = str(e)
+ result["msg"] = str(exc)
return result
- args = self._task.args
- resolved_checks = resolve_checks(args.get("checks", []), known_checks.values())
+ if "openshift" not in task_vars:
+ result["failed"] = True
+ result["msg"] = "'openshift' is undefined, did 'openshift_facts' run?"
+ return result
result["checks"] = check_results = {}
- for check_name in resolved_checks:
- display.banner("CHECK [{} : {}]".format(check_name, task_vars["ansible_host"]))
- check = known_checks[check_name]
-
- if check.is_active(task_vars):
- try:
- r = check.run(tmp, task_vars)
- except OpenShiftCheckException as e:
- r = {}
- r["failed"] = True
- r["msg"] = str(e)
- else:
- r = {"skipped": True}
+ user_disabled_checks = normalize(task_vars.get('openshift_disable_check', []))
- check_results[check_name] = r
+ for name in resolved_checks:
+ display.banner("CHECK [{} : {}]".format(name, task_vars["ansible_host"]))
+ check_results[name] = run_check(name, known_checks[name], user_disabled_checks, output_dir)
- if r.get("failed", False):
- result["failed"] = True
- result["msg"] = "One or more checks failed"
+ result["changed"] = any(r.get("changed") for r in check_results.values())
+ if any(r.get("failed") for r in check_results.values()):
+ result["failed"] = True
+ result["msg"] = "One or more checks failed"
+ write_result_to_output_dir(output_dir, result)
- result["changed"] = any(r.get("changed", False) for r in check_results.values())
return result
- def load_known_checks(self):
+ def load_known_checks(self, tmp, task_vars, output_dir=None):
+ """Find all existing checks and return a mapping of names to instances."""
load_checks()
+ want_full_results = bool(output_dir)
known_checks = {}
for cls in OpenShiftCheck.subclasses():
- check_name = cls.name
- if check_name in known_checks:
- other_cls = known_checks[check_name].__class__
+ name = cls.name
+ if name in known_checks:
+ other_cls = known_checks[name].__class__
raise OpenShiftCheckException(
- "non-unique check name '{}' in: '{}.{}' and '{}.{}'".format(
- check_name,
- cls.__module__, cls.__name__,
- other_cls.__module__, other_cls.__name__))
- known_checks[check_name] = cls(execute_module=self._execute_module)
+ "duplicate check name '{}' in: '{}' and '{}'"
+ "".format(name, full_class_name(cls), full_class_name(other_cls))
+ )
+ known_checks[name] = cls(
+ execute_module=self._execute_module,
+ tmp=tmp,
+ task_vars=task_vars,
+ want_full_results=want_full_results,
+ templar=self._templar
+ )
return known_checks
+def list_known_checks(known_checks):
+ """Return text listing the existing checks and tags."""
+ # TODO: we could include a description of each check by taking it from a
+ # check class attribute (e.g., __doc__) when building the message below.
+ msg = (
+ 'This playbook is meant to run health checks, but no checks were '
+ 'requested. Set the `openshift_checks` variable to a comma-separated '
+ 'list of check names or a YAML list. Available checks:\n {}'
+ ).format('\n '.join(sorted(known_checks)))
+
+ tags = describe_tags(known_checks.values())
+
+ msg += (
+ '\n\nTags can be used as a shortcut to select multiple '
+ 'checks. Available tags and the checks they select:\n {}'
+ ).format('\n '.join(tags))
+
+ return msg
+
+
+def describe_tags(check_classes):
+ """Return a sorted list of strings describing tags and the checks they include."""
+ tag_checks = defaultdict(list)
+ for cls in check_classes:
+ for tag in cls.tags:
+ tag_checks[tag].append(cls.name)
+ tags = [
+ '@{} = {}'.format(tag, ','.join(sorted(checks)))
+ for tag, checks in tag_checks.items()
+ ]
+ return sorted(tags)
+
+
def resolve_checks(names, all_checks):
"""Returns a set of resolved check names.
@@ -113,6 +167,12 @@ def resolve_checks(names, all_checks):
if unknown_tag_names:
msg.append('Unknown tag names: {}.'.format(', '.join(sorted(unknown_tag_names))))
msg.append('Make sure there is no typo in the playbook and no files are missing.')
+ # TODO: implement a "Did you mean ...?" when the input is similar to a
+ # valid check or tag.
+ msg.append('Known checks:')
+ msg.append(' {}'.format('\n '.join(sorted(known_check_names))))
+ msg.append('Known tags:')
+ msg.append(' {}'.format('\n '.join(describe_tags(all_checks))))
raise OpenShiftCheckException('\n'.join(msg))
tag_to_checks = defaultdict(set)
@@ -125,3 +185,167 @@ def resolve_checks(names, all_checks):
resolved.update(tag_to_checks[tag])
return resolved
+
+
+def normalize(checks):
+ """Return a clean list of check names.
+
+ The input may be a comma-separated string or a sequence. Leading and
+ trailing whitespace characters are removed. Empty items are discarded.
+ """
+ if isinstance(checks, string_types):
+ checks = checks.split(',')
+ return [name.strip() for name in checks if name.strip()]
+
+
+def run_check(name, check, user_disabled_checks, output_dir=None):
+ """Run a single check if enabled and return a result dict."""
+
+ # determine if we're going to run the check (not inactive or disabled)
+ if name in user_disabled_checks or '*' in user_disabled_checks:
+ return dict(skipped=True, skipped_reason="Disabled by user request")
+
+ # pylint: disable=broad-except; capturing exceptions broadly is intentional,
+ # to isolate arbitrary failures in one check from others.
+ try:
+ is_active = check.is_active()
+ except Exception as exc:
+ reason = "Could not determine if check should be run, exception: {}".format(exc)
+ return dict(skipped=True, skipped_reason=reason, exception=traceback.format_exc())
+
+ if not is_active:
+ return dict(skipped=True, skipped_reason="Not active for this host")
+
+ # run the check
+ result = {}
+ try:
+ result = check.run()
+ except OpenShiftCheckException as exc:
+ check.register_failure(exc)
+ except Exception as exc:
+ check.register_failure("\n".join([str(exc), traceback.format_exc()]))
+
+ # process the check state; compose the result hash, write files as needed
+ if check.changed:
+ result["changed"] = True
+ if check.failures or result.get("failed"):
+ if "msg" in result: # failure result has msg; combine with any registered failures
+ check.register_failure(result.get("msg"))
+ result["failures"] = [(fail.name, str(fail)) for fail in check.failures]
+ result["failed"] = True
+ result["msg"] = "\n".join(str(fail) for fail in check.failures)
+ write_to_output_file(output_dir, name + ".failures.json", result["failures"])
+ if check.logs:
+ write_to_output_file(output_dir, name + ".log.json", check.logs)
+ if check.files_to_save:
+ write_files_to_save(output_dir, check)
+
+ return result
+
+
+def prepare_output_dir(dirname):
+ """Create the directory, including parents. Return bool for success/failure."""
+ try:
+ os.makedirs(dirname)
+ return True
+ except OSError as exc:
+ # trying to create existing dir leads to error;
+ # that error is fine, but for any other, assume the dir is not there
+ return exc.errno == errno.EEXIST
+
+
+def copy_remote_file_to_dir(check, file_to_save, output_dir, fname):
+ """Copy file from remote host to local file in output_dir, if given."""
+ if not output_dir or not prepare_output_dir(output_dir):
+ return
+ local_file = os.path.join(output_dir, fname)
+
+ # pylint: disable=broad-except; do not need to do anything about failure to write dir/file
+ # and do not want exceptions to break anything.
+ try:
+ # NOTE: it would have been nice to copy the file directly without loading it into
+ # memory, but there does not seem to be a good way to do this via ansible.
+ result = check.execute_module("slurp", dict(src=file_to_save), register=False)
+ if result.get("failed"):
+ display.warning("Could not retrieve file {}: {}".format(file_to_save, result.get("msg")))
+ return
+
+ content = result["content"]
+ if result.get("encoding") == "base64":
+ content = base64.b64decode(content)
+ with open(local_file, "wb") as outfile:
+ outfile.write(content)
+ except Exception as exc:
+ display.warning("Failed writing remote {} to local {}: {}".format(file_to_save, local_file, exc))
+ return
+
+
+def _no_fail(obj):
+ # pylint: disable=broad-except; do not want serialization to fail for any reason
+ try:
+ return str(obj)
+ except Exception:
+ return "[not serializable]"
+
+
+def write_to_output_file(output_dir, filename, data):
+ """If output_dir provided, write data to file. Serialize as JSON if data is not a string."""
+
+ if not output_dir or not prepare_output_dir(output_dir):
+ return
+ filename = os.path.join(output_dir, filename)
+ try:
+ with open(filename, 'w') as outfile:
+ if isinstance(data, string_types):
+ outfile.write(data)
+ else:
+ json.dump(data, outfile, sort_keys=True, indent=4, default=_no_fail)
+ # pylint: disable=broad-except; do not want serialization/write to break for any reason
+ except Exception as exc:
+ display.warning("Could not write output file {}: {}".format(filename, exc))
+
+
+def write_result_to_output_dir(output_dir, result):
+ """If output_dir provided, write the result as json to result.json.
+
+ Success/failure of the write is recorded as "output_files" in the result hash afterward.
+ Otherwise this is much like write_to_output_file.
+ """
+
+ if not output_dir:
+ return
+ if not prepare_output_dir(output_dir):
+ result["output_files"] = "Error creating output directory " + output_dir
+ return
+
+ filename = os.path.join(output_dir, "result.json")
+ try:
+ with open(filename, 'w') as outfile:
+ json.dump(result, outfile, sort_keys=True, indent=4, default=_no_fail)
+ result["output_files"] = "Check results for this host written to " + filename
+ # pylint: disable=broad-except; do not want serialization/write to break for any reason
+ except Exception as exc:
+ result["output_files"] = "Error writing check results to {}:\n{}".format(filename, exc)
+
+
+def write_files_to_save(output_dir, check):
+ """Write files to check subdir in output dir."""
+ if not output_dir:
+ return
+ output_dir = os.path.join(output_dir, check.name)
+ seen_file = defaultdict(lambda: 0)
+ for file_to_save in check.files_to_save:
+ fname = file_to_save.filename
+ while seen_file[fname]: # just to be sure we never re-write a file, append numbers as needed
+ seen_file[fname] += 1
+ fname = "{}.{}".format(fname, seen_file[fname])
+ seen_file[fname] += 1
+ if file_to_save.remote_filename:
+ copy_remote_file_to_dir(check, file_to_save.remote_filename, output_dir, fname)
+ else:
+ write_to_output_file(output_dir, fname, file_to_save.contents)
+
+
+def full_class_name(cls):
+ """Return the name of a class prefixed with its module name."""
+ return '{}.{}'.format(cls.__module__, cls.__name__)
diff --git a/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py b/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py
index 7bce7f107..dcaf87eca 100644
--- a/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py
+++ b/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py
@@ -1,20 +1,23 @@
-'''
-Ansible callback plugin.
-'''
+"""Ansible callback plugin to print a nicely formatted summary of failures.
-from pprint import pformat
+The file / module name is prefixed with `zz_` to make this plugin be loaded last
+by Ansible, thus making its output the last thing that users see.
+"""
+
+from collections import defaultdict
+import traceback
from ansible.plugins.callback import CallbackBase
from ansible import constants as C
from ansible.utils.color import stringc
+from ansible.module_utils.six import string_types
+
+
+FAILED_NO_MSG = u'Failed without returning a message.'
class CallbackModule(CallbackBase):
- '''
- This callback plugin stores task results and summarizes failures.
- The file name is prefixed with `zz_` to make this plugin be loaded last by
- Ansible, thus making its output the last thing that users see.
- '''
+ """This callback plugin stores task results and summarizes failures."""
CALLBACK_VERSION = 2.0
CALLBACK_TYPE = 'aggregate'
@@ -24,90 +27,209 @@ class CallbackModule(CallbackBase):
def __init__(self):
super(CallbackModule, self).__init__()
self.__failures = []
+ self.__playbook_file = ''
+
+ def v2_playbook_on_start(self, playbook):
+ super(CallbackModule, self).v2_playbook_on_start(playbook)
+ # pylint: disable=protected-access; Ansible gives us no public API to
+ # get the file name of the current playbook from a callback plugin.
+ self.__playbook_file = playbook._file_name
def v2_runner_on_failed(self, result, ignore_errors=False):
super(CallbackModule, self).v2_runner_on_failed(result, ignore_errors)
- self.__failures.append(dict(result=result, ignore_errors=ignore_errors))
+ if not ignore_errors:
+ self.__failures.append(result)
def v2_playbook_on_stats(self, stats):
super(CallbackModule, self).v2_playbook_on_stats(stats)
- # TODO: update condition to consider a host var or env var to
- # enable/disable the summary, so that we can control the output from a
- # play.
- if self.__failures:
- self._print_failure_summary()
-
- def _print_failure_summary(self):
- '''Print a summary of failed tasks (including ignored failures).'''
- self._display.display(u'\nFailure summary:\n')
-
- # TODO: group failures by host or by task. If grouped by host, it is
- # easy to see all problems of a given host. If grouped by task, it is
- # easy to see what hosts needs the same fix.
-
- width = len(str(len(self.__failures)))
- initial_indent_format = u' {{:>{width}}}. '.format(width=width)
- initial_indent_len = len(initial_indent_format.format(0))
- subsequent_indent = u' ' * initial_indent_len
- subsequent_extra_indent = u' ' * (initial_indent_len + 10)
-
- for i, failure in enumerate(self.__failures, 1):
- entries = _format_failure(failure)
- self._display.display(u'\n{}{}'.format(initial_indent_format.format(i), entries[0]))
- for entry in entries[1:]:
- entry = entry.replace(u'\n', u'\n' + subsequent_extra_indent)
- indented = u'{}{}'.format(subsequent_indent, entry)
- self._display.display(indented)
-
-
-# Reason: disable pylint protected-access because we need to access _*
-# attributes of a task result to implement this method.
-# Status: permanently disabled unless Ansible's API changes.
-# pylint: disable=protected-access
-def _format_failure(failure):
- '''Return a list of pretty-formatted text entries describing a failure, including
+ # pylint: disable=broad-except; capturing exceptions broadly is
+ # intentional, to isolate arbitrary failures in this callback plugin.
+ try:
+ if self.__failures:
+ self._display.display(failure_summary(self.__failures, self.__playbook_file))
+ except Exception:
+ msg = stringc(
+ u'An error happened while generating a summary of failures:\n'
+ u'{}'.format(traceback.format_exc()), C.COLOR_WARN)
+ self._display.v(msg)
+
+
+def failure_summary(failures, playbook):
+ """Return a summary of failed tasks, including details on health checks."""
+ if not failures:
+ return u''
+
+ # NOTE: because we don't have access to task_vars from callback plugins, we
+ # store the playbook context in the task result when the
+ # openshift_health_check action plugin is used, and we use this context to
+ # customize the error message.
+ # pylint: disable=protected-access; Ansible gives us no sufficient public
+ # API on TaskResult objects.
+ context = next((
+ context for context in
+ (failure._result.get('playbook_context') for failure in failures)
+ if context
+ ), None)
+
+ failures = [failure_to_dict(failure) for failure in failures]
+ failures = deduplicate_failures(failures)
+
+ summary = [u'', u'', u'Failure summary:', u'']
+
+ width = len(str(len(failures)))
+ initial_indent_format = u' {{:>{width}}}. '.format(width=width)
+ initial_indent_len = len(initial_indent_format.format(0))
+ subsequent_indent = u' ' * initial_indent_len
+ subsequent_extra_indent = u' ' * (initial_indent_len + 10)
+
+ for i, failure in enumerate(failures, 1):
+ entries = format_failure(failure)
+ summary.append(u'\n{}{}'.format(initial_indent_format.format(i), entries[0]))
+ for entry in entries[1:]:
+ entry = entry.replace(u'\n', u'\n' + subsequent_extra_indent)
+ indented = u'{}{}'.format(subsequent_indent, entry)
+ summary.append(indented)
+
+ failed_checks = set()
+ for failure in failures:
+ failed_checks.update(name for name, message in failure['checks'])
+ if failed_checks:
+ summary.append(check_failure_footer(failed_checks, context, playbook))
+
+ return u'\n'.join(summary)
+
+
+def failure_to_dict(failed_task_result):
+ """Extract information out of a failed TaskResult into a dict.
+
+ The intent is to transform a TaskResult object into something easier to
+ manipulate. TaskResult is ansible.executor.task_result.TaskResult.
+ """
+ # pylint: disable=protected-access; Ansible gives us no sufficient public
+ # API on TaskResult objects.
+ _result = failed_task_result._result
+ return {
+ 'host': failed_task_result._host.get_name(),
+ 'play': play_name(failed_task_result._task),
+ 'task': failed_task_result.task_name,
+ 'msg': _result.get('msg', FAILED_NO_MSG),
+ 'checks': tuple(
+ (name, result.get('msg', FAILED_NO_MSG))
+ for name, result in sorted(_result.get('checks', {}).items())
+ if result.get('failed')
+ ),
+ }
+
+
+def play_name(obj):
+ """Given a task or block, return the name of its parent play.
+
+ This is loosely inspired by ansible.playbook.base.Base.dump_me.
+ """
+ # pylint: disable=protected-access; Ansible gives us no sufficient public
+ # API to implement this.
+ if not obj:
+ return ''
+ if hasattr(obj, '_play'):
+ return obj._play.get_name()
+ return play_name(getattr(obj, '_parent'))
+
+
+def deduplicate_failures(failures):
+ """Group together similar failures from different hosts.
+
+ Returns a new list of failures such that identical failures from different
+ hosts are grouped together in a single entry. The relative order of failures
+ is preserved.
+
+ If failures is unhashable, the original list of failures is returned.
+ """
+ groups = defaultdict(list)
+ for failure in failures:
+ group_key = tuple(sorted((key, value) for key, value in failure.items() if key != 'host'))
+ try:
+ groups[group_key].append(failure)
+ except TypeError:
+ # abort and return original list of failures when failures has an
+ # unhashable type.
+ return failures
+
+ result = []
+ for failure in failures:
+ group_key = tuple(sorted((key, value) for key, value in failure.items() if key != 'host'))
+ if group_key not in groups:
+ continue
+ failure['host'] = tuple(sorted(g_failure['host'] for g_failure in groups.pop(group_key)))
+ result.append(failure)
+ return result
+
+
+def format_failure(failure):
+ """Return a list of pretty-formatted text entries describing a failure, including
relevant information about it. Expect that the list of text entries will be joined
- by a newline separator when output to the user.'''
- result = failure['result']
- host = result._host.get_name()
- play = _get_play(result._task)
- if play:
- play = play.get_name()
- task = result._task.get_name()
- msg = result._result.get('msg', u'???')
+ by a newline separator when output to the user."""
+ if isinstance(failure['host'], string_types):
+ host = failure['host']
+ else:
+ host = u', '.join(failure['host'])
+ play = failure['play']
+ task = failure['task']
+ msg = failure['msg']
+ checks = failure['checks']
fields = (
- (u'Host', host),
+ (u'Hosts', host),
(u'Play', play),
(u'Task', task),
(u'Message', stringc(msg, C.COLOR_ERROR)),
)
- if 'checks' in result._result:
- fields += ((u'Details', _format_failed_checks(result._result['checks'])),)
+ if checks:
+ fields += ((u'Details', format_failed_checks(checks)),)
row_format = '{:10}{}'
return [row_format.format(header + u':', body) for header, body in fields]
-def _format_failed_checks(checks):
- '''Return pretty-formatted text describing checks that failed.'''
- failed_check_msgs = []
- for check, body in checks.items():
- if body.get('failed', False): # only show the failed checks
- msg = body.get('msg', u"Failed without returning a message")
- failed_check_msgs.append('check "%s":\n%s' % (check, msg))
- if failed_check_msgs:
- return stringc("\n\n".join(failed_check_msgs), C.COLOR_ERROR)
- else: # something failed but no checks will admit to it, so dump everything
- return stringc(pformat(checks), C.COLOR_ERROR)
-
-
-# Reason: disable pylint protected-access because we need to access _*
-# attributes of obj to implement this function.
-# This is inspired by ansible.playbook.base.Base.dump_me.
-# Status: permanently disabled unless Ansible's API changes.
-# pylint: disable=protected-access
-def _get_play(obj):
- '''Given a task or block, recursively tries to find its parent play.'''
- if hasattr(obj, '_play'):
- return obj._play
- if getattr(obj, '_parent'):
- return _get_play(obj._parent)
+def format_failed_checks(checks):
+ """Return pretty-formatted text describing checks that failed."""
+ messages = []
+ for name, message in checks:
+ messages.append(u'check "{}":\n{}'.format(name, message))
+ return stringc(u'\n\n'.join(messages), C.COLOR_ERROR)
+
+
+def check_failure_footer(failed_checks, context, playbook):
+ """Return a textual explanation about checks depending on context.
+
+ The purpose of specifying context is to vary the output depending on what
+ the user was expecting to happen (based on which playbook they ran). The
+ only use currently is to vary the message depending on whether the user was
+ deliberately running checks or was trying to install/upgrade and checks are
+ just included. Other use cases may arise.
+ """
+ checks = ','.join(sorted(failed_checks))
+ summary = [u'']
+ if context in ['pre-install', 'health', 'adhoc']:
+ # User was expecting to run checks, less explanation needed.
+ summary.extend([
+ u'You may configure or disable checks by setting Ansible '
+ u'variables. To disable those above, set:',
+ u' openshift_disable_check={checks}'.format(checks=checks),
+ u'Consult check documentation for configurable variables.',
+ ])
+ else:
+ # User may not be familiar with the checks, explain what checks are in
+ # the first place.
+ summary.extend([
+ u'The execution of "{playbook}" includes checks designed to fail '
+ u'early if the requirements of the playbook are not met. One or '
+ u'more of these checks failed. To disregard these results,'
+ u'explicitly disable checks by setting an Ansible variable:'.format(playbook=playbook),
+ u' openshift_disable_check={checks}'.format(checks=checks),
+ u'Failing check names are shown in the failure details above. '
+ u'Some checks may be configurable by variables if your requirements '
+ u'are different from the defaults; consult check documentation.',
+ ])
+ summary.append(
+ u'Variables can be set in the inventory or passed on the command line '
+ u'using the -e flag to ansible-playbook.'
+ )
+ return u'\n'.join(summary)
diff --git a/roles/openshift_health_checker/library/aos_version.py b/roles/openshift_health_checker/library/aos_version.py
index 4460ec324..db3c0b654 100755..100644
--- a/roles/openshift_health_checker/library/aos_version.py
+++ b/roles/openshift_health_checker/library/aos_version.py
@@ -1,5 +1,5 @@
#!/usr/bin/python
-'''
+"""
Ansible module for yum-based systems determining if multiple releases
of an OpenShift package are available, and if the release requested
(if any) is available down to the given precision.
@@ -16,71 +16,87 @@ of release availability already. Without duplicating all that, we would
like the user to have a helpful error message if we detect things will
not work out right. Note that if openshift_release is not specified in
the inventory, the version comparison checks just pass.
-
-TODO: fail gracefully on non-yum systems (dnf in Fedora)
-'''
+"""
from ansible.module_utils.basic import AnsibleModule
+# NOTE: because of the dependency on yum (Python 2-only), this module does not
+# work under Python 3. But since we run unit tests against both Python 2 and
+# Python 3, we use six for cross compatibility in this module alone:
+from ansible.module_utils.six import string_types
-IMPORT_EXCEPTION = None
+YUM_IMPORT_EXCEPTION = None
+DNF_IMPORT_EXCEPTION = None
try:
import yum # pylint: disable=import-error
except ImportError as err:
- IMPORT_EXCEPTION = err # in tox test env, yum import fails
+ YUM_IMPORT_EXCEPTION = err
+
+try:
+ import dnf # pylint: disable=import-error
+except ImportError as err:
+ DNF_IMPORT_EXCEPTION = err
class AosVersionException(Exception):
- '''Base exception class for package version problems'''
+ """Base exception class for package version problems"""
def __init__(self, message, problem_pkgs=None):
Exception.__init__(self, message)
self.problem_pkgs = problem_pkgs
def main():
- '''Entrypoint for this Ansible module'''
+ """Entrypoint for this Ansible module"""
module = AnsibleModule(
argument_spec=dict(
- requested_openshift_release=dict(type="str", default=''),
- openshift_deployment_type=dict(required=True),
- rpm_prefix=dict(required=True), # atomic-openshift, origin, ...?
+ package_list=dict(type="list", required=True),
+ package_mgr=dict(type="str", required=True),
),
supports_check_mode=True
)
- if IMPORT_EXCEPTION:
- module.fail_json(msg="aos_version module could not import yum: %s" % IMPORT_EXCEPTION)
+ # determine the package manager to use
+ package_mgr = module.params['package_mgr']
+ if package_mgr not in ('yum', 'dnf'):
+ module.fail_json(msg="package_mgr must be one of: yum, dnf")
+ pkg_mgr_exception = dict(yum=YUM_IMPORT_EXCEPTION, dnf=DNF_IMPORT_EXCEPTION)[package_mgr]
+ if pkg_mgr_exception:
+ module.fail_json(
+ msg="aos_version module could not import {}: {}".format(package_mgr, pkg_mgr_exception)
+ )
# determine the packages we will look for
- rpm_prefix = module.params['rpm_prefix']
- if not rpm_prefix:
- module.fail_json(msg="rpm_prefix must not be empty")
- expected_pkgs = set([
- rpm_prefix,
- rpm_prefix + '-master',
- rpm_prefix + '-node',
- ])
-
- # determine what level of precision the user specified for the openshift version.
- # should look like a version string with possibly many segments e.g. "3.4.1":
- requested_openshift_release = module.params['requested_openshift_release']
+ package_list = module.params['package_list']
+ if not package_list:
+ module.fail_json(msg="package_list must not be empty")
+
+ # generate set with only the names of expected packages
+ expected_pkg_names = [p["name"] for p in package_list]
+
+ # gather packages that require a multi_minor_release check
+ multi_minor_pkgs = [p for p in package_list if p["check_multi"]]
+
+ # generate list of packages with a specified (non-empty) version
+ # should look like a version string with possibly many segments e.g. "3.4.1"
+ versioned_pkgs = [p for p in package_list if p["version"]]
# get the list of packages available and complain if anything is wrong
try:
- pkgs = _retrieve_available_packages(expected_pkgs)
- if requested_openshift_release:
- _check_precise_version_found(pkgs, expected_pkgs, requested_openshift_release)
- _check_higher_version_found(pkgs, expected_pkgs, requested_openshift_release)
- if module.params['openshift_deployment_type'] in ['openshift-enterprise']:
- _check_multi_minor_release(pkgs, expected_pkgs)
+ pkgs = _retrieve_available_packages(package_mgr, expected_pkg_names)
+ if versioned_pkgs:
+ _check_precise_version_found(pkgs, _to_dict(versioned_pkgs))
+ _check_higher_version_found(pkgs, _to_dict(versioned_pkgs))
+ if multi_minor_pkgs:
+ _check_multi_minor_release(pkgs, _to_dict(multi_minor_pkgs))
except AosVersionException as excinfo:
module.fail_json(msg=str(excinfo))
module.exit_json(changed=False)
-def _retrieve_available_packages(expected_pkgs):
- # search for package versions available for openshift pkgs
- yb = yum.YumBase() # pylint: disable=invalid-name
+def _to_dict(pkg_list):
+ return {pkg["name"]: pkg for pkg in pkg_list}
+
+def _retrieve_available_packages(pkg_mgr, expected_pkgs):
# The openshift excluder prevents unintended updates to openshift
# packages by setting yum excludes on those packages. See:
# https://wiki.centos.org/SpecialInterestGroup/PaaS/OpenShift-Origin-Control-Updates
@@ -89,71 +105,111 @@ def _retrieve_available_packages(expected_pkgs):
# attempt to determine what packages are available via yum they may
# be excluded. So, for our purposes here, disable excludes to see
# what will really be available during an install or upgrade.
- yb.conf.disable_excludes = ['all']
- try:
- pkgs = yb.pkgSack.returnPackages(patterns=expected_pkgs)
- except yum.Errors.PackageSackError as excinfo:
- # you only hit this if *none* of the packages are available
- raise AosVersionException('\n'.join([
- 'Unable to find any OpenShift packages.',
- 'Check your subscription and repo settings.',
- str(excinfo),
- ]))
+ if pkg_mgr == "yum":
+ # search for package versions available for openshift pkgs
+ yb = yum.YumBase() # pylint: disable=invalid-name
+
+ yb.conf.disable_excludes = ['all']
+
+ try:
+ pkgs = yb.rpmdb.returnPackages(patterns=expected_pkgs)
+ pkgs += yb.pkgSack.returnPackages(patterns=expected_pkgs)
+ except yum.Errors.PackageSackError as excinfo:
+ # you only hit this if *none* of the packages are available
+ raise AosVersionException('\n'.join([
+ 'Unable to find any OpenShift packages.',
+ 'Check your subscription and repo settings.',
+ str(excinfo),
+ ]))
+ elif pkg_mgr == "dnf":
+ dbase = dnf.Base() # pyling: disable=invalid-name
+
+ dbase.conf.disable_excludes = ['all']
+ dbase.read_all_repos()
+ dbase.fill_sack(load_system_repo=False, load_available_repos=True)
+
+ dquery = dbase.sack.query()
+ aquery = dquery.available()
+ iquery = dquery.installed()
+
+ available_pkgs = list(aquery.filter(name=expected_pkgs))
+ installed_pkgs = list(iquery.filter(name=expected_pkgs))
+ pkgs = available_pkgs + installed_pkgs
+
+ if not pkgs:
+ # pkgs list is empty, raise because no expected packages found
+ raise AosVersionException('\n'.join([
+ 'Unable to find any OpenShift packages.',
+ 'Check your subscription and repo settings.',
+ ]))
+
return pkgs
class PreciseVersionNotFound(AosVersionException):
- '''Exception for reporting packages not available at given release'''
- def __init__(self, requested_release, not_found):
- msg = ['Not all of the required packages are available at requested version %s:' % requested_release]
- msg += [' ' + name for name in not_found]
+ """Exception for reporting packages not available at given version"""
+ def __init__(self, not_found):
+ msg = ['Not all of the required packages are available at their requested version']
+ msg += ['{}:{} '.format(pkg["name"], pkg["version"]) for pkg in not_found]
msg += ['Please check your subscriptions and enabled repositories.']
AosVersionException.__init__(self, '\n'.join(msg), not_found)
-def _check_precise_version_found(pkgs, expected_pkgs, requested_openshift_release):
+def _check_precise_version_found(pkgs, expected_pkgs_dict):
# see if any packages couldn't be found at requested release version
# we would like to verify that the latest available pkgs have however specific a version is given.
# so e.g. if there is a package version 3.4.1.5 the check passes; if only 3.4.0, it fails.
- pkgs_precise_version_found = {}
+ pkgs_precise_version_found = set()
for pkg in pkgs:
- if pkg.name not in expected_pkgs:
+ if pkg.name not in expected_pkgs_dict:
continue
- # does the version match, to the precision requested?
- # and, is it strictly greater, at the precision requested?
- match_version = '.'.join(pkg.version.split('.')[:requested_openshift_release.count('.') + 1])
- if match_version == requested_openshift_release:
- pkgs_precise_version_found[pkg.name] = True
+ expected_pkg_versions = expected_pkgs_dict[pkg.name]["version"]
+ if isinstance(expected_pkg_versions, string_types):
+ expected_pkg_versions = [expected_pkg_versions]
+ for expected_pkg_version in expected_pkg_versions:
+ # does the version match, to the precision requested?
+ # and, is it strictly greater, at the precision requested?
+ match_version = '.'.join(pkg.version.split('.')[:expected_pkg_version.count('.') + 1])
+ if match_version == expected_pkg_version:
+ pkgs_precise_version_found.add(pkg.name)
not_found = []
- for name in expected_pkgs:
+ for name, pkg in expected_pkgs_dict.items():
if name not in pkgs_precise_version_found:
- not_found.append(name)
+ not_found.append(pkg)
if not_found:
- raise PreciseVersionNotFound(requested_openshift_release, not_found)
+ raise PreciseVersionNotFound(not_found)
class FoundHigherVersion(AosVersionException):
- '''Exception for reporting that a higher version than requested is available'''
- def __init__(self, requested_release, higher_found):
+ """Exception for reporting that a higher version than requested is available"""
+ def __init__(self, higher_found):
msg = ['Some required package(s) are available at a version',
- 'that is higher than requested %s:' % requested_release]
+ 'that is higher than requested']
msg += [' ' + name for name in higher_found]
msg += ['This will prevent installing the version you requested.']
msg += ['Please check your enabled repositories or adjust openshift_release.']
AosVersionException.__init__(self, '\n'.join(msg), higher_found)
-def _check_higher_version_found(pkgs, expected_pkgs, requested_openshift_release):
- req_release_arr = [int(segment) for segment in requested_openshift_release.split(".")]
+def _check_higher_version_found(pkgs, expected_pkgs_dict):
+ expected_pkg_names = list(expected_pkgs_dict)
+
# see if any packages are available in a version higher than requested
higher_version_for_pkg = {}
for pkg in pkgs:
- if pkg.name not in expected_pkgs:
+ if pkg.name not in expected_pkg_names:
continue
+ expected_pkg_versions = expected_pkgs_dict[pkg.name]["version"]
+ if isinstance(expected_pkg_versions, string_types):
+ expected_pkg_versions = [expected_pkg_versions]
+ # NOTE: the list of versions is assumed to be sorted so that the highest
+ # desirable version is the last.
+ highest_desirable_version = expected_pkg_versions[-1]
+ req_release_arr = [int(segment) for segment in highest_desirable_version.split(".")]
version = [int(segment) for segment in pkg.version.split(".")]
too_high = version[:len(req_release_arr)] > req_release_arr
higher_than_seen = version > higher_version_for_pkg.get(pkg.name, [])
@@ -164,11 +220,11 @@ def _check_higher_version_found(pkgs, expected_pkgs, requested_openshift_release
higher_found = []
for name, version in higher_version_for_pkg.items():
higher_found.append(name + '-' + '.'.join(str(segment) for segment in version))
- raise FoundHigherVersion(requested_openshift_release, higher_found)
+ raise FoundHigherVersion(higher_found)
class FoundMultiRelease(AosVersionException):
- '''Exception for reporting multiple minor releases found for same package'''
+ """Exception for reporting multiple minor releases found for same package"""
def __init__(self, multi_found):
msg = ['Multiple minor versions of these packages are available']
msg += [' ' + name for name in multi_found]
@@ -176,18 +232,18 @@ class FoundMultiRelease(AosVersionException):
AosVersionException.__init__(self, '\n'.join(msg), multi_found)
-def _check_multi_minor_release(pkgs, expected_pkgs):
+def _check_multi_minor_release(pkgs, expected_pkgs_dict):
# see if any packages are available in more than one minor version
pkgs_by_name_version = {}
for pkg in pkgs:
# keep track of x.y (minor release) versions seen
minor_release = '.'.join(pkg.version.split('.')[:2])
if pkg.name not in pkgs_by_name_version:
- pkgs_by_name_version[pkg.name] = {}
- pkgs_by_name_version[pkg.name][minor_release] = True
+ pkgs_by_name_version[pkg.name] = set()
+ pkgs_by_name_version[pkg.name].add(minor_release)
multi_found = []
- for name in expected_pkgs:
+ for name in expected_pkgs_dict:
if name in pkgs_by_name_version and len(pkgs_by_name_version[name]) > 1:
multi_found.append(name)
diff --git a/roles/openshift_health_checker/library/check_yum_update.py b/roles/openshift_health_checker/library/check_yum_update.py
index 433795b67..433795b67 100755..100644
--- a/roles/openshift_health_checker/library/check_yum_update.py
+++ b/roles/openshift_health_checker/library/check_yum_update.py
diff --git a/roles/openshift_health_checker/library/docker_info.py b/roles/openshift_health_checker/library/docker_info.py
index 7f712bcff..0d0ddae8b 100644
--- a/roles/openshift_health_checker/library/docker_info.py
+++ b/roles/openshift_health_checker/library/docker_info.py
@@ -1,4 +1,3 @@
-# pylint: disable=missing-docstring
"""
Ansible module for determining information about the docker host.
@@ -13,6 +12,7 @@ from ansible.module_utils.docker_common import AnsibleDockerClient
def main():
+ """Entrypoint for running an Ansible module."""
client = AnsibleDockerClient()
client.module.exit_json(
diff --git a/roles/openshift_health_checker/library/etcdkeysize.py b/roles/openshift_health_checker/library/etcdkeysize.py
new file mode 100644
index 000000000..620e82d87
--- /dev/null
+++ b/roles/openshift_health_checker/library/etcdkeysize.py
@@ -0,0 +1,122 @@
+#!/usr/bin/python
+"""Ansible module that recursively determines if the size of a key in an etcd cluster exceeds a given limit."""
+
+from ansible.module_utils.basic import AnsibleModule
+
+
+try:
+ import etcd
+
+ IMPORT_EXCEPTION_MSG = None
+except ImportError as err:
+ IMPORT_EXCEPTION_MSG = str(err)
+
+ from collections import namedtuple
+ EtcdMock = namedtuple("etcd", ["EtcdKeyNotFound"])
+ etcd = EtcdMock(KeyError)
+
+
+# pylint: disable=too-many-arguments
+def check_etcd_key_size(client, key, size_limit, total_size=0, depth=0, depth_limit=1000, visited=None):
+ """Check size of an etcd path starting at given key. Returns tuple (string, bool)"""
+ if visited is None:
+ visited = set()
+
+ if key in visited:
+ return 0, False
+
+ visited.add(key)
+
+ try:
+ result = client.read(key, recursive=False)
+ except etcd.EtcdKeyNotFound:
+ return 0, False
+
+ size = 0
+ limit_exceeded = False
+
+ for node in result.leaves:
+ if depth >= depth_limit:
+ raise Exception("Maximum recursive stack depth ({}) exceeded.".format(depth_limit))
+
+ if size_limit and total_size + size > size_limit:
+ return size, True
+
+ if not node.dir:
+ size += len(node.value)
+ continue
+
+ key_size, limit_exceeded = check_etcd_key_size(client, node.key,
+ size_limit,
+ total_size + size,
+ depth + 1,
+ depth_limit, visited)
+ size += key_size
+
+ max_limit_exceeded = limit_exceeded or (total_size + size > size_limit)
+ return size, max_limit_exceeded
+
+
+def main(): # pylint: disable=missing-docstring,too-many-branches
+ module = AnsibleModule(
+ argument_spec=dict(
+ size_limit_bytes=dict(type="int", default=0),
+ paths=dict(type="list", default=["/openshift.io/images"]),
+ host=dict(type="str", default="127.0.0.1"),
+ port=dict(type="int", default=4001),
+ protocol=dict(type="str", default="http"),
+ version_prefix=dict(type="str", default=""),
+ allow_redirect=dict(type="bool", default=False),
+ cert=dict(type="dict", default=""),
+ ca_cert=dict(type="str", default=None),
+ ),
+ supports_check_mode=True
+ )
+
+ module.params["cert"] = (
+ module.params["cert"]["cert"],
+ module.params["cert"]["key"],
+ )
+
+ size_limit = module.params.pop("size_limit_bytes")
+ paths = module.params.pop("paths")
+
+ limit_exceeded = False
+
+ try:
+ # pylint: disable=no-member
+ client = etcd.Client(**module.params)
+ except AttributeError as attrerr:
+ msg = str(attrerr)
+ if IMPORT_EXCEPTION_MSG:
+ msg = IMPORT_EXCEPTION_MSG
+ if "No module named etcd" in IMPORT_EXCEPTION_MSG:
+ # pylint: disable=redefined-variable-type
+ msg = ('Unable to import the python "etcd" dependency. '
+ 'Make sure python-etcd is installed on the host.')
+
+ module.exit_json(
+ failed=True,
+ changed=False,
+ size_limit_exceeded=limit_exceeded,
+ msg=msg,
+ )
+
+ return
+
+ size = 0
+ for path in paths:
+ path_size, limit_exceeded = check_etcd_key_size(client, path, size_limit - size)
+ size += path_size
+
+ if limit_exceeded:
+ break
+
+ module.exit_json(
+ changed=False,
+ size_limit_exceeded=limit_exceeded,
+ )
+
+
+if __name__ == '__main__':
+ main()
diff --git a/roles/openshift_health_checker/library/ocutil.py b/roles/openshift_health_checker/library/ocutil.py
new file mode 100644
index 000000000..c72f4c5b3
--- /dev/null
+++ b/roles/openshift_health_checker/library/ocutil.py
@@ -0,0 +1,73 @@
+#!/usr/bin/python
+"""Interface to OpenShift oc command"""
+
+import os
+import shlex
+import shutil
+import subprocess
+
+from ansible.module_utils.basic import AnsibleModule
+
+
+ADDITIONAL_PATH_LOOKUPS = ['/usr/local/bin', os.path.expanduser('~/bin')]
+
+
+def locate_oc_binary():
+ """Find and return oc binary file"""
+ # https://github.com/openshift/openshift-ansible/issues/3410
+ # oc can be in /usr/local/bin in some cases, but that may not
+ # be in $PATH due to ansible/sudo
+ paths = os.environ.get("PATH", os.defpath).split(os.pathsep) + ADDITIONAL_PATH_LOOKUPS
+
+ oc_binary = 'oc'
+
+ # Use shutil.which if it is available, otherwise fallback to a naive path search
+ try:
+ which_result = shutil.which(oc_binary, path=os.pathsep.join(paths))
+ if which_result is not None:
+ oc_binary = which_result
+ except AttributeError:
+ for path in paths:
+ if os.path.exists(os.path.join(path, oc_binary)):
+ oc_binary = os.path.join(path, oc_binary)
+ break
+
+ return oc_binary
+
+
+def main():
+ """Module that executes commands on a remote OpenShift cluster"""
+
+ module = AnsibleModule(
+ argument_spec=dict(
+ namespace=dict(type="str", required=False),
+ config_file=dict(type="str", required=True),
+ cmd=dict(type="str", required=True),
+ extra_args=dict(type="list", default=[]),
+ ),
+ )
+
+ cmd = [locate_oc_binary(), '--config', module.params["config_file"]]
+ if module.params["namespace"]:
+ cmd += ['-n', module.params["namespace"]]
+ cmd += shlex.split(module.params["cmd"]) + module.params["extra_args"]
+
+ failed = True
+ try:
+ cmd_result = subprocess.check_output(list(cmd), stderr=subprocess.STDOUT)
+ failed = False
+ except subprocess.CalledProcessError as exc:
+ cmd_result = '[rc {}] {}\n{}'.format(exc.returncode, ' '.join(exc.cmd), exc.output)
+ except OSError as exc:
+ # we get this when 'oc' is not there
+ cmd_result = str(exc)
+
+ module.exit_json(
+ changed=False,
+ failed=failed,
+ result=cmd_result,
+ )
+
+
+if __name__ == '__main__':
+ main()
diff --git a/roles/openshift_health_checker/library/rpm_version.py b/roles/openshift_health_checker/library/rpm_version.py
new file mode 100644
index 000000000..c24fbba3b
--- /dev/null
+++ b/roles/openshift_health_checker/library/rpm_version.py
@@ -0,0 +1,133 @@
+#!/usr/bin/python
+"""
+Ansible module for rpm-based systems determining existing package version information in a host.
+"""
+
+from ansible.module_utils.basic import AnsibleModule
+from ansible.module_utils.six import string_types
+
+IMPORT_EXCEPTION = None
+try:
+ import rpm # pylint: disable=import-error
+except ImportError as err:
+ IMPORT_EXCEPTION = err # in tox test env, rpm import fails
+
+
+class RpmVersionException(Exception):
+ """Base exception class for package version problems"""
+ def __init__(self, message, problem_pkgs=None):
+ Exception.__init__(self, message)
+ self.problem_pkgs = problem_pkgs
+
+
+def main():
+ """Entrypoint for this Ansible module"""
+ module = AnsibleModule(
+ argument_spec=dict(
+ package_list=dict(type="list", required=True),
+ ),
+ supports_check_mode=True
+ )
+
+ if IMPORT_EXCEPTION:
+ module.fail_json(msg="rpm_version module could not import rpm: %s" % IMPORT_EXCEPTION)
+
+ # determine the packages we will look for
+ pkg_list = module.params['package_list']
+ if not pkg_list:
+ module.fail_json(msg="package_list must not be empty")
+
+ # get list of packages available and complain if any
+ # of them are missing or if any errors occur
+ try:
+ pkg_versions = _retrieve_expected_pkg_versions(_to_dict(pkg_list))
+ _check_pkg_versions(pkg_versions, _to_dict(pkg_list))
+ except RpmVersionException as excinfo:
+ module.fail_json(msg=str(excinfo))
+ module.exit_json(changed=False)
+
+
+def _to_dict(pkg_list):
+ return {pkg["name"]: pkg for pkg in pkg_list}
+
+
+def _retrieve_expected_pkg_versions(expected_pkgs_dict):
+ """Search for installed packages matching given pkg names
+ and versions. Returns a dictionary: {pkg_name: [versions]}"""
+
+ transaction = rpm.TransactionSet()
+ pkgs = {}
+
+ for pkg_name in expected_pkgs_dict:
+ matched_pkgs = transaction.dbMatch("name", pkg_name)
+ if not matched_pkgs:
+ continue
+
+ for header in matched_pkgs:
+ if header['name'] == pkg_name:
+ if pkg_name not in pkgs:
+ pkgs[pkg_name] = []
+
+ pkgs[pkg_name].append(header['version'])
+
+ return pkgs
+
+
+def _check_pkg_versions(found_pkgs_dict, expected_pkgs_dict):
+ invalid_pkg_versions = {}
+ not_found_pkgs = []
+
+ for pkg_name, pkg in expected_pkgs_dict.items():
+ if not found_pkgs_dict.get(pkg_name):
+ not_found_pkgs.append(pkg_name)
+ continue
+
+ found_versions = [_parse_version(version) for version in found_pkgs_dict[pkg_name]]
+
+ if isinstance(pkg["version"], string_types):
+ expected_versions = [_parse_version(pkg["version"])]
+ else:
+ expected_versions = [_parse_version(version) for version in pkg["version"]]
+
+ if not set(expected_versions) & set(found_versions):
+ invalid_pkg_versions[pkg_name] = {
+ "found_versions": found_versions,
+ "required_versions": expected_versions,
+ }
+
+ if not_found_pkgs:
+ raise RpmVersionException(
+ '\n'.join([
+ "The following packages were not found to be installed: {}".format('\n '.join([
+ "{}".format(pkg)
+ for pkg in not_found_pkgs
+ ]))
+ ]),
+ not_found_pkgs,
+ )
+
+ if invalid_pkg_versions:
+ raise RpmVersionException(
+ '\n '.join([
+ "The following packages were found to be installed with an incorrect version: {}".format('\n'.join([
+ " \n{}\n Required version: {}\n Found versions: {}".format(
+ pkg_name,
+ ', '.join(pkg["required_versions"]),
+ ', '.join([version for version in pkg["found_versions"]]))
+ for pkg_name, pkg in invalid_pkg_versions.items()
+ ]))
+ ]),
+ invalid_pkg_versions,
+ )
+
+
+def _parse_version(version_str):
+ segs = version_str.split('.')
+ if not segs or len(segs) <= 2:
+ return version_str
+
+ return '.'.join(segs[0:2])
+
+
+if __name__ == '__main__':
+ main()
diff --git a/roles/openshift_health_checker/library/search_journalctl.py b/roles/openshift_health_checker/library/search_journalctl.py
new file mode 100644
index 000000000..3631f71c8
--- /dev/null
+++ b/roles/openshift_health_checker/library/search_journalctl.py
@@ -0,0 +1,150 @@
+#!/usr/bin/python
+"""Interface to journalctl."""
+
+from time import time
+import json
+import re
+import subprocess
+
+from ansible.module_utils.basic import AnsibleModule
+
+
+class InvalidMatcherRegexp(Exception):
+ """Exception class for invalid matcher regexp."""
+ pass
+
+
+class InvalidLogEntry(Exception):
+ """Exception class for invalid / non-json log entries."""
+ pass
+
+
+class LogInputSubprocessError(Exception):
+ """Exception class for errors that occur while executing a subprocess."""
+ pass
+
+
+def main():
+ """Scan a given list of "log_matchers" for journalctl messages containing given patterns.
+ "log_matchers" is a list of dicts consisting of three keys that help fine-tune log searching:
+ 'start_regexp', 'regexp', and 'unit'.
+
+ Sample "log_matchers" list:
+
+ [
+ {
+ 'start_regexp': r'Beginning of systemd unit',
+ 'regexp': r'the specific log message to find',
+ 'unit': 'etcd',
+ }
+ ]
+ """
+ module = AnsibleModule(
+ argument_spec=dict(
+ log_count_limit=dict(type="int", default=500),
+ log_matchers=dict(type="list", required=True),
+ ),
+ )
+
+ timestamp_limit_seconds = time() - 60 * 60 # 1 hour
+
+ log_count_limit = module.params["log_count_limit"]
+ log_matchers = module.params["log_matchers"]
+
+ matched_regexp, errors = get_log_matches(log_matchers, log_count_limit, timestamp_limit_seconds)
+
+ module.exit_json(
+ changed=False,
+ failed=bool(errors),
+ errors=errors,
+ matched=matched_regexp,
+ )
+
+
+def get_log_matches(matchers, log_count_limit, timestamp_limit_seconds):
+ """Return a list of up to log_count_limit matches for each matcher.
+
+ Log entries are only considered if newer than timestamp_limit_seconds.
+ """
+ matched_regexp = []
+ errors = []
+
+ for matcher in matchers:
+ try:
+ log_output = get_log_output(matcher)
+ except LogInputSubprocessError as err:
+ errors.append(str(err))
+ continue
+
+ try:
+ matched = find_matches(log_output, matcher, log_count_limit, timestamp_limit_seconds)
+ if matched:
+ matched_regexp.append(matcher.get("regexp", ""))
+ except InvalidMatcherRegexp as err:
+ errors.append(str(err))
+ except InvalidLogEntry as err:
+ errors.append(str(err))
+
+ return matched_regexp, errors
+
+
+def get_log_output(matcher):
+ """Return an iterator on the logs of a given matcher."""
+ try:
+ cmd_output = subprocess.Popen(list([
+ '/bin/journalctl',
+ '-ru', matcher.get("unit", ""),
+ '--output', 'json',
+ ]), stdout=subprocess.PIPE)
+
+ return iter(cmd_output.stdout.readline, '')
+
+ except subprocess.CalledProcessError as exc:
+ msg = "Could not obtain journalctl logs for the specified systemd unit: {}: {}"
+ raise LogInputSubprocessError(msg.format(matcher.get("unit", "<missing>"), str(exc)))
+ except OSError as exc:
+ raise LogInputSubprocessError(str(exc))
+
+
+def find_matches(log_output, matcher, log_count_limit, timestamp_limit_seconds):
+ """Return log messages matched in iterable log_output by a given matcher.
+
+ Ignore any log_output items older than timestamp_limit_seconds.
+ """
+ try:
+ regexp = re.compile(matcher.get("regexp", ""))
+ start_regexp = re.compile(matcher.get("start_regexp", ""))
+ except re.error as err:
+ msg = "A log matcher object was provided with an invalid regular expression: {}"
+ raise InvalidMatcherRegexp(msg.format(str(err)))
+
+ matched = None
+
+ for log_count, line in enumerate(log_output):
+ if log_count >= log_count_limit:
+ break
+
+ try:
+ obj = json.loads(line)
+
+ # don't need to look past the most recent service restart
+ if start_regexp.match(obj["MESSAGE"]):
+ break
+
+ log_timestamp_seconds = float(obj["__REALTIME_TIMESTAMP"]) / 1000000
+ if log_timestamp_seconds < timestamp_limit_seconds:
+ break
+
+ if regexp.match(obj["MESSAGE"]):
+ matched = line
+ break
+
+ except ValueError:
+ msg = "Log entry for systemd unit {} contained invalid json syntax: {}"
+ raise InvalidLogEntry(msg.format(matcher.get("unit"), line))
+
+ return matched
+
+
+if __name__ == '__main__':
+ main()
diff --git a/roles/openshift_health_checker/meta/main.yml b/roles/openshift_health_checker/meta/main.yml
index cd9b55902..bc8e7bdcf 100644
--- a/roles/openshift_health_checker/meta/main.yml
+++ b/roles/openshift_health_checker/meta/main.yml
@@ -1,4 +1,3 @@
---
dependencies:
- - role: openshift_facts
- - role: openshift_repos
+- role: openshift_facts
diff --git a/roles/openshift_health_checker/openshift_checks/__init__.py b/roles/openshift_health_checker/openshift_checks/__init__.py
index be63d864a..b7b16e0ea 100644
--- a/roles/openshift_health_checker/openshift_checks/__init__.py
+++ b/roles/openshift_health_checker/openshift_checks/__init__.py
@@ -2,32 +2,98 @@
Health checks for OpenShift clusters.
"""
+import json
import operator
import os
+import time
+import collections
from abc import ABCMeta, abstractmethod, abstractproperty
from importlib import import_module
from ansible.module_utils import six
from ansible.module_utils.six.moves import reduce # pylint: disable=import-error,redefined-builtin
+from ansible.module_utils.six import string_types
+from ansible.plugins.filter.core import to_bool as ansible_to_bool
class OpenShiftCheckException(Exception):
- """Raised when a check cannot proceed."""
- pass
+ """Raised when a check encounters a failure condition."""
+ def __init__(self, name, msg=None):
+ # msg is for the message the user will see when this is raised.
+ # name is for test code to identify the error without looking at msg text.
+ if msg is None: # for parameter backward compatibility
+ msg = name
+ name = self.__class__.__name__
+ self.name = name
+ super(OpenShiftCheckException, self).__init__(msg)
+
+class OpenShiftCheckExceptionList(OpenShiftCheckException):
+ """A container for multiple errors that may be detected in one check."""
+ def __init__(self, errors):
+ self.errors = errors
+ super(OpenShiftCheckExceptionList, self).__init__(
+ 'OpenShiftCheckExceptionList',
+ '\n'.join(str(msg) for msg in errors)
+ )
+
+ # make iterable
+ def __getitem__(self, index):
+ return self.errors[index]
+
+
+FileToSave = collections.namedtuple("FileToSave", "filename contents remote_filename")
+
+
+# pylint: disable=too-many-instance-attributes; all represent significantly different state.
+# Arguably they could be separated into two hashes, one for storing parameters, and one for
+# storing result state; but that smells more like clutter than clarity.
@six.add_metaclass(ABCMeta)
class OpenShiftCheck(object):
- """A base class for defining checks for an OpenShift cluster environment."""
+ """A base class for defining checks for an OpenShift cluster environment.
+
+ Optional init params: method execute_module, dict task_vars, and string tmp
+ execute_module is expected to have a signature compatible with _execute_module
+ from ansible plugins/action/__init__.py, e.g.:
+ def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None, *args):
+ This is stored so that it can be invoked in subclasses via check.execute_module("name", args)
+ which provides the check's stored task_vars and tmp.
- def __init__(self, execute_module=None, module_executor=None):
- if execute_module is module_executor is None:
- raise TypeError(
- "__init__() takes either execute_module (recommended) "
- "or module_executor (deprecated), none given")
- self.execute_module = execute_module or module_executor
- self.module_executor = self.execute_module
+ Optional init param: want_full_results
+ If the check can gather logs, tarballs, etc., do so when True; but no need to spend
+ the time if they're not wanted (won't be written to output directory).
+ """
+ # pylint: disable=too-many-arguments
+ def __init__(self, execute_module=None, task_vars=None, tmp=None, want_full_results=False,
+ templar=None):
+ # store a method for executing ansible modules from the check
+ self._execute_module = execute_module
+ # the task variables and tmpdir passed into the health checker task
+ self.task_vars = task_vars or {}
+ # We may need to template some task_vars
+ self._templar = templar
+ self.tmp = tmp
+ # a boolean for disabling the gathering of results (files, computations) that won't
+ # actually be recorded/used
+ self.want_full_results = want_full_results
+
+ # mainly for testing purposes; see execute_module_with_retries
+ self._module_retries = 3
+ self._module_retry_interval = 5 # seconds
+
+ # state to be recorded for inspection after the check runs:
+ #
+ # set to True when the check changes the host, for accurate total "changed" count
+ self.changed = False
+ # list of OpenShiftCheckException for check to report (alternative to returning a failed result)
+ self.failures = []
+ # list of FileToSave - files the check specifies to be written locally if so configured
+ self.files_to_save = []
+ # log messages for the check - tuples of (description, msg) where msg is serializable.
+ # These are intended to be a sequential record of what the check observed and determined.
+ self.logs = []
@abstractproperty
def name(self):
@@ -43,14 +109,25 @@ class OpenShiftCheck(object):
"""
return []
- @classmethod
- def is_active(cls, task_vars): # pylint: disable=unused-argument
+ @staticmethod
+ def is_active():
"""Returns true if this check applies to the ansible-playbook run."""
return True
+ def is_first_master(self):
+ """Determine if running on first master. Returns: bool"""
+ masters = self.get_var("groups", "oo_first_master", default=None) or [None]
+ return masters[0] == self.get_var("ansible_host")
+
@abstractmethod
- def run(self, tmp, task_vars):
- """Executes a check, normally implemented as a module."""
+ def run(self):
+ """Executes a check against a host and returns a result hash similar to Ansible modules.
+
+ Actually the direction ahead is to record state in the attributes and
+ not bother building a result hash. Instead, return an empty hash and let
+ the action plugin fill it in. Or raise an OpenShiftCheckException.
+ Returning a hash may become deprecated if it does not prove necessary.
+ """
return {}
@classmethod
@@ -62,34 +139,240 @@ class OpenShiftCheck(object):
for subclass in subclass.subclasses():
yield subclass
+ def register_failure(self, error):
+ """Record in the check that a failure occurred.
+
+ Recorded failures are merged into the result hash for now. They are also saved to output directory
+ (if provided) <check>.failures.json and registered as a log entry for context <check>.log.json.
+ """
+ # It should be an exception; make it one if not
+ if not isinstance(error, OpenShiftCheckException):
+ error = OpenShiftCheckException(str(error))
+ self.failures.append(error)
+ # duplicate it in the logs so it can be seen in the context of any
+ # information that led to the failure
+ self.register_log("failure: " + error.name, str(error))
+
+ def register_log(self, context, msg):
+ """Record an entry for the check log.
+
+ Notes are intended to serve as context of the whole sequence of what the check observed.
+ They are be saved as an ordered list in a local check log file.
+ They are not to included in the result or in the ansible log; it's just for the record.
+ """
+ self.logs.append([context, msg])
+
+ def register_file(self, filename, contents=None, remote_filename=""):
+ """Record a file that a check makes available to be saved individually to output directory.
+
+ Either file contents should be passed in, or a file to be copied from the remote host
+ should be specified. Contents that are not a string are to be serialized as JSON.
+
+ NOTE: When copying a file from remote host, it is slurped into memory as base64, meaning
+ you should avoid using this on huge files (more than say 10M).
+ """
+ if contents is None and not remote_filename:
+ raise OpenShiftCheckException("File data/source not specified; this is a bug in the check.")
+ self.files_to_save.append(FileToSave(filename, contents, remote_filename))
+
+ def execute_module(self, module_name=None, module_args=None, save_as_name=None, register=True):
+ """Invoke an Ansible module from a check.
+
+ Invoke stored _execute_module, normally copied from the action
+ plugin, with its params and the task_vars and tmp given at
+ check initialization. No positional parameters beyond these
+ are specified. If it's necessary to specify any of the other
+ parameters to _execute_module then that should just be invoked
+ directly (with awareness of changes in method signature per
+ Ansible version).
+
+ So e.g. check.execute_module("foo", dict(arg1=...))
+
+ save_as_name specifies a file name for saving the result to an output directory,
+ if needed, and is intended to uniquely identify the result of invoking execute_module.
+ If not provided, the module name will be used.
+ If register is set False, then the result won't be registered in logs or files to save.
+
+ Return: result hash from module execution.
+ """
+ if self._execute_module is None:
+ raise NotImplementedError(
+ self.__class__.__name__ +
+ " invoked execute_module without providing the method at initialization."
+ )
+ result = self._execute_module(module_name, module_args, self.tmp, self.task_vars)
+ if result.get("changed"):
+ self.changed = True
+ for output in ["result", "stdout"]:
+ # output is often JSON; attempt to decode
+ try:
+ result[output + "_json"] = json.loads(result[output])
+ except (KeyError, ValueError):
+ pass
+
+ if register:
+ self.register_log("execute_module: " + module_name, result)
+ self.register_file(save_as_name or module_name + ".json", result)
+ return result
+
+ def execute_module_with_retries(self, module_name, module_args):
+ """Run execute_module and retry on failure."""
+ result = {}
+ tries = 0
+ while True:
+ res = self.execute_module(module_name, module_args)
+ if tries > self._module_retries or not res.get("failed"):
+ result.update(res)
+ return result
+ result["last_failed"] = res
+ tries += 1
+ time.sleep(self._module_retry_interval)
+
+ def get_var(self, *keys, **kwargs):
+ """Get deeply nested values from task_vars.
+
+ Ansible task_vars structures are Python dicts, often mapping strings to
+ other dicts. This helper makes it easier to get a nested value, raising
+ OpenShiftCheckException when a key is not found.
+
+ Keyword args:
+ default:
+ On missing key, return this as default value instead of raising exception.
+ convert:
+ Supply a function to apply to normalize the value before returning it.
+ None is the default (return as-is).
+ This function should raise ValueError if the user has provided a value
+ that cannot be converted, or OpenShiftCheckException if some other
+ problem needs to be described to the user.
+ """
+ if len(keys) == 1:
+ keys = keys[0].split(".")
+
+ try:
+ value = reduce(operator.getitem, keys, self.task_vars)
+ except (KeyError, TypeError):
+ if "default" not in kwargs:
+ raise OpenShiftCheckException(
+ "This check expects the '{}' inventory variable to be defined\n"
+ "in order to proceed, but it is undefined. There may be a bug\n"
+ "in Ansible, the checks, or their dependencies."
+ "".format(".".join(map(str, keys)))
+ )
+ value = kwargs["default"]
+
+ convert = kwargs.get("convert", None)
+ try:
+ if convert is None:
+ return value
+ elif convert is bool: # interpret bool as Ansible does, instead of python truthiness
+ return ansible_to_bool(value)
+ else:
+ return convert(value)
+
+ except ValueError as error: # user error in specifying value
+ raise OpenShiftCheckException(
+ 'Cannot convert inventory variable to expected type:\n'
+ ' "{var}={value}"\n'
+ '{error}'.format(var=".".join(keys), value=value, error=error)
+ )
+
+ except OpenShiftCheckException: # some other check-specific problem
+ raise
+
+ except Exception as error: # probably a bug in the function
+ raise OpenShiftCheckException(
+ 'There is a bug in this check. While trying to convert variable \n'
+ ' "{var}={value}"\n'
+ 'the given converter cannot be used or failed unexpectedly:\n'
+ '{type}: {error}'.format(
+ var=".".join(keys),
+ value=value,
+ type=error.__class__.__name__,
+ error=error
+ ))
+
+ @staticmethod
+ def normalize(name_list):
+ """Return a clean list of names.
+
+ The input may be a comma-separated string or a sequence. Leading and
+ trailing whitespace characters are removed. Empty items are discarded.
+ """
+ if isinstance(name_list, string_types):
+ name_list = name_list.split(',')
+ return [name.strip() for name in name_list if name.strip()]
+
+ @staticmethod
+ def get_major_minor_version(openshift_image_tag):
+ """Parse and return the deployed version of OpenShift as a tuple."""
+ if openshift_image_tag and openshift_image_tag[0] == 'v':
+ openshift_image_tag = openshift_image_tag[1:]
+
+ # map major release versions across releases
+ # to a common major version
+ openshift_major_release_version = {
+ "1": "3",
+ }
+
+ components = openshift_image_tag.split(".")
+ if not components or len(components) < 2:
+ msg = "An invalid version of OpenShift was found for this host: {}"
+ raise OpenShiftCheckException(msg.format(openshift_image_tag))
+
+ if components[0] in openshift_major_release_version:
+ components[0] = openshift_major_release_version[components[0]]
+
+ components = tuple(int(x) for x in components[:2])
+ return components
+
+ def find_ansible_mount(self, path):
+ """Return the mount point for path from ansible_mounts."""
+
+ # reorganize list of mounts into dict by path
+ mount_for_path = {
+ mount['mount']: mount
+ for mount
+ in self.get_var('ansible_mounts')
+ }
+
+ # NOTE: including base cases '/' and '' to ensure the loop ends
+ mount_targets = set(mount_for_path.keys()) | {'/', ''}
+ mount_point = path
+ while mount_point not in mount_targets:
+ mount_point = os.path.dirname(mount_point)
+
+ try:
+ mount = mount_for_path[mount_point]
+ self.register_log("mount point for " + path, mount)
+ return mount
+ except KeyError:
+ known_mounts = ', '.join('"{}"'.format(mount) for mount in sorted(mount_for_path))
+ raise OpenShiftCheckException(
+ 'Unable to determine mount point for path "{}".\n'
+ 'Known mount points: {}.'.format(path, known_mounts or 'none')
+ )
+
LOADER_EXCLUDES = (
"__init__.py",
"mixins.py",
+ "logging.py",
)
-def load_checks():
+def load_checks(path=None, subpkg=""):
"""Dynamically import all check modules for the side effect of registering checks."""
- return [
- import_module(__package__ + "." + name[:-3])
- for name in os.listdir(os.path.dirname(__file__))
- if name.endswith(".py") and name not in LOADER_EXCLUDES
- ]
+ if path is None:
+ path = os.path.dirname(__file__)
+ modules = []
-def get_var(task_vars, *keys, **kwargs):
- """Helper function to get deeply nested values from task_vars.
+ for name in os.listdir(path):
+ if os.path.isdir(os.path.join(path, name)):
+ modules = modules + load_checks(os.path.join(path, name), subpkg + "." + name)
+ continue
- Ansible task_vars structures are Python dicts, often mapping strings to
- other dicts. This helper makes it easier to get a nested value, raising
- OpenShiftCheckException when a key is not found or returning a default value
- provided as a keyword argument.
- """
- try:
- value = reduce(operator.getitem, keys, task_vars)
- except (KeyError, TypeError):
- if "default" in kwargs:
- return kwargs["default"]
- raise OpenShiftCheckException("'{}' is undefined".format(".".join(map(str, keys))))
- return value
+ if name.endswith(".py") and name not in LOADER_EXCLUDES:
+ modules.append(import_module(__package__ + subpkg + "." + name[:-3]))
+
+ return modules
diff --git a/roles/openshift_health_checker/openshift_checks/diagnostics.py b/roles/openshift_health_checker/openshift_checks/diagnostics.py
new file mode 100644
index 000000000..1cfdc1129
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/diagnostics.py
@@ -0,0 +1,62 @@
+"""
+A check to run relevant diagnostics via `oc adm diagnostics`.
+"""
+
+import os
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
+
+
+DIAGNOSTIC_LIST = (
+ "AggregatedLogging ClusterRegistry ClusterRoleBindings ClusterRoles "
+ "ClusterRouter DiagnosticPod NetworkCheck"
+).split()
+
+
+class DiagnosticCheck(OpenShiftCheck):
+ """A check to run relevant diagnostics via `oc adm diagnostics`."""
+
+ name = "diagnostics"
+ tags = ["health"]
+
+ def is_active(self):
+ return super(DiagnosticCheck, self).is_active() and self.is_first_master()
+
+ def run(self):
+ if self.exec_diagnostic("ConfigContexts"):
+ # only run the other diagnostics if that one succeeds (otherwise, all will fail)
+ diagnostics = self.get_var("openshift_check_diagnostics", default=DIAGNOSTIC_LIST)
+ for diagnostic in self.normalize(diagnostics):
+ self.exec_diagnostic(diagnostic)
+ return {}
+
+ def exec_diagnostic(self, diagnostic):
+ """
+ Execute an 'oc adm diagnostics' command on the remote host.
+ Raises OcNotFound or registers OcDiagFailed.
+ Returns True on success or False on failure (non-zero rc).
+ """
+ config_base = self.get_var("openshift.common.config_base")
+ args = {
+ "config_file": os.path.join(config_base, "master", "admin.kubeconfig"),
+ "cmd": "adm diagnostics",
+ "extra_args": [diagnostic],
+ }
+
+ result = self.execute_module("ocutil", args, save_as_name=diagnostic + ".failure.json")
+ self.register_file(diagnostic + ".txt", result['result'])
+ if result.get("failed"):
+ if result['result'] == '[Errno 2] No such file or directory':
+ raise OpenShiftCheckException(
+ "OcNotFound",
+ "This host is supposed to be a master but does not have the `oc` command where expected.\n"
+ "Has an installation been run on this host yet?"
+ )
+
+ self.register_failure(OpenShiftCheckException(
+ 'OcDiagFailed',
+ 'The {diag} diagnostic reported an error:\n'
+ '{error}'.format(diag=diagnostic, error=result['result'])
+ ))
+ return False
+ return True
diff --git a/roles/openshift_health_checker/openshift_checks/disk_availability.py b/roles/openshift_health_checker/openshift_checks/disk_availability.py
index c2792a0fe..87e6146d4 100644
--- a/roles/openshift_health_checker/openshift_checks/disk_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/disk_availability.py
@@ -1,9 +1,12 @@
-# pylint: disable=missing-docstring
-from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
-from openshift_checks.mixins import NotContainerizedMixin
+"""Check that there is enough disk space in predefined paths."""
+import tempfile
+import os.path
-class DiskAvailability(NotContainerizedMixin, OpenShiftCheck):
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
+
+
+class DiskAvailability(OpenShiftCheck):
"""Check that recommended disk space is available before a first-time install."""
name = "disk_availability"
@@ -12,54 +15,134 @@ class DiskAvailability(NotContainerizedMixin, OpenShiftCheck):
# Values taken from the official installation documentation:
# https://docs.openshift.org/latest/install_config/install/prerequisites.html#system-requirements
recommended_disk_space_bytes = {
- "masters": 40 * 10**9,
- "nodes": 15 * 10**9,
- "etcd": 20 * 10**9,
+ '/var': {
+ 'oo_masters_to_config': 40 * 10**9,
+ 'oo_nodes_to_config': 15 * 10**9,
+ 'oo_etcd_to_config': 20 * 10**9,
+ },
+ # Used to copy client binaries into,
+ # see roles/openshift_cli/library/openshift_container_binary_sync.py.
+ '/usr/local/bin': {
+ 'oo_masters_to_config': 1 * 10**9,
+ 'oo_nodes_to_config': 1 * 10**9,
+ 'oo_etcd_to_config': 1 * 10**9,
+ },
+ # Used as temporary storage in several cases.
+ tempfile.gettempdir(): {
+ 'oo_masters_to_config': 1 * 10**9,
+ 'oo_nodes_to_config': 1 * 10**9,
+ 'oo_etcd_to_config': 1 * 10**9,
+ },
}
- @classmethod
- def is_active(cls, task_vars):
+ # recommended disk space for each location under an upgrade context
+ recommended_disk_upgrade_bytes = {
+ '/var': {
+ 'oo_masters_to_config': 10 * 10**9,
+ 'oo_nodes_to_config': 5 * 10 ** 9,
+ 'oo_etcd_to_config': 5 * 10 ** 9,
+ },
+ }
+
+ def is_active(self):
"""Skip hosts that do not have recommended disk space requirements."""
- group_names = get_var(task_vars, "group_names", default=[])
- has_disk_space_recommendation = bool(set(group_names).intersection(cls.recommended_disk_space_bytes))
- return super(DiskAvailability, cls).is_active(task_vars) and has_disk_space_recommendation
-
- def run(self, tmp, task_vars):
- group_names = get_var(task_vars, "group_names")
- ansible_mounts = get_var(task_vars, "ansible_mounts")
-
- min_free_bytes = max(self.recommended_disk_space_bytes.get(name, 0) for name in group_names)
- free_bytes = self.openshift_available_disk(ansible_mounts)
-
- if free_bytes < min_free_bytes:
- return {
- 'failed': True,
- 'msg': (
- 'Available disk space ({:.1f} GB) for the volume containing '
- '"/var" is below minimum recommended space ({:.1f} GB)'
- ).format(float(free_bytes) / 10**9, float(min_free_bytes) / 10**9)
+ group_names = self.get_var("group_names", default=[])
+ active_groups = set()
+ for recommendation in self.recommended_disk_space_bytes.values():
+ active_groups.update(recommendation.keys())
+ has_disk_space_recommendation = bool(active_groups.intersection(group_names))
+ return super(DiskAvailability, self).is_active() and has_disk_space_recommendation
+
+ def run(self):
+ group_names = self.get_var("group_names")
+ user_config = self.get_var("openshift_check_min_host_disk_gb", default={})
+ try:
+ # For backwards-compatibility, if openshift_check_min_host_disk_gb
+ # is a number, then it overrides the required config for '/var'.
+ number = float(user_config)
+ user_config = {
+ '/var': {
+ 'oo_masters_to_config': number,
+ 'oo_nodes_to_config': number,
+ 'oo_etcd_to_config': number,
+ },
}
+ except TypeError:
+ # If it is not a number, then it should be a nested dict.
+ pass
- return {}
+ self.register_log("recommended thresholds", self.recommended_disk_space_bytes)
+ if user_config:
+ self.register_log("user-configured thresholds", user_config)
+
+ # TODO: as suggested in
+ # https://github.com/openshift/openshift-ansible/pull/4436#discussion_r122180021,
+ # maybe we could support checking disk availability in paths that are
+ # not part of the official recommendation but present in the user
+ # configuration.
+ for path, recommendation in self.recommended_disk_space_bytes.items():
+ free_bytes = self.free_bytes(path)
+ recommended_bytes = max(recommendation.get(name, 0) for name in group_names)
- @staticmethod
- def openshift_available_disk(ansible_mounts):
- """Determine the available disk space for an OpenShift installation.
+ config = user_config.get(path, {})
+ # NOTE: the user config is in GB, but we compare bytes, thus the
+ # conversion.
+ config_bytes = max(config.get(name, 0) for name in group_names) * 10**9
+ recommended_bytes = config_bytes or recommended_bytes
- ansible_mounts should be a list of dicts like the 'setup' Ansible module
- returns.
- """
- # priority list in descending order
- supported_mnt_paths = ["/var", "/"]
- available_mnts = {mnt.get("mount"): mnt for mnt in ansible_mounts}
+ # if an "upgrade" context is set, update the minimum disk requirement
+ # as this signifies an in-place upgrade - the node might have the
+ # required total disk space, but some of that space may already be
+ # in use by the existing OpenShift deployment.
+ context = self.get_var("r_openshift_health_checker_playbook_context", default="")
+ if context == "upgrade":
+ recommended_upgrade_paths = self.recommended_disk_upgrade_bytes.get(path, {})
+ if recommended_upgrade_paths:
+ recommended_bytes = config_bytes or max(recommended_upgrade_paths.get(name, 0)
+ for name in group_names)
+ if free_bytes < recommended_bytes:
+ free_gb = float(free_bytes) / 10**9
+ recommended_gb = float(recommended_bytes) / 10**9
+ msg = (
+ 'Available disk space in "{}" ({:.1f} GB) '
+ 'is below minimum recommended ({:.1f} GB)'
+ ).format(path, free_gb, recommended_gb)
+
+ # warn if check failed under an "upgrade" context
+ # due to limits imposed by the user config
+ if config_bytes and context == "upgrade":
+ msg += ('\n\nMake sure to account for decreased disk space during an upgrade\n'
+ 'due to an existing OpenShift deployment. Please check the value of\n'
+ ' openshift_check_min_host_disk_gb={}\n'
+ 'in your Ansible inventory, and lower the recommended disk space availability\n'
+ 'if necessary for this upgrade.').format(config_bytes)
+
+ self.register_failure(msg)
+
+ return {}
+
+ def find_ansible_submounts(self, path):
+ """Return a list of ansible_mounts that are below the given path."""
+ base = os.path.join(path, "")
+ return [
+ mount
+ for mount in self.get_var("ansible_mounts")
+ if mount["mount"].startswith(base)
+ ]
+
+ def free_bytes(self, path):
+ """Return the size available in path based on ansible_mounts."""
+ submounts = sum(mnt.get('size_available', 0) for mnt in self.find_ansible_submounts(path))
+ mount = self.find_ansible_mount(path)
try:
- for path in supported_mnt_paths:
- if path in available_mnts:
- return available_mnts[path]["size_available"]
+ return mount['size_available'] + submounts
except KeyError:
- pass
-
- paths = ''.join(sorted(available_mnts)) or 'none'
- msg = "Unable to determine available disk space. Paths mounted: {}.".format(paths)
- raise OpenShiftCheckException(msg)
+ raise OpenShiftCheckException(
+ 'Unable to retrieve disk availability for "{path}".\n'
+ 'Ansible facts included a matching mount point for this path:\n'
+ ' {mount}\n'
+ 'however it is missing the size_available field.\n'
+ 'To investigate, you can inspect the output of `ansible -m setup <host>`'
+ ''.format(path=path, mount=mount)
+ )
diff --git a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
index cce289b95..5beb20503 100644
--- a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
@@ -1,179 +1,245 @@
-# pylint: disable=missing-docstring
-from openshift_checks import OpenShiftCheck, get_var
-
-
-class DockerImageAvailability(OpenShiftCheck):
+"""Check that required Docker images are available."""
+
+from pipes import quote
+from ansible.module_utils import six
+from openshift_checks import OpenShiftCheck
+from openshift_checks.mixins import DockerHostMixin
+
+
+NODE_IMAGE_SUFFIXES = ["haproxy-router", "docker-registry", "deployer", "pod"]
+DEPLOYMENT_IMAGE_INFO = {
+ "origin": {
+ "namespace": "openshift",
+ "name": "origin",
+ "registry_console_image": "cockpit/kubernetes",
+ },
+ "openshift-enterprise": {
+ "namespace": "openshift3",
+ "name": "ose",
+ "registry_console_image": "registry.access.redhat.com/openshift3/registry-console",
+ },
+}
+
+
+class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
"""Check that required Docker images are available.
- This check attempts to ensure that required docker images are
- either present locally, or able to be pulled down from available
- registries defined in a host machine.
+ Determine docker images that an install would require and check that they
+ are either present in the host's docker index, or available for the host to pull
+ with known registries as defined in our inventory file (or defaults).
"""
name = "docker_image_availability"
tags = ["preflight"]
+ # we use python-docker-py to check local docker for images, and skopeo
+ # to look for images available remotely without waiting to pull them.
+ dependencies = ["python-docker-py", "skopeo"]
+ # command for checking if remote registries have an image, without docker pull
+ skopeo_command = "timeout 10 skopeo inspect --tls-verify={tls} {creds} docker://{registry}/{image}"
+ skopeo_example_command = "skopeo inspect [--tls-verify=false] [--creds=<user>:<pass>] docker://<registry>/<image>"
+
+ def __init__(self, *args, **kwargs):
+ super(DockerImageAvailability, self).__init__(*args, **kwargs)
+
+ self.registries = dict(
+ # set of registries that need to be checked insecurely (note: not accounting for CIDR entries)
+ insecure=set(self.ensure_list("openshift_docker_insecure_registries")),
+ # set of registries that should never be queried even if given in the image
+ blocked=set(self.ensure_list("openshift_docker_blocked_registries")),
+ )
- skopeo_image = "openshift/openshift-ansible"
-
- # FIXME(juanvallejo): we should consider other possible values of
- # `deployment_type` (the key here). See
- # https://github.com/openshift/openshift-ansible/blob/8e26f8c/roles/openshift_repos/vars/main.yml#L7
- docker_image_base = {
- "origin": {
- "repo": "openshift",
- "image": "origin",
- },
- "openshift-enterprise": {
- "repo": "openshift3",
- "image": "ose",
- },
- }
-
- def run(self, tmp, task_vars):
- required_images = self.required_images(task_vars)
- missing_images = set(required_images) - set(self.local_images(required_images, task_vars))
-
- # exit early if all images were found locally
- if not missing_images:
- return {"changed": False}
-
- msg, failed, changed = self.update_skopeo_image(task_vars)
-
- # exit early if Skopeo update fails
+ # ordered list of registries (according to inventory vars) that docker will try for unscoped images
+ regs = self.ensure_list("openshift_docker_additional_registries")
+ # currently one of these registries is added whether the user wants it or not.
+ deployment_type = self.get_var("openshift_deployment_type")
+ if deployment_type == "origin" and "docker.io" not in regs:
+ regs.append("docker.io")
+ elif deployment_type == 'openshift-enterprise' and "registry.access.redhat.com" not in regs:
+ regs.append("registry.access.redhat.com")
+ self.registries["configured"] = regs
+
+ # for the oreg_url registry there may be credentials specified
+ components = self.get_var("oreg_url", default="").split('/')
+ self.registries["oreg"] = "" if len(components) < 3 else components[0]
+
+ # Retrieve and template registry credentials, if provided
+ self.skopeo_command_creds = ""
+ oreg_auth_user = self.get_var('oreg_auth_user', default='')
+ oreg_auth_password = self.get_var('oreg_auth_password', default='')
+ if oreg_auth_user != '' and oreg_auth_password != '':
+ if self._templar is not None:
+ oreg_auth_user = self._templar.template(oreg_auth_user)
+ oreg_auth_password = self._templar.template(oreg_auth_password)
+ self.skopeo_command_creds = "--creds={}:{}".format(quote(oreg_auth_user), quote(oreg_auth_password))
+
+ # record whether we could reach a registry or not (and remember results)
+ self.reachable_registries = {}
+
+ def is_active(self):
+ """Skip hosts with unsupported deployment types."""
+ deployment_type = self.get_var("openshift_deployment_type")
+ has_valid_deployment_type = deployment_type in DEPLOYMENT_IMAGE_INFO
+
+ return super(DockerImageAvailability, self).is_active() and has_valid_deployment_type
+
+ def run(self):
+ msg, failed = self.ensure_dependencies()
if failed:
return {
"failed": True,
- "changed": changed,
- "msg": "Failed to update Skopeo image ({img_name}). {msg}".format(img_name=self.skopeo_image, msg=msg),
+ "msg": "Some dependencies are required in order to check Docker image availability.\n" + msg
}
- registries = self.known_docker_registries(task_vars)
- available_images = self.available_images(missing_images, registries, task_vars)
- unavailable_images = set(missing_images) - set(available_images)
-
- if unavailable_images:
- return {
- "failed": True,
- "msg": (
- "One or more required images are not available: {}.\n"
- "Configured registries: {}"
- ).format(", ".join(sorted(unavailable_images)), ", ".join(registries)),
- "changed": changed,
- }
+ required_images = self.required_images()
+ missing_images = set(required_images) - set(self.local_images(required_images))
- return {"changed": changed}
-
- def required_images(self, task_vars):
- deployment_type = get_var(task_vars, "deployment_type")
- # FIXME(juanvallejo): we should handle gracefully with a proper error
- # message when given an unexpected value for `deployment_type`.
- image_base_name = self.docker_image_base[deployment_type]
-
- openshift_release = get_var(task_vars, "openshift_release")
- # FIXME(juanvallejo): this variable is not required when the
- # installation is non-containerized. The example inventories have it
- # commented out. We should handle gracefully and with a proper error
- # message when this variable is required and not set.
- openshift_image_tag = get_var(task_vars, "openshift_image_tag")
-
- is_containerized = get_var(task_vars, "openshift", "common", "is_containerized")
-
- if is_containerized:
- images = set(self.containerized_docker_images(image_base_name, openshift_release))
- else:
- images = set(self.rpm_docker_images(image_base_name, openshift_release))
-
- # append images with qualified image tags to our list of required images.
- # these are images with a (v0.0.0.0) tag, rather than a standard release
- # format tag (v0.0). We want to check this set in both containerized and
- # non-containerized installations.
- images.update(
- self.qualified_docker_images(self.image_from_base_name(image_base_name), "v" + openshift_image_tag)
- )
+ # exit early if all images were found locally
+ if not missing_images:
+ return {}
- return images
+ available_images = self.available_images(missing_images)
+ unavailable_images = set(missing_images) - set(available_images)
- def local_images(self, images, task_vars):
+ if unavailable_images:
+ unreachable = [reg for reg, reachable in self.reachable_registries.items() if not reachable]
+ unreachable_msg = "Failed connecting to: {}\n".format(", ".join(unreachable))
+ blocked_msg = "Blocked registries: {}\n".format(", ".join(self.registries["blocked"]))
+ msg = (
+ "One or more required container images are not available:\n {missing}\n"
+ "Checked with: {cmd}\n"
+ "Default registries searched: {registries}\n"
+ "{blocked}"
+ "{unreachable}"
+ ).format(
+ missing=",\n ".join(sorted(unavailable_images)),
+ cmd=self.skopeo_example_command,
+ registries=", ".join(self.registries["configured"]),
+ blocked=blocked_msg if self.registries["blocked"] else "",
+ unreachable=unreachable_msg if unreachable else "",
+ )
+
+ return dict(failed=True, msg=msg)
+
+ return {}
+
+ def required_images(self):
+ """
+ Determine which images we expect to need for this host.
+ Returns: a set of required images like 'openshift/origin:v3.6'
+
+ The thorny issue of determining the image names from the variables is under consideration
+ via https://github.com/openshift/openshift-ansible/issues/4415
+
+ For now we operate as follows:
+ * For containerized components (master, node, ...) we look at the deployment type and
+ use openshift/origin or openshift3/ose as the base for those component images. The
+ version is openshift_image_tag as determined by the openshift_version role.
+ * For OpenShift-managed infrastructure (router, registry...) we use oreg_url if
+ it is defined; otherwise we again use the base that depends on the deployment type.
+ Registry is not included in constructed images. It may be in oreg_url or etcd image.
+ """
+ required = set()
+ deployment_type = self.get_var("openshift_deployment_type")
+ host_groups = self.get_var("group_names")
+ # containerized etcd may not have openshift_image_tag, see bz 1466622
+ image_tag = self.get_var("openshift_image_tag", default="latest")
+ image_info = DEPLOYMENT_IMAGE_INFO[deployment_type]
+
+ # template for images that run on top of OpenShift
+ image_url = "{}/{}-{}:{}".format(image_info["namespace"], image_info["name"], "${component}", "${version}")
+ image_url = self.get_var("oreg_url", default="") or image_url
+ if 'oo_nodes_to_config' in host_groups:
+ for suffix in NODE_IMAGE_SUFFIXES:
+ required.add(image_url.replace("${component}", suffix).replace("${version}", image_tag))
+ # The registry-console is for some reason not prefixed with ose- like the other components.
+ # Nor is it versioned the same, so just look for latest.
+ # Also a completely different name is used for Origin.
+ required.add(image_info["registry_console_image"])
+
+ # images for containerized components
+ if self.get_var("openshift", "common", "is_containerized"):
+ components = set()
+ if 'oo_nodes_to_config' in host_groups:
+ components.update(["node", "openvswitch"])
+ if 'oo_masters_to_config' in host_groups: # name is "origin" or "ose"
+ components.add(image_info["name"])
+ for component in components:
+ required.add("{}/{}:{}".format(image_info["namespace"], component, image_tag))
+ if 'oo_etcd_to_config' in host_groups: # special case, note it is the same for origin/enterprise
+ required.add("registry.access.redhat.com/rhel7/etcd") # and no image tag
+
+ return required
+
+ def local_images(self, images):
"""Filter a list of images and return those available locally."""
+ found_images = []
+ for image in images:
+ # docker could have the image name as-is or prefixed with any registry
+ imglist = [image] + [reg + "/" + image for reg in self.registries["configured"]]
+ if self.is_image_local(imglist):
+ found_images.append(image)
+ return found_images
+
+ def is_image_local(self, image):
+ """Check if image is already in local docker index."""
+ result = self.execute_module("docker_image_facts", {"name": image})
+ return bool(result.get("images")) and not result.get("failed")
+
+ def ensure_list(self, registry_param):
+ """Return the task var as a list."""
+ # https://bugzilla.redhat.com/show_bug.cgi?id=1497274
+ # If the result was a string type, place it into a list. We must do this
+ # as using list() on a string will split the string into its characters.
+ # Otherwise cast to a list as was done previously.
+ registry = self.get_var(registry_param, default=[])
+ if not isinstance(registry, six.string_types):
+ return list(registry)
+ return self.normalize(registry)
+
+ def available_images(self, images):
+ """Search remotely for images. Returns: list of images found."""
return [
image for image in images
- if self.is_image_local(image, task_vars)
+ if self.is_available_skopeo_image(image)
]
- def is_image_local(self, image, task_vars):
- result = self.module_executor("docker_image_facts", {"name": image}, task_vars)
- if result.get("failed", False):
- return False
-
- return bool(result.get("images", []))
+ def is_available_skopeo_image(self, image):
+ """Use Skopeo to determine if required image exists in known registry(s)."""
+ registries = self.registries["configured"]
+ # If image already includes a registry, only use that.
+ # NOTE: This logic would incorrectly identify images that do not use a namespace, e.g.
+ # registry.access.redhat.com/rhel7 as if the registry were a namespace.
+ # It's not clear that there's any way to distinguish them, but fortunately
+ # the current set of images all look like [registry/]namespace/name[:version].
+ if image.count("/") > 1:
+ registry, image = image.split("/", 1)
+ registries = [registry]
- def known_docker_registries(self, task_vars):
- result = self.module_executor("docker_info", {}, task_vars)
-
- if result.get("failed", False):
- return []
-
- # FIXME(juanvallejo): wrong default type, result["info"] is expected to
- # contain a dictionary (see how we call `docker_info.get` below).
- docker_info = result.get("info", "")
- return [registry.get("Name", "") for registry in docker_info.get("Registries", {})]
-
- def available_images(self, images, registries, task_vars):
- """Inspect existing images using Skopeo and return all images successfully inspected."""
- return [
- image for image in images
- if self.is_image_available(image, registries, task_vars)
- ]
-
- def is_image_available(self, image, registries, task_vars):
for registry in registries:
- if self.is_available_skopeo_image(image, registry, task_vars):
+ if registry in self.registries["blocked"]:
+ continue # blocked will never be consulted
+ if registry not in self.reachable_registries:
+ self.reachable_registries[registry] = self.connect_to_registry(registry)
+ if not self.reachable_registries[registry]:
+ continue # do not keep trying unreachable registries
+
+ args = dict(registry=registry, image=image)
+ args["tls"] = "false" if registry in self.registries["insecure"] else "true"
+ args["creds"] = self.skopeo_command_creds if registry == self.registries["oreg"] else ""
+
+ result = self.execute_module_with_retries("command", {"_raw_params": self.skopeo_command.format(**args)})
+ if result.get("rc", 0) == 0 and not result.get("failed"):
return True
+ if result.get("rc") == 124: # RC 124 == timed out; mark unreachable
+ self.reachable_registries[registry] = False
return False
- def is_available_skopeo_image(self, image, registry, task_vars):
- """Uses Skopeo to determine if required image exists in a given registry."""
-
- cmd_str = "skopeo inspect docker://{registry}/{image}".format(
- registry=registry,
- image=image,
- )
-
- args = {
- "name": "skopeo_inspect",
- "image": self.skopeo_image,
- "command": cmd_str,
- "detach": False,
- "cleanup": True,
- }
- result = self.module_executor("docker_container", args, task_vars)
- return result.get("failed", False)
-
- def containerized_docker_images(self, base_name, version):
- return [
- "{image}:{version}".format(image=self.image_from_base_name(base_name), version=version)
- ]
-
- @staticmethod
- def rpm_docker_images(base, version):
- return [
- "{image_repo}/registry-console:{version}".format(image_repo=base["repo"], version=version)
- ]
-
- @staticmethod
- def qualified_docker_images(image_name, version):
- return [
- "{}-{}:{}".format(image_name, component, version)
- for component in "haproxy-router docker-registry deployer pod".split()
- ]
-
- @staticmethod
- def image_from_base_name(base):
- return "".join([base["repo"], "/", base["image"]])
-
- # ensures that the skopeo docker image exists, and updates it
- # with latest if image was already present locally.
- def update_skopeo_image(self, task_vars):
- result = self.module_executor("docker_image", {"name": self.skopeo_image}, task_vars)
- return result.get("msg", ""), result.get("failed", False), result.get("changed", False)
+ def connect_to_registry(self, registry):
+ """Use ansible wait_for module to test connectivity from host to registry. Returns bool."""
+ # test a simple TCP connection
+ host, _, port = registry.partition(":")
+ port = port or 443
+ args = dict(host=host, port=port, state="started", timeout=30)
+ result = self.execute_module("wait_for", args)
+ return result.get("rc", 0) == 0 and not result.get("failed")
diff --git a/roles/openshift_health_checker/openshift_checks/docker_storage.py b/roles/openshift_health_checker/openshift_checks/docker_storage.py
new file mode 100644
index 000000000..6808d8b2f
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/docker_storage.py
@@ -0,0 +1,276 @@
+"""Check Docker storage driver and usage."""
+import json
+import re
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
+from openshift_checks.mixins import DockerHostMixin
+
+
+class DockerStorage(DockerHostMixin, OpenShiftCheck):
+ """Check Docker storage driver compatibility.
+
+ This check ensures that Docker is using a supported storage driver,
+ and that loopback is not being used (if using devicemapper).
+ Also that storage usage is not above threshold.
+ """
+
+ name = "docker_storage"
+ tags = ["health", "preflight"]
+
+ dependencies = ["python-docker-py"]
+ storage_drivers = ["devicemapper", "overlay", "overlay2"]
+ max_thinpool_data_usage_percent = 90.0
+ max_thinpool_meta_usage_percent = 90.0
+ max_overlay_usage_percent = 90.0
+
+ # TODO(lmeyer): mention these in the output when check fails
+ configuration_variables = [
+ (
+ "max_thinpool_data_usage_percent",
+ "For 'devicemapper' storage driver, usage threshold percentage for data. "
+ "Format: float. Default: {:.1f}".format(max_thinpool_data_usage_percent),
+ ),
+ (
+ "max_thinpool_meta_usage_percent",
+ "For 'devicemapper' storage driver, usage threshold percentage for metadata. "
+ "Format: float. Default: {:.1f}".format(max_thinpool_meta_usage_percent),
+ ),
+ (
+ "max_overlay_usage_percent",
+ "For 'overlay' or 'overlay2' storage driver, usage threshold percentage. "
+ "Format: float. Default: {:.1f}".format(max_overlay_usage_percent),
+ ),
+ ]
+
+ def run(self):
+ msg, failed = self.ensure_dependencies()
+ if failed:
+ return {
+ "failed": True,
+ "msg": "Some dependencies are required in order to query docker storage on host:\n" + msg
+ }
+
+ # attempt to get the docker info hash from the API
+ docker_info = self.execute_module("docker_info", {})
+ if docker_info.get("failed"):
+ return {"failed": True,
+ "msg": "Failed to query Docker API. Is docker running on this host?"}
+ if not docker_info.get("info"): # this would be very strange
+ return {"failed": True,
+ "msg": "Docker API query missing info:\n{}".format(json.dumps(docker_info))}
+ docker_info = docker_info["info"]
+
+ # check if the storage driver we saw is valid
+ driver = docker_info.get("Driver", "[NONE]")
+ if driver not in self.storage_drivers:
+ msg = (
+ "Detected unsupported Docker storage driver '{driver}'.\n"
+ "Supported storage drivers are: {drivers}"
+ ).format(driver=driver, drivers=', '.join(self.storage_drivers))
+ return {"failed": True, "msg": msg}
+
+ # driver status info is a list of tuples; convert to dict and validate based on driver
+ driver_status = {item[0]: item[1] for item in docker_info.get("DriverStatus", [])}
+
+ result = {}
+
+ if driver == "devicemapper":
+ result = self.check_devicemapper_support(driver_status)
+
+ if driver in ['overlay', 'overlay2']:
+ result = self.check_overlay_support(docker_info, driver_status)
+
+ return result
+
+ def check_devicemapper_support(self, driver_status):
+ """Check if dm storage driver is supported as configured. Return: result dict."""
+ if driver_status.get("Data loop file"):
+ msg = (
+ "Use of loopback devices with the Docker devicemapper storage driver\n"
+ "(the default storage configuration) is unsupported in production.\n"
+ "Please use docker-storage-setup to configure a backing storage volume.\n"
+ "See http://red.ht/2rNperO for further information."
+ )
+ return {"failed": True, "msg": msg}
+ result = self.check_dm_usage(driver_status)
+ return result
+
+ def check_dm_usage(self, driver_status):
+ """Check usage thresholds for Docker dm storage driver. Return: result dict.
+ Backing assumptions: We expect devicemapper to be backed by an auto-expanding thin pool
+ implemented as an LV in an LVM2 VG. This is how docker-storage-setup currently configures
+ devicemapper storage. The LV is "thin" because it does not use all available storage
+ from its VG, instead expanding as needed; so to determine available space, we gather
+ current usage as the Docker API reports for the driver as well as space available for
+ expansion in the pool's VG.
+ Usage within the LV is divided into pools allocated to data and metadata, either of which
+ could run out of space first; so we check both.
+ """
+ vals = dict(
+ vg_free=self.get_vg_free(driver_status.get("Pool Name")),
+ data_used=driver_status.get("Data Space Used"),
+ data_total=driver_status.get("Data Space Total"),
+ metadata_used=driver_status.get("Metadata Space Used"),
+ metadata_total=driver_status.get("Metadata Space Total"),
+ )
+
+ # convert all human-readable strings to bytes
+ for key, value in vals.copy().items():
+ try:
+ vals[key + "_bytes"] = self.convert_to_bytes(value)
+ except ValueError as err: # unlikely to hit this from API info, but just to be safe
+ return {
+ "failed": True,
+ "values": vals,
+ "msg": "Could not interpret {} value '{}' as bytes: {}".format(key, value, str(err))
+ }
+
+ # determine the threshold percentages which usage should not exceed
+ for name, default in [("data", self.max_thinpool_data_usage_percent),
+ ("metadata", self.max_thinpool_meta_usage_percent)]:
+ percent = self.get_var("max_thinpool_" + name + "_usage_percent", default=default)
+ try:
+ vals[name + "_threshold"] = float(percent)
+ except ValueError:
+ return {
+ "failed": True,
+ "msg": "Specified thinpool {} usage limit '{}' is not a percentage".format(name, percent)
+ }
+
+ # test whether the thresholds are exceeded
+ messages = []
+ for name in ["data", "metadata"]:
+ vals[name + "_pct_used"] = 100 * vals[name + "_used_bytes"] / (
+ vals[name + "_total_bytes"] + vals["vg_free_bytes"])
+ if vals[name + "_pct_used"] > vals[name + "_threshold"]:
+ messages.append(
+ "Docker thinpool {name} usage percentage {pct:.1f} "
+ "is higher than threshold {thresh:.1f}.".format(
+ name=name,
+ pct=vals[name + "_pct_used"],
+ thresh=vals[name + "_threshold"],
+ ))
+ vals["failed"] = True
+
+ vals["msg"] = "\n".join(messages or ["Thinpool usage is within thresholds."])
+ return vals
+
+ def get_vg_free(self, pool):
+ """Determine which VG to examine according to the pool name. Return: size vgs reports.
+ Pool name is the only indicator currently available from the Docker API driver info.
+ We assume a name that looks like "vg--name-docker--pool";
+ vg and lv names with inner hyphens doubled, joined by a hyphen.
+ """
+ match = re.match(r'((?:[^-]|--)+)-(?!-)', pool) # matches up to the first single hyphen
+ if not match: # unlikely, but... be clear if we assumed wrong
+ raise OpenShiftCheckException(
+ "This host's Docker reports it is using a storage pool named '{}'.\n"
+ "However this name does not have the expected format of 'vgname-lvname'\n"
+ "so the available storage in the VG cannot be determined.".format(pool)
+ )
+ vg_name = match.groups()[0].replace("--", "-")
+ vgs_cmd = "/sbin/vgs --noheadings -o vg_free --units g --select vg_name=" + vg_name
+ # should return free space like " 12.00g" if the VG exists; empty if it does not
+
+ ret = self.execute_module("command", {"_raw_params": vgs_cmd})
+ if ret.get("failed") or ret.get("rc", 0) != 0:
+ raise OpenShiftCheckException(
+ "Is LVM installed? Failed to run /sbin/vgs "
+ "to determine docker storage usage:\n" + ret.get("msg", "")
+ )
+ size = ret.get("stdout", "").strip()
+ if not size:
+ raise OpenShiftCheckException(
+ "This host's Docker reports it is using a storage pool named '{pool}'.\n"
+ "which we expect to come from local VG '{vg}'.\n"
+ "However, /sbin/vgs did not find this VG. Is Docker for this host"
+ "running and using the storage on the host?".format(pool=pool, vg=vg_name)
+ )
+ return size
+
+ @staticmethod
+ def convert_to_bytes(string):
+ """Convert string like "10.3 G" to bytes (binary units assumed). Return: float bytes."""
+ units = dict(
+ b=1,
+ k=1024,
+ m=1024**2,
+ g=1024**3,
+ t=1024**4,
+ p=1024**5,
+ )
+ string = string or ""
+ match = re.match(r'(\d+(?:\.\d+)?)\s*(\w)?', string) # float followed by optional unit
+ if not match:
+ raise ValueError("Cannot convert to a byte size: " + string)
+
+ number, unit = match.groups()
+ multiplier = 1 if not unit else units.get(unit.lower())
+ if not multiplier:
+ raise ValueError("Cannot convert to a byte size: " + string)
+
+ return float(number) * multiplier
+
+ def check_overlay_support(self, docker_info, driver_status):
+ """Check if overlay storage driver is supported for this host. Return: result dict."""
+ # check for xfs as backing store
+ backing_fs = driver_status.get("Backing Filesystem", "[NONE]")
+ if backing_fs != "xfs":
+ msg = (
+ "Docker storage drivers 'overlay' and 'overlay2' are only supported with\n"
+ "'xfs' as the backing storage, but this host's storage is type '{fs}'."
+ ).format(fs=backing_fs)
+ return {"failed": True, "msg": msg}
+
+ # check support for OS and kernel version
+ o_s = docker_info.get("OperatingSystem", "[NONE]")
+ if "Red Hat Enterprise Linux" in o_s or "CentOS" in o_s:
+ # keep it simple, only check enterprise kernel versions; assume everyone else is good
+ kernel = docker_info.get("KernelVersion", "[NONE]")
+ kernel_arr = [int(num) for num in re.findall(r'\d+', kernel)]
+ if kernel_arr < [3, 10, 0, 514]: # rhel < 7.3
+ msg = (
+ "Docker storage drivers 'overlay' and 'overlay2' are only supported beginning with\n"
+ "kernel version 3.10.0-514; but Docker reports kernel version {version}."
+ ).format(version=kernel)
+ return {"failed": True, "msg": msg}
+ # NOTE: we could check for --selinux-enabled here but docker won't even start with
+ # that option until it's supported in the kernel so we don't need to.
+
+ return self.check_overlay_usage(docker_info)
+
+ def check_overlay_usage(self, docker_info):
+ """Check disk usage on OverlayFS backing store volume. Return: result dict."""
+ path = docker_info.get("DockerRootDir", "/var/lib/docker") + "/" + docker_info["Driver"]
+
+ threshold = self.get_var("max_overlay_usage_percent", default=self.max_overlay_usage_percent)
+ try:
+ threshold = float(threshold)
+ except ValueError:
+ return {
+ "failed": True,
+ "msg": "Specified 'max_overlay_usage_percent' is not a percentage: {}".format(threshold),
+ }
+
+ mount = self.find_ansible_mount(path)
+ try:
+ free_bytes = mount['size_available']
+ total_bytes = mount['size_total']
+ usage = 100.0 * (total_bytes - free_bytes) / total_bytes
+ except (KeyError, ZeroDivisionError):
+ return {
+ "failed": True,
+ "msg": "The ansible_mount found for path {} is invalid.\n"
+ "This is likely to be an Ansible bug. The record was:\n"
+ "{}".format(path, json.dumps(mount, indent=2)),
+ }
+
+ if usage > threshold:
+ return {
+ "failed": True,
+ "msg": (
+ "For Docker OverlayFS mount point {path},\n"
+ "usage percentage {pct:.1f} is higher than threshold {thresh:.1f}."
+ ).format(path=mount["mount"], pct=usage, thresh=threshold)
+ }
+
+ return {}
diff --git a/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py b/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py
new file mode 100644
index 000000000..f4296753a
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py
@@ -0,0 +1,72 @@
+"""
+Ansible module for determining if the size of OpenShift image data exceeds a specified limit in an etcd cluster.
+"""
+
+from openshift_checks import OpenShiftCheck
+
+
+class EtcdImageDataSize(OpenShiftCheck):
+ """Check that total size of OpenShift image data does not exceed the recommended limit in an etcd cluster"""
+
+ name = "etcd_imagedata_size"
+ tags = ["etcd"]
+
+ def run(self):
+ etcd_mountpath = self.find_ansible_mount("/var/lib/etcd")
+ etcd_avail_diskspace = etcd_mountpath["size_available"]
+ etcd_total_diskspace = etcd_mountpath["size_total"]
+
+ etcd_imagedata_size_limit = self.get_var(
+ "etcd_max_image_data_size_bytes",
+ default=int(0.5 * float(etcd_total_diskspace - etcd_avail_diskspace))
+ )
+
+ etcd_is_ssl = self.get_var("openshift", "master", "etcd_use_ssl", default=False)
+ etcd_port = self.get_var("openshift", "master", "etcd_port", default=2379)
+ etcd_hosts = self.get_var("openshift", "master", "etcd_hosts")
+
+ config_base = self.get_var("openshift", "common", "config_base")
+
+ cert = self.get_var("etcd_client_cert", default=config_base + "/master/master.etcd-client.crt")
+ key = self.get_var("etcd_client_key", default=config_base + "/master/master.etcd-client.key")
+ ca_cert = self.get_var("etcd_client_ca_cert", default=config_base + "/master/master.etcd-ca.crt")
+
+ for etcd_host in list(etcd_hosts):
+ args = {
+ "size_limit_bytes": etcd_imagedata_size_limit,
+ "paths": ["/openshift.io/images", "/openshift.io/imagestreams"],
+ "host": etcd_host,
+ "port": etcd_port,
+ "protocol": "https" if etcd_is_ssl else "http",
+ "version_prefix": "/v2",
+ "allow_redirect": True,
+ "ca_cert": ca_cert,
+ "cert": {
+ "cert": cert,
+ "key": key,
+ },
+ }
+
+ etcdkeysize = self.execute_module("etcdkeysize", args)
+
+ if etcdkeysize.get("rc", 0) != 0 or etcdkeysize.get("failed"):
+ msg = 'Failed to retrieve stats for etcd host "{host}": {reason}'
+ reason = etcdkeysize.get("msg")
+ if etcdkeysize.get("module_stderr"):
+ reason = etcdkeysize["module_stderr"]
+
+ msg = msg.format(host=etcd_host, reason=reason)
+ return {"failed": True, "msg": msg}
+
+ if etcdkeysize["size_limit_exceeded"]:
+ limit = self._to_gigabytes(etcd_imagedata_size_limit)
+ msg = ("The size of OpenShift image data stored in etcd host "
+ "\"{host}\" exceeds the maximum recommended limit of {limit:.2f} GB. "
+ "Use the `oadm prune images` command to cleanup unused Docker images.")
+ return {"failed": True, "msg": msg.format(host=etcd_host, limit=limit)}
+
+ return {}
+
+ @staticmethod
+ def _to_gigabytes(byte_size):
+ return float(byte_size) / 10.0**9
diff --git a/roles/openshift_health_checker/openshift_checks/etcd_traffic.py b/roles/openshift_health_checker/openshift_checks/etcd_traffic.py
new file mode 100644
index 000000000..8b20ccb49
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/etcd_traffic.py
@@ -0,0 +1,44 @@
+"""Check that scans journalctl for messages caused as a symptom of increased etcd traffic."""
+
+from openshift_checks import OpenShiftCheck
+
+
+class EtcdTraffic(OpenShiftCheck):
+ """Check if host is being affected by an increase in etcd traffic."""
+
+ name = "etcd_traffic"
+ tags = ["health", "etcd"]
+
+ def is_active(self):
+ """Skip hosts that do not have etcd in their group names."""
+ group_names = self.get_var("group_names", default=[])
+ valid_group_names = "oo_etcd_to_config" in group_names
+
+ version = self.get_major_minor_version(self.get_var("openshift_image_tag"))
+ valid_version = version in ((3, 4), (3, 5))
+
+ return super(EtcdTraffic, self).is_active() and valid_group_names and valid_version
+
+ def run(self):
+ is_containerized = self.get_var("openshift", "common", "is_containerized")
+ unit = "etcd_container" if is_containerized else "etcd"
+
+ log_matchers = [{
+ "start_regexp": r"Starting Etcd Server",
+ "regexp": r"etcd: sync duration of [^,]+, expected less than 1s",
+ "unit": unit
+ }]
+
+ match = self.execute_module("search_journalctl", {"log_matchers": log_matchers})
+
+ if match.get("matched"):
+ msg = ("Higher than normal etcd traffic detected.\n"
+ "OpenShift 3.4 introduced an increase in etcd traffic.\n"
+ "Upgrading to OpenShift 3.6 is recommended in order to fix this issue.\n"
+ "Please refer to https://access.redhat.com/solutions/2916381 for more information.")
+ return {"failed": True, "msg": msg}
+
+ if match.get("failed"):
+ return {"failed": True, "msg": "\n".join(match.get("errors"))}
+
+ return {}
diff --git a/roles/openshift_health_checker/openshift_checks/etcd_volume.py b/roles/openshift_health_checker/openshift_checks/etcd_volume.py
new file mode 100644
index 000000000..3d75da6f9
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/etcd_volume.py
@@ -0,0 +1,47 @@
+"""A health check for OpenShift clusters."""
+
+from openshift_checks import OpenShiftCheck
+
+
+class EtcdVolume(OpenShiftCheck):
+ """Ensures etcd storage usage does not exceed a given threshold."""
+
+ name = "etcd_volume"
+ tags = ["etcd", "health"]
+
+ # Default device usage threshold. Value should be in the range [0, 100].
+ default_threshold_percent = 90
+ # Where to find etcd data
+ etcd_mount_path = "/var/lib/etcd"
+
+ def is_active(self):
+ etcd_hosts = (
+ self.get_var("groups", "oo_etcd_to_config", default=[]) or
+ self.get_var("groups", "oo_masters_to_config", default=[]) or
+ []
+ )
+ is_etcd_host = self.get_var("ansible_host") in etcd_hosts
+ return super(EtcdVolume, self).is_active() and is_etcd_host
+
+ def run(self):
+ mount_info = self.find_ansible_mount(self.etcd_mount_path)
+ available = mount_info["size_available"]
+ total = mount_info["size_total"]
+ used = total - available
+
+ threshold = self.get_var(
+ "etcd_device_usage_threshold_percent",
+ default=self.default_threshold_percent
+ )
+
+ used_percent = 100.0 * used / total
+
+ if used_percent > threshold:
+ device = mount_info.get("device", "unknown")
+ mount = mount_info.get("mount", "unknown")
+ msg = "etcd storage usage ({:.1f}%) is above threshold ({:.1f}%). Device: {}, mount: {}.".format(
+ used_percent, threshold, device, mount
+ )
+ return {"failed": True, "msg": msg}
+
+ return {}
diff --git a/roles/openshift_health_checker/openshift_checks/logging/__init__.py b/roles/openshift_health_checker/openshift_checks/logging/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/__init__.py
diff --git a/roles/openshift_health_checker/openshift_checks/logging/curator.py b/roles/openshift_health_checker/openshift_checks/logging/curator.py
new file mode 100644
index 000000000..b27f97172
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/curator.py
@@ -0,0 +1,43 @@
+"""Check for an aggregated logging Curator deployment"""
+
+from openshift_checks.logging.logging import OpenShiftCheckException, LoggingCheck
+
+
+class Curator(LoggingCheck):
+ """Check for an aggregated logging Curator deployment"""
+
+ name = "curator"
+ tags = ["health", "logging"]
+
+ def run(self):
+ """Check various things and gather errors. Returns: result as hash"""
+
+ curator_pods = self.get_pods_for_component("curator")
+ self.check_curator(curator_pods)
+ # TODO(lmeyer): run it all again for the ops cluster
+
+ return {}
+
+ def check_curator(self, pods):
+ """Check to see if curator is up and working. Returns: error string"""
+ if not pods:
+ raise OpenShiftCheckException(
+ "MissingComponentPods",
+ "There are no Curator pods for the logging stack,\n"
+ "so nothing will prune Elasticsearch indexes.\n"
+ "Is Curator correctly deployed?"
+ )
+
+ not_running = self.not_running_pods(pods)
+ if len(not_running) == len(pods):
+ raise OpenShiftCheckException(
+ "CuratorNotRunning",
+ "The Curator pod is not currently in a running state,\n"
+ "so Elasticsearch indexes may increase without bound."
+ )
+ if len(pods) - len(not_running) > 1:
+ raise OpenShiftCheckException(
+ "TooManyCurators",
+ "There is more than one Curator pod running. This should not normally happen.\n"
+ "Although this doesn't cause any problems, you may want to investigate."
+ )
diff --git a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
new file mode 100644
index 000000000..986a01f38
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
@@ -0,0 +1,212 @@
+"""Check for an aggregated logging Elasticsearch deployment"""
+
+import json
+import re
+
+from openshift_checks import OpenShiftCheckException, OpenShiftCheckExceptionList
+from openshift_checks.logging.logging import LoggingCheck
+
+
+class Elasticsearch(LoggingCheck):
+ """Check for an aggregated logging Elasticsearch deployment"""
+
+ name = "elasticsearch"
+ tags = ["health", "logging"]
+
+ def run(self):
+ """Check various things and gather errors. Returns: result as hash"""
+
+ es_pods = self.get_pods_for_component("es")
+ self.check_elasticsearch(es_pods)
+ # TODO(lmeyer): run it all again for the ops cluster
+
+ return {}
+
+ def check_elasticsearch(self, es_pods):
+ """Perform checks for Elasticsearch. Raises OpenShiftCheckExceptionList on any errors."""
+ running_pods, errors = self.running_elasticsearch_pods(es_pods)
+ pods_by_name = {
+ pod['metadata']['name']: pod for pod in running_pods
+ # Filter out pods that are not members of a DC
+ if pod['metadata'].get('labels', {}).get('deploymentconfig')
+ }
+ if not pods_by_name:
+ # nothing running, cannot run the rest of the check
+ errors.append(OpenShiftCheckException(
+ 'NoRunningPods',
+ 'No logging Elasticsearch pods were found running, so no logs are being aggregated.'
+ ))
+ raise OpenShiftCheckExceptionList(errors)
+
+ errors += self.check_elasticsearch_masters(pods_by_name)
+ errors += self.check_elasticsearch_node_list(pods_by_name)
+ errors += self.check_es_cluster_health(pods_by_name)
+ errors += self.check_elasticsearch_diskspace(pods_by_name)
+ if errors:
+ raise OpenShiftCheckExceptionList(errors)
+
+ def running_elasticsearch_pods(self, es_pods):
+ """Returns: list of running pods, list of errors about non-running pods"""
+ not_running = self.not_running_pods(es_pods)
+ running_pods = [pod for pod in es_pods if pod not in not_running]
+ if not_running:
+ return running_pods, [OpenShiftCheckException(
+ 'PodNotRunning',
+ 'The following Elasticsearch pods are defined but not running:\n'
+ '{pods}'.format(pods=''.join(
+ " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None'))
+ for pod in not_running
+ ))
+ )]
+ return running_pods, []
+
+ @staticmethod
+ def _build_es_curl_cmd(pod_name, url):
+ base = "exec {name} -- curl -s --cert {base}cert --key {base}key --cacert {base}ca -XGET '{url}'"
+ return base.format(base="/etc/elasticsearch/secret/admin-", name=pod_name, url=url)
+
+ def check_elasticsearch_masters(self, pods_by_name):
+ """Check that Elasticsearch masters are sane. Returns: list of errors"""
+ es_master_names = set()
+ errors = []
+ for pod_name in pods_by_name.keys():
+ # Compare what each ES node reports as master and compare for split brain
+ get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master")
+ master_name_str = self.exec_oc(get_master_cmd, [], save_as_name="get_master_names.json")
+ master_names = (master_name_str or '').split(' ')
+ if len(master_names) > 1:
+ es_master_names.add(master_names[1])
+ else:
+ errors.append(OpenShiftCheckException(
+ 'NoMasterName',
+ 'Elasticsearch {pod} gave unexpected response when asked master name:\n'
+ ' {response}'.format(pod=pod_name, response=master_name_str)
+ ))
+
+ if not es_master_names:
+ errors.append(OpenShiftCheckException(
+ 'NoMasterFound',
+ 'No logging Elasticsearch masters were found.'
+ ))
+ return errors
+
+ if len(es_master_names) > 1:
+ errors.append(OpenShiftCheckException(
+ 'SplitBrainMasters',
+ 'Found multiple Elasticsearch masters according to the pods:\n'
+ '{master_list}\n'
+ 'This implies that the masters have "split brain" and are not correctly\n'
+ 'replicating data for the logging cluster. Log loss is likely to occur.'
+ .format(master_list='\n'.join(' ' + master for master in es_master_names))
+ ))
+
+ return errors
+
+ def check_elasticsearch_node_list(self, pods_by_name):
+ """Check that reported ES masters are accounted for by pods. Returns: list of errors"""
+
+ if not pods_by_name:
+ return [OpenShiftCheckException(
+ 'MissingComponentPods',
+ 'No logging Elasticsearch pods were found.'
+ )]
+
+ # get ES cluster nodes
+ node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes')
+ cluster_node_data = self.exec_oc(node_cmd, [], save_as_name="get_es_nodes.json")
+ try:
+ cluster_nodes = json.loads(cluster_node_data)['nodes']
+ except (ValueError, KeyError):
+ return [OpenShiftCheckException(
+ 'MissingNodeList',
+ 'Failed to query Elasticsearch for the list of ES nodes. The output was:\n' +
+ cluster_node_data
+ )]
+
+ # Try to match all ES-reported node hosts to known pods.
+ errors = []
+ for node in cluster_nodes.values():
+ # Note that with 1.4/3.4 the pod IP may be used as the master name
+ if not any(node['host'] in (pod_name, pod['status'].get('podIP'))
+ for pod_name, pod in pods_by_name.items()):
+ errors.append(OpenShiftCheckException(
+ 'EsPodNodeMismatch',
+ 'The Elasticsearch cluster reports a member node "{node}"\n'
+ 'that does not correspond to any known ES pod.'.format(node=node['host'])
+ ))
+
+ return errors
+
+ def check_es_cluster_health(self, pods_by_name):
+ """Exec into the elasticsearch pods and check the cluster health. Returns: list of errors"""
+ errors = []
+ for pod_name in pods_by_name.keys():
+ cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true')
+ cluster_health_data = self.exec_oc(cluster_health_cmd, [], save_as_name='get_es_health.json')
+ try:
+ health_res = json.loads(cluster_health_data)
+ if not health_res or not health_res.get('status'):
+ raise ValueError()
+ except ValueError:
+ errors.append(OpenShiftCheckException(
+ 'BadEsResponse',
+ 'Could not retrieve cluster health status from logging ES pod "{pod}".\n'
+ 'Response was:\n{output}'.format(pod=pod_name, output=cluster_health_data)
+ ))
+ continue
+
+ if health_res['status'] not in ['green', 'yellow']:
+ errors.append(OpenShiftCheckException(
+ 'EsClusterHealthRed',
+ 'Elasticsearch cluster health status is RED according to pod "{}"'.format(pod_name)
+ ))
+
+ return errors
+
+ def check_elasticsearch_diskspace(self, pods_by_name):
+ """
+ Exec into an ES pod and query the diskspace on the persistent volume.
+ Returns: list of errors
+ """
+ errors = []
+ for pod_name in pods_by_name.keys():
+ df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name)
+ disk_output = self.exec_oc(df_cmd, [], save_as_name='get_pv_diskspace.json')
+ lines = disk_output.splitlines()
+ # expecting one header looking like 'IUse% Use%' and one body line
+ body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$'
+ if len(lines) != 2 or len(lines[0].split()) != 2 or not re.match(body_re, lines[1]):
+ errors.append(OpenShiftCheckException(
+ 'BadDfResponse',
+ 'Could not retrieve storage usage from logging ES pod "{pod}".\n'
+ 'Response to `df` command was:\n{output}'.format(pod=pod_name, output=disk_output)
+ ))
+ continue
+ inode_pct, disk_pct = re.match(body_re, lines[1]).groups()
+
+ inode_pct_thresh = self.get_var('openshift_check_efk_es_inode_pct', default='90')
+ if int(inode_pct) >= int(inode_pct_thresh):
+ errors.append(OpenShiftCheckException(
+ 'InodeUsageTooHigh',
+ 'Inode percent usage on the storage volume for logging ES pod "{pod}"\n'
+ ' is {pct}, greater than threshold {limit}.\n'
+ ' Note: threshold can be specified in inventory with {param}'.format(
+ pod=pod_name,
+ pct=str(inode_pct),
+ limit=str(inode_pct_thresh),
+ param='openshift_check_efk_es_inode_pct',
+ )))
+ disk_pct_thresh = self.get_var('openshift_check_efk_es_storage_pct', default='80')
+ if int(disk_pct) >= int(disk_pct_thresh):
+ errors.append(OpenShiftCheckException(
+ 'DiskUsageTooHigh',
+ 'Disk percent usage on the storage volume for logging ES pod "{pod}"\n'
+ ' is {pct}, greater than threshold {limit}.\n'
+ ' Note: threshold can be specified in inventory with {param}'.format(
+ pod=pod_name,
+ pct=str(disk_pct),
+ limit=str(disk_pct_thresh),
+ param='openshift_check_efk_es_storage_pct',
+ )))
+
+ return errors
diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
new file mode 100644
index 000000000..3b192a281
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
@@ -0,0 +1,154 @@
+"""Check for an aggregated logging Fluentd deployment"""
+
+import json
+
+
+from openshift_checks import OpenShiftCheckException, OpenShiftCheckExceptionList
+from openshift_checks.logging.logging import LoggingCheck
+
+
+class Fluentd(LoggingCheck):
+ """Check for an aggregated logging Fluentd deployment"""
+
+ name = "fluentd"
+ tags = ["health", "logging"]
+
+ def run(self):
+ """Check the Fluentd deployment and raise an error if any problems are found."""
+
+ fluentd_pods = self.get_pods_for_component("fluentd")
+ self.check_fluentd(fluentd_pods)
+ return {}
+
+ def check_fluentd(self, pods):
+ """Verify fluentd is running everywhere. Raises OpenShiftCheckExceptionList if error(s) found."""
+
+ node_selector = self.get_var(
+ 'openshift_logging_fluentd_nodeselector',
+ default='logging-infra-fluentd=true'
+ )
+
+ nodes_by_name = self.get_nodes_by_name()
+ fluentd_nodes = self.filter_fluentd_labeled_nodes(nodes_by_name, node_selector)
+
+ errors = []
+ errors += self.check_node_labeling(nodes_by_name, fluentd_nodes, node_selector)
+ errors += self.check_nodes_have_fluentd(pods, fluentd_nodes)
+ errors += self.check_fluentd_pods_running(pods)
+
+ # Make sure there are no extra fluentd pods
+ if len(pods) > len(fluentd_nodes):
+ errors.append(OpenShiftCheckException(
+ 'TooManyFluentdPods',
+ 'There are more Fluentd pods running than nodes labeled.\n'
+ 'This may not cause problems with logging but it likely indicates something wrong.'
+ ))
+
+ if errors:
+ raise OpenShiftCheckExceptionList(errors)
+
+ def get_nodes_by_name(self):
+ """Retrieve all the node definitions. Returns: dict(name: node)"""
+ nodes_json = self.exec_oc("get nodes -o json", [])
+ try:
+ nodes = json.loads(nodes_json)
+ except ValueError: # no valid json - should not happen
+ raise OpenShiftCheckException(
+ "BadOcNodeList",
+ "Could not obtain a list of nodes to validate fluentd.\n"
+ "Output from oc get:\n" + nodes_json
+ )
+ if not nodes or not nodes.get('items'): # also should not happen
+ raise OpenShiftCheckException(
+ "NoNodesDefined",
+ "No nodes appear to be defined according to the API."
+ )
+ return {
+ node['metadata']['name']: node
+ for node in nodes['items']
+ }
+
+ @staticmethod
+ def filter_fluentd_labeled_nodes(nodes_by_name, node_selector):
+ """Filter to all nodes with fluentd label. Returns dict(name: node)"""
+ label, value = node_selector.split('=', 1)
+ fluentd_nodes = {
+ name: node for name, node in nodes_by_name.items()
+ if node['metadata']['labels'].get(label) == value
+ }
+ if not fluentd_nodes:
+ raise OpenShiftCheckException(
+ 'NoNodesLabeled',
+ 'There are no nodes with the fluentd label {label}.\n'
+ 'This means no logs will be aggregated from the nodes.'.format(label=node_selector)
+ )
+ return fluentd_nodes
+
+ def check_node_labeling(self, nodes_by_name, fluentd_nodes, node_selector):
+ """Note if nodes are not labeled as expected. Returns: error list"""
+ intended_nodes = self.get_var('openshift_logging_fluentd_hosts', default=['--all'])
+ if not intended_nodes or '--all' in intended_nodes:
+ intended_nodes = nodes_by_name.keys()
+ nodes_missing_labels = set(intended_nodes) - set(fluentd_nodes.keys())
+ if nodes_missing_labels:
+ return [OpenShiftCheckException(
+ 'NodesUnlabeled',
+ 'The following nodes are supposed to be labeled with {label} but are not:\n'
+ ' {nodes}\n'
+ 'Fluentd will not aggregate logs from these nodes.'.format(
+ label=node_selector, nodes=', '.join(nodes_missing_labels)
+ ))]
+
+ return []
+
+ @staticmethod
+ def check_nodes_have_fluentd(pods, fluentd_nodes):
+ """Make sure fluentd is on all the labeled nodes. Returns: error list"""
+ unmatched_nodes = fluentd_nodes.copy()
+ node_names_by_label = {
+ node['metadata']['labels']['kubernetes.io/hostname']: name
+ for name, node in fluentd_nodes.items()
+ }
+ node_names_by_internal_ip = {
+ address['address']: name
+ for name, node in fluentd_nodes.items()
+ for address in node['status']['addresses']
+ if address['type'] == "InternalIP"
+ }
+ for pod in pods:
+ for name in [
+ pod['spec']['nodeName'],
+ node_names_by_internal_ip.get(pod['spec']['nodeName']),
+ node_names_by_label.get(pod.get('spec', {}).get('host')),
+ ]:
+ unmatched_nodes.pop(name, None)
+ if unmatched_nodes:
+ return [OpenShiftCheckException(
+ 'MissingFluentdPod',
+ 'The following nodes are supposed to have a Fluentd pod but do not:\n'
+ ' {nodes}\n'
+ 'These nodes will not have their logs aggregated.'.format(
+ nodes='\n '.join(unmatched_nodes.keys())
+ ))]
+
+ return []
+
+ def check_fluentd_pods_running(self, pods):
+ """Make sure all fluentd pods are running. Returns: error string"""
+ not_running = super(Fluentd, self).not_running_pods(pods)
+ if not_running:
+ return [OpenShiftCheckException(
+ 'FluentdNotRunning',
+ 'The following Fluentd pods are supposed to be running but are not:\n'
+ ' {pods}\n'
+ 'These pods will not aggregate logs from their nodes.'.format(
+ pods='\n'.join(
+ " {name} ({host})".format(
+ name=pod['metadata']['name'],
+ host=pod['spec'].get('host', 'None')
+ )
+ for pod in not_running
+ )
+ ))]
+
+ return []
diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py
new file mode 100644
index 000000000..e93cc9028
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py
@@ -0,0 +1,131 @@
+"""
+Module for performing checks on a Fluentd logging deployment configuration
+"""
+
+from openshift_checks import OpenShiftCheckException
+from openshift_checks.logging.logging import LoggingCheck
+
+
+class FluentdConfig(LoggingCheck):
+ """Module that checks logging configuration of an integrated logging Fluentd deployment"""
+ name = "fluentd_config"
+ tags = ["health"]
+
+ def is_active(self):
+ logging_deployed = self.get_var("openshift_hosted_logging_deploy", default=False)
+
+ try:
+ version = self.get_major_minor_version(self.get_var("openshift_image_tag"))
+ except ValueError:
+ # if failed to parse OpenShift version, perform check anyway (if logging enabled)
+ return logging_deployed
+
+ return logging_deployed and version < (3, 6)
+
+ def run(self):
+ """Check that Fluentd has running pods, and that its logging config matches Docker's logging config."""
+ config_error = self.check_logging_config()
+ if config_error:
+ msg = ("The following Fluentd logging configuration problem was found:"
+ "\n{}".format(config_error))
+ return {"failed": True, "msg": msg}
+
+ return {}
+
+ def check_logging_config(self):
+ """Ensure that the configured Docker logging driver matches fluentd settings.
+ This means that, at least for now, if the following condition is met:
+
+ openshift_logging_fluentd_use_journal == True
+
+ then the value of the configured Docker logging driver should be "journald".
+ Otherwise, the value of the Docker logging driver should be "json-file".
+ Returns an error string if the above condition is not met, or None otherwise."""
+ use_journald = self.get_var("openshift_logging_fluentd_use_journal", default=True)
+
+ # if check is running on a master, retrieve all running pods
+ # and check any pod's container for the env var "USE_JOURNAL"
+ group_names = self.get_var("group_names")
+ if "oo_masters_to_config" in group_names:
+ use_journald = self.check_fluentd_env_var()
+
+ docker_info = self.execute_module("docker_info", {})
+ try:
+ logging_driver = docker_info["info"]["LoggingDriver"]
+ except KeyError:
+ return "Unable to determine Docker logging driver."
+
+ logging_driver = docker_info["info"]["LoggingDriver"]
+ recommended_logging_driver = "journald"
+ error = None
+
+ # If fluentd is set to use journald but Docker is not, recommend setting the `--log-driver`
+ # option as an inventory file variable, or adding the log driver value as part of the
+ # Docker configuration in /etc/docker/daemon.json. There is no global --log-driver flag that
+ # can be passed to the Docker binary; the only other recommendation that can be made, would be
+ # to pass the `--log-driver` flag to the "run" sub-command of the `docker` binary when running
+ # individual containers.
+ if use_journald and logging_driver != "journald":
+ error = ('Your Fluentd configuration is set to aggregate Docker container logs from "journald".\n'
+ 'This differs from your Docker configuration, which has been set to use "{driver}" '
+ 'as the default method of storing logs.\n'
+ 'This discrepancy in configuration will prevent Fluentd from receiving any logs'
+ 'from your Docker containers.').format(driver=logging_driver)
+ elif not use_journald and logging_driver != "json-file":
+ recommended_logging_driver = "json-file"
+ error = ('Your Fluentd configuration is set to aggregate Docker container logs from '
+ 'individual json log files per container.\n '
+ 'This differs from your Docker configuration, which has been set to use '
+ '"{driver}" as the default method of storing logs.\n'
+ 'This discrepancy in configuration will prevent Fluentd from receiving any logs'
+ 'from your Docker containers.').format(driver=logging_driver)
+
+ if error:
+ error += ('\nTo resolve this issue, add the following variable to your Ansible inventory file:\n\n'
+ ' openshift_docker_options="--log-driver={driver}"\n\n'
+ 'Alternatively, you can add the following option to your Docker configuration, located in'
+ '"/etc/docker/daemon.json":\n\n'
+ '{{ "log-driver": "{driver}" }}\n\n'
+ 'See https://docs.docker.com/engine/admin/logging/json-file '
+ 'for more information.').format(driver=recommended_logging_driver)
+
+ return error
+
+ def check_fluentd_env_var(self):
+ """Read and return the value of the 'USE_JOURNAL' environment variable on a fluentd pod."""
+ running_pods = self.running_fluentd_pods()
+
+ try:
+ pod_containers = running_pods[0]["spec"]["containers"]
+ except KeyError:
+ return "Unable to detect running containers on selected Fluentd pod."
+
+ if not pod_containers:
+ msg = ('There are no running containers on selected Fluentd pod "{}".\n'
+ 'Unable to calculate expected logging driver.').format(running_pods[0]["metadata"].get("name", ""))
+ raise OpenShiftCheckException(msg)
+
+ pod_env = pod_containers[0].get("env")
+ if not pod_env:
+ msg = ('There are no environment variables set on the Fluentd container "{}".\n'
+ 'Unable to calculate expected logging driver.').format(pod_containers[0].get("name"))
+ raise OpenShiftCheckException(msg)
+
+ for env in pod_env:
+ if env["name"] == "USE_JOURNAL":
+ return env.get("value", "false") != "false"
+
+ return False
+
+ def running_fluentd_pods(self):
+ """Return a list of running fluentd pods."""
+ fluentd_pods = self.get_pods_for_component("fluentd")
+
+ running_fluentd_pods = [pod for pod in fluentd_pods if pod['status']['phase'] == 'Running']
+ if not running_fluentd_pods:
+ raise OpenShiftCheckException(
+ 'No Fluentd pods were found to be in the "Running" state. '
+ 'At least one Fluentd pod is required in order to perform this check.'
+ )
+
+ return running_fluentd_pods
diff --git a/roles/openshift_health_checker/openshift_checks/logging/kibana.py b/roles/openshift_health_checker/openshift_checks/logging/kibana.py
new file mode 100644
index 000000000..3b1cf8baa
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/kibana.py
@@ -0,0 +1,226 @@
+"""
+Module for performing checks on a Kibana logging deployment
+"""
+
+import json
+import ssl
+
+try:
+ from urllib2 import HTTPError, URLError
+ import urllib2
+except ImportError:
+ from urllib.error import HTTPError, URLError
+ import urllib.request as urllib2
+
+from openshift_checks.logging.logging import LoggingCheck, OpenShiftCheckException
+
+
+class Kibana(LoggingCheck):
+ """Module that checks an integrated logging Kibana deployment"""
+
+ name = "kibana"
+ tags = ["health", "logging"]
+
+ def run(self):
+ """Check various things and gather errors. Returns: result as hash"""
+
+ kibana_pods = self.get_pods_for_component("kibana")
+ self.check_kibana(kibana_pods)
+ self.check_kibana_route()
+ # TODO(lmeyer): run it all again for the ops cluster
+
+ return {}
+
+ def _verify_url_internal(self, url):
+ """
+ Try to reach a URL from the host.
+ Returns: success (bool), reason (for failure)
+ """
+ args = dict(
+ url=url,
+ follow_redirects='none',
+ validate_certs='no', # likely to be signed with internal CA
+ # TODO(lmeyer): give users option to validate certs
+ status_code=302,
+ )
+ result = self.execute_module('uri', args)
+ if result.get('failed'):
+ return result['msg']
+ return None
+
+ @staticmethod
+ def _verify_url_external(url):
+ """
+ Try to reach a URL from ansible control host.
+ Raise an OpenShiftCheckException if anything goes wrong.
+ """
+ # This actually checks from the ansible control host, which may or may not
+ # really be "external" to the cluster.
+
+ # Disable SSL cert validation to work around internally signed certs
+ ctx = ssl.create_default_context()
+ ctx.check_hostname = False # or setting CERT_NONE is refused
+ ctx.verify_mode = ssl.CERT_NONE
+
+ # Verify that the url is returning a valid response
+ try:
+ # We only care if the url connects and responds
+ return_code = urllib2.urlopen(url, context=ctx).getcode()
+ except HTTPError as httperr:
+ return httperr.reason
+ except URLError as urlerr:
+ return str(urlerr)
+
+ # there appears to be no way to prevent urlopen from following redirects
+ if return_code != 200:
+ return 'Expected success (200) but got return code {}'.format(int(return_code))
+
+ return None
+
+ def check_kibana(self, pods):
+ """Check to see if Kibana is up and working. Raises OpenShiftCheckException if not."""
+
+ if not pods:
+ raise OpenShiftCheckException(
+ "MissingComponentPods",
+ "There are no Kibana pods deployed, so no access to the logging UI."
+ )
+
+ not_running = self.not_running_pods(pods)
+ if len(not_running) == len(pods):
+ raise OpenShiftCheckException(
+ "NoRunningPods",
+ "No Kibana pod is in a running state, so there is no access to the logging UI."
+ )
+ elif not_running:
+ raise OpenShiftCheckException(
+ "PodNotRunning",
+ "The following Kibana pods are not currently in a running state:\n"
+ " {pods}\n"
+ "However at least one is, so service may not be impacted.".format(
+ pods="\n ".join(pod['metadata']['name'] for pod in not_running)
+ )
+ )
+
+ def _get_kibana_url(self):
+ """
+ Get kibana route or report error.
+ Returns: url
+ """
+
+ # Get logging url
+ get_route = self.exec_oc("get route logging-kibana -o json", [])
+ if not get_route:
+ raise OpenShiftCheckException(
+ 'no_route_exists',
+ 'No route is defined for Kibana in the logging namespace,\n'
+ 'so the logging stack is not accessible. Is logging deployed?\n'
+ 'Did something remove the logging-kibana route?'
+ )
+
+ try:
+ route = json.loads(get_route)
+ # check that the route has been accepted by a router
+ ingress = route["status"]["ingress"]
+ except (ValueError, KeyError):
+ raise OpenShiftCheckException(
+ 'get_route_failed',
+ '"oc get route" returned an unexpected response:\n' + get_route
+ )
+
+ # ingress can be null if there is no router, or empty if not routed
+ if not ingress or not ingress[0]:
+ raise OpenShiftCheckException(
+ 'route_not_accepted',
+ 'The logging-kibana route is not being routed by any router.\n'
+ 'Is the router deployed and working?'
+ )
+
+ host = route.get("spec", {}).get("host")
+ if not host:
+ raise OpenShiftCheckException(
+ 'route_missing_host',
+ 'The logging-kibana route has no hostname defined,\n'
+ 'which should never happen. Did something alter its definition?'
+ )
+
+ return 'https://{}/'.format(host)
+
+ def check_kibana_route(self):
+ """
+ Check to see if kibana route is up and working.
+ Raises exception if not.
+ """
+
+ kibana_url = self._get_kibana_url()
+
+ # first, check that kibana is reachable from the master.
+ error = self._verify_url_internal(kibana_url)
+ if error:
+ if 'urlopen error [Errno 111] Connection refused' in error:
+ raise OpenShiftCheckException(
+ 'FailedToConnectInternal',
+ 'Failed to connect from this master to Kibana URL {url}\n'
+ 'Is kibana running, and is at least one router routing to it?'.format(url=kibana_url)
+ )
+ elif 'urlopen error [Errno -2] Name or service not known' in error:
+ raise OpenShiftCheckException(
+ 'FailedToResolveInternal',
+ 'Failed to connect from this master to Kibana URL {url}\n'
+ 'because the hostname does not resolve.\n'
+ 'Is DNS configured for the Kibana hostname?'.format(url=kibana_url)
+ )
+ elif 'Status code was not' in error:
+ raise OpenShiftCheckException(
+ 'WrongReturnCodeInternal',
+ 'A request from this master to the Kibana URL {url}\n'
+ 'did not return the correct status code (302).\n'
+ 'This could mean that Kibana is malfunctioning, the hostname is\n'
+ 'resolving incorrectly, or other network issues. The output was:\n'
+ ' {error}'.format(url=kibana_url, error=error)
+ )
+ raise OpenShiftCheckException(
+ 'MiscRouteErrorInternal',
+ 'Error validating the logging Kibana route internally:\n' + error
+ )
+
+ # in production we would like the kibana route to work from outside the
+ # cluster too; but that may not be the case, so allow disabling just this part.
+ if self.get_var("openshift_check_efk_kibana_external", default="True").lower() != "true":
+ return
+ error = self._verify_url_external(kibana_url)
+
+ if not error:
+ return
+
+ error_fmt = (
+ 'Error validating the logging Kibana route:\n{error}\n'
+ 'To disable external Kibana route validation, set the variable:\n'
+ ' openshift_check_efk_kibana_external=False'
+ )
+ if 'urlopen error [Errno 111] Connection refused' in error:
+ msg = (
+ 'Failed to connect from the Ansible control host to Kibana URL {url}\n'
+ 'Is the router for the Kibana hostname exposed externally?'
+ ).format(url=kibana_url)
+ raise OpenShiftCheckException('FailedToConnect', error_fmt.format(error=msg))
+ elif 'urlopen error [Errno -2] Name or service not known' in error:
+ msg = (
+ 'Failed to resolve the Kibana hostname in {url}\n'
+ 'from the Ansible control host.\n'
+ 'Is DNS configured to resolve this Kibana hostname externally?'
+ ).format(url=kibana_url)
+ raise OpenShiftCheckException('FailedToResolve', error_fmt.format(error=msg))
+ elif 'Expected success (200)' in error:
+ msg = (
+ 'A request to Kibana at {url}\n'
+ 'returned the wrong error code:\n'
+ ' {error}\n'
+ 'This could mean that Kibana is malfunctioning, the hostname is\n'
+ 'resolving incorrectly, or other network issues.'
+ ).format(url=kibana_url, error=error)
+ raise OpenShiftCheckException('WrongReturnCode', error_fmt.format(error=msg))
+ raise OpenShiftCheckException(
+ 'MiscRouteError',
+ 'Error validating the logging Kibana route externally:\n' + error
+ )
diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging.py b/roles/openshift_health_checker/openshift_checks/logging/logging.py
new file mode 100644
index 000000000..05ba73ca1
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/logging.py
@@ -0,0 +1,101 @@
+"""
+Util functions for performing checks on an Elasticsearch, Fluentd, and Kibana stack
+"""
+
+import json
+import os
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
+
+
+class MissingComponentPods(OpenShiftCheckException):
+ """Raised when a component has no pods in the namespace."""
+ pass
+
+
+class CouldNotUseOc(OpenShiftCheckException):
+ """Raised when ocutil has a failure running oc."""
+ pass
+
+
+class LoggingCheck(OpenShiftCheck):
+ """Base class for OpenShift aggregated logging component checks"""
+
+ # FIXME: this should not be listed as a check, since it is not meant to be
+ # run by itself.
+
+ name = "logging"
+
+ def is_active(self):
+ logging_deployed = self.get_var("openshift_hosted_logging_deploy", convert=bool, default=False)
+ return logging_deployed and super(LoggingCheck, self).is_active() and self.is_first_master()
+
+ def run(self):
+ return {}
+
+ def get_pods_for_component(self, logging_component):
+ """Get all pods for a given component. Returns: list of pods."""
+ pod_output = self.exec_oc(
+ "get pods -l component={} -o json".format(logging_component),
+ [],
+ )
+ try:
+ pods = json.loads(pod_output) # raises ValueError if deserialize fails
+ if not pods or not pods.get('items'): # also a broken response, treat the same
+ raise ValueError()
+ except ValueError:
+ # successful run but non-parsing data generally means there were no pods to be found
+ raise MissingComponentPods(
+ 'There are no "{}" component pods in the "{}" namespace.\n'
+ 'Is logging deployed?'.format(logging_component, self.logging_namespace())
+ )
+
+ return pods['items']
+
+ @staticmethod
+ def not_running_pods(pods):
+ """Returns: list of pods not in a ready and running state"""
+ return [
+ pod for pod in pods
+ if not pod.get("status", {}).get("containerStatuses") or any(
+ container['ready'] is False
+ for container in pod['status']['containerStatuses']
+ ) or not any(
+ condition['type'] == 'Ready' and condition['status'] == 'True'
+ for condition in pod['status'].get('conditions', [])
+ )
+ ]
+
+ def logging_namespace(self):
+ """Returns the namespace in which logging is configured to deploy."""
+ return self.get_var("openshift_logging_namespace", default="logging")
+
+ def exec_oc(self, cmd_str="", extra_args=None, save_as_name=None):
+ """
+ Execute an 'oc' command in the remote host.
+ Returns: output of command and namespace,
+ or raises CouldNotUseOc on error
+ """
+ config_base = self.get_var("openshift", "common", "config_base")
+ args = {
+ "namespace": self.logging_namespace(),
+ "config_file": os.path.join(config_base, "master", "admin.kubeconfig"),
+ "cmd": cmd_str,
+ "extra_args": list(extra_args) if extra_args else [],
+ }
+
+ result = self.execute_module("ocutil", args, save_as_name=save_as_name)
+ if result.get("failed"):
+ if result['result'] == '[Errno 2] No such file or directory':
+ raise CouldNotUseOc(
+ "This host is supposed to be a master but does not have the `oc` command where expected.\n"
+ "Has an installation been run on this host yet?"
+ )
+
+ raise CouldNotUseOc(
+ 'Unexpected error using `oc` to validate the logging stack components.\n'
+ 'Error executing `oc {cmd}`:\n'
+ '{error}'.format(cmd=args['cmd'], error=result['result'])
+ )
+
+ return result.get("result", "")
diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py
new file mode 100644
index 000000000..cacdf4213
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py
@@ -0,0 +1,129 @@
+"""
+Check for ensuring logs from pods can be queried in a reasonable amount of time.
+"""
+
+import json
+import time
+
+from uuid import uuid4
+
+from openshift_checks import OpenShiftCheckException
+from openshift_checks.logging.logging import LoggingCheck
+
+
+ES_CMD_TIMEOUT_SECONDS = 30
+
+
+class LoggingIndexTime(LoggingCheck):
+ """Check that pod logs are aggregated and indexed in ElasticSearch within a reasonable amount of time."""
+ name = "logging_index_time"
+ tags = ["health", "logging"]
+
+ def run(self):
+ """Add log entry by making unique request to Kibana. Check for unique entry in the ElasticSearch pod logs."""
+ try:
+ log_index_timeout = int(
+ self.get_var("openshift_check_logging_index_timeout_seconds", default=ES_CMD_TIMEOUT_SECONDS)
+ )
+ except ValueError:
+ raise OpenShiftCheckException(
+ 'InvalidTimeout',
+ 'Invalid value provided for "openshift_check_logging_index_timeout_seconds". '
+ 'Value must be an integer representing an amount in seconds.'
+ )
+
+ running_component_pods = dict()
+
+ # get all component pods
+ for component, name in (['kibana', 'Kibana'], ['es', 'Elasticsearch']):
+ pods = self.get_pods_for_component(component)
+ running_pods = self.running_pods(pods)
+
+ if not running_pods:
+ raise OpenShiftCheckException(
+ component + 'NoRunningPods',
+ 'No {} pods in the "Running" state were found.'
+ 'At least one pod is required in order to perform this check.'.format(name)
+ )
+
+ running_component_pods[component] = running_pods
+
+ uuid = self.curl_kibana_with_uuid(running_component_pods["kibana"][0])
+ self.wait_until_cmd_or_err(running_component_pods["es"][0], uuid, log_index_timeout)
+ return {}
+
+ def wait_until_cmd_or_err(self, es_pod, uuid, timeout_secs):
+ """Retry an Elasticsearch query every second until query success, or a defined
+ length of time has passed."""
+ deadline = time.time() + timeout_secs
+ interval = 1
+ while not self.query_es_from_es(es_pod, uuid):
+ if time.time() + interval > deadline:
+ raise OpenShiftCheckException(
+ "NoMatchFound",
+ "expecting match in Elasticsearch for message with uuid {}, "
+ "but no matches were found after {}s.".format(uuid, timeout_secs)
+ )
+ time.sleep(interval)
+
+ def curl_kibana_with_uuid(self, kibana_pod):
+ """curl Kibana with a unique uuid."""
+ uuid = self.generate_uuid()
+ pod_name = kibana_pod["metadata"]["name"]
+ exec_cmd = "exec {pod_name} -c kibana -- curl --max-time 30 -s http://localhost:5601/{uuid}"
+ exec_cmd = exec_cmd.format(pod_name=pod_name, uuid=uuid)
+
+ error_str = self.exec_oc(exec_cmd, [])
+
+ try:
+ error_code = json.loads(error_str)["statusCode"]
+ except (KeyError, ValueError):
+ raise OpenShiftCheckException(
+ 'kibanaInvalidResponse',
+ 'invalid response returned from Kibana request:\n'
+ 'Command: {}\nResponse: {}'.format(exec_cmd, error_str)
+ )
+
+ if error_code != 404:
+ raise OpenShiftCheckException(
+ 'kibanaInvalidReturnCode',
+ 'invalid error code returned from Kibana request.\n'
+ 'Expecting error code "404", but got "{}" instead.'.format(error_code)
+ )
+
+ return uuid
+
+ def query_es_from_es(self, es_pod, uuid):
+ """curl the Elasticsearch pod and look for a unique uuid in its logs."""
+ pod_name = es_pod["metadata"]["name"]
+ exec_cmd = (
+ "exec {pod_name} -- curl --max-time 30 -s -f "
+ "--cacert /etc/elasticsearch/secret/admin-ca "
+ "--cert /etc/elasticsearch/secret/admin-cert "
+ "--key /etc/elasticsearch/secret/admin-key "
+ "https://logging-es:9200/project.{namespace}*/_count?q=message:{uuid}"
+ )
+ exec_cmd = exec_cmd.format(pod_name=pod_name, namespace=self.logging_namespace(), uuid=uuid)
+ result = self.exec_oc(exec_cmd, [], save_as_name="query_for_uuid.json")
+
+ try:
+ count = json.loads(result)["count"]
+ except (KeyError, ValueError):
+ raise OpenShiftCheckException(
+ 'esInvalidResponse',
+ 'Invalid response from Elasticsearch query:\n'
+ ' {}\n'
+ 'Response was:\n{}'.format(exec_cmd, result)
+ )
+
+ return count
+
+ @staticmethod
+ def running_pods(pods):
+ """Filter pods that are running."""
+ return [pod for pod in pods if pod['status']['phase'] == 'Running']
+
+ @staticmethod
+ def generate_uuid():
+ """Wrap uuid generator. Allows for testing with expected values."""
+ return str(uuid4())
diff --git a/roles/openshift_health_checker/openshift_checks/memory_availability.py b/roles/openshift_health_checker/openshift_checks/memory_availability.py
index 28805dc37..e7a8ec976 100644
--- a/roles/openshift_health_checker/openshift_checks/memory_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/memory_availability.py
@@ -1,5 +1,8 @@
-# pylint: disable=missing-docstring
-from openshift_checks import OpenShiftCheck, get_var
+"""Check that recommended memory is available."""
+from openshift_checks import OpenShiftCheck
+
+MIB = 2**20
+GIB = 2**30
class MemoryAvailability(OpenShiftCheck):
@@ -11,33 +14,36 @@ class MemoryAvailability(OpenShiftCheck):
# Values taken from the official installation documentation:
# https://docs.openshift.org/latest/install_config/install/prerequisites.html#system-requirements
recommended_memory_bytes = {
- "masters": 16 * 10**9,
- "nodes": 8 * 10**9,
- "etcd": 20 * 10**9,
+ "oo_masters_to_config": 16 * GIB,
+ "oo_nodes_to_config": 8 * GIB,
+ "oo_etcd_to_config": 8 * GIB,
}
+ # https://access.redhat.com/solutions/3006511 physical RAM is partly reserved from memtotal
+ memtotal_adjustment = 1 * GIB
- @classmethod
- def is_active(cls, task_vars):
+ def is_active(self):
"""Skip hosts that do not have recommended memory requirements."""
- group_names = get_var(task_vars, "group_names", default=[])
- has_memory_recommendation = bool(set(group_names).intersection(cls.recommended_memory_bytes))
- return super(MemoryAvailability, cls).is_active(task_vars) and has_memory_recommendation
+ group_names = self.get_var("group_names", default=[])
+ has_memory_recommendation = bool(set(group_names).intersection(self.recommended_memory_bytes))
+ return super(MemoryAvailability, self).is_active() and has_memory_recommendation
- def run(self, tmp, task_vars):
- group_names = get_var(task_vars, "group_names")
- total_memory_bytes = get_var(task_vars, "ansible_memtotal_mb") * 10**6
+ def run(self):
+ group_names = self.get_var("group_names")
+ total_memory_bytes = self.get_var("ansible_memtotal_mb") * MIB
- min_memory_bytes = max(self.recommended_memory_bytes.get(name, 0) for name in group_names)
+ recommended_min = max(self.recommended_memory_bytes.get(name, 0) for name in group_names)
+ configured_min = float(self.get_var("openshift_check_min_host_memory_gb", default=0)) * GIB
+ min_memory_bytes = configured_min or recommended_min
- if total_memory_bytes < min_memory_bytes:
+ if total_memory_bytes + self.memtotal_adjustment < min_memory_bytes:
return {
'failed': True,
'msg': (
- 'Available memory ({available:.1f} GB) '
- 'below recommended value ({recommended:.1f} GB)'
+ 'Available memory ({available:.1f} GiB) is too far '
+ 'below recommended value ({recommended:.1f} GiB)'
).format(
- available=float(total_memory_bytes) / 10**9,
- recommended=float(min_memory_bytes) / 10**9,
+ available=float(total_memory_bytes) / GIB,
+ recommended=float(min_memory_bytes) / GIB,
),
}
diff --git a/roles/openshift_health_checker/openshift_checks/mixins.py b/roles/openshift_health_checker/openshift_checks/mixins.py
index 20d160eaf..cfbdea303 100644
--- a/roles/openshift_health_checker/openshift_checks/mixins.py
+++ b/roles/openshift_health_checker/openshift_checks/mixins.py
@@ -1,15 +1,54 @@
-# pylint: disable=missing-docstring,too-few-public-methods
"""
Mixin classes meant to be used with subclasses of OpenShiftCheck.
"""
-from openshift_checks import get_var
-
class NotContainerizedMixin(object):
"""Mixin for checks that are only active when not in containerized mode."""
+ # permanent # pylint: disable=too-few-public-methods
+ # Reason: The mixin is not intended to stand on its own as a class.
+
+ def is_active(self):
+ """Only run on non-containerized hosts."""
+ is_containerized = self.get_var("openshift", "common", "is_containerized")
+ return super(NotContainerizedMixin, self).is_active() and not is_containerized
+
+
+class DockerHostMixin(object):
+ """Mixin for checks that are only active on hosts that require Docker."""
+
+ dependencies = []
+
+ def is_active(self):
+ """Only run on hosts that depend on Docker."""
+ group_names = set(self.get_var("group_names", default=[]))
+ needs_docker = set(["oo_nodes_to_config"])
+ if self.get_var("openshift.common.is_containerized"):
+ needs_docker.update(["oo_masters_to_config", "oo_etcd_to_config"])
+ return super(DockerHostMixin, self).is_active() and bool(group_names.intersection(needs_docker))
+
+ def ensure_dependencies(self):
+ """
+ Ensure that docker-related packages exist, but not on atomic hosts
+ (which would not be able to install but should already have them).
+ Returns: msg, failed
+ """
+ if self.get_var("openshift", "common", "is_atomic"):
+ return "", False
- @classmethod
- def is_active(cls, task_vars):
- is_containerized = get_var(task_vars, "openshift", "common", "is_containerized")
- return super(NotContainerizedMixin, cls).is_active(task_vars) and not is_containerized
+ # NOTE: we would use the "package" module but it's actually an action plugin
+ # and it's not clear how to invoke one of those. This is about the same anyway:
+ result = self.execute_module_with_retries(
+ self.get_var("ansible_pkg_mgr", default="yum"),
+ {"name": self.dependencies, "state": "present"},
+ )
+ msg = result.get("msg", "")
+ if result.get("failed"):
+ if "No package matching" in msg:
+ msg = "Ensure that all required dependencies can be installed via `yum`.\n"
+ msg = (
+ "Unable to install required packages on this host:\n"
+ " {deps}\n{msg}"
+ ).format(deps=',\n '.join(self.dependencies), msg=msg)
+ failed = result.get("failed", False) or result.get("rc", 0) != 0
+ return msg, failed
diff --git a/roles/openshift_health_checker/openshift_checks/ovs_version.py b/roles/openshift_health_checker/openshift_checks/ovs_version.py
new file mode 100644
index 000000000..416805c4d
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/ovs_version.py
@@ -0,0 +1,54 @@
+"""
+Ansible module for determining if an installed version of Open vSwitch is incompatible with the
+currently installed version of OpenShift.
+"""
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
+from openshift_checks.mixins import NotContainerizedMixin
+
+
+class OvsVersion(NotContainerizedMixin, OpenShiftCheck):
+ """Check that packages in a package_list are installed on the host
+ and are the correct version as determined by an OpenShift installation.
+ """
+
+ name = "ovs_version"
+ tags = ["health"]
+
+ openshift_to_ovs_version = {
+ "3.6": ["2.6", "2.7"],
+ "3.5": ["2.6", "2.7"],
+ "3.4": "2.4",
+ }
+
+ def is_active(self):
+ """Skip hosts that do not have package requirements."""
+ group_names = self.get_var("group_names", default=[])
+ master_or_node = 'oo_masters_to_config' in group_names or 'oo_nodes_to_config' in group_names
+ return super(OvsVersion, self).is_active() and master_or_node
+
+ def run(self):
+ args = {
+ "package_list": [
+ {
+ "name": "openvswitch",
+ "version": self.get_required_ovs_version(),
+ },
+ ],
+ }
+ return self.execute_module("rpm_version", args)
+
+ def get_required_ovs_version(self):
+ """Return the correct Open vSwitch version for the current OpenShift version"""
+ openshift_version_tuple = self.get_major_minor_version(self.get_var("openshift_image_tag"))
+
+ if openshift_version_tuple < (3, 5):
+ return self.openshift_to_ovs_version["3.4"]
+
+ openshift_version = ".".join(str(x) for x in openshift_version_tuple)
+ ovs_version = self.openshift_to_ovs_version.get(openshift_version)
+ if ovs_version:
+ return self.openshift_to_ovs_version[openshift_version]
+
+ msg = "There is no recommended version of Open vSwitch for the current version of OpenShift: {}"
+ raise OpenShiftCheckException(msg.format(openshift_version))
diff --git a/roles/openshift_health_checker/openshift_checks/package_availability.py b/roles/openshift_health_checker/openshift_checks/package_availability.py
index a7eb720fd..090e438ff 100644
--- a/roles/openshift_health_checker/openshift_checks/package_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/package_availability.py
@@ -1,5 +1,6 @@
-# pylint: disable=missing-docstring
-from openshift_checks import OpenShiftCheck, get_var
+"""Check that required RPM packages are available."""
+
+from openshift_checks import OpenShiftCheck
from openshift_checks.mixins import NotContainerizedMixin
@@ -9,26 +10,27 @@ class PackageAvailability(NotContainerizedMixin, OpenShiftCheck):
name = "package_availability"
tags = ["preflight"]
- @classmethod
- def is_active(cls, task_vars):
- return super(PackageAvailability, cls).is_active(task_vars) and task_vars["ansible_pkg_mgr"] == "yum"
+ def is_active(self):
+ """Run only when yum is the package manager as the code is specific to it."""
+ return super(PackageAvailability, self).is_active() and self.get_var("ansible_pkg_mgr") == "yum"
- def run(self, tmp, task_vars):
- rpm_prefix = get_var(task_vars, "openshift", "common", "service_type")
- group_names = get_var(task_vars, "group_names", default=[])
+ def run(self):
+ rpm_prefix = self.get_var("openshift", "common", "service_type")
+ group_names = self.get_var("group_names", default=[])
packages = set()
- if "masters" in group_names:
+ if "oo_masters_to_config" in group_names:
packages.update(self.master_packages(rpm_prefix))
- if "nodes" in group_names:
+ if "oo_nodes_to_config" in group_names:
packages.update(self.node_packages(rpm_prefix))
args = {"packages": sorted(set(packages))}
- return self.execute_module("check_yum_update", args, tmp, task_vars)
+ return self.execute_module_with_retries("check_yum_update", args)
@staticmethod
def master_packages(rpm_prefix):
+ """Return a list of RPMs that we expect a master install to have available."""
return [
"{rpm_prefix}".format(rpm_prefix=rpm_prefix),
"{rpm_prefix}-clients".format(rpm_prefix=rpm_prefix),
@@ -36,8 +38,7 @@ class PackageAvailability(NotContainerizedMixin, OpenShiftCheck):
"bash-completion",
"cockpit-bridge",
"cockpit-docker",
- "cockpit-kubernetes",
- "cockpit-shell",
+ "cockpit-system",
"cockpit-ws",
"etcd",
"httpd-tools",
@@ -45,6 +46,7 @@ class PackageAvailability(NotContainerizedMixin, OpenShiftCheck):
@staticmethod
def node_packages(rpm_prefix):
+ """Return a list of RPMs that we expect a node install to have available."""
return [
"{rpm_prefix}".format(rpm_prefix=rpm_prefix),
"{rpm_prefix}-node".format(rpm_prefix=rpm_prefix),
diff --git a/roles/openshift_health_checker/openshift_checks/package_update.py b/roles/openshift_health_checker/openshift_checks/package_update.py
index fd0c0a755..8464e8a5e 100644
--- a/roles/openshift_health_checker/openshift_checks/package_update.py
+++ b/roles/openshift_health_checker/openshift_checks/package_update.py
@@ -1,14 +1,14 @@
-# pylint: disable=missing-docstring
+"""Check that a yum update would not run into conflicts with available packages."""
from openshift_checks import OpenShiftCheck
from openshift_checks.mixins import NotContainerizedMixin
class PackageUpdate(NotContainerizedMixin, OpenShiftCheck):
- """Check that there are no conflicts in RPM packages."""
+ """Check that a yum update would not run into conflicts with available packages."""
name = "package_update"
tags = ["preflight"]
- def run(self, tmp, task_vars):
+ def run(self):
args = {"packages": []}
- return self.execute_module("check_yum_update", args, tmp, task_vars)
+ return self.execute_module_with_retries("check_yum_update", args)
diff --git a/roles/openshift_health_checker/openshift_checks/package_version.py b/roles/openshift_health_checker/openshift_checks/package_version.py
index 682f6bd40..2f09b22fc 100644
--- a/roles/openshift_health_checker/openshift_checks/package_version.py
+++ b/roles/openshift_health_checker/openshift_checks/package_version.py
@@ -1,5 +1,8 @@
-# pylint: disable=missing-docstring
-from openshift_checks import OpenShiftCheck, get_var
+"""Check that available RPM packages match the required versions."""
+
+import re
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
from openshift_checks.mixins import NotContainerizedMixin
@@ -9,17 +12,116 @@ class PackageVersion(NotContainerizedMixin, OpenShiftCheck):
name = "package_version"
tags = ["preflight"]
- @classmethod
- def is_active(cls, task_vars):
+ # NOTE: versions outside those specified are mapped to least/greatest
+ openshift_to_ovs_version = {
+ (3, 4): "2.4",
+ (3, 5): ["2.6", "2.7"],
+ (3, 6): ["2.6", "2.7"],
+ }
+
+ openshift_to_docker_version = {
+ (3, 1): "1.8",
+ (3, 2): "1.10",
+ (3, 3): "1.10",
+ (3, 4): "1.12",
+ (3, 5): "1.12",
+ (3, 6): "1.12",
+ }
+
+ # map major OpenShift release versions across releases to a common major version
+ map_major_release_version = {
+ 1: 3,
+ }
+
+ def is_active(self):
"""Skip hosts that do not have package requirements."""
- group_names = get_var(task_vars, "group_names", default=[])
- master_or_node = 'masters' in group_names or 'nodes' in group_names
- return super(PackageVersion, cls).is_active(task_vars) and master_or_node
+ group_names = self.get_var("group_names", default=[])
+ master_or_node = 'oo_masters_to_config' in group_names or 'oo_nodes_to_config' in group_names
+ return super(PackageVersion, self).is_active() and master_or_node
+
+ def run(self):
+ rpm_prefix = self.get_var("openshift", "common", "service_type")
+ openshift_release = self.get_var("openshift_release", default='')
+ deployment_type = self.get_var("openshift_deployment_type")
+ check_multi_minor_release = deployment_type in ['openshift-enterprise']
- def run(self, tmp, task_vars):
args = {
- "requested_openshift_release": get_var(task_vars, "openshift_release", default=''),
- "openshift_deployment_type": get_var(task_vars, "openshift_deployment_type"),
- "rpm_prefix": get_var(task_vars, "openshift", "common", "service_type"),
+ "package_mgr": self.get_var("ansible_pkg_mgr"),
+ "package_list": [
+ {
+ "name": "openvswitch",
+ "version": self.get_required_ovs_version(),
+ "check_multi": False,
+ },
+ {
+ "name": "docker",
+ "version": self.get_required_docker_version(),
+ "check_multi": False,
+ },
+ {
+ "name": "{}".format(rpm_prefix),
+ "version": openshift_release,
+ "check_multi": check_multi_minor_release,
+ },
+ {
+ "name": "{}-master".format(rpm_prefix),
+ "version": openshift_release,
+ "check_multi": check_multi_minor_release,
+ },
+ {
+ "name": "{}-node".format(rpm_prefix),
+ "version": openshift_release,
+ "check_multi": check_multi_minor_release,
+ },
+ ],
}
- return self.execute_module("aos_version", args, tmp, task_vars)
+
+ return self.execute_module_with_retries("aos_version", args)
+
+ def get_required_ovs_version(self):
+ """Return the correct Open vSwitch version(s) for the current OpenShift version."""
+ openshift_version = self.get_openshift_version_tuple()
+
+ earliest = min(self.openshift_to_ovs_version)
+ latest = max(self.openshift_to_ovs_version)
+ if openshift_version < earliest:
+ return self.openshift_to_ovs_version[earliest]
+ if openshift_version > latest:
+ return self.openshift_to_ovs_version[latest]
+
+ ovs_version = self.openshift_to_ovs_version.get(openshift_version)
+ if not ovs_version:
+ msg = "There is no recommended version of Open vSwitch for the current version of OpenShift: {}"
+ raise OpenShiftCheckException(msg.format(".".join(str(comp) for comp in openshift_version)))
+
+ return ovs_version
+
+ def get_required_docker_version(self):
+ """Return the correct Docker version(s) for the current OpenShift version."""
+ openshift_version = self.get_openshift_version_tuple()
+
+ earliest = min(self.openshift_to_docker_version)
+ latest = max(self.openshift_to_docker_version)
+ if openshift_version < earliest:
+ return self.openshift_to_docker_version[earliest]
+ if openshift_version > latest:
+ return self.openshift_to_docker_version[latest]
+
+ docker_version = self.openshift_to_docker_version.get(openshift_version)
+ if not docker_version:
+ msg = "There is no recommended version of Docker for the current version of OpenShift: {}"
+ raise OpenShiftCheckException(msg.format(".".join(str(comp) for comp in openshift_version)))
+
+ return docker_version
+
+ def get_openshift_version_tuple(self):
+ """Return received image tag as a normalized (X, Y) minor version tuple."""
+ version = self.get_var("openshift_image_tag")
+ comps = [int(component) for component in re.findall(r'\d+', version)]
+
+ if len(comps) < 2:
+ msg = "An invalid version of OpenShift was found for this host: {}"
+ raise OpenShiftCheckException(msg.format(version))
+
+ comps[0] = self.map_major_release_version.get(comps[0], comps[0])
+ return tuple(comps[0:2])
diff --git a/roles/openshift_health_checker/test/action_plugin_test.py b/roles/openshift_health_checker/test/action_plugin_test.py
index 2693ae37b..40ad27d5d 100644
--- a/roles/openshift_health_checker/test/action_plugin_test.py
+++ b/roles/openshift_health_checker/test/action_plugin_test.py
@@ -3,10 +3,12 @@ import pytest
from ansible.playbook.play_context import PlayContext
from openshift_health_check import ActionModule, resolve_checks
-from openshift_checks import OpenShiftCheckException
+from openshift_health_check import copy_remote_file_to_dir, write_result_to_output_dir, write_to_output_file
+from openshift_checks import OpenShiftCheckException, FileToSave
-def fake_check(name='fake_check', tags=None, is_active=True, run_return=None, run_exception=None):
+def fake_check(name='fake_check', tags=None, is_active=True, run_return=None, run_exception=None,
+ run_logs=None, run_files=None, changed=False, get_var_return=None):
"""Returns a new class that is compatible with OpenShiftCheck for testing."""
_name, _tags = name, tags
@@ -15,18 +17,30 @@ def fake_check(name='fake_check', tags=None, is_active=True, run_return=None, ru
name = _name
tags = _tags or []
- def __init__(self, execute_module=None):
- pass
+ def __init__(self, **_):
+ self.changed = False
+ self.failures = []
+ self.logs = run_logs or []
+ self.files_to_save = run_files or []
- @classmethod
- def is_active(cls, task_vars):
+ def is_active(self):
+ if isinstance(is_active, Exception):
+ raise is_active
return is_active
- def run(self, tmp, task_vars):
+ def run(self):
+ self.changed = changed
if run_exception is not None:
raise run_exception
return run_return
+ def get_var(*args, **_):
+ return get_var_return
+
+ def register_failure(self, exc):
+ self.failures.append(OpenShiftCheckException(str(exc)))
+ return
+
return FakeCheck
@@ -59,7 +73,7 @@ def failed(result, msg_has=None):
if msg_has is not None:
assert 'msg' in result
for term in msg_has:
- assert term in result['msg']
+ assert term.lower() in result['msg'].lower()
return result.get('failed', False)
@@ -67,6 +81,7 @@ def changed(result):
return result.get('changed', False)
+# tests whether task is skipped, not individual checks
def skipped(result):
return result.get('skipped', False)
@@ -78,7 +93,9 @@ def skipped(result):
None,
{},
])
-def test_action_plugin_missing_openshift_facts(plugin, task_vars):
+def test_action_plugin_missing_openshift_facts(plugin, task_vars, monkeypatch):
+ monkeypatch.setattr(plugin, 'load_known_checks', lambda *_: {})
+ monkeypatch.setattr('openshift_health_check.resolve_checks', lambda *args: ['fake_check'])
result = plugin.run(tmp=None, task_vars=task_vars)
assert failed(result, msg_has=['openshift_facts'])
@@ -92,25 +109,59 @@ def test_action_plugin_cannot_load_checks_with_the_same_name(plugin, task_vars,
result = plugin.run(tmp=None, task_vars=task_vars)
- assert failed(result, msg_has=['unique', 'duplicate_name', 'FakeCheck'])
+ assert failed(result, msg_has=['duplicate', 'duplicate_name', 'FakeCheck'])
-def test_action_plugin_skip_non_active_checks(plugin, task_vars, monkeypatch):
- checks = [fake_check(is_active=False)]
+@pytest.mark.parametrize('is_active, skipped_reason', [
+ (False, "Not active for this host"),
+ (Exception("borked"), "exception"),
+])
+def test_action_plugin_skip_non_active_checks(is_active, skipped_reason, plugin, task_vars, monkeypatch):
+ checks = [fake_check(is_active=is_active)]
monkeypatch.setattr('openshift_checks.OpenShiftCheck.subclasses', classmethod(lambda cls: checks))
result = plugin.run(tmp=None, task_vars=task_vars)
- assert result['checks']['fake_check'] == {'skipped': True}
+ assert result['checks']['fake_check'].get('skipped')
+ assert skipped_reason in result['checks']['fake_check'].get('skipped_reason')
assert not failed(result)
assert not changed(result)
assert not skipped(result)
+@pytest.mark.parametrize('to_disable', [
+ 'fake_check',
+ ['fake_check', 'spam'],
+ '*,spam,eggs',
+])
+def test_action_plugin_skip_disabled_checks(to_disable, plugin, task_vars, monkeypatch):
+ checks = [fake_check('fake_check', is_active=True)]
+ monkeypatch.setattr('openshift_checks.OpenShiftCheck.subclasses', classmethod(lambda cls: checks))
+
+ task_vars['openshift_disable_check'] = to_disable
+ result = plugin.run(tmp=None, task_vars=task_vars)
+
+ assert result['checks']['fake_check'] == dict(skipped=True, skipped_reason="Disabled by user request")
+ assert not failed(result)
+ assert not changed(result)
+ assert not skipped(result)
+
+
+def test_action_plugin_run_list_checks(monkeypatch):
+ task = FakeTask('openshift_health_check', {'checks': []})
+ plugin = ActionModule(task, None, PlayContext(), None, None, None)
+ monkeypatch.setattr(plugin, 'load_known_checks', lambda *_: {})
+ result = plugin.run()
+
+ assert failed(result, msg_has="Available checks")
+ assert not changed(result)
+ assert not skipped(result)
+
+
def test_action_plugin_run_check_ok(plugin, task_vars, monkeypatch):
check_return_value = {'ok': 'test'}
- check_class = fake_check(run_return=check_return_value)
- monkeypatch.setattr(plugin, 'load_known_checks', lambda: {'fake_check': check_class()})
+ check_class = fake_check(run_return=check_return_value, run_files=[None])
+ monkeypatch.setattr(plugin, 'load_known_checks', lambda *_: {'fake_check': check_class()})
monkeypatch.setattr('openshift_health_check.resolve_checks', lambda *args: ['fake_check'])
result = plugin.run(tmp=None, task_vars=task_vars)
@@ -122,23 +173,24 @@ def test_action_plugin_run_check_ok(plugin, task_vars, monkeypatch):
def test_action_plugin_run_check_changed(plugin, task_vars, monkeypatch):
- check_return_value = {'ok': 'test', 'changed': True}
- check_class = fake_check(run_return=check_return_value)
- monkeypatch.setattr(plugin, 'load_known_checks', lambda: {'fake_check': check_class()})
+ check_return_value = {'ok': 'test'}
+ check_class = fake_check(run_return=check_return_value, changed=True)
+ monkeypatch.setattr(plugin, 'load_known_checks', lambda *_: {'fake_check': check_class()})
monkeypatch.setattr('openshift_health_check.resolve_checks', lambda *args: ['fake_check'])
result = plugin.run(tmp=None, task_vars=task_vars)
assert result['checks']['fake_check'] == check_return_value
+ assert changed(result['checks']['fake_check'])
assert not failed(result)
assert changed(result)
assert not skipped(result)
def test_action_plugin_run_check_fail(plugin, task_vars, monkeypatch):
- check_return_value = {'failed': True}
+ check_return_value = {'failed': True, 'msg': 'this is a failure'}
check_class = fake_check(run_return=check_return_value)
- monkeypatch.setattr(plugin, 'load_known_checks', lambda: {'fake_check': check_class()})
+ monkeypatch.setattr(plugin, 'load_known_checks', lambda *_: {'fake_check': check_class()})
monkeypatch.setattr('openshift_health_check.resolve_checks', lambda *args: ['fake_check'])
result = plugin.run(tmp=None, task_vars=task_vars)
@@ -149,17 +201,55 @@ def test_action_plugin_run_check_fail(plugin, task_vars, monkeypatch):
assert not skipped(result)
-def test_action_plugin_run_check_exception(plugin, task_vars, monkeypatch):
+@pytest.mark.parametrize('exc_class, expect_traceback', [
+ (OpenShiftCheckException, False),
+ (Exception, True),
+])
+def test_action_plugin_run_check_exception(plugin, task_vars, exc_class, expect_traceback, monkeypatch):
exception_msg = 'fake check has an exception'
- run_exception = OpenShiftCheckException(exception_msg)
- check_class = fake_check(run_exception=run_exception)
- monkeypatch.setattr(plugin, 'load_known_checks', lambda: {'fake_check': check_class()})
+ run_exception = exc_class(exception_msg)
+ check_class = fake_check(run_exception=run_exception, changed=True)
+ monkeypatch.setattr(plugin, 'load_known_checks', lambda *_: {'fake_check': check_class()})
monkeypatch.setattr('openshift_health_check.resolve_checks', lambda *args: ['fake_check'])
result = plugin.run(tmp=None, task_vars=task_vars)
assert failed(result['checks']['fake_check'], msg_has=exception_msg)
+ assert expect_traceback == ("Traceback" in result['checks']['fake_check']['msg'])
assert failed(result, msg_has=['failed'])
+ assert changed(result['checks']['fake_check'])
+ assert changed(result)
+ assert not skipped(result)
+
+
+def test_action_plugin_run_check_output_dir(plugin, task_vars, tmpdir, monkeypatch):
+ check_class = fake_check(
+ run_return={},
+ run_logs=[('thing', 'note')],
+ run_files=[
+ FileToSave('save.file', 'contents', None),
+ FileToSave('save.file', 'duplicate', None),
+ FileToSave('copy.file', None, 'foo'), # note: copy runs execute_module => exception
+ ],
+ )
+ task_vars['openshift_checks_output_dir'] = str(tmpdir)
+ check_class.get_var = lambda self, name, **_: task_vars.get(name)
+ monkeypatch.setattr(plugin, 'load_known_checks', lambda *_: {'fake_check': check_class()})
+ monkeypatch.setattr('openshift_health_check.resolve_checks', lambda *args: ['fake_check'])
+
+ plugin.run(tmp=None, task_vars=task_vars)
+ assert any(path.basename == task_vars['ansible_host'] for path in tmpdir.listdir())
+ assert any(path.basename == 'fake_check.log.json' for path in tmpdir.visit())
+ assert any(path.basename == 'save.file' for path in tmpdir.visit())
+ assert any(path.basename == 'save.file.2' for path in tmpdir.visit())
+
+
+def test_action_plugin_resolve_checks_exception(plugin, task_vars, monkeypatch):
+ monkeypatch.setattr(plugin, 'load_known_checks', lambda *_: {})
+
+ result = plugin.run(tmp=None, task_vars=task_vars)
+
+ assert failed(result, msg_has=['unknown', 'name'])
assert not changed(result)
assert not skipped(result)
@@ -190,24 +280,21 @@ def test_resolve_checks_ok(names, all_checks, expected):
assert resolve_checks(names, all_checks) == expected
-@pytest.mark.parametrize('names,all_checks,words_in_exception,words_not_in_exception', [
+@pytest.mark.parametrize('names,all_checks,words_in_exception', [
(
['testA', 'testB'],
[],
['check', 'name', 'testA', 'testB'],
- ['tag', 'group', '@'],
),
(
['@group'],
[],
['tag', 'name', 'group'],
- ['check', '@'],
),
(
['testA', 'testB', '@group'],
[],
['check', 'name', 'testA', 'testB', 'tag', 'group'],
- ['@'],
),
(
['testA', 'testB', '@group'],
@@ -217,13 +304,45 @@ def test_resolve_checks_ok(names, all_checks, expected):
fake_check('from_group_2', ['preflight', 'group']),
],
['check', 'name', 'testA', 'testB'],
- ['tag', 'group', '@'],
),
])
-def test_resolve_checks_failure(names, all_checks, words_in_exception, words_not_in_exception):
+def test_resolve_checks_failure(names, all_checks, words_in_exception):
with pytest.raises(Exception) as excinfo:
resolve_checks(names, all_checks)
for word in words_in_exception:
assert word in str(excinfo.value)
- for word in words_not_in_exception:
- assert word not in str(excinfo.value)
+
+
+@pytest.mark.parametrize('give_output_dir, result, expect_file', [
+ (False, None, False),
+ (True, dict(content="c3BhbQo=", encoding="base64"), True),
+ (True, dict(content="encoding error", encoding="base64"), False),
+ (True, dict(content="spam", no_encoding=None), True),
+ (True, dict(failed=True, msg="could not slurp"), False),
+])
+def test_copy_remote_file_to_dir(give_output_dir, result, expect_file, tmpdir):
+ check = fake_check()()
+ check.execute_module = lambda *args, **_: result
+ copy_remote_file_to_dir(check, "remote_file", str(tmpdir) if give_output_dir else "", "local_file")
+ assert expect_file == any(path.basename == "local_file" for path in tmpdir.listdir())
+
+
+def test_write_to_output_exceptions(tmpdir, monkeypatch, capsys):
+
+ class Spam(object):
+ def __str__(self):
+ raise Exception("break str")
+
+ test = {1: object(), 2: Spam()}
+ test[3] = test
+ write_result_to_output_dir(str(tmpdir), test)
+ assert "Error writing" in test["output_files"]
+
+ output_dir = tmpdir.join("eggs")
+ output_dir.write("spam") # so now it's not a dir
+ write_to_output_file(str(output_dir), "somefile", "somedata")
+ assert "Could not write" in capsys.readouterr()[1]
+
+ monkeypatch.setattr("openshift_health_check.prepare_output_dir", lambda *_: False)
+ write_result_to_output_dir(str(tmpdir), test)
+ assert "Error creating" in test["output_files"]
diff --git a/roles/openshift_health_checker/test/aos_version_test.py b/roles/openshift_health_checker/test/aos_version_test.py
index 39c86067a..4100f6c70 100644
--- a/roles/openshift_health_checker/test/aos_version_test.py
+++ b/roles/openshift_health_checker/test/aos_version_test.py
@@ -4,104 +4,183 @@ import aos_version
from collections import namedtuple
Package = namedtuple('Package', ['name', 'version'])
-expected_pkgs = set(['spam', 'eggs'])
+expected_pkgs = {
+ "spam": {
+ "name": "spam",
+ "version": "3.2.1",
+ "check_multi": False,
+ },
+ "eggs": {
+ "name": "eggs",
+ "version": "3.2.1",
+ "check_multi": False,
+ },
+}
-@pytest.mark.parametrize('pkgs, requested_release, expect_not_found', [
+@pytest.mark.parametrize('pkgs,expected_pkgs_dict', [
(
- [],
- '3.2.1',
- expected_pkgs, # none found
+ # all found
+ [Package('spam', '3.2.1'), Package('eggs', '3.2.1')],
+ expected_pkgs,
),
(
- [Package('spam', '3.2.1')],
- '3.2',
- ['eggs'], # completely missing
+ # found with more specific version
+ [Package('spam', '3.2.1'), Package('eggs', '3.2.1.5')],
+ expected_pkgs,
),
(
- [Package('spam', '3.2.1'), Package('eggs', '3.3.2')],
- '3.2',
- ['eggs'], # not the right version
+ [Package('ovs', '2.6'), Package('ovs', '2.4')],
+ {
+ "ovs": {
+ "name": "ovs",
+ "version": ["2.6", "2.7"],
+ "check_multi": False,
+ }
+ },
),
(
- [Package('spam', '3.2.1'), Package('eggs', '3.2.1')],
- '3.2',
- [], # all found
+ [Package('ovs', '2.7')],
+ {
+ "ovs": {
+ "name": "ovs",
+ "version": ["2.6", "2.7"],
+ "check_multi": False,
+ }
+ },
),
+])
+def test_check_precise_version_found(pkgs, expected_pkgs_dict):
+ aos_version._check_precise_version_found(pkgs, expected_pkgs_dict)
+
+
+@pytest.mark.parametrize('pkgs,expect_not_found', [
(
- [Package('spam', '3.2.1'), Package('eggs', '3.2.1.5')],
- '3.2.1',
- [], # found with more specific version
+ [],
+ {
+ "spam": {
+ "name": "spam",
+ "version": "3.2.1",
+ "check_multi": False,
+ },
+ "eggs": {
+ "name": "eggs",
+ "version": "3.2.1",
+ "check_multi": False,
+ }
+ }, # none found
+ ),
+ (
+ [Package('spam', '3.2.1')],
+ {
+ "eggs": {
+ "name": "eggs",
+ "version": "3.2.1",
+ "check_multi": False,
+ }
+ }, # completely missing
+ ),
+ (
+ [Package('spam', '3.2.1'), Package('eggs', '3.3.2')],
+ {
+ "eggs": {
+ "name": "eggs",
+ "version": "3.2.1",
+ "check_multi": False,
+ }
+ }, # not the right version
),
(
[Package('eggs', '1.2.3'), Package('eggs', '3.2.1.5')],
- '3.2.1',
- ['spam'], # eggs found with multiple versions
+ {
+ "spam": {
+ "name": "spam",
+ "version": "3.2.1",
+ "check_multi": False,
+ }
+ }, # eggs found with multiple versions
),
])
-def test_check_pkgs_for_precise_version(pkgs, requested_release, expect_not_found):
- if expect_not_found:
- with pytest.raises(aos_version.PreciseVersionNotFound) as e:
- aos_version._check_precise_version_found(pkgs, expected_pkgs, requested_release)
- assert set(expect_not_found) == set(e.value.problem_pkgs)
- else:
- aos_version._check_precise_version_found(pkgs, expected_pkgs, requested_release)
+def test_check_precise_version_found_fail(pkgs, expect_not_found):
+ with pytest.raises(aos_version.PreciseVersionNotFound) as e:
+ aos_version._check_precise_version_found(pkgs, expected_pkgs)
+ assert list(expect_not_found.values()) == e.value.problem_pkgs
-@pytest.mark.parametrize('pkgs, requested_release, expect_higher', [
+@pytest.mark.parametrize('pkgs,expected_pkgs_dict', [
(
[],
- '3.2.1',
- [],
+ expected_pkgs,
),
(
- [Package('spam', '3.2.1')],
- '3.2',
- [], # more precise but not strictly higher
+ # more precise but not strictly higher
+ [Package('spam', '3.2.1.9')],
+ expected_pkgs,
+ ),
+ (
+ [Package('ovs', '2.7')],
+ {
+ "ovs": {
+ "name": "ovs",
+ "version": ["2.6", "2.7"],
+ "check_multi": False,
+ }
+ },
),
+])
+def test_check_higher_version_found(pkgs, expected_pkgs_dict):
+ aos_version._check_higher_version_found(pkgs, expected_pkgs_dict)
+
+
+@pytest.mark.parametrize('pkgs,expected_pkgs_dict,expect_higher', [
(
[Package('spam', '3.3')],
- '3.2.1',
+ expected_pkgs,
['spam-3.3'], # lower precision, but higher
),
(
[Package('spam', '3.2.1'), Package('eggs', '3.3.2')],
- '3.2',
+ expected_pkgs,
['eggs-3.3.2'], # one too high
),
(
[Package('eggs', '1.2.3'), Package('eggs', '3.2.1.5'), Package('eggs', '3.4')],
- '3.2.1',
+ expected_pkgs,
['eggs-3.4'], # multiple versions, one is higher
),
(
[Package('eggs', '3.2.1'), Package('eggs', '3.4'), Package('eggs', '3.3')],
- '3.2.1',
+ expected_pkgs,
['eggs-3.4'], # multiple versions, two are higher
),
+ (
+ [Package('ovs', '2.8')],
+ {
+ "ovs": {
+ "name": "ovs",
+ "version": ["2.6", "2.7"],
+ "check_multi": False,
+ }
+ },
+ ['ovs-2.8'],
+ ),
])
-def test_check_pkgs_for_greater_version(pkgs, requested_release, expect_higher):
- if expect_higher:
- with pytest.raises(aos_version.FoundHigherVersion) as e:
- aos_version._check_higher_version_found(pkgs, expected_pkgs, requested_release)
- assert set(expect_higher) == set(e.value.problem_pkgs)
- else:
- aos_version._check_higher_version_found(pkgs, expected_pkgs, requested_release)
+def test_check_higher_version_found_fail(pkgs, expected_pkgs_dict, expect_higher):
+ with pytest.raises(aos_version.FoundHigherVersion) as e:
+ aos_version._check_higher_version_found(pkgs, expected_pkgs_dict)
+ assert set(expect_higher) == set(e.value.problem_pkgs)
-@pytest.mark.parametrize('pkgs, expect_to_flag_pkgs', [
- (
- [],
- [],
- ),
- (
- [Package('spam', '3.2.1')],
- [],
- ),
- (
- [Package('spam', '3.2.1'), Package('eggs', '3.2.2')],
- [],
- ),
+@pytest.mark.parametrize('pkgs', [
+ [],
+ [Package('spam', '3.2.1')],
+ [Package('spam', '3.2.1'), Package('eggs', '3.2.2')],
+])
+def test_check_multi_minor_release(pkgs):
+ aos_version._check_multi_minor_release(pkgs, expected_pkgs)
+
+
+@pytest.mark.parametrize('pkgs,expect_to_flag_pkgs', [
(
[Package('spam', '3.2.1'), Package('spam', '3.3.2')],
['spam'],
@@ -111,10 +190,7 @@ def test_check_pkgs_for_greater_version(pkgs, requested_release, expect_higher):
['eggs'],
),
])
-def test_check_pkgs_for_multi_release(pkgs, expect_to_flag_pkgs):
- if expect_to_flag_pkgs:
- with pytest.raises(aos_version.FoundMultiRelease) as e:
- aos_version._check_multi_minor_release(pkgs, expected_pkgs)
- assert set(expect_to_flag_pkgs) == set(e.value.problem_pkgs)
- else:
+def test_check_multi_minor_release_fail(pkgs, expect_to_flag_pkgs):
+ with pytest.raises(aos_version.FoundMultiRelease) as e:
aos_version._check_multi_minor_release(pkgs, expected_pkgs)
+ assert set(expect_to_flag_pkgs) == set(e.value.problem_pkgs)
diff --git a/roles/openshift_health_checker/test/conftest.py b/roles/openshift_health_checker/test/conftest.py
index 3cbd65507..244a1f0fa 100644
--- a/roles/openshift_health_checker/test/conftest.py
+++ b/roles/openshift_health_checker/test/conftest.py
@@ -7,5 +7,6 @@ openshift_health_checker_path = os.path.dirname(os.path.dirname(__file__))
sys.path[1:1] = [
openshift_health_checker_path,
os.path.join(openshift_health_checker_path, 'action_plugins'),
+ os.path.join(openshift_health_checker_path, 'callback_plugins'),
os.path.join(openshift_health_checker_path, 'library'),
]
diff --git a/roles/openshift_health_checker/test/curator_test.py b/roles/openshift_health_checker/test/curator_test.py
new file mode 100644
index 000000000..62c680b74
--- /dev/null
+++ b/roles/openshift_health_checker/test/curator_test.py
@@ -0,0 +1,57 @@
+import pytest
+
+from openshift_checks.logging.curator import Curator, OpenShiftCheckException
+
+
+plain_curator_pod = {
+ "metadata": {
+ "labels": {"component": "curator", "deploymentconfig": "logging-curator"},
+ "name": "logging-curator-1",
+ },
+ "status": {
+ "containerStatuses": [{"ready": True}],
+ "conditions": [{"status": "True", "type": "Ready"}],
+ "podIP": "10.10.10.10",
+ }
+}
+
+not_running_curator_pod = {
+ "metadata": {
+ "labels": {"component": "curator", "deploymentconfig": "logging-curator"},
+ "name": "logging-curator-2",
+ },
+ "status": {
+ "containerStatuses": [{"ready": False}],
+ "conditions": [{"status": "False", "type": "Ready"}],
+ "podIP": "10.10.10.10",
+ }
+}
+
+
+def test_get_curator_pods():
+ check = Curator()
+ check.get_pods_for_component = lambda *_: [plain_curator_pod]
+ result = check.run()
+ assert "failed" not in result or not result["failed"]
+
+
+@pytest.mark.parametrize('pods, expect_error', [
+ (
+ [],
+ 'MissingComponentPods',
+ ),
+ (
+ [not_running_curator_pod],
+ 'CuratorNotRunning',
+ ),
+ (
+ [plain_curator_pod, plain_curator_pod],
+ 'TooManyCurators',
+ ),
+])
+def test_get_curator_pods_fail(pods, expect_error):
+ check = Curator()
+ check.get_pods_for_component = lambda *_: pods
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ check.run()
+ assert excinfo.value.name == expect_error
diff --git a/roles/openshift_health_checker/test/diagnostics_test.py b/roles/openshift_health_checker/test/diagnostics_test.py
new file mode 100644
index 000000000..800889fa7
--- /dev/null
+++ b/roles/openshift_health_checker/test/diagnostics_test.py
@@ -0,0 +1,50 @@
+import pytest
+
+from openshift_checks.diagnostics import DiagnosticCheck, OpenShiftCheckException
+
+
+@pytest.fixture()
+def task_vars():
+ return dict(
+ openshift=dict(
+ common=dict(config_base="/etc/origin/")
+ )
+ )
+
+
+def test_module_succeeds(task_vars):
+ check = DiagnosticCheck(lambda *_: {"result": "success"}, task_vars)
+ check.is_first_master = lambda: True
+ assert check.is_active()
+ check.exec_diagnostic("spam")
+ assert not check.failures
+
+
+def test_oc_not_there(task_vars):
+ def exec_module(*_):
+ return {"failed": True, "result": "[Errno 2] No such file or directory"}
+
+ check = DiagnosticCheck(exec_module, task_vars)
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ check.exec_diagnostic("spam")
+ assert excinfo.value.name == "OcNotFound"
+
+
+def test_module_fails(task_vars):
+ def exec_module(*_):
+ return {"failed": True, "result": "something broke"}
+
+ check = DiagnosticCheck(exec_module, task_vars)
+ check.exec_diagnostic("spam")
+ assert check.failures and check.failures[0].name == "OcDiagFailed"
+
+
+def test_names_executed(task_vars):
+ task_vars["openshift_check_diagnostics"] = diagnostics = "ConfigContexts,spam,,eggs"
+
+ def exec_module(module, args, *_):
+ assert "extra_args" in args
+ assert args["extra_args"][0] in diagnostics
+ return {"result": "success"}
+
+ DiagnosticCheck(exec_module, task_vars).run()
diff --git a/roles/openshift_health_checker/test/disk_availability_test.py b/roles/openshift_health_checker/test/disk_availability_test.py
index 970b474d7..7acdb40ec 100644
--- a/roles/openshift_health_checker/test/disk_availability_test.py
+++ b/roles/openshift_health_checker/test/disk_availability_test.py
@@ -3,95 +3,136 @@ import pytest
from openshift_checks.disk_availability import DiskAvailability, OpenShiftCheckException
-@pytest.mark.parametrize('group_names,is_containerized,is_active', [
- (['masters'], False, True),
- # ensure check is skipped on containerized installs
- (['masters'], True, False),
- (['nodes'], False, True),
- (['etcd'], False, True),
- (['masters', 'nodes'], False, True),
- (['masters', 'etcd'], False, True),
- ([], False, False),
- (['lb'], False, False),
- (['nfs'], False, False),
+@pytest.mark.parametrize('group_names,is_active', [
+ (['oo_masters_to_config'], True),
+ (['oo_nodes_to_config'], True),
+ (['oo_etcd_to_config'], True),
+ (['oo_masters_to_config', 'oo_nodes_to_config'], True),
+ (['oo_masters_to_config', 'oo_etcd_to_config'], True),
+ ([], False),
+ (['lb'], False),
+ (['nfs'], False),
])
-def test_is_active(group_names, is_containerized, is_active):
+def test_is_active(group_names, is_active):
task_vars = dict(
group_names=group_names,
- openshift=dict(common=dict(is_containerized=is_containerized)),
)
- assert DiskAvailability.is_active(task_vars=task_vars) == is_active
+ assert DiskAvailability(None, task_vars).is_active() == is_active
-@pytest.mark.parametrize('ansible_mounts,extra_words', [
- ([], ['none']), # empty ansible_mounts
- ([{'mount': '/mnt'}], ['/mnt']), # missing relevant mount paths
- ([{'mount': '/var'}], ['/var']), # missing size_available
+@pytest.mark.parametrize('desc, ansible_mounts, expect_chunks', [
+ (
+ 'empty ansible_mounts',
+ [],
+ ['determine mount point', 'none'],
+ ),
+ (
+ 'missing relevant mount paths',
+ [{'mount': '/mnt'}],
+ ['determine mount point', '/mnt'],
+ ),
+ (
+ 'missing size_available',
+ [{'mount': '/var'}, {'mount': '/usr'}, {'mount': '/tmp'}],
+ ['missing', 'size_available'],
+ ),
])
-def test_cannot_determine_available_disk(ansible_mounts, extra_words):
+def test_cannot_determine_available_disk(desc, ansible_mounts, expect_chunks):
task_vars = dict(
- group_names=['masters'],
+ group_names=['oo_masters_to_config'],
ansible_mounts=ansible_mounts,
)
- check = DiskAvailability(execute_module=fake_execute_module)
with pytest.raises(OpenShiftCheckException) as excinfo:
- check.run(tmp=None, task_vars=task_vars)
+ DiskAvailability(fake_execute_module, task_vars).run()
- for word in 'determine available disk'.split() + extra_words:
- assert word in str(excinfo.value)
+ for chunk in expect_chunks:
+ assert chunk in str(excinfo.value)
-@pytest.mark.parametrize('group_names,ansible_mounts', [
+@pytest.mark.parametrize('group_names,configured_min,ansible_mounts', [
(
- ['masters'],
+ ['oo_masters_to_config'],
+ 0,
[{
'mount': '/',
'size_available': 40 * 10**9 + 1,
}],
),
(
- ['nodes'],
+ ['oo_nodes_to_config'],
+ 0,
[{
'mount': '/',
'size_available': 15 * 10**9 + 1,
}],
),
(
- ['etcd'],
+ ['oo_etcd_to_config'],
+ 0,
[{
'mount': '/',
'size_available': 20 * 10**9 + 1,
}],
),
(
- ['etcd'],
+ ['oo_etcd_to_config'],
+ 1, # configure lower threshold
+ [{
+ 'mount': '/',
+ 'size_available': 1 * 10**9 + 1, # way smaller than recommended
+ }],
+ ),
+ (
+ ['oo_etcd_to_config'],
+ 0,
[{
# not enough space on / ...
'mount': '/',
- 'size_available': 0,
+ 'size_available': 2 * 10**9,
}, {
# ... but enough on /var
'mount': '/var',
'size_available': 20 * 10**9 + 1,
}],
),
+ (
+ ['oo_masters_to_config'],
+ 0,
+ [{
+ 'mount': '/',
+ 'size_available': 2 * 10**9,
+ }, { # not enough directly on /var
+ 'mount': '/var',
+ 'size_available': 10 * 10**9 + 1,
+ }, {
+ # but subdir mounts add up to enough
+ 'mount': '/var/lib/docker',
+ 'size_available': 20 * 10**9 + 1,
+ }, {
+ 'mount': '/var/lib/origin',
+ 'size_available': 20 * 10**9 + 1,
+ }],
+ ),
])
-def test_succeeds_with_recommended_disk_space(group_names, ansible_mounts):
+def test_succeeds_with_recommended_disk_space(group_names, configured_min, ansible_mounts):
task_vars = dict(
group_names=group_names,
+ openshift_check_min_host_disk_gb=configured_min,
ansible_mounts=ansible_mounts,
)
- check = DiskAvailability(execute_module=fake_execute_module)
- result = check.run(tmp=None, task_vars=task_vars)
+ check = DiskAvailability(fake_execute_module, task_vars)
+ check.run()
- assert not result.get('failed', False)
+ assert not check.failures
-@pytest.mark.parametrize('group_names,ansible_mounts,extra_words', [
+@pytest.mark.parametrize('name,group_names,configured_min,ansible_mounts,expect_chunks', [
(
- ['masters'],
+ 'test with no space available',
+ ['oo_masters_to_config'],
+ 0,
[{
'mount': '/',
'size_available': 1,
@@ -99,7 +140,19 @@ def test_succeeds_with_recommended_disk_space(group_names, ansible_mounts):
['0.0 GB'],
),
(
- ['nodes'],
+ 'test with a higher configured required value',
+ ['oo_masters_to_config'],
+ 100, # set a higher threshold
+ [{
+ 'mount': '/',
+ 'size_available': 50 * 10**9, # would normally be enough...
+ }],
+ ['100.0 GB'],
+ ),
+ (
+ 'test with 1GB available, but "0" GB space requirement',
+ ['oo_nodes_to_config'],
+ 0,
[{
'mount': '/',
'size_available': 1 * 10**9,
@@ -107,7 +160,9 @@ def test_succeeds_with_recommended_disk_space(group_names, ansible_mounts):
['1.0 GB'],
),
(
- ['etcd'],
+ 'test with no space available, but "0" GB space requirement',
+ ['oo_etcd_to_config'],
+ 0,
[{
'mount': '/',
'size_available': 1,
@@ -115,16 +170,19 @@ def test_succeeds_with_recommended_disk_space(group_names, ansible_mounts):
['0.0 GB'],
),
(
- ['nodes', 'masters'],
+ 'test with enough space for a node, but not for a master',
+ ['oo_nodes_to_config', 'oo_masters_to_config'],
+ 0,
[{
'mount': '/',
- # enough space for a node, not enough for a master
'size_available': 15 * 10**9 + 1,
}],
['15.0 GB'],
),
(
- ['etcd'],
+ 'test failure with enough space on "/", but not enough on "/var"',
+ ['oo_etcd_to_config'],
+ 0,
[{
# enough space on / ...
'mount': '/',
@@ -136,19 +194,74 @@ def test_succeeds_with_recommended_disk_space(group_names, ansible_mounts):
}],
['0.0 GB'],
),
-])
-def test_fails_with_insufficient_disk_space(group_names, ansible_mounts, extra_words):
+], ids=lambda argval: argval[0])
+def test_fails_with_insufficient_disk_space(name, group_names, configured_min, ansible_mounts, expect_chunks):
+ task_vars = dict(
+ group_names=group_names,
+ openshift_check_min_host_disk_gb=configured_min,
+ ansible_mounts=ansible_mounts,
+ )
+
+ check = DiskAvailability(fake_execute_module, task_vars)
+ check.run()
+
+ assert check.failures
+ for chunk in 'below recommended'.split() + expect_chunks:
+ assert chunk in str(check.failures[0])
+
+
+@pytest.mark.parametrize('name,group_names,context,ansible_mounts,failed,extra_words', [
+ (
+ 'test without enough space for master under "upgrade" context',
+ ['oo_nodes_to_config', 'oo_masters_to_config'],
+ "upgrade",
+ [{
+ 'mount': '/',
+ 'size_available': 1 * 10**9 + 1,
+ 'size_total': 21 * 10**9 + 1,
+ }],
+ True,
+ ["1.0 GB"],
+ ),
+ (
+ 'test with enough space for master under "upgrade" context',
+ ['oo_nodes_to_config', 'oo_masters_to_config'],
+ "upgrade",
+ [{
+ 'mount': '/',
+ 'size_available': 10 * 10**9 + 1,
+ 'size_total': 21 * 10**9 + 1,
+ }],
+ False,
+ [],
+ ),
+ (
+ 'test with not enough space for master, and non-upgrade context',
+ ['oo_nodes_to_config', 'oo_masters_to_config'],
+ "health",
+ [{
+ 'mount': '/',
+ # not enough space for a master,
+ # "health" context should not lower requirement
+ 'size_available': 20 * 10**9 + 1,
+ }],
+ True,
+ ["20.0 GB", "below minimum"],
+ ),
+], ids=lambda argval: argval[0])
+def test_min_required_space_changes_with_upgrade_context(name, group_names, context, ansible_mounts, failed, extra_words):
task_vars = dict(
+ r_openshift_health_checker_playbook_context=context,
group_names=group_names,
ansible_mounts=ansible_mounts,
)
- check = DiskAvailability(execute_module=fake_execute_module)
- result = check.run(tmp=None, task_vars=task_vars)
+ check = DiskAvailability(fake_execute_module, task_vars)
+ check.run()
- assert result['failed']
- for word in 'below recommended'.split() + extra_words:
- assert word in result['msg']
+ assert bool(check.failures) == failed
+ for word in extra_words:
+ assert word in str(check.failures[0])
def fake_execute_module(*args):
diff --git a/roles/openshift_health_checker/test/docker_image_availability_test.py b/roles/openshift_health_checker/test/docker_image_availability_test.py
index 2a9c32f77..dec99e5db 100644
--- a/roles/openshift_health_checker/test/docker_image_availability_test.py
+++ b/roles/openshift_health_checker/test/docker_image_availability_test.py
@@ -3,26 +3,261 @@ import pytest
from openshift_checks.docker_image_availability import DockerImageAvailability
-@pytest.mark.xfail(strict=True) # TODO: remove this once this test is fully implemented.
-@pytest.mark.parametrize('task_vars,expected_result', [
- (
- dict(
- openshift=dict(common=dict(
+@pytest.fixture()
+def task_vars():
+ return dict(
+ openshift=dict(
+ common=dict(
service_type='origin',
is_containerized=False,
- )),
- openshift_release='v3.5',
- deployment_type='origin',
- openshift_image_tag='', # FIXME: should not be required
+ is_atomic=False,
+ ),
+ docker=dict(),
+ ),
+ openshift_deployment_type='origin',
+ openshift_image_tag='',
+ group_names=['oo_nodes_to_config', 'oo_masters_to_config'],
+ )
+
+
+@pytest.mark.parametrize('deployment_type, is_containerized, group_names, expect_active', [
+ ("invalid", True, [], False),
+ ("", True, [], False),
+ ("origin", False, [], False),
+ ("openshift-enterprise", False, [], False),
+ ("origin", False, ["oo_nodes_to_config", "oo_masters_to_config"], True),
+ ("openshift-enterprise", False, ["oo_etcd_to_config"], False),
+ ("origin", True, ["nfs"], False),
+ ("openshift-enterprise", True, ["lb"], False),
+])
+def test_is_active(task_vars, deployment_type, is_containerized, group_names, expect_active):
+ task_vars['openshift_deployment_type'] = deployment_type
+ task_vars['openshift']['common']['is_containerized'] = is_containerized
+ task_vars['group_names'] = group_names
+ assert DockerImageAvailability(None, task_vars).is_active() == expect_active
+
+
+@pytest.mark.parametrize("is_containerized,is_atomic", [
+ (True, True),
+ (False, False),
+ (True, False),
+ (False, True),
+])
+def test_all_images_available_locally(task_vars, is_containerized, is_atomic):
+ def execute_module(module_name, module_args, *_):
+ if module_name == "yum":
+ return {}
+
+ assert module_name == "docker_image_facts"
+ assert 'name' in module_args
+ assert module_args['name']
+ return {
+ 'images': [module_args['name']],
+ }
+
+ task_vars['openshift']['common']['is_containerized'] = is_containerized
+ task_vars['openshift']['common']['is_atomic'] = is_atomic
+ result = DockerImageAvailability(execute_module, task_vars).run()
+
+ assert not result.get('failed', False)
+
+
+@pytest.mark.parametrize("available_locally", [
+ False,
+ True,
+])
+def test_all_images_available_remotely(task_vars, available_locally):
+ def execute_module(module_name, *_):
+ if module_name == 'docker_image_facts':
+ return {'images': [], 'failed': available_locally}
+ return {}
+
+ task_vars['openshift_docker_additional_registries'] = ["docker.io", "registry.access.redhat.com"]
+ task_vars['openshift_image_tag'] = 'v3.4'
+ check = DockerImageAvailability(execute_module, task_vars)
+ check._module_retry_interval = 0
+ result = check.run()
+
+ assert not result.get('failed', False)
+
+
+def test_all_images_unavailable(task_vars):
+ def execute_module(module_name=None, *args):
+ if module_name == "wait_for":
+ return {}
+ elif module_name == "command":
+ return {'failed': True}
+
+ return {} # docker_image_facts failure
+
+ task_vars['openshift_docker_additional_registries'] = ["docker.io"]
+ task_vars['openshift_deployment_type'] = "openshift-enterprise"
+ task_vars['openshift_image_tag'] = 'latest'
+ check = DockerImageAvailability(execute_module, task_vars)
+ check._module_retry_interval = 0
+ actual = check.run()
+
+ assert actual['failed']
+ assert "required container images are not available" in actual['msg']
+
+
+@pytest.mark.parametrize("message,extra_words", [
+ (
+ "docker image update failure",
+ ["docker image update failure"],
+ ),
+ (
+ "No package matching 'skopeo' found available, installed or updated",
+ ["dependencies can be installed via `yum`"]
+ ),
+])
+def test_skopeo_update_failure(task_vars, message, extra_words):
+ def execute_module(module_name=None, *_):
+ if module_name == "yum":
+ return {
+ "failed": True,
+ "msg": message,
+ }
+
+ return {}
+
+ task_vars['openshift_docker_additional_registries'] = ["unknown.io"]
+ task_vars['openshift_deployment_type'] = "openshift-enterprise"
+ check = DockerImageAvailability(execute_module, task_vars)
+ check._module_retry_interval = 0
+ actual = check.run()
+
+ assert actual["failed"]
+ for word in extra_words:
+ assert word in actual["msg"]
+
+
+@pytest.mark.parametrize(
+ "image, registries, connection_test_failed, skopeo_failed, "
+ "expect_success, expect_registries_reached", [
+ (
+ "spam/eggs:v1", ["test.reg"],
+ True, True,
+ False,
+ {"test.reg": False, "docker.io": False},
+ ),
+ (
+ "spam/eggs:v1", ["test.reg"],
+ False, True,
+ False,
+ {"test.reg": True, "docker.io": True},
+ ),
+ (
+ "eggs.reg/spam/eggs:v1", ["test.reg"],
+ False, False,
+ True,
+ {"eggs.reg": True},
),
- {'changed': False},
+ ])
+def test_registry_availability(image, registries, connection_test_failed, skopeo_failed,
+ expect_success, expect_registries_reached):
+ def execute_module(module_name=None, *_):
+ if module_name == "wait_for":
+ return dict(msg="msg", failed=connection_test_failed)
+ elif module_name == "command":
+ return dict(msg="msg", failed=skopeo_failed)
+
+ tv = task_vars()
+ tv.update({"openshift_docker_additional_registries": registries})
+ check = DockerImageAvailability(execute_module, tv)
+ check._module_retry_interval = 0
+
+ available = check.is_available_skopeo_image(image)
+ assert available == expect_success
+ assert expect_registries_reached == check.reachable_registries
+
+
+@pytest.mark.parametrize("deployment_type, is_containerized, groups, oreg_url, expected", [
+ ( # standard set of stuff required on nodes
+ "origin", False, ['oo_nodes_to_config'], "",
+ set([
+ 'openshift/origin-pod:vtest',
+ 'openshift/origin-deployer:vtest',
+ 'openshift/origin-docker-registry:vtest',
+ 'openshift/origin-haproxy-router:vtest',
+ 'cockpit/kubernetes', # origin version of registry-console
+ ])
+ ),
+ ( # set a different URL for images
+ "origin", False, ['oo_nodes_to_config'], 'foo.io/openshift/origin-${component}:${version}',
+ set([
+ 'foo.io/openshift/origin-pod:vtest',
+ 'foo.io/openshift/origin-deployer:vtest',
+ 'foo.io/openshift/origin-docker-registry:vtest',
+ 'foo.io/openshift/origin-haproxy-router:vtest',
+ 'cockpit/kubernetes', # AFAICS this is not built from the URL
+ ])
+ ),
+ (
+ "origin", True, ['oo_nodes_to_config', 'oo_masters_to_config', 'oo_etcd_to_config'], "",
+ set([
+ # images running on top of openshift
+ 'openshift/origin-pod:vtest',
+ 'openshift/origin-deployer:vtest',
+ 'openshift/origin-docker-registry:vtest',
+ 'openshift/origin-haproxy-router:vtest',
+ 'cockpit/kubernetes',
+ # containerized component images
+ 'openshift/origin:vtest',
+ 'openshift/node:vtest',
+ 'openshift/openvswitch:vtest',
+ 'registry.access.redhat.com/rhel7/etcd',
+ ])
+ ),
+ ( # enterprise images
+ "openshift-enterprise", True, ['oo_nodes_to_config'], 'foo.io/openshift3/ose-${component}:f13ac45',
+ set([
+ 'foo.io/openshift3/ose-pod:f13ac45',
+ 'foo.io/openshift3/ose-deployer:f13ac45',
+ 'foo.io/openshift3/ose-docker-registry:f13ac45',
+ 'foo.io/openshift3/ose-haproxy-router:f13ac45',
+ # registry-console is not constructed/versioned the same as the others.
+ 'registry.access.redhat.com/openshift3/registry-console',
+ # containerized images aren't built from oreg_url
+ 'openshift3/node:vtest',
+ 'openshift3/openvswitch:vtest',
+ ])
+ ),
+ (
+ "openshift-enterprise", True, ['oo_etcd_to_config', 'lb'], 'foo.io/openshift3/ose-${component}:f13ac45',
+ set([
+ 'registry.access.redhat.com/rhel7/etcd',
+ # lb does not yet come in a containerized version
+ ])
),
- # TODO: add more parameters here to test the multiple possible inputs that affect behavior.
+
])
-def test_docker_image_availability(task_vars, expected_result):
- def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None):
- return {'info': {}} # TODO: this will vary depending on input parameters.
+def test_required_images(deployment_type, is_containerized, groups, oreg_url, expected):
+ task_vars = dict(
+ openshift=dict(
+ common=dict(
+ is_containerized=is_containerized,
+ is_atomic=False,
+ ),
+ ),
+ openshift_deployment_type=deployment_type,
+ group_names=groups,
+ oreg_url=oreg_url,
+ openshift_image_tag='vtest',
+ )
- check = DockerImageAvailability(execute_module=execute_module)
- result = check.run(tmp=None, task_vars=task_vars)
- assert result == expected_result
+ assert expected == DockerImageAvailability(task_vars=task_vars).required_images()
+
+
+def test_containerized_etcd():
+ task_vars = dict(
+ openshift=dict(
+ common=dict(
+ is_containerized=True,
+ ),
+ ),
+ openshift_deployment_type="origin",
+ group_names=['oo_etcd_to_config'],
+ )
+ expected = set(['registry.access.redhat.com/rhel7/etcd'])
+ assert expected == DockerImageAvailability(task_vars=task_vars).required_images()
diff --git a/roles/openshift_health_checker/test/docker_storage_test.py b/roles/openshift_health_checker/test/docker_storage_test.py
new file mode 100644
index 000000000..8fa68c378
--- /dev/null
+++ b/roles/openshift_health_checker/test/docker_storage_test.py
@@ -0,0 +1,305 @@
+import pytest
+
+from openshift_checks import OpenShiftCheckException
+from openshift_checks.docker_storage import DockerStorage
+
+
+@pytest.mark.parametrize('is_containerized, group_names, is_active', [
+ (False, ["oo_masters_to_config", "oo_etcd_to_config"], False),
+ (False, ["oo_masters_to_config", "oo_nodes_to_config"], True),
+ (True, ["oo_etcd_to_config"], True),
+])
+def test_is_active(is_containerized, group_names, is_active):
+ task_vars = dict(
+ openshift=dict(common=dict(is_containerized=is_containerized)),
+ group_names=group_names,
+ )
+ assert DockerStorage(None, task_vars).is_active() == is_active
+
+
+def non_atomic_task_vars():
+ return {"openshift": {"common": {"is_atomic": False}}}
+
+
+@pytest.mark.parametrize('docker_info, failed, expect_msg', [
+ (
+ dict(failed=True, msg="Error connecting: Error while fetching server API version"),
+ True,
+ ["Is docker running on this host?"],
+ ),
+ (
+ dict(msg="I have no info"),
+ True,
+ ["missing info"],
+ ),
+ (
+ dict(info={
+ "Driver": "devicemapper",
+ "DriverStatus": [("Pool Name", "docker-docker--pool")],
+ }),
+ False,
+ [],
+ ),
+ (
+ dict(info={
+ "Driver": "devicemapper",
+ "DriverStatus": [("Data loop file", "true")],
+ }),
+ True,
+ ["loopback devices with the Docker devicemapper storage driver"],
+ ),
+ (
+ dict(info={
+ "Driver": "overlay2",
+ "DriverStatus": [("Backing Filesystem", "xfs")],
+ }),
+ False,
+ [],
+ ),
+ (
+ dict(info={
+ "Driver": "overlay",
+ "DriverStatus": [("Backing Filesystem", "btrfs")],
+ }),
+ True,
+ ["storage is type 'btrfs'", "only supported with\n'xfs'"],
+ ),
+ (
+ dict(info={
+ "Driver": "overlay2",
+ "DriverStatus": [("Backing Filesystem", "xfs")],
+ "OperatingSystem": "Red Hat Enterprise Linux Server release 7.2 (Maipo)",
+ "KernelVersion": "3.10.0-327.22.2.el7.x86_64",
+ }),
+ True,
+ ["Docker reports kernel version 3.10.0-327"],
+ ),
+ (
+ dict(info={
+ "Driver": "overlay",
+ "DriverStatus": [("Backing Filesystem", "xfs")],
+ "OperatingSystem": "CentOS",
+ "KernelVersion": "3.10.0-514",
+ }),
+ False,
+ [],
+ ),
+ (
+ dict(info={
+ "Driver": "unsupported",
+ }),
+ True,
+ ["unsupported Docker storage driver"],
+ ),
+])
+def test_check_storage_driver(docker_info, failed, expect_msg):
+ def execute_module(module_name, *_):
+ if module_name == "yum":
+ return {}
+ if module_name != "docker_info":
+ raise ValueError("not expecting module " + module_name)
+ return docker_info
+
+ check = DockerStorage(execute_module, non_atomic_task_vars())
+ check.check_dm_usage = lambda status: dict() # stub out for this test
+ check.check_overlay_usage = lambda info: dict() # stub out for this test
+ result = check.run()
+
+ if failed:
+ assert result["failed"]
+ else:
+ assert not result.get("failed", False)
+
+ for word in expect_msg:
+ assert word in result["msg"]
+
+
+enough_space = {
+ "Pool Name": "docker--vg-docker--pool",
+ "Data Space Used": "19.92 MB",
+ "Data Space Total": "8.535 GB",
+ "Metadata Space Used": "40.96 kB",
+ "Metadata Space Total": "25.17 MB",
+}
+
+not_enough_space = {
+ "Pool Name": "docker--vg-docker--pool",
+ "Data Space Used": "10 GB",
+ "Data Space Total": "10 GB",
+ "Metadata Space Used": "42 kB",
+ "Metadata Space Total": "43 kB",
+}
+
+
+@pytest.mark.parametrize('task_vars, driver_status, vg_free, success, expect_msg', [
+ (
+ {"max_thinpool_data_usage_percent": "not a float"},
+ enough_space,
+ "12g",
+ False,
+ ["is not a percentage"],
+ ),
+ (
+ {},
+ {}, # empty values from driver status
+ "bogus", # also does not parse as bytes
+ False,
+ ["Could not interpret", "as bytes"],
+ ),
+ (
+ {},
+ enough_space,
+ "12.00g",
+ True,
+ [],
+ ),
+ (
+ {},
+ not_enough_space,
+ "0.00",
+ False,
+ ["data usage", "metadata usage", "higher than threshold"],
+ ),
+])
+def test_dm_usage(task_vars, driver_status, vg_free, success, expect_msg):
+ check = DockerStorage(None, task_vars)
+ check.get_vg_free = lambda pool: vg_free
+ result = check.check_dm_usage(driver_status)
+ result_success = not result.get("failed")
+
+ assert result_success is success
+ for msg in expect_msg:
+ assert msg in result["msg"]
+
+
+@pytest.mark.parametrize('pool, command_returns, raises, returns', [
+ (
+ "foo-bar",
+ { # vgs missing
+ "msg": "[Errno 2] No such file or directory",
+ "failed": True,
+ "cmd": "/sbin/vgs",
+ "rc": 2,
+ },
+ "Failed to run /sbin/vgs",
+ None,
+ ),
+ (
+ "foo", # no hyphen in name - should not happen
+ {},
+ "name does not have the expected format",
+ None,
+ ),
+ (
+ "foo-bar",
+ dict(stdout=" 4.00g\n"),
+ None,
+ "4.00g",
+ ),
+ (
+ "foo-bar",
+ dict(stdout="\n"), # no matching VG
+ "vgs did not find this VG",
+ None,
+ )
+])
+def test_vg_free(pool, command_returns, raises, returns):
+ def execute_module(module_name, *_):
+ if module_name != "command":
+ raise ValueError("not expecting module " + module_name)
+ return command_returns
+
+ check = DockerStorage(execute_module)
+ if raises:
+ with pytest.raises(OpenShiftCheckException) as err:
+ check.get_vg_free(pool)
+ assert raises in str(err.value)
+ else:
+ ret = check.get_vg_free(pool)
+ assert ret == returns
+
+
+@pytest.mark.parametrize('string, expect_bytes', [
+ ("12", 12.0),
+ ("12 k", 12.0 * 1024),
+ ("42.42 MB", 42.42 * 1024**2),
+ ("12g", 12.0 * 1024**3),
+])
+def test_convert_to_bytes(string, expect_bytes):
+ got = DockerStorage.convert_to_bytes(string)
+ assert got == expect_bytes
+
+
+@pytest.mark.parametrize('string', [
+ "bork",
+ "42 Qs",
+])
+def test_convert_to_bytes_error(string):
+ with pytest.raises(ValueError) as err:
+ DockerStorage.convert_to_bytes(string)
+ assert "Cannot convert" in str(err.value)
+ assert string in str(err.value)
+
+
+ansible_mounts_enough = [{
+ 'mount': '/var/lib/docker',
+ 'size_available': 50 * 10**9,
+ 'size_total': 50 * 10**9,
+}]
+ansible_mounts_not_enough = [{
+ 'mount': '/var/lib/docker',
+ 'size_available': 0,
+ 'size_total': 50 * 10**9,
+}]
+ansible_mounts_missing_fields = [dict(mount='/var/lib/docker')]
+ansible_mounts_zero_size = [{
+ 'mount': '/var/lib/docker',
+ 'size_available': 0,
+ 'size_total': 0,
+}]
+
+
+@pytest.mark.parametrize('ansible_mounts, threshold, expect_fail, expect_msg', [
+ (
+ ansible_mounts_enough,
+ None,
+ False,
+ [],
+ ),
+ (
+ ansible_mounts_not_enough,
+ None,
+ True,
+ ["usage percentage", "higher than threshold"],
+ ),
+ (
+ ansible_mounts_not_enough,
+ "bogus percent",
+ True,
+ ["is not a percentage"],
+ ),
+ (
+ ansible_mounts_missing_fields,
+ None,
+ True,
+ ["Ansible bug"],
+ ),
+ (
+ ansible_mounts_zero_size,
+ None,
+ True,
+ ["Ansible bug"],
+ ),
+])
+def test_overlay_usage(ansible_mounts, threshold, expect_fail, expect_msg):
+ task_vars = non_atomic_task_vars()
+ task_vars["ansible_mounts"] = ansible_mounts
+ if threshold is not None:
+ task_vars["max_overlay_usage_percent"] = threshold
+ check = DockerStorage(None, task_vars)
+ docker_info = dict(DockerRootDir="/var/lib/docker", Driver="overlay")
+ result = check.check_overlay_usage(docker_info)
+
+ assert expect_fail == bool(result.get("failed"))
+ for msg in expect_msg:
+ assert msg in result["msg"]
diff --git a/roles/openshift_health_checker/test/elasticsearch_test.py b/roles/openshift_health_checker/test/elasticsearch_test.py
new file mode 100644
index 000000000..3fa5e8929
--- /dev/null
+++ b/roles/openshift_health_checker/test/elasticsearch_test.py
@@ -0,0 +1,203 @@
+import pytest
+import json
+
+from openshift_checks.logging.elasticsearch import Elasticsearch, OpenShiftCheckExceptionList
+
+
+task_vars_config_base = dict(openshift=dict(common=dict(config_base='/etc/origin')))
+
+
+def canned_elasticsearch(task_vars=None, exec_oc=None):
+ """Create an Elasticsearch check object with stubbed exec_oc method"""
+ check = Elasticsearch(None, task_vars or {})
+ if exec_oc:
+ check.exec_oc = exec_oc
+ return check
+
+
+def assert_error_in_list(expect_err, errorlist):
+ assert any(err.name == expect_err for err in errorlist), "{} in {}".format(str(expect_err), str(errorlist))
+
+
+def pods_by_name(pods):
+ return {pod['metadata']['name']: pod for pod in pods}
+
+
+plain_es_pod = {
+ "metadata": {
+ "labels": {"component": "es", "deploymentconfig": "logging-es"},
+ "name": "logging-es",
+ },
+ "spec": {},
+ "status": {
+ "conditions": [{"status": "True", "type": "Ready"}],
+ "containerStatuses": [{"ready": True}],
+ "podIP": "10.10.10.10",
+ },
+ "_test_master_name_str": "name logging-es",
+}
+
+split_es_pod = {
+ "metadata": {
+ "labels": {"component": "es", "deploymentconfig": "logging-es-2"},
+ "name": "logging-es-2",
+ },
+ "spec": {},
+ "status": {
+ "conditions": [{"status": "True", "type": "Ready"}],
+ "containerStatuses": [{"ready": True}],
+ "podIP": "10.10.10.10",
+ },
+ "_test_master_name_str": "name logging-es-2",
+}
+
+unready_es_pod = {
+ "metadata": {
+ "labels": {"component": "es", "deploymentconfig": "logging-es-3"},
+ "name": "logging-es-3",
+ },
+ "spec": {},
+ "status": {
+ "conditions": [{"status": "False", "type": "Ready"}],
+ "containerStatuses": [{"ready": False}],
+ "podIP": "10.10.10.10",
+ },
+ "_test_master_name_str": "BAD_NAME_RESPONSE",
+}
+
+
+def test_check_elasticsearch():
+ with pytest.raises(OpenShiftCheckExceptionList) as excinfo:
+ canned_elasticsearch().check_elasticsearch([])
+ assert_error_in_list('NoRunningPods', excinfo.value)
+
+ # canned oc responses to match so all the checks pass
+ def exec_oc(cmd, args, **_):
+ if '_cat/master' in cmd:
+ return 'name logging-es'
+ elif '/_nodes' in cmd:
+ return json.dumps(es_node_list)
+ elif '_cluster/health' in cmd:
+ return '{"status": "green"}'
+ elif ' df ' in cmd:
+ return 'IUse% Use%\n 3% 4%\n'
+ else:
+ raise Exception(cmd)
+
+ check = canned_elasticsearch({}, exec_oc)
+ check.get_pods_for_component = lambda *_: [plain_es_pod]
+ assert {} == check.run()
+
+
+def test_check_running_es_pods():
+ pods, errors = Elasticsearch().running_elasticsearch_pods([plain_es_pod, unready_es_pod])
+ assert plain_es_pod in pods
+ assert_error_in_list('PodNotRunning', errors)
+
+
+def test_check_elasticsearch_masters():
+ pods = [plain_es_pod]
+ check = canned_elasticsearch(task_vars_config_base, lambda *args, **_: plain_es_pod['_test_master_name_str'])
+ assert not check.check_elasticsearch_masters(pods_by_name(pods))
+
+
+@pytest.mark.parametrize('pods, expect_error', [
+ (
+ [],
+ 'NoMasterFound',
+ ),
+ (
+ [unready_es_pod],
+ 'NoMasterName',
+ ),
+ (
+ [plain_es_pod, split_es_pod],
+ 'SplitBrainMasters',
+ ),
+])
+def test_check_elasticsearch_masters_error(pods, expect_error):
+ test_pods = list(pods)
+ check = canned_elasticsearch(task_vars_config_base, lambda *args, **_: test_pods.pop(0)['_test_master_name_str'])
+ assert_error_in_list(expect_error, check.check_elasticsearch_masters(pods_by_name(pods)))
+
+
+es_node_list = {
+ 'nodes': {
+ 'random-es-name': {
+ 'host': 'logging-es',
+ }}}
+
+
+def test_check_elasticsearch_node_list():
+ check = canned_elasticsearch(task_vars_config_base, lambda *args, **_: json.dumps(es_node_list))
+ assert not check.check_elasticsearch_node_list(pods_by_name([plain_es_pod]))
+
+
+@pytest.mark.parametrize('pods, node_list, expect_error', [
+ (
+ [],
+ {},
+ 'MissingComponentPods',
+ ),
+ (
+ [plain_es_pod],
+ {}, # empty list of nodes triggers KeyError
+ 'MissingNodeList',
+ ),
+ (
+ [split_es_pod],
+ es_node_list,
+ 'EsPodNodeMismatch',
+ ),
+])
+def test_check_elasticsearch_node_list_errors(pods, node_list, expect_error):
+ check = canned_elasticsearch(task_vars_config_base, lambda cmd, args, **_: json.dumps(node_list))
+ assert_error_in_list(expect_error, check.check_elasticsearch_node_list(pods_by_name(pods)))
+
+
+def test_check_elasticsearch_cluster_health():
+ test_health_data = [{"status": "green"}]
+ check = canned_elasticsearch(exec_oc=lambda *args, **_: json.dumps(test_health_data.pop(0)))
+ assert not check.check_es_cluster_health(pods_by_name([plain_es_pod]))
+
+
+@pytest.mark.parametrize('pods, health_data, expect_error', [
+ (
+ [plain_es_pod],
+ [{"no-status": "should bomb"}],
+ 'BadEsResponse',
+ ),
+ (
+ [plain_es_pod, split_es_pod],
+ [{"status": "green"}, {"status": "red"}],
+ 'EsClusterHealthRed',
+ ),
+])
+def test_check_elasticsearch_cluster_health_errors(pods, health_data, expect_error):
+ test_health_data = list(health_data)
+ check = canned_elasticsearch(exec_oc=lambda *args, **_: json.dumps(test_health_data.pop(0)))
+ assert_error_in_list(expect_error, check.check_es_cluster_health(pods_by_name(pods)))
+
+
+def test_check_elasticsearch_diskspace():
+ check = canned_elasticsearch(exec_oc=lambda *args, **_: 'IUse% Use%\n 3% 4%\n')
+ assert not check.check_elasticsearch_diskspace(pods_by_name([plain_es_pod]))
+
+
+@pytest.mark.parametrize('disk_data, expect_error', [
+ (
+ 'df: /elasticsearch/persistent: No such file or directory\n',
+ 'BadDfResponse',
+ ),
+ (
+ 'IUse% Use%\n 95% 40%\n',
+ 'InodeUsageTooHigh',
+ ),
+ (
+ 'IUse% Use%\n 3% 94%\n',
+ 'DiskUsageTooHigh',
+ ),
+])
+def test_check_elasticsearch_diskspace_errors(disk_data, expect_error):
+ check = canned_elasticsearch(exec_oc=lambda *args, **_: disk_data)
+ assert_error_in_list(expect_error, check.check_elasticsearch_diskspace(pods_by_name([plain_es_pod])))
diff --git a/roles/openshift_health_checker/test/etcd_imagedata_size_test.py b/roles/openshift_health_checker/test/etcd_imagedata_size_test.py
new file mode 100644
index 000000000..d3aae98f2
--- /dev/null
+++ b/roles/openshift_health_checker/test/etcd_imagedata_size_test.py
@@ -0,0 +1,329 @@
+import pytest
+
+from collections import namedtuple
+from openshift_checks.etcd_imagedata_size import EtcdImageDataSize
+from openshift_checks import OpenShiftCheckException
+from etcdkeysize import check_etcd_key_size
+
+
+def fake_etcd_client(root):
+ fake_nodes = dict()
+ fake_etcd_node(root, fake_nodes)
+
+ clientclass = namedtuple("client", ["read"])
+ return clientclass(lambda key, recursive: fake_etcd_result(fake_nodes[key]))
+
+
+def fake_etcd_result(fake_node):
+ resultclass = namedtuple("result", ["leaves"])
+ if not fake_node.dir:
+ return resultclass([fake_node])
+
+ return resultclass(fake_node.leaves)
+
+
+def fake_etcd_node(node, visited):
+ min_req_fields = ["dir", "key"]
+ fields = list(node)
+ leaves = []
+
+ if node["dir"] and node.get("leaves"):
+ for leaf in node["leaves"]:
+ leaves.append(fake_etcd_node(leaf, visited))
+
+ if len(set(min_req_fields) - set(fields)) > 0:
+ raise ValueError("fake etcd nodes require at least {} fields.".format(min_req_fields))
+
+ if node.get("leaves"):
+ node["leaves"] = leaves
+
+ nodeclass = namedtuple("node", fields)
+ nodeinst = nodeclass(**node)
+ visited[nodeinst.key] = nodeinst
+
+ return nodeinst
+
+
+@pytest.mark.parametrize('ansible_mounts,extra_words', [
+ ([], ['none']), # empty ansible_mounts
+ ([{'mount': '/mnt'}], ['/mnt']), # missing relevant mount paths
+])
+def test_cannot_determine_available_mountpath(ansible_mounts, extra_words):
+ task_vars = dict(
+ ansible_mounts=ansible_mounts,
+ )
+ check = EtcdImageDataSize(fake_execute_module, task_vars)
+
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ check.run()
+
+ for word in ['Unable to determine mount point'] + extra_words:
+ assert word in str(excinfo.value)
+
+
+@pytest.mark.parametrize('ansible_mounts,tree,size_limit,should_fail,extra_words', [
+ (
+ # test that default image size limit evals to 1/2 * (total size in use)
+ [{
+ 'mount': '/',
+ 'size_available': 40 * 10**9,
+ 'size_total': 80 * 10**9,
+ }],
+ {"dir": False, "key": "/", "value": "1234"},
+ None,
+ False,
+ [],
+ ),
+ (
+ [{
+ 'mount': '/',
+ 'size_available': 40 * 10**9,
+ 'size_total': 48 * 10**9,
+ }],
+ {"dir": False, "key": "/", "value": "1234"},
+ None,
+ False,
+ [],
+ ),
+ (
+ # set max size limit for image data to be below total node value
+ # total node value is defined as the sum of the value field
+ # from every node
+ [{
+ 'mount': '/',
+ 'size_available': 40 * 10**9,
+ 'size_total': 48 * 10**9,
+ }],
+ {"dir": False, "key": "/", "value": "12345678"},
+ 7,
+ True,
+ ["exceeds the maximum recommended limit", "0.00 GB"],
+ ),
+ (
+ [{
+ 'mount': '/',
+ 'size_available': 48 * 10**9 - 1,
+ 'size_total': 48 * 10**9,
+ }],
+ {"dir": False, "key": "/", "value": "1234"},
+ None,
+ True,
+ ["exceeds the maximum recommended limit", "0.00 GB"],
+ )
+])
+def test_check_etcd_key_size_calculates_correct_limit(ansible_mounts, tree, size_limit, should_fail, extra_words):
+ def execute_module(module_name, module_args, *_):
+ if module_name != "etcdkeysize":
+ return {
+ "changed": False,
+ }
+
+ client = fake_etcd_client(tree)
+ s, limit_exceeded = check_etcd_key_size(client, tree["key"], module_args["size_limit_bytes"])
+
+ return {"size_limit_exceeded": limit_exceeded}
+
+ task_vars = dict(
+ etcd_max_image_data_size_bytes=size_limit,
+ ansible_mounts=ansible_mounts,
+ openshift=dict(
+ master=dict(etcd_hosts=["localhost"]),
+ common=dict(config_base="/var/lib/origin")
+ )
+ )
+ if size_limit is None:
+ task_vars.pop("etcd_max_image_data_size_bytes")
+
+ check = EtcdImageDataSize(execute_module, task_vars).run()
+
+ if should_fail:
+ assert check["failed"]
+
+ for word in extra_words:
+ assert word in check["msg"]
+ else:
+ assert not check.get("failed", False)
+
+
+@pytest.mark.parametrize('ansible_mounts,tree,root_path,expected_size,extra_words', [
+ (
+ [{
+ 'mount': '/',
+ 'size_available': 40 * 10**9,
+ 'size_total': 80 * 10**9,
+ }],
+ # test recursive size check on tree with height > 1
+ {
+ "dir": True,
+ "key": "/",
+ "leaves": [
+ {"dir": False, "key": "/foo1", "value": "1234"},
+ {"dir": False, "key": "/foo2", "value": "1234"},
+ {"dir": False, "key": "/foo3", "value": "1234"},
+ {"dir": False, "key": "/foo4", "value": "1234"},
+ {
+ "dir": True,
+ "key": "/foo5",
+ "leaves": [
+ {"dir": False, "key": "/foo/bar1", "value": "56789"},
+ {"dir": False, "key": "/foo/bar2", "value": "56789"},
+ {"dir": False, "key": "/foo/bar3", "value": "56789"},
+ {
+ "dir": True,
+ "key": "/foo/bar4",
+ "leaves": [
+ {"dir": False, "key": "/foo/bar/baz1", "value": "123"},
+ {"dir": False, "key": "/foo/bar/baz2", "value": "123"},
+ ]
+ },
+ ]
+ },
+ ]
+ },
+ "/",
+ 37,
+ [],
+ ),
+ (
+ [{
+ 'mount': '/',
+ 'size_available': 40 * 10**9,
+ 'size_total': 80 * 10**9,
+ }],
+ # test correct sub-tree size calculation
+ {
+ "dir": True,
+ "key": "/",
+ "leaves": [
+ {"dir": False, "key": "/foo1", "value": "1234"},
+ {"dir": False, "key": "/foo2", "value": "1234"},
+ {"dir": False, "key": "/foo3", "value": "1234"},
+ {"dir": False, "key": "/foo4", "value": "1234"},
+ {
+ "dir": True,
+ "key": "/foo5",
+ "leaves": [
+ {"dir": False, "key": "/foo/bar1", "value": "56789"},
+ {"dir": False, "key": "/foo/bar2", "value": "56789"},
+ {"dir": False, "key": "/foo/bar3", "value": "56789"},
+ {
+ "dir": True,
+ "key": "/foo/bar4",
+ "leaves": [
+ {"dir": False, "key": "/foo/bar/baz1", "value": "123"},
+ {"dir": False, "key": "/foo/bar/baz2", "value": "123"},
+ ]
+ },
+ ]
+ },
+ ]
+ },
+ "/foo5",
+ 21,
+ [],
+ ),
+ (
+ [{
+ 'mount': '/',
+ 'size_available': 40 * 10**9,
+ 'size_total': 80 * 10**9,
+ }],
+ # test that a non-existing key is handled correctly
+ {
+ "dir": False,
+ "key": "/",
+ "value": "1234",
+ },
+ "/missing",
+ 0,
+ [],
+ ),
+ (
+ [{
+ 'mount': '/',
+ 'size_available': 40 * 10**9,
+ 'size_total': 80 * 10**9,
+ }],
+ # test etcd cycle handling
+ {
+ "dir": True,
+ "key": "/",
+ "leaves": [
+ {"dir": False, "key": "/foo1", "value": "1234"},
+ {"dir": False, "key": "/foo2", "value": "1234"},
+ {"dir": False, "key": "/foo3", "value": "1234"},
+ {"dir": False, "key": "/foo4", "value": "1234"},
+ {
+ "dir": True,
+ "key": "/",
+ "leaves": [
+ {"dir": False, "key": "/foo1", "value": "1"},
+ ],
+ },
+ ]
+ },
+ "/",
+ 16,
+ [],
+ ),
+])
+def test_etcd_key_size_check_calculates_correct_size(ansible_mounts, tree, root_path, expected_size, extra_words):
+ def execute_module(module_name, module_args, *_):
+ if module_name != "etcdkeysize":
+ return {
+ "changed": False,
+ }
+
+ client = fake_etcd_client(tree)
+ size, limit_exceeded = check_etcd_key_size(client, root_path, module_args["size_limit_bytes"])
+
+ assert size == expected_size
+ return {
+ "size_limit_exceeded": limit_exceeded,
+ }
+
+ task_vars = dict(
+ ansible_mounts=ansible_mounts,
+ openshift=dict(
+ master=dict(etcd_hosts=["localhost"]),
+ common=dict(config_base="/var/lib/origin")
+ )
+ )
+
+ check = EtcdImageDataSize(execute_module, task_vars).run()
+ assert not check.get("failed", False)
+
+
+def test_etcdkeysize_module_failure():
+ def execute_module(module_name, *_):
+ if module_name != "etcdkeysize":
+ return {
+ "changed": False,
+ }
+
+ return {
+ "rc": 1,
+ "module_stderr": "failure",
+ }
+
+ task_vars = dict(
+ ansible_mounts=[{
+ 'mount': '/',
+ 'size_available': 40 * 10**9,
+ 'size_total': 80 * 10**9,
+ }],
+ openshift=dict(
+ master=dict(etcd_hosts=["localhost"]),
+ common=dict(config_base="/var/lib/origin")
+ )
+ )
+
+ check = EtcdImageDataSize(execute_module, task_vars).run()
+
+ assert check["failed"]
+ for word in "Failed to retrieve stats":
+ assert word in check["msg"]
+
+
+def fake_execute_module(*args):
+ raise AssertionError('this function should not be called')
diff --git a/roles/openshift_health_checker/test/etcd_traffic_test.py b/roles/openshift_health_checker/test/etcd_traffic_test.py
new file mode 100644
index 000000000..dd6f4ad81
--- /dev/null
+++ b/roles/openshift_health_checker/test/etcd_traffic_test.py
@@ -0,0 +1,72 @@
+import pytest
+
+from openshift_checks.etcd_traffic import EtcdTraffic
+
+
+@pytest.mark.parametrize('group_names,version,is_active', [
+ (['oo_masters_to_config'], "3.5", False),
+ (['oo_masters_to_config'], "3.6", False),
+ (['oo_nodes_to_config'], "3.4", False),
+ (['oo_etcd_to_config'], "3.4", True),
+ (['oo_etcd_to_config'], "1.5", True),
+ (['oo_etcd_to_config'], "3.1", False),
+ (['oo_masters_to_config', 'oo_nodes_to_config'], "3.5", False),
+ (['oo_masters_to_config', 'oo_etcd_to_config'], "3.5", True),
+ ([], "3.4", False),
+])
+def test_is_active(group_names, version, is_active):
+ task_vars = dict(
+ group_names=group_names,
+ openshift_image_tag=version,
+ )
+ assert EtcdTraffic(task_vars=task_vars).is_active() == is_active
+
+
+@pytest.mark.parametrize('group_names,matched,failed,extra_words', [
+ (["oo_masters_to_config"], True, True, ["Higher than normal", "traffic"]),
+ (["oo_masters_to_config", "oo_etcd_to_config"], False, False, []),
+ (["oo_etcd_to_config"], False, False, []),
+])
+def test_log_matches_high_traffic_msg(group_names, matched, failed, extra_words):
+ def execute_module(module_name, *_):
+ return {
+ "matched": matched,
+ "failed": failed,
+ }
+
+ task_vars = dict(
+ group_names=group_names,
+ openshift=dict(
+ common=dict(service_type="origin", is_containerized=False),
+ )
+ )
+
+ result = EtcdTraffic(execute_module, task_vars).run()
+
+ for word in extra_words:
+ assert word in result.get("msg", "")
+
+ assert result.get("failed", False) == failed
+
+
+@pytest.mark.parametrize('is_containerized,expected_unit_value', [
+ (False, "etcd"),
+ (True, "etcd_container"),
+])
+def test_systemd_unit_matches_deployment_type(is_containerized, expected_unit_value):
+ task_vars = dict(
+ openshift=dict(
+ common=dict(is_containerized=is_containerized),
+ )
+ )
+
+ def execute_module(module_name, args, *_):
+ assert module_name == "search_journalctl"
+ matchers = args["log_matchers"]
+
+ for matcher in matchers:
+ assert matcher["unit"] == expected_unit_value
+
+ return {"failed": False}
+
+ EtcdTraffic(execute_module, task_vars).run()
diff --git a/roles/openshift_health_checker/test/etcd_volume_test.py b/roles/openshift_health_checker/test/etcd_volume_test.py
new file mode 100644
index 000000000..077cea3ea
--- /dev/null
+++ b/roles/openshift_health_checker/test/etcd_volume_test.py
@@ -0,0 +1,147 @@
+import pytest
+
+from openshift_checks.etcd_volume import EtcdVolume
+from openshift_checks import OpenShiftCheckException
+
+
+@pytest.mark.parametrize('ansible_mounts,extra_words', [
+ ([], ['none']), # empty ansible_mounts
+ ([{'mount': '/mnt'}], ['/mnt']), # missing relevant mount paths
+])
+def test_cannot_determine_available_disk(ansible_mounts, extra_words):
+ task_vars = dict(
+ ansible_mounts=ansible_mounts,
+ )
+
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ EtcdVolume(fake_execute_module, task_vars).run()
+
+ for word in ['Unable to determine mount point'] + extra_words:
+ assert word in str(excinfo.value)
+
+
+@pytest.mark.parametrize('size_limit,ansible_mounts', [
+ (
+ # if no size limit is specified, expect max usage
+ # limit to default to 90% of size_total
+ None,
+ [{
+ 'mount': '/',
+ 'size_available': 40 * 10**9,
+ 'size_total': 80 * 10**9
+ }],
+ ),
+ (
+ 1,
+ [{
+ 'mount': '/',
+ 'size_available': 30 * 10**9,
+ 'size_total': 30 * 10**9,
+ }],
+ ),
+ (
+ 20000000000,
+ [{
+ 'mount': '/',
+ 'size_available': 20 * 10**9,
+ 'size_total': 40 * 10**9,
+ }],
+ ),
+ (
+ 5000000000,
+ [{
+ # not enough space on / ...
+ 'mount': '/',
+ 'size_available': 0,
+ 'size_total': 0,
+ }, {
+ # not enough space on /var/lib ...
+ 'mount': '/var/lib',
+ 'size_available': 2 * 10**9,
+ 'size_total': 21 * 10**9,
+ }, {
+ # ... but enough on /var/lib/etcd
+ 'mount': '/var/lib/etcd',
+ 'size_available': 36 * 10**9,
+ 'size_total': 40 * 10**9
+ }],
+ )
+])
+def test_succeeds_with_recommended_disk_space(size_limit, ansible_mounts):
+ task_vars = dict(
+ etcd_device_usage_threshold_percent=size_limit,
+ ansible_mounts=ansible_mounts,
+ )
+
+ if task_vars["etcd_device_usage_threshold_percent"] is None:
+ task_vars.pop("etcd_device_usage_threshold_percent")
+
+ result = EtcdVolume(fake_execute_module, task_vars).run()
+
+ assert not result.get('failed', False)
+
+
+@pytest.mark.parametrize('size_limit_percent,ansible_mounts,extra_words', [
+ (
+ # if no size limit is specified, expect max usage
+ # limit to default to 90% of size_total
+ None,
+ [{
+ 'mount': '/',
+ 'size_available': 1 * 10**9,
+ 'size_total': 100 * 10**9,
+ }],
+ ['99.0%'],
+ ),
+ (
+ 70.0,
+ [{
+ 'mount': '/',
+ 'size_available': 1 * 10**6,
+ 'size_total': 5 * 10**9,
+ }],
+ ['100.0%'],
+ ),
+ (
+ 40.0,
+ [{
+ 'mount': '/',
+ 'size_available': 2 * 10**9,
+ 'size_total': 6 * 10**9,
+ }],
+ ['66.7%'],
+ ),
+ (
+ None,
+ [{
+ # enough space on /var ...
+ 'mount': '/var',
+ 'size_available': 20 * 10**9,
+ 'size_total': 20 * 10**9,
+ }, {
+ # .. but not enough on /var/lib
+ 'mount': '/var/lib',
+ 'size_available': 1 * 10**9,
+ 'size_total': 20 * 10**9,
+ }],
+ ['95.0%'],
+ ),
+])
+def test_fails_with_insufficient_disk_space(size_limit_percent, ansible_mounts, extra_words):
+ task_vars = dict(
+ etcd_device_usage_threshold_percent=size_limit_percent,
+ ansible_mounts=ansible_mounts,
+ )
+
+ if task_vars["etcd_device_usage_threshold_percent"] is None:
+ task_vars.pop("etcd_device_usage_threshold_percent")
+
+ result = EtcdVolume(fake_execute_module, task_vars).run()
+
+ assert result['failed']
+ for word in extra_words:
+ assert word in result['msg']
+
+
+def fake_execute_module(*args):
+ raise AssertionError('this function should not be called')
diff --git a/roles/openshift_health_checker/test/fluentd_config_test.py b/roles/openshift_health_checker/test/fluentd_config_test.py
new file mode 100644
index 000000000..b5b4858d6
--- /dev/null
+++ b/roles/openshift_health_checker/test/fluentd_config_test.py
@@ -0,0 +1,348 @@
+import pytest
+
+from openshift_checks.logging.fluentd_config import FluentdConfig, OpenShiftCheckException
+
+
+def canned_fluentd_pod(containers):
+ return {
+ "metadata": {
+ "labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"},
+ "name": "logging-fluentd-1",
+ },
+ "spec": {
+ "host": "node1",
+ "nodeName": "node1",
+ "containers": containers,
+ },
+ "status": {
+ "phase": "Running",
+ "containerStatuses": [{"ready": True}],
+ "conditions": [{"status": "True", "type": "Ready"}],
+ }
+ }
+
+
+fluentd_pod = {
+ "metadata": {
+ "labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"},
+ "name": "logging-fluentd-1",
+ },
+ "spec": {
+ "host": "node1",
+ "nodeName": "node1",
+ "containers": [
+ {
+ "name": "container1",
+ "env": [
+ {
+ "name": "USE_JOURNAL",
+ "value": "true",
+ }
+ ],
+ }
+ ],
+ },
+ "status": {
+ "phase": "Running",
+ "containerStatuses": [{"ready": True}],
+ "conditions": [{"status": "True", "type": "Ready"}],
+ }
+}
+
+not_running_fluentd_pod = {
+ "metadata": {
+ "labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"},
+ "name": "logging-fluentd-2",
+ },
+ "status": {
+ "phase": "Unknown",
+ "containerStatuses": [{"ready": True}, {"ready": False}],
+ "conditions": [{"status": "True", "type": "Ready"}],
+ }
+}
+
+
+@pytest.mark.parametrize('name, use_journald, logging_driver, extra_words', [
+ (
+ 'test success with use_journald=false, and docker config set to use "json-file"',
+ False,
+ "json-file",
+ [],
+ ),
+], ids=lambda argvals: argvals[0])
+def test_check_logging_config_non_master(name, use_journald, logging_driver, extra_words):
+ def execute_module(module_name, args):
+ if module_name == "docker_info":
+ return {
+ "info": {
+ "LoggingDriver": logging_driver,
+ }
+ }
+
+ return {}
+
+ task_vars = dict(
+ group_names=["oo_nodes_to_config", "oo_etcd_to_config"],
+ openshift_logging_fluentd_use_journal=use_journald,
+ openshift=dict(
+ common=dict(config_base=""),
+ ),
+ )
+
+ check = FluentdConfig(execute_module, task_vars)
+ check.execute_module = execute_module
+ error = check.check_logging_config()
+
+ assert error is None
+
+
+@pytest.mark.parametrize('name, use_journald, logging_driver, words', [
+ (
+ 'test failure with use_journald=false, but docker config set to use "journald"',
+ False,
+ "journald",
+ ['json log files', 'has been set to use "journald"'],
+ ),
+ (
+ 'test failure with use_journald=false, but docker config set to use an "unsupported" driver',
+ False,
+ "unsupported",
+ ["json log files", 'has been set to use "unsupported"'],
+ ),
+ (
+ 'test failure with use_journald=true, but docker config set to use "json-file"',
+ True,
+ "json-file",
+ ['logs from "journald"', 'has been set to use "json-file"'],
+ ),
+], ids=lambda argvals: argvals[0])
+def test_check_logging_config_non_master_failed(name, use_journald, logging_driver, words):
+ def execute_module(module_name, args):
+ if module_name == "docker_info":
+ return {
+ "info": {
+ "LoggingDriver": logging_driver,
+ }
+ }
+
+ return {}
+
+ task_vars = dict(
+ group_names=["oo_nodes_to_config", "oo_etcd_to_config"],
+ openshift_logging_fluentd_use_journal=use_journald,
+ openshift=dict(
+ common=dict(config_base=""),
+ ),
+ )
+
+ check = FluentdConfig(execute_module, task_vars)
+ check.execute_module = execute_module
+ error = check.check_logging_config()
+
+ assert error is not None
+ for word in words:
+ assert word in error
+
+
+@pytest.mark.parametrize('name, pods, logging_driver, extra_words', [
+ # use_journald returns false (not using journald), but check succeeds
+ # since docker is set to use json-file
+ (
+ 'test success with use_journald=false, and docker config set to use default driver "json-file"',
+ [canned_fluentd_pod(
+ [
+ {
+ "name": "container1",
+ "env": [{
+ "name": "USE_JOURNAL",
+ "value": "false",
+ }],
+ },
+ ]
+ )],
+ "json-file",
+ [],
+ ),
+ (
+ 'test success with USE_JOURNAL env var missing and docker config set to use default driver "json-file"',
+ [canned_fluentd_pod(
+ [
+ {
+ "name": "container1",
+ "env": [{
+ "name": "RANDOM",
+ "value": "value",
+ }],
+ },
+ ]
+ )],
+ "json-file",
+ [],
+ ),
+], ids=lambda argvals: argvals[0])
+def test_check_logging_config_master(name, pods, logging_driver, extra_words):
+ def execute_module(module_name, args):
+ if module_name == "docker_info":
+ return {
+ "info": {
+ "LoggingDriver": logging_driver,
+ }
+ }
+
+ return {}
+
+ task_vars = dict(
+ group_names=["oo_masters_to_config"],
+ openshift=dict(
+ common=dict(config_base=""),
+ ),
+ )
+
+ check = FluentdConfig(execute_module, task_vars)
+ check.execute_module = execute_module
+ check.get_pods_for_component = lambda _: pods
+ error = check.check_logging_config()
+
+ assert error is None
+
+
+@pytest.mark.parametrize('name, pods, logging_driver, words', [
+ (
+ 'test failure with use_journald=false, but docker config set to use "journald"',
+ [canned_fluentd_pod(
+ [
+ {
+ "name": "container1",
+ "env": [{
+ "name": "USE_JOURNAL",
+ "value": "false",
+ }],
+ },
+ ]
+ )],
+ "journald",
+ ['json log files', 'has been set to use "journald"'],
+ ),
+ (
+ 'test failure with use_journald=true, but docker config set to use "json-file"',
+ [fluentd_pod],
+ "json-file",
+ ['logs from "journald"', 'has been set to use "json-file"'],
+ ),
+ (
+ 'test failure with use_journald=false, but docker set to use an "unsupported" driver',
+ [canned_fluentd_pod(
+ [
+ {
+ "name": "container1",
+ "env": [{
+ "name": "USE_JOURNAL",
+ "value": "false",
+ }],
+ },
+ ]
+ )],
+ "unsupported",
+ ["json log files", 'has been set to use "unsupported"'],
+ ),
+ (
+ 'test failure with USE_JOURNAL env var missing and docker config set to use "journald"',
+ [canned_fluentd_pod(
+ [
+ {
+ "name": "container1",
+ "env": [{
+ "name": "RANDOM",
+ "value": "value",
+ }],
+ },
+ ]
+ )],
+ "journald",
+ ["configuration is set to", "json log files"],
+ ),
+], ids=lambda argvals: argvals[0])
+def test_check_logging_config_master_failed(name, pods, logging_driver, words):
+ def execute_module(module_name, args):
+ if module_name == "docker_info":
+ return {
+ "info": {
+ "LoggingDriver": logging_driver,
+ }
+ }
+
+ return {}
+
+ task_vars = dict(
+ group_names=["oo_masters_to_config"],
+ openshift=dict(
+ common=dict(config_base=""),
+ ),
+ )
+
+ check = FluentdConfig(execute_module, task_vars)
+ check.execute_module = execute_module
+ check.get_pods_for_component = lambda _: pods
+ error = check.check_logging_config()
+
+ assert error is not None
+ for word in words:
+ assert word in error
+
+
+@pytest.mark.parametrize('name, pods, response, logging_driver, extra_words', [
+ (
+ 'test OpenShiftCheckException with no running containers',
+ [canned_fluentd_pod([])],
+ {
+ "failed": True,
+ "result": "unexpected",
+ },
+ "json-file",
+ ['no running containers'],
+ ),
+ (
+ 'test OpenShiftCheckException one container and no env vars set',
+ [canned_fluentd_pod(
+ [
+ {
+ "name": "container1",
+ "env": [],
+ },
+ ]
+ )],
+ {
+ "failed": True,
+ "result": "unexpected",
+ },
+ "json-file",
+ ['no environment variables'],
+ ),
+], ids=lambda argvals: argvals[0])
+def test_check_logging_config_master_fails_on_unscheduled_deployment(name, pods, response, logging_driver, extra_words):
+ def execute_module(module_name, args):
+ if module_name == "docker_info":
+ return {
+ "info": {
+ "LoggingDriver": logging_driver,
+ }
+ }
+
+ return {}
+
+ task_vars = dict(
+ group_names=["oo_masters_to_config"],
+ openshift=dict(
+ common=dict(config_base=""),
+ ),
+ )
+
+ check = FluentdConfig(execute_module, task_vars)
+ check.get_pods_for_component = lambda _: pods
+
+ with pytest.raises(OpenShiftCheckException) as error:
+ check.check_logging_config()
+
+ assert error is not None
+ for word in extra_words:
+ assert word in str(error)
diff --git a/roles/openshift_health_checker/test/fluentd_test.py b/roles/openshift_health_checker/test/fluentd_test.py
new file mode 100644
index 000000000..e7bf9818b
--- /dev/null
+++ b/roles/openshift_health_checker/test/fluentd_test.py
@@ -0,0 +1,112 @@
+import pytest
+import json
+
+from openshift_checks.logging.fluentd import Fluentd, OpenShiftCheckExceptionList, OpenShiftCheckException
+
+
+def assert_error_in_list(expect_err, errorlist):
+ assert any(err.name == expect_err for err in errorlist), "{} in {}".format(str(expect_err), str(errorlist))
+
+
+fluentd_pod_node1 = {
+ "metadata": {
+ "labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"},
+ "name": "logging-fluentd-1",
+ },
+ "spec": {"host": "node1", "nodeName": "node1"},
+ "status": {
+ "containerStatuses": [{"ready": True}],
+ "conditions": [{"status": "True", "type": "Ready"}],
+ }
+}
+fluentd_pod_node2_down = {
+ "metadata": {
+ "labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"},
+ "name": "logging-fluentd-2",
+ },
+ "spec": {"host": "node2", "nodeName": "node2"},
+ "status": {
+ "containerStatuses": [{"ready": False}],
+ "conditions": [{"status": "False", "type": "Ready"}],
+ }
+}
+fluentd_node1 = {
+ "metadata": {
+ "labels": {"logging-infra-fluentd": "true", "kubernetes.io/hostname": "node1"},
+ "name": "node1",
+ },
+ "status": {"addresses": [{"type": "InternalIP", "address": "10.10.1.1"}]},
+}
+fluentd_node2 = {
+ "metadata": {
+ "labels": {"logging-infra-fluentd": "true", "kubernetes.io/hostname": "hostname"},
+ "name": "node2",
+ },
+ "status": {"addresses": [{"type": "InternalIP", "address": "10.10.1.2"}]},
+}
+fluentd_node3_unlabeled = {
+ "metadata": {
+ "labels": {"kubernetes.io/hostname": "hostname"},
+ "name": "node3",
+ },
+ "status": {"addresses": [{"type": "InternalIP", "address": "10.10.1.3"}]},
+}
+
+
+def test_get_fluentd_pods():
+ check = Fluentd()
+ check.exec_oc = lambda *_: json.dumps(dict(items=[fluentd_node1]))
+ check.get_pods_for_component = lambda *_: [fluentd_pod_node1]
+ assert not check.run()
+
+
+@pytest.mark.parametrize('pods, nodes, expect_error', [
+ (
+ [],
+ [],
+ 'NoNodesDefined',
+ ),
+ (
+ [],
+ [fluentd_node3_unlabeled],
+ 'NoNodesLabeled',
+ ),
+ (
+ [],
+ [fluentd_node1, fluentd_node3_unlabeled],
+ 'NodesUnlabeled',
+ ),
+ (
+ [],
+ [fluentd_node2],
+ 'MissingFluentdPod',
+ ),
+ (
+ [fluentd_pod_node1, fluentd_pod_node1],
+ [fluentd_node1],
+ 'TooManyFluentdPods',
+ ),
+ (
+ [fluentd_pod_node2_down],
+ [fluentd_node2],
+ 'FluentdNotRunning',
+ ),
+])
+def test_get_fluentd_pods_errors(pods, nodes, expect_error):
+ check = Fluentd()
+ check.exec_oc = lambda *_: json.dumps(dict(items=nodes))
+
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ check.check_fluentd(pods)
+ if isinstance(excinfo.value, OpenShiftCheckExceptionList):
+ assert_error_in_list(expect_error, excinfo.value)
+ else:
+ assert expect_error == excinfo.value.name
+
+
+def test_bad_oc_node_list():
+ check = Fluentd()
+ check.exec_oc = lambda *_: "this isn't even json"
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ check.get_nodes_by_name()
+ assert 'BadOcNodeList' == excinfo.value.name
diff --git a/roles/openshift_health_checker/test/kibana_test.py b/roles/openshift_health_checker/test/kibana_test.py
new file mode 100644
index 000000000..04a5e89c4
--- /dev/null
+++ b/roles/openshift_health_checker/test/kibana_test.py
@@ -0,0 +1,244 @@
+import pytest
+import json
+
+try:
+ import urllib2
+ from urllib2 import HTTPError, URLError
+except ImportError:
+ from urllib.error import HTTPError, URLError
+ import urllib.request as urllib2
+
+from openshift_checks.logging.kibana import Kibana, OpenShiftCheckException
+
+
+plain_kibana_pod = {
+ "metadata": {
+ "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"},
+ "name": "logging-kibana-1",
+ },
+ "status": {
+ "containerStatuses": [{"ready": True}, {"ready": True}],
+ "conditions": [{"status": "True", "type": "Ready"}],
+ }
+}
+not_running_kibana_pod = {
+ "metadata": {
+ "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"},
+ "name": "logging-kibana-2",
+ },
+ "status": {
+ "containerStatuses": [{"ready": True}, {"ready": False}],
+ "conditions": [{"status": "True", "type": "Ready"}],
+ }
+}
+
+
+def test_check_kibana():
+ # should run without exception:
+ Kibana().check_kibana([plain_kibana_pod])
+
+
+@pytest.mark.parametrize('pods, expect_error', [
+ (
+ [],
+ "MissingComponentPods",
+ ),
+ (
+ [not_running_kibana_pod],
+ "NoRunningPods",
+ ),
+ (
+ [plain_kibana_pod, not_running_kibana_pod],
+ "PodNotRunning",
+ ),
+])
+def test_check_kibana_error(pods, expect_error):
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ Kibana().check_kibana(pods)
+ assert expect_error == excinfo.value.name
+
+
+@pytest.mark.parametrize('comment, route, expect_error', [
+ (
+ "No route returned",
+ None,
+ "no_route_exists",
+ ),
+
+ (
+ "broken route response",
+ {"status": {}},
+ "get_route_failed",
+ ),
+ (
+ "route with no ingress",
+ {
+ "metadata": {
+ "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"},
+ "name": "logging-kibana",
+ },
+ "status": {
+ "ingress": [],
+ },
+ "spec": {
+ "host": "hostname",
+ }
+ },
+ "route_not_accepted",
+ ),
+
+ (
+ "route with no host",
+ {
+ "metadata": {
+ "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"},
+ "name": "logging-kibana",
+ },
+ "status": {
+ "ingress": [{
+ "status": True,
+ }],
+ },
+ "spec": {},
+ },
+ "route_missing_host",
+ ),
+])
+def test_get_kibana_url_error(comment, route, expect_error):
+ check = Kibana()
+ check.exec_oc = lambda *_: json.dumps(route) if route else ""
+
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ check._get_kibana_url()
+ assert excinfo.value.name == expect_error
+
+
+@pytest.mark.parametrize('comment, route, expect_url', [
+ (
+ "test route that looks fine",
+ {
+ "metadata": {
+ "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"},
+ "name": "logging-kibana",
+ },
+ "status": {
+ "ingress": [{
+ "status": True,
+ }],
+ },
+ "spec": {
+ "host": "hostname",
+ },
+ },
+ "https://hostname/",
+ ),
+])
+def test_get_kibana_url(comment, route, expect_url):
+ check = Kibana()
+ check.exec_oc = lambda *_: json.dumps(route)
+ assert expect_url == check._get_kibana_url()
+
+
+@pytest.mark.parametrize('exec_result, expect', [
+ (
+ 'urlopen error [Errno 111] Connection refused',
+ 'FailedToConnectInternal',
+ ),
+ (
+ 'urlopen error [Errno -2] Name or service not known',
+ 'FailedToResolveInternal',
+ ),
+ (
+ 'Status code was not [302]: HTTP Error 500: Server error',
+ 'WrongReturnCodeInternal',
+ ),
+ (
+ 'bork bork bork',
+ 'MiscRouteErrorInternal',
+ ),
+])
+def test_verify_url_internal_failure(exec_result, expect):
+ check = Kibana(execute_module=lambda *_: dict(failed=True, msg=exec_result))
+ check._get_kibana_url = lambda: 'url'
+
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ check.check_kibana_route()
+ assert expect == excinfo.value.name
+
+
+@pytest.mark.parametrize('lib_result, expect', [
+ (
+ HTTPError('url', 500, 'it broke', hdrs=None, fp=None),
+ 'MiscRouteError',
+ ),
+ (
+ URLError('urlopen error [Errno 111] Connection refused'),
+ 'FailedToConnect',
+ ),
+ (
+ URLError('urlopen error [Errno -2] Name or service not known'),
+ 'FailedToResolve',
+ ),
+ (
+ 302,
+ 'WrongReturnCode',
+ ),
+ (
+ 200,
+ None,
+ ),
+])
+def test_verify_url_external_failure(lib_result, expect, monkeypatch):
+
+ class _http_return:
+
+ def __init__(self, code):
+ self.code = code
+
+ def getcode(self):
+ return self.code
+
+ def urlopen(url, context):
+ if type(lib_result) is int:
+ return _http_return(lib_result)
+ raise lib_result
+ monkeypatch.setattr(urllib2, 'urlopen', urlopen)
+
+ check = Kibana()
+ check._get_kibana_url = lambda: 'url'
+ check._verify_url_internal = lambda url: None
+
+ if not expect:
+ check.check_kibana_route()
+ return
+
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ check.check_kibana_route()
+ assert expect == excinfo.value.name
+
+
+def test_verify_url_external_skip():
+ check = Kibana(lambda *_: {}, dict(openshift_check_efk_kibana_external="false"))
+ check._get_kibana_url = lambda: 'url'
+ check.check_kibana_route()
+
+
+# this is kind of silly but it adds coverage for the run() method...
+def test_run():
+ pods = ["foo"]
+ ran = dict(check_kibana=False, check_route=False)
+
+ def check_kibana(pod_list):
+ ran["check_kibana"] = True
+ assert pod_list == pods
+
+ def check_kibana_route():
+ ran["check_route"] = True
+
+ check = Kibana()
+ check.get_pods_for_component = lambda *_: pods
+ check.check_kibana = check_kibana
+ check.check_kibana_route = check_kibana_route
+
+ check.run()
+ assert ran["check_kibana"] and ran["check_route"]
diff --git a/roles/openshift_health_checker/test/logging_check_test.py b/roles/openshift_health_checker/test/logging_check_test.py
new file mode 100644
index 000000000..59c703214
--- /dev/null
+++ b/roles/openshift_health_checker/test/logging_check_test.py
@@ -0,0 +1,166 @@
+import pytest
+import json
+
+from openshift_checks.logging.logging import LoggingCheck, MissingComponentPods, CouldNotUseOc
+
+task_vars_config_base = dict(openshift=dict(common=dict(config_base='/etc/origin')))
+
+
+def canned_loggingcheck(exec_oc=None, execute_module=None):
+ """Create a LoggingCheck object with canned exec_oc method"""
+ check = LoggingCheck(execute_module)
+ if exec_oc:
+ check.exec_oc = exec_oc
+ return check
+
+
+def assert_error(error, expect_error):
+ if expect_error:
+ assert error
+ assert expect_error in error
+ else:
+ assert not error
+
+
+plain_es_pod = {
+ "metadata": {
+ "labels": {"component": "es", "deploymentconfig": "logging-es"},
+ "name": "logging-es",
+ },
+ "status": {
+ "conditions": [{"status": "True", "type": "Ready"}],
+ "containerStatuses": [{"ready": True}],
+ "podIP": "10.10.10.10",
+ },
+ "_test_master_name_str": "name logging-es",
+}
+
+plain_kibana_pod = {
+ "metadata": {
+ "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"},
+ "name": "logging-kibana-1",
+ },
+ "status": {
+ "containerStatuses": [{"ready": True}, {"ready": True}],
+ "conditions": [{"status": "True", "type": "Ready"}],
+ }
+}
+
+plain_kibana_pod_no_containerstatus = {
+ "metadata": {
+ "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"},
+ "name": "logging-kibana-1",
+ },
+ "status": {
+ "conditions": [{"status": "True", "type": "Ready"}],
+ }
+}
+
+fluentd_pod_node1 = {
+ "metadata": {
+ "labels": {"component": "fluentd", "deploymentconfig": "logging-fluentd"},
+ "name": "logging-fluentd-1",
+ },
+ "spec": {"host": "node1", "nodeName": "node1"},
+ "status": {
+ "containerStatuses": [{"ready": True}],
+ "conditions": [{"status": "True", "type": "Ready"}],
+ }
+}
+
+plain_curator_pod = {
+ "metadata": {
+ "labels": {"component": "curator", "deploymentconfig": "logging-curator"},
+ "name": "logging-curator-1",
+ },
+ "status": {
+ "containerStatuses": [{"ready": True}],
+ "conditions": [{"status": "True", "type": "Ready"}],
+ "podIP": "10.10.10.10",
+ }
+}
+
+
+@pytest.mark.parametrize('problem, expect', [
+ ("[Errno 2] No such file or directory", "supposed to be a master"),
+ ("Permission denied", "Unexpected error using `oc`"),
+])
+def test_oc_failure(problem, expect):
+ def execute_module(module_name, *_):
+ if module_name == "ocutil":
+ return dict(failed=True, result=problem)
+ return dict(changed=False)
+
+ check = LoggingCheck(execute_module, task_vars_config_base)
+
+ with pytest.raises(CouldNotUseOc) as excinfo:
+ check.exec_oc('get foo', [])
+ assert expect in str(excinfo)
+
+
+groups_with_first_master = dict(oo_first_master=['this-host'])
+groups_not_a_master = dict(oo_first_master=['other-host'], oo_masters=['other-host'])
+
+
+@pytest.mark.parametrize('groups, logging_deployed, is_active', [
+ (groups_with_first_master, True, True),
+ (groups_with_first_master, False, False),
+ (groups_not_a_master, True, False),
+ (groups_not_a_master, True, False),
+])
+def test_is_active(groups, logging_deployed, is_active):
+ task_vars = dict(
+ ansible_host='this-host',
+ groups=groups,
+ openshift_hosted_logging_deploy=logging_deployed,
+ )
+
+ assert LoggingCheck(None, task_vars).is_active() == is_active
+
+
+@pytest.mark.parametrize('pod_output, expect_pods', [
+ (
+ json.dumps({'items': [plain_es_pod]}),
+ [plain_es_pod],
+ ),
+])
+def test_get_pods_for_component(pod_output, expect_pods):
+ check = canned_loggingcheck(lambda *_: pod_output)
+ pods = check.get_pods_for_component("es")
+ assert pods == expect_pods
+
+
+@pytest.mark.parametrize('exec_oc_output, expect_error', [
+ (
+ 'No resources found.',
+ MissingComponentPods,
+ ),
+ (
+ '{"items": null}',
+ MissingComponentPods,
+ ),
+])
+def test_get_pods_for_component_fail(exec_oc_output, expect_error):
+ check = canned_loggingcheck(lambda *_: exec_oc_output)
+ with pytest.raises(expect_error):
+ check.get_pods_for_component("es")
+
+
+@pytest.mark.parametrize('name, pods, expected_pods', [
+ (
+ 'test single pod found, scheduled, but no containerStatuses field',
+ [plain_kibana_pod_no_containerstatus],
+ [plain_kibana_pod_no_containerstatus],
+ ),
+ (
+ 'set of pods has at least one pod with containerStatuses (scheduled); should still fail',
+ [plain_kibana_pod_no_containerstatus, plain_kibana_pod],
+ [plain_kibana_pod_no_containerstatus],
+ ),
+
+], ids=lambda argvals: argvals[0])
+def test_get_not_running_pods_no_container_status(name, pods, expected_pods):
+ check = canned_loggingcheck(lambda *_: '')
+ result = check.not_running_pods(pods)
+
+ assert result == expected_pods
diff --git a/roles/openshift_health_checker/test/logging_index_time_test.py b/roles/openshift_health_checker/test/logging_index_time_test.py
new file mode 100644
index 000000000..c48ade9b8
--- /dev/null
+++ b/roles/openshift_health_checker/test/logging_index_time_test.py
@@ -0,0 +1,170 @@
+import json
+
+import pytest
+
+from openshift_checks.logging.logging_index_time import LoggingIndexTime, OpenShiftCheckException
+
+
+SAMPLE_UUID = "unique-test-uuid"
+
+
+def canned_loggingindextime(exec_oc=None):
+ """Create a check object with a canned exec_oc method"""
+ check = LoggingIndexTime() # fails if a module is actually invoked
+ if exec_oc:
+ check.exec_oc = exec_oc
+ return check
+
+
+plain_running_elasticsearch_pod = {
+ "metadata": {
+ "labels": {"component": "es", "deploymentconfig": "logging-es-data-master"},
+ "name": "logging-es-data-master-1",
+ },
+ "status": {
+ "containerStatuses": [{"ready": True}, {"ready": True}],
+ "phase": "Running",
+ }
+}
+plain_running_kibana_pod = {
+ "metadata": {
+ "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"},
+ "name": "logging-kibana-1",
+ },
+ "status": {
+ "containerStatuses": [{"ready": True}, {"ready": True}],
+ "phase": "Running",
+ }
+}
+not_running_kibana_pod = {
+ "metadata": {
+ "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"},
+ "name": "logging-kibana-2",
+ },
+ "status": {
+ "containerStatuses": [{"ready": True}, {"ready": False}],
+ "conditions": [{"status": "True", "type": "Ready"}],
+ "phase": "pending",
+ }
+}
+
+
+@pytest.mark.parametrize('pods, expect_pods', [
+ (
+ [not_running_kibana_pod],
+ [],
+ ),
+ (
+ [plain_running_kibana_pod],
+ [plain_running_kibana_pod],
+ ),
+ (
+ [],
+ [],
+ )
+])
+def test_check_running_pods(pods, expect_pods):
+ check = canned_loggingindextime()
+ pods = check.running_pods(pods)
+ assert pods == expect_pods
+
+
+def test_bad_config_param():
+ with pytest.raises(OpenShiftCheckException) as error:
+ LoggingIndexTime(task_vars=dict(openshift_check_logging_index_timeout_seconds="foo")).run()
+ assert 'InvalidTimeout' == error.value.name
+
+
+def test_no_running_pods():
+ check = LoggingIndexTime()
+ check.get_pods_for_component = lambda *_: [not_running_kibana_pod]
+ with pytest.raises(OpenShiftCheckException) as error:
+ check.run()
+ assert 'kibanaNoRunningPods' == error.value.name
+
+
+def test_with_running_pods():
+ check = LoggingIndexTime()
+ check.get_pods_for_component = lambda *_: [plain_running_kibana_pod, plain_running_elasticsearch_pod]
+ check.curl_kibana_with_uuid = lambda *_: SAMPLE_UUID
+ check.wait_until_cmd_or_err = lambda *_: None
+ assert not check.run().get("failed")
+
+
+@pytest.mark.parametrize('name, json_response, uuid, timeout', [
+ (
+ 'valid count in response',
+ {
+ "count": 1,
+ },
+ SAMPLE_UUID,
+ 0.001,
+ ),
+], ids=lambda argval: argval[0])
+def test_wait_until_cmd_or_err_succeeds(name, json_response, uuid, timeout):
+ check = canned_loggingindextime(lambda *args, **_: json.dumps(json_response))
+ check.wait_until_cmd_or_err(plain_running_elasticsearch_pod, uuid, timeout)
+
+
+@pytest.mark.parametrize('name, json_response, timeout, expect_error', [
+ (
+ 'invalid json response',
+ {
+ "invalid_field": 1,
+ },
+ 0.001,
+ 'esInvalidResponse',
+ ),
+ (
+ 'empty response',
+ {},
+ 0.001,
+ 'esInvalidResponse',
+ ),
+ (
+ 'valid response but invalid match count',
+ {
+ "count": 0,
+ },
+ 0.005,
+ 'NoMatchFound',
+ )
+], ids=lambda argval: argval[0])
+def test_wait_until_cmd_or_err(name, json_response, timeout, expect_error):
+ check = canned_loggingindextime(lambda *args, **_: json.dumps(json_response))
+ with pytest.raises(OpenShiftCheckException) as error:
+ check.wait_until_cmd_or_err(plain_running_elasticsearch_pod, SAMPLE_UUID, timeout)
+
+ assert expect_error == error.value.name
+
+
+def test_curl_kibana_with_uuid():
+ check = canned_loggingindextime(lambda *args, **_: json.dumps({"statusCode": 404}))
+ check.generate_uuid = lambda: SAMPLE_UUID
+ assert SAMPLE_UUID == check.curl_kibana_with_uuid(plain_running_kibana_pod)
+
+
+@pytest.mark.parametrize('name, json_response, expect_error', [
+ (
+ 'invalid json response',
+ {
+ "invalid_field": "invalid",
+ },
+ 'kibanaInvalidResponse',
+ ),
+ (
+ 'wrong error code in response',
+ {
+ "statusCode": 500,
+ },
+ 'kibanaInvalidReturnCode',
+ ),
+], ids=lambda argval: argval[0])
+def test_failed_curl_kibana_with_uuid(name, json_response, expect_error):
+ check = canned_loggingindextime(lambda *args, **_: json.dumps(json_response))
+ check.generate_uuid = lambda: SAMPLE_UUID
+
+ with pytest.raises(OpenShiftCheckException) as error:
+ check.curl_kibana_with_uuid(plain_running_kibana_pod)
+
+ assert expect_error == error.value.name
diff --git a/roles/openshift_health_checker/test/memory_availability_test.py b/roles/openshift_health_checker/test/memory_availability_test.py
index e161a5b9e..5ec83dd79 100644
--- a/roles/openshift_health_checker/test/memory_availability_test.py
+++ b/roles/openshift_health_checker/test/memory_availability_test.py
@@ -4,11 +4,11 @@ from openshift_checks.memory_availability import MemoryAvailability
@pytest.mark.parametrize('group_names,is_active', [
- (['masters'], True),
- (['nodes'], True),
- (['etcd'], True),
- (['masters', 'nodes'], True),
- (['masters', 'etcd'], True),
+ (['oo_masters_to_config'], True),
+ (['oo_nodes_to_config'], True),
+ (['oo_etcd_to_config'], True),
+ (['oo_masters_to_config', 'oo_nodes_to_config'], True),
+ (['oo_masters_to_config', 'oo_etcd_to_config'], True),
([], False),
(['lb'], False),
(['nfs'], False),
@@ -17,72 +17,108 @@ def test_is_active(group_names, is_active):
task_vars = dict(
group_names=group_names,
)
- assert MemoryAvailability.is_active(task_vars=task_vars) == is_active
+ assert MemoryAvailability(None, task_vars).is_active() == is_active
-@pytest.mark.parametrize('group_names,ansible_memtotal_mb', [
+@pytest.mark.parametrize('group_names,configured_min,ansible_memtotal_mb', [
(
- ['masters'],
+ ['oo_masters_to_config'],
+ 0,
17200,
),
(
- ['nodes'],
+ ['oo_nodes_to_config'],
+ 0,
8200,
),
(
- ['etcd'],
- 22200,
+ ['oo_nodes_to_config'],
+ 1, # configure lower threshold
+ 2000, # too low for recommended but not for configured
+ ),
+ (
+ ['oo_nodes_to_config'],
+ 2, # configure threshold where adjustment pushes it over
+ 1900,
+ ),
+ (
+ ['oo_etcd_to_config'],
+ 0,
+ 8200,
),
(
- ['masters', 'nodes'],
+ ['oo_masters_to_config', 'oo_nodes_to_config'],
+ 0,
17000,
),
])
-def test_succeeds_with_recommended_memory(group_names, ansible_memtotal_mb):
+def test_succeeds_with_recommended_memory(group_names, configured_min, ansible_memtotal_mb):
task_vars = dict(
group_names=group_names,
+ openshift_check_min_host_memory_gb=configured_min,
ansible_memtotal_mb=ansible_memtotal_mb,
)
- check = MemoryAvailability(execute_module=fake_execute_module)
- result = check.run(tmp=None, task_vars=task_vars)
+ result = MemoryAvailability(fake_execute_module, task_vars).run()
assert not result.get('failed', False)
-@pytest.mark.parametrize('group_names,ansible_memtotal_mb,extra_words', [
+@pytest.mark.parametrize('group_names,configured_min,ansible_memtotal_mb,extra_words', [
(
- ['masters'],
+ ['oo_masters_to_config'],
+ 0,
0,
- ['0.0 GB'],
+ ['0.0 GiB'],
),
(
- ['nodes'],
+ ['oo_nodes_to_config'],
+ 0,
100,
- ['0.1 GB'],
+ ['0.1 GiB'],
+ ),
+ (
+ ['oo_nodes_to_config'],
+ 24, # configure higher threshold
+ 20 * 1024, # enough to meet recommended but not configured
+ ['20.0 GiB'],
+ ),
+ (
+ ['oo_nodes_to_config'],
+ 24, # configure higher threshold
+ 22 * 1024, # not enough for adjustment to push over threshold
+ ['22.0 GiB'],
),
(
- ['etcd'],
- -1,
- ['0.0 GB'],
+ ['oo_etcd_to_config'],
+ 0,
+ 6 * 1024,
+ ['6.0 GiB'],
),
(
- ['nodes', 'masters'],
+ ['oo_etcd_to_config', 'oo_masters_to_config'],
+ 0,
+ 9 * 1024, # enough memory for etcd, not enough for a master
+ ['9.0 GiB'],
+ ),
+ (
+ ['oo_nodes_to_config', 'oo_masters_to_config'],
+ 0,
# enough memory for a node, not enough for a master
- 11000,
- ['11.0 GB'],
+ 11 * 1024,
+ ['11.0 GiB'],
),
])
-def test_fails_with_insufficient_memory(group_names, ansible_memtotal_mb, extra_words):
+def test_fails_with_insufficient_memory(group_names, configured_min, ansible_memtotal_mb, extra_words):
task_vars = dict(
group_names=group_names,
+ openshift_check_min_host_memory_gb=configured_min,
ansible_memtotal_mb=ansible_memtotal_mb,
)
- check = MemoryAvailability(execute_module=fake_execute_module)
- result = check.run(tmp=None, task_vars=task_vars)
+ result = MemoryAvailability(fake_execute_module, task_vars).run()
- assert result['failed']
+ assert result.get('failed', False)
for word in 'below recommended'.split() + extra_words:
assert word in result['msg']
diff --git a/roles/openshift_health_checker/test/mixins_test.py b/roles/openshift_health_checker/test/mixins_test.py
index 2d83e207d..b1a41ca3c 100644
--- a/roles/openshift_health_checker/test/mixins_test.py
+++ b/roles/openshift_health_checker/test/mixins_test.py
@@ -14,10 +14,10 @@ class NotContainerizedCheck(NotContainerizedMixin, OpenShiftCheck):
(dict(openshift=dict(common=dict(is_containerized=True))), False),
])
def test_is_active(task_vars, expected):
- assert NotContainerizedCheck.is_active(task_vars) == expected
+ assert NotContainerizedCheck(None, task_vars).is_active() == expected
def test_is_active_missing_task_vars():
with pytest.raises(OpenShiftCheckException) as excinfo:
- NotContainerizedCheck.is_active(task_vars={})
+ NotContainerizedCheck().is_active()
assert 'is_containerized' in str(excinfo.value)
diff --git a/roles/openshift_health_checker/test/openshift_check_test.py b/roles/openshift_health_checker/test/openshift_check_test.py
index e3153979c..bc0c3b26c 100644
--- a/roles/openshift_health_checker/test/openshift_check_test.py
+++ b/roles/openshift_health_checker/test/openshift_check_test.py
@@ -1,7 +1,7 @@
import pytest
from openshift_checks import OpenShiftCheck, OpenShiftCheckException
-from openshift_checks import load_checks, get_var
+from openshift_checks import load_checks
# Fixtures
@@ -28,34 +28,23 @@ def test_OpenShiftCheck_init():
name = "test_check"
run = NotImplemented
- # initialization requires at least one argument (apart from self)
- with pytest.raises(TypeError) as excinfo:
- TestCheck()
+ # execute_module required at init if it will be used
+ with pytest.raises(RuntimeError) as excinfo:
+ TestCheck().execute_module("foo")
assert 'execute_module' in str(excinfo.value)
- assert 'module_executor' in str(excinfo.value)
execute_module = object()
# initialize with positional argument
check = TestCheck(execute_module)
- # new recommended name
- assert check.execute_module == execute_module
- # deprecated attribute name
- assert check.module_executor == execute_module
+ assert check._execute_module == execute_module
- # initialize with keyword argument, recommended name
+ # initialize with keyword argument
check = TestCheck(execute_module=execute_module)
- # new recommended name
- assert check.execute_module == execute_module
- # deprecated attribute name
- assert check.module_executor == execute_module
+ assert check._execute_module == execute_module
- # initialize with keyword argument, deprecated name
- check = TestCheck(module_executor=execute_module)
- # new recommended name
- assert check.execute_module == execute_module
- # deprecated attribute name
- assert check.module_executor == execute_module
+ assert check.task_vars == {}
+ assert check.tmp is None
def test_subclasses():
@@ -81,19 +70,76 @@ def test_load_checks():
assert modules
+def dummy_check(task_vars):
+ class TestCheck(OpenShiftCheck):
+ name = "dummy"
+ run = NotImplemented
+
+ return TestCheck(task_vars=task_vars)
+
+
@pytest.mark.parametrize("keys,expected", [
(("foo",), 42),
(("bar", "baz"), "openshift"),
+ (("bar.baz",), "openshift"),
])
def test_get_var_ok(task_vars, keys, expected):
- assert get_var(task_vars, *keys) == expected
+ assert dummy_check(task_vars).get_var(*keys) == expected
def test_get_var_error(task_vars, missing_keys):
with pytest.raises(OpenShiftCheckException):
- get_var(task_vars, *missing_keys)
+ dummy_check(task_vars).get_var(*missing_keys)
def test_get_var_default(task_vars, missing_keys):
default = object()
- assert get_var(task_vars, *missing_keys, default=default) == default
+ assert dummy_check(task_vars).get_var(*missing_keys, default=default) == default
+
+
+@pytest.mark.parametrize("keys, convert, expected", [
+ (("foo",), str, "42"),
+ (("foo",), float, 42.0),
+ (("bar", "baz"), bool, False),
+])
+def test_get_var_convert(task_vars, keys, convert, expected):
+ assert dummy_check(task_vars).get_var(*keys, convert=convert) == expected
+
+
+def convert_oscexc(_):
+ raise OpenShiftCheckException("known failure")
+
+
+def convert_exc(_):
+ raise Exception("failure unknown")
+
+
+@pytest.mark.parametrize("keys, convert, expect_text", [
+ (("bar", "baz"), int, "Cannot convert"),
+ (("bar.baz",), float, "Cannot convert"),
+ (("foo",), "bogus", "TypeError"),
+ (("foo",), lambda a, b: 1, "TypeError"),
+ (("foo",), lambda a: 1 / 0, "ZeroDivisionError"),
+ (("foo",), convert_oscexc, "known failure"),
+ (("foo",), convert_exc, "failure unknown"),
+])
+def test_get_var_convert_error(task_vars, keys, convert, expect_text):
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ dummy_check(task_vars).get_var(*keys, convert=convert)
+ assert expect_text in str(excinfo.value)
+
+
+def test_register(task_vars):
+ check = dummy_check(task_vars)
+
+ check.register_failure(OpenShiftCheckException("spam"))
+ assert "spam" in str(check.failures[0])
+
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ check.register_file("spam") # no file contents specified
+ assert "not specified" in str(excinfo.value)
+
+ # normally execute_module registers the result file; test disabling that
+ check._execute_module = lambda *args, **_: dict()
+ check.execute_module("eggs", module_args={}, register=False)
+ assert not check.files_to_save
diff --git a/roles/openshift_health_checker/test/ovs_version_test.py b/roles/openshift_health_checker/test/ovs_version_test.py
new file mode 100644
index 000000000..5a82a43bf
--- /dev/null
+++ b/roles/openshift_health_checker/test/ovs_version_test.py
@@ -0,0 +1,86 @@
+import pytest
+
+from openshift_checks.ovs_version import OvsVersion, OpenShiftCheckException
+
+
+def test_openshift_version_not_supported():
+ def execute_module(*_):
+ return {}
+
+ openshift_release = '111.7.0'
+
+ task_vars = dict(
+ openshift=dict(common=dict(service_type='origin')),
+ openshift_release=openshift_release,
+ openshift_image_tag='v' + openshift_release,
+ openshift_deployment_type='origin',
+ )
+
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ OvsVersion(execute_module, task_vars).run()
+
+ assert "no recommended version of Open vSwitch" in str(excinfo.value)
+
+
+def test_invalid_openshift_release_format():
+ def execute_module(*_):
+ return {}
+
+ task_vars = dict(
+ openshift=dict(common=dict(service_type='origin')),
+ openshift_image_tag='v0',
+ openshift_deployment_type='origin',
+ )
+
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ OvsVersion(execute_module, task_vars).run()
+ assert "invalid version" in str(excinfo.value)
+
+
+@pytest.mark.parametrize('openshift_release,expected_ovs_version', [
+ ("3.5", ["2.6", "2.7"]),
+ ("3.6", ["2.6", "2.7"]),
+ ("3.4", "2.4"),
+ ("3.3", "2.4"),
+ ("1.0", "2.4"),
+])
+def test_ovs_package_version(openshift_release, expected_ovs_version):
+ task_vars = dict(
+ openshift=dict(common=dict(service_type='origin')),
+ openshift_release=openshift_release,
+ openshift_image_tag='v' + openshift_release,
+ )
+ return_value = {} # note: check.execute_module modifies return hash contents
+
+ def execute_module(module_name=None, module_args=None, *_):
+ assert module_name == 'rpm_version'
+ assert "package_list" in module_args
+
+ for pkg in module_args["package_list"]:
+ if pkg["name"] == "openvswitch":
+ assert pkg["version"] == expected_ovs_version
+
+ return return_value
+
+ result = OvsVersion(execute_module, task_vars).run()
+ assert result is return_value
+
+
+@pytest.mark.parametrize('group_names,is_containerized,is_active', [
+ (['oo_masters_to_config'], False, True),
+ # ensure check is skipped on containerized installs
+ (['oo_masters_to_config'], True, False),
+ (['oo_nodes_to_config'], False, True),
+ (['oo_masters_to_config', 'oo_nodes_to_config'], False, True),
+ (['oo_masters_to_config', 'oo_etcd_to_config'], False, True),
+ ([], False, False),
+ (['oo_etcd_to_config'], False, False),
+ (['lb'], False, False),
+ (['nfs'], False, False),
+])
+def test_ovs_version_skip_when_not_master_nor_node(group_names, is_containerized, is_active):
+ task_vars = dict(
+ group_names=group_names,
+ openshift=dict(common=dict(is_containerized=is_containerized)),
+ )
+ assert OvsVersion(None, task_vars).is_active() == is_active
diff --git a/roles/openshift_health_checker/test/package_availability_test.py b/roles/openshift_health_checker/test/package_availability_test.py
index f7e916a46..9815acb38 100644
--- a/roles/openshift_health_checker/test/package_availability_test.py
+++ b/roles/openshift_health_checker/test/package_availability_test.py
@@ -14,7 +14,7 @@ def test_is_active(pkg_mgr, is_containerized, is_active):
ansible_pkg_mgr=pkg_mgr,
openshift=dict(common=dict(is_containerized=is_containerized)),
)
- assert PackageAvailability.is_active(task_vars=task_vars) == is_active
+ assert PackageAvailability(None, task_vars).is_active() == is_active
@pytest.mark.parametrize('task_vars,must_have_packages,must_not_have_packages', [
@@ -26,7 +26,7 @@ def test_is_active(pkg_mgr, is_containerized, is_active):
(
dict(
openshift=dict(common=dict(service_type='origin')),
- group_names=['masters'],
+ group_names=['oo_masters_to_config'],
),
set(['origin-master']),
set(['origin-node']),
@@ -34,7 +34,7 @@ def test_is_active(pkg_mgr, is_containerized, is_active):
(
dict(
openshift=dict(common=dict(service_type='atomic-openshift')),
- group_names=['nodes'],
+ group_names=['oo_nodes_to_config'],
),
set(['atomic-openshift-node']),
set(['atomic-openshift-master']),
@@ -42,22 +42,21 @@ def test_is_active(pkg_mgr, is_containerized, is_active):
(
dict(
openshift=dict(common=dict(service_type='atomic-openshift')),
- group_names=['masters', 'nodes'],
+ group_names=['oo_masters_to_config', 'oo_nodes_to_config'],
),
set(['atomic-openshift-master', 'atomic-openshift-node']),
set(),
),
])
def test_package_availability(task_vars, must_have_packages, must_not_have_packages):
- return_value = object()
+ return_value = {}
- def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None):
+ def execute_module(module_name=None, module_args=None, *_):
assert module_name == 'check_yum_update'
assert 'packages' in module_args
assert set(module_args['packages']).issuperset(must_have_packages)
assert not set(module_args['packages']).intersection(must_not_have_packages)
- return return_value
+ return {'foo': return_value}
- check = PackageAvailability(execute_module=execute_module)
- result = check.run(tmp=None, task_vars=task_vars)
- assert result is return_value
+ result = PackageAvailability(execute_module, task_vars).run()
+ assert result['foo'] is return_value
diff --git a/roles/openshift_health_checker/test/package_update_test.py b/roles/openshift_health_checker/test/package_update_test.py
index 5e000cff5..85d3c9cab 100644
--- a/roles/openshift_health_checker/test/package_update_test.py
+++ b/roles/openshift_health_checker/test/package_update_test.py
@@ -2,15 +2,14 @@ from openshift_checks.package_update import PackageUpdate
def test_package_update():
- return_value = object()
+ return_value = {}
- def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None):
+ def execute_module(module_name=None, module_args=None, *_):
assert module_name == 'check_yum_update'
assert 'packages' in module_args
# empty list of packages means "generic check if 'yum update' will work"
assert module_args['packages'] == []
- return return_value
+ return {'foo': return_value}
- check = PackageUpdate(execute_module=execute_module)
- result = check.run(tmp=None, task_vars=None)
- assert result is return_value
+ result = PackageUpdate(execute_module).run()
+ assert result['foo'] is return_value
diff --git a/roles/openshift_health_checker/test/package_version_test.py b/roles/openshift_health_checker/test/package_version_test.py
index 196d9816a..3cf4ce033 100644
--- a/roles/openshift_health_checker/test/package_version_test.py
+++ b/roles/openshift_health_checker/test/package_version_test.py
@@ -1,40 +1,110 @@
import pytest
-from openshift_checks.package_version import PackageVersion
+from openshift_checks.package_version import PackageVersion, OpenShiftCheckException
-def test_package_version():
+def task_vars_for(openshift_release, deployment_type):
+ return dict(
+ ansible_pkg_mgr='yum',
+ openshift=dict(common=dict(service_type=deployment_type)),
+ openshift_release=openshift_release,
+ openshift_image_tag='v' + openshift_release,
+ openshift_deployment_type=deployment_type,
+ )
+
+
+def test_openshift_version_not_supported():
+ check = PackageVersion(None, task_vars_for("1.2.3", 'origin'))
+ check.get_openshift_version_tuple = lambda: (3, 4, 1) # won't be in the dict
+
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ check.get_required_ovs_version()
+ assert "no recommended version of Open vSwitch" in str(excinfo.value)
+
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ check.get_required_docker_version()
+ assert "no recommended version of Docker" in str(excinfo.value)
+
+
+def test_invalid_openshift_release_format():
task_vars = dict(
+ ansible_pkg_mgr='yum',
openshift=dict(common=dict(service_type='origin')),
- openshift_release='3.5',
+ openshift_image_tag='v0',
openshift_deployment_type='origin',
)
- return_value = object()
- def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None):
+ check = PackageVersion(lambda *_: {}, task_vars)
+ with pytest.raises(OpenShiftCheckException) as excinfo:
+ check.run()
+ assert "invalid version" in str(excinfo.value)
+
+
+@pytest.mark.parametrize('openshift_release', [
+ "111.7.0",
+ "3.7",
+ "3.6",
+ "3.5.1.2.3",
+ "3.5",
+ "3.4",
+ "3.3",
+ "2.1.0",
+])
+def test_package_version(openshift_release):
+
+ return_value = {"foo": object()}
+
+ def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None, *_):
assert module_name == 'aos_version'
- assert 'requested_openshift_release' in module_args
- assert 'openshift_deployment_type' in module_args
- assert 'rpm_prefix' in module_args
- assert module_args['requested_openshift_release'] == task_vars['openshift_release']
- assert module_args['openshift_deployment_type'] == task_vars['openshift_deployment_type']
- assert module_args['rpm_prefix'] == task_vars['openshift']['common']['service_type']
+ assert "package_list" in module_args
+
+ for pkg in module_args["package_list"]:
+ if "-master" in pkg["name"] or "-node" in pkg["name"]:
+ assert pkg["version"] == task_vars["openshift_release"]
+
+ return return_value
+
+ check = PackageVersion(execute_module, task_vars_for(openshift_release, 'origin'))
+ result = check.run()
+ assert result == return_value
+
+
+@pytest.mark.parametrize('deployment_type,openshift_release,expected_docker_version', [
+ ("origin", "3.5", "1.12"),
+ ("origin", "1.3", "1.10"),
+ ("origin", "1.1", "1.8"),
+ ("openshift-enterprise", "3.4", "1.12"),
+ ("openshift-enterprise", "3.2", "1.10"),
+ ("openshift-enterprise", "3.1", "1.8"),
+])
+def test_docker_package_version(deployment_type, openshift_release, expected_docker_version):
+
+ return_value = {"foo": object()}
+
+ def execute_module(module_name=None, module_args=None, *_):
+ assert module_name == 'aos_version'
+ assert "package_list" in module_args
+
+ for pkg in module_args["package_list"]:
+ if pkg["name"] == "docker":
+ assert pkg["version"] == expected_docker_version
+
return return_value
- check = PackageVersion(execute_module=execute_module)
- result = check.run(tmp=None, task_vars=task_vars)
- assert result is return_value
+ check = PackageVersion(execute_module, task_vars_for(openshift_release, deployment_type))
+ result = check.run()
+ assert result == return_value
@pytest.mark.parametrize('group_names,is_containerized,is_active', [
- (['masters'], False, True),
+ (['oo_masters_to_config'], False, True),
# ensure check is skipped on containerized installs
- (['masters'], True, False),
- (['nodes'], False, True),
- (['masters', 'nodes'], False, True),
- (['masters', 'etcd'], False, True),
+ (['oo_masters_to_config'], True, False),
+ (['oo_nodes_to_config'], False, True),
+ (['oo_masters_to_config', 'oo_nodes_to_config'], False, True),
+ (['oo_masters_to_config', 'oo_etcd_to_config'], False, True),
([], False, False),
- (['etcd'], False, False),
+ (['oo_etcd_to_config'], False, False),
(['lb'], False, False),
(['nfs'], False, False),
])
@@ -43,4 +113,4 @@ def test_package_version_skip_when_not_master_nor_node(group_names, is_container
group_names=group_names,
openshift=dict(common=dict(is_containerized=is_containerized)),
)
- assert PackageVersion.is_active(task_vars=task_vars) == is_active
+ assert PackageVersion(None, task_vars).is_active() == is_active
diff --git a/roles/openshift_health_checker/test/rpm_version_test.py b/roles/openshift_health_checker/test/rpm_version_test.py
new file mode 100644
index 000000000..2c1bcf876
--- /dev/null
+++ b/roles/openshift_health_checker/test/rpm_version_test.py
@@ -0,0 +1,82 @@
+import pytest
+import rpm_version
+
+expected_pkgs = {
+ "spam": {
+ "name": "spam",
+ "version": "3.2.1",
+ },
+ "eggs": {
+ "name": "eggs",
+ "version": "3.2.1",
+ },
+}
+
+
+@pytest.mark.parametrize('pkgs, expect_not_found', [
+ (
+ {},
+ ["spam", "eggs"], # none found
+ ),
+ (
+ {"spam": ["3.2.1", "4.5.1"]},
+ ["eggs"], # completely missing
+ ),
+ (
+ {
+ "spam": ["3.2.1", "4.5.1"],
+ "eggs": ["3.2.1"],
+ },
+ [], # all found
+ ),
+])
+def test_check_pkg_found(pkgs, expect_not_found):
+ if expect_not_found:
+ with pytest.raises(rpm_version.RpmVersionException) as e:
+ rpm_version._check_pkg_versions(pkgs, expected_pkgs)
+
+ assert "not found to be installed" in str(e.value)
+ assert set(expect_not_found) == set(e.value.problem_pkgs)
+ else:
+ rpm_version._check_pkg_versions(pkgs, expected_pkgs)
+
+
+@pytest.mark.parametrize('pkgs, expect_not_found', [
+ (
+ {
+ 'spam': ['3.2.1'],
+ 'eggs': ['3.3.2'],
+ },
+ {
+ "eggs": {
+ "required_versions": ["3.2"],
+ "found_versions": ["3.3"],
+ }
+ }, # not the right version
+ ),
+ (
+ {
+ 'spam': ['3.1.2', "3.3.2"],
+ 'eggs': ['3.3.2', "1.2.3"],
+ },
+ {
+ "eggs": {
+ "required_versions": ["3.2"],
+ "found_versions": ["3.3", "1.2"],
+ },
+ "spam": {
+ "required_versions": ["3.2"],
+ "found_versions": ["3.1", "3.3"],
+ }
+ }, # not the right version
+ ),
+])
+def test_check_pkg_version_found(pkgs, expect_not_found):
+ if expect_not_found:
+ with pytest.raises(rpm_version.RpmVersionException) as e:
+ rpm_version._check_pkg_versions(pkgs, expected_pkgs)
+
+ assert "found to be installed with an incorrect version" in str(e.value)
+ assert expect_not_found == e.value.problem_pkgs
+ else:
+ rpm_version._check_pkg_versions(pkgs, expected_pkgs)
diff --git a/roles/openshift_health_checker/test/search_journalctl_test.py b/roles/openshift_health_checker/test/search_journalctl_test.py
new file mode 100644
index 000000000..724928aa1
--- /dev/null
+++ b/roles/openshift_health_checker/test/search_journalctl_test.py
@@ -0,0 +1,157 @@
+import pytest
+import search_journalctl
+
+
+def canned_search_journalctl(get_log_output=None):
+ """Create a search_journalctl object with canned get_log_output method"""
+ module = search_journalctl
+ if get_log_output:
+ module.get_log_output = get_log_output
+ return module
+
+
+DEFAULT_TIMESTAMP = 1496341364
+
+
+def get_timestamp(modifier=0):
+ return DEFAULT_TIMESTAMP + modifier
+
+
+def get_timestamp_microseconds(modifier=0):
+ return get_timestamp(modifier) * 1000000
+
+
+def create_test_log_object(stamp, msg):
+ return '{{"__REALTIME_TIMESTAMP": "{}", "MESSAGE": "{}"}}'.format(stamp, msg)
+
+
+@pytest.mark.parametrize('name,matchers,log_input,expected_matches,expected_errors', [
+ (
+ 'test with valid params',
+ [
+ {
+ "start_regexp": r"Sample Logs Beginning",
+ "regexp": r"test log message",
+ "unit": "test",
+ },
+ ],
+ [
+ create_test_log_object(get_timestamp_microseconds(), "test log message"),
+ create_test_log_object(get_timestamp_microseconds(), "Sample Logs Beginning"),
+ ],
+ ["test log message"],
+ [],
+ ),
+ (
+ 'test with invalid json in log input',
+ [
+ {
+ "start_regexp": r"Sample Logs Beginning",
+ "regexp": r"test log message",
+ "unit": "test-unit",
+ },
+ ],
+ [
+ '{__REALTIME_TIMESTAMP: ' + str(get_timestamp_microseconds()) + ', "MESSAGE": "test log message"}',
+ ],
+ [],
+ [
+ ["invalid json", "test-unit", "test log message"],
+ ],
+ ),
+ (
+ 'test with invalid regexp',
+ [
+ {
+ "start_regexp": r"Sample Logs Beginning",
+ "regexp": r"test [ log message",
+ "unit": "test",
+ },
+ ],
+ [
+ create_test_log_object(get_timestamp_microseconds(), "test log message"),
+ create_test_log_object(get_timestamp_microseconds(), "sample log message"),
+ create_test_log_object(get_timestamp_microseconds(), "fake log message"),
+ create_test_log_object(get_timestamp_microseconds(), "dummy log message"),
+ create_test_log_object(get_timestamp_microseconds(), "Sample Logs Beginning"),
+ ],
+ [],
+ [
+ ["invalid regular expression"],
+ ],
+ ),
+], ids=lambda argval: argval[0])
+def test_get_log_matches(name, matchers, log_input, expected_matches, expected_errors):
+ def get_log_output(matcher):
+ return log_input
+
+ module = canned_search_journalctl(get_log_output)
+ matched_regexp, errors = module.get_log_matches(matchers, 500, 60 * 60)
+
+ assert set(matched_regexp) == set(expected_matches)
+ assert len(expected_errors) == len(errors)
+
+ for idx, partial_err_set in enumerate(expected_errors):
+ for partial_err_msg in partial_err_set:
+ assert partial_err_msg in errors[idx]
+
+
+@pytest.mark.parametrize('name,matcher,log_count_lim,stamp_lim_seconds,log_input,expected_match', [
+ (
+ 'test with matching log message, but out of bounds of log_count_lim',
+ {
+ "start_regexp": r"Sample Logs Beginning",
+ "regexp": r"dummy log message",
+ "unit": "test",
+ },
+ 3,
+ get_timestamp(-100 * 60 * 60),
+ [
+ create_test_log_object(get_timestamp_microseconds(), "test log message"),
+ create_test_log_object(get_timestamp_microseconds(), "sample log message"),
+ create_test_log_object(get_timestamp_microseconds(), "fake log message"),
+ create_test_log_object(get_timestamp_microseconds(), "dummy log message"),
+ create_test_log_object(get_timestamp_microseconds(), "Sample Logs Beginning"),
+ ],
+ None,
+ ),
+ (
+ 'test with matching log message, but with timestamp too old',
+ {
+ "start_regexp": r"Sample Logs Beginning",
+ "regexp": r"dummy log message",
+ "unit": "test",
+ },
+ 100,
+ get_timestamp(-10),
+ [
+ create_test_log_object(get_timestamp_microseconds(), "test log message"),
+ create_test_log_object(get_timestamp_microseconds(), "sample log message"),
+ create_test_log_object(get_timestamp_microseconds(), "fake log message"),
+ create_test_log_object(get_timestamp_microseconds(-1000), "dummy log message"),
+ create_test_log_object(get_timestamp_microseconds(-1000), "Sample Logs Beginning"),
+ ],
+ None,
+ ),
+ (
+ 'test with matching log message, and timestamp within time limit',
+ {
+ "start_regexp": r"Sample Logs Beginning",
+ "regexp": r"dummy log message",
+ "unit": "test",
+ },
+ 100,
+ get_timestamp(-1010),
+ [
+ create_test_log_object(get_timestamp_microseconds(), "test log message"),
+ create_test_log_object(get_timestamp_microseconds(), "sample log message"),
+ create_test_log_object(get_timestamp_microseconds(), "fake log message"),
+ create_test_log_object(get_timestamp_microseconds(-1000), "dummy log message"),
+ create_test_log_object(get_timestamp_microseconds(-1000), "Sample Logs Beginning"),
+ ],
+ create_test_log_object(get_timestamp_microseconds(-1000), "dummy log message"),
+ ),
+], ids=lambda argval: argval[0])
+def test_find_matches_skips_logs(name, matcher, log_count_lim, stamp_lim_seconds, log_input, expected_match):
+ match = search_journalctl.find_matches(log_input, matcher, log_count_lim, stamp_lim_seconds)
+ assert match == expected_match
diff --git a/roles/openshift_health_checker/test/zz_failure_summary_test.py b/roles/openshift_health_checker/test/zz_failure_summary_test.py
new file mode 100644
index 000000000..69f27653c
--- /dev/null
+++ b/roles/openshift_health_checker/test/zz_failure_summary_test.py
@@ -0,0 +1,85 @@
+from zz_failure_summary import deduplicate_failures
+
+import pytest
+
+
+@pytest.mark.parametrize('failures,deduplicated', [
+ (
+ [
+ {
+ 'host': 'master1',
+ 'msg': 'One or more checks failed',
+ },
+ ],
+ [
+ {
+ 'host': ('master1',),
+ 'msg': 'One or more checks failed',
+ },
+ ],
+ ),
+ (
+ [
+ {
+ 'host': 'master1',
+ 'msg': 'One or more checks failed',
+ },
+ {
+ 'host': 'node1',
+ 'msg': 'One or more checks failed',
+ },
+ ],
+ [
+ {
+ 'host': ('master1', 'node1'),
+ 'msg': 'One or more checks failed',
+ },
+ ],
+ ),
+ (
+ [
+ {
+ 'host': 'node1',
+ 'msg': 'One or more checks failed',
+ 'checks': (('test_check', 'error message'),),
+ },
+ {
+ 'host': 'master2',
+ 'msg': 'Some error happened',
+ },
+ {
+ 'host': 'master1',
+ 'msg': 'One or more checks failed',
+ 'checks': (('test_check', 'error message'),),
+ },
+ ],
+ [
+ {
+ 'host': ('master1', 'node1'),
+ 'msg': 'One or more checks failed',
+ 'checks': (('test_check', 'error message'),),
+ },
+ {
+ 'host': ('master2',),
+ 'msg': 'Some error happened',
+ },
+ ],
+ ),
+ # if a failure contain an unhashable value, it will not be deduplicated
+ (
+ [
+ {
+ 'host': 'master1',
+ 'msg': {'unhashable': 'value'},
+ },
+ ],
+ [
+ {
+ 'host': 'master1',
+ 'msg': {'unhashable': 'value'},
+ },
+ ],
+ ),
+])
+def test_deduplicate_failures(failures, deduplicated):
+ assert deduplicate_failures(failures) == deduplicated