1 files changed, 202 insertions, 81 deletions
diff --git a/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py b/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py
index 208e81048..dcaf87eca 100644
--- a/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py
+++ b/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py
@@ -1,21 +1,23 @@
-# vim: expandtab:tabstop=4:shiftwidth=4
-'''
-Ansible callback plugin.
-'''
+"""Ansible callback plugin to print a nicely formatted summary of failures.
 
-from pprint import pformat
+The file / module name is prefixed with `zz_` to make this plugin be loaded last
+by Ansible, thus making its output the last thing that users see.
+"""
+
+from collections import defaultdict
+import traceback
 
 from ansible.plugins.callback import CallbackBase
 from ansible import constants as C
 from ansible.utils.color import stringc
+from ansible.module_utils.six import string_types
+
+
+FAILED_NO_MSG = u'Failed without returning a message.'
 
 
 class CallbackModule(CallbackBase):
-    '''
-    This callback plugin stores task results and summarizes failures.
-    The file name is prefixed with `zz_` to make this plugin be loaded last by
-    Ansible, thus making its output the last thing that users see.
-    '''
+    """This callback plugin stores task results and summarizes failures."""
 
     CALLBACK_VERSION = 2.0
     CALLBACK_TYPE = 'aggregate'
@@ -25,90 +27,209 @@ class CallbackModule(CallbackBase):
     def __init__(self):
         super(CallbackModule, self).__init__()
         self.__failures = []
+        self.__playbook_file = ''
+
+    def v2_playbook_on_start(self, playbook):
+        super(CallbackModule, self).v2_playbook_on_start(playbook)
+        # pylint: disable=protected-access; Ansible gives us no public API to
+        # get the file name of the current playbook from a callback plugin.
+        self.__playbook_file = playbook._file_name
 
     def v2_runner_on_failed(self, result, ignore_errors=False):
         super(CallbackModule, self).v2_runner_on_failed(result, ignore_errors)
-        self.__failures.append(dict(result=result, ignore_errors=ignore_errors))
+        if not ignore_errors:
+            self.__failures.append(result)
 
     def v2_playbook_on_stats(self, stats):
         super(CallbackModule, self).v2_playbook_on_stats(stats)
-        # TODO: update condition to consider a host var or env var to
-        # enable/disable the summary, so that we can control the output from a
-        # play.
-        if self.__failures:
-            self._print_failure_summary()
-
-    def _print_failure_summary(self):
-        '''Print a summary of failed tasks (including ignored failures).'''
-        self._display.display(u'\nFailure summary:\n')
-
-        # TODO: group failures by host or by task. If grouped by host, it is
-        # easy to see all problems of a given host. If grouped by task, it is
-        # easy to see what hosts needs the same fix.
-
-        width = len(str(len(self.__failures)))
-        initial_indent_format = u'  {{:>{width}}}. '.format(width=width)
-        initial_indent_len = len(initial_indent_format.format(0))
-        subsequent_indent = u' ' * initial_indent_len
-        subsequent_extra_indent = u' ' * (initial_indent_len + 10)
-
-        for i, failure in enumerate(self.__failures, 1):
-            entries = _format_failure(failure)
-            self._display.display(u'\n{}{}'.format(initial_indent_format.format(i), entries[0]))
-            for entry in entries[1:]:
-                entry = entry.replace(u'\n', u'\n' + subsequent_extra_indent)
-                indented = u'{}{}'.format(subsequent_indent, entry)
-                self._display.display(indented)
-
-
-# Reason: disable pylint protected-access because we need to access _*
-#         attributes of a task result to implement this method.
-# Status: permanently disabled unless Ansible's API changes.
-# pylint: disable=protected-access
-def _format_failure(failure):
-    '''Return a list of pretty-formatted text entries describing a failure, including
+        # pylint: disable=broad-except; capturing exceptions broadly is
+        # intentional, to isolate arbitrary failures in this callback plugin.
+        try:
+            if self.__failures:
+                self._display.display(failure_summary(self.__failures, self.__playbook_file))
+        except Exception:
+            msg = stringc(
+                u'An error happened while generating a summary of failures:\n'
+                u'{}'.format(traceback.format_exc()), C.COLOR_WARN)
+            self._display.v(msg)
+
+
+def failure_summary(failures, playbook):
+    """Return a summary of failed tasks, including details on health checks."""
+    if not failures:
+        return u''
+
+    # NOTE: because we don't have access to task_vars from callback plugins, we
+    # store the playbook context in the task result when the
+    # openshift_health_check action plugin is used, and we use this context to
+    # customize the error message.
+    # pylint: disable=protected-access; Ansible gives us no sufficient public
+    # API on TaskResult objects.
+    context = next((
+        context for context in
+        (failure._result.get('playbook_context') for failure in failures)
+        if context
+    ), None)
+
+    failures = [failure_to_dict(failure) for failure in failures]
+    failures = deduplicate_failures(failures)
+
+    summary = [u'', u'', u'Failure summary:', u'']
+
+    width = len(str(len(failures)))
+    initial_indent_format = u'  {{:>{width}}}. '.format(width=width)
+    initial_indent_len = len(initial_indent_format.format(0))
+    subsequent_indent = u' ' * initial_indent_len
+    subsequent_extra_indent = u' ' * (initial_indent_len + 10)
+
+    for i, failure in enumerate(failures, 1):
+        entries = format_failure(failure)
+        summary.append(u'\n{}{}'.format(initial_indent_format.format(i), entries[0]))
+        for entry in entries[1:]:
+            entry = entry.replace(u'\n', u'\n' + subsequent_extra_indent)
+            indented = u'{}{}'.format(subsequent_indent, entry)
+            summary.append(indented)
+
+    failed_checks = set()
+    for failure in failures:
+        failed_checks.update(name for name, message in failure['checks'])
+    if failed_checks:
+        summary.append(check_failure_footer(failed_checks, context, playbook))
+
+    return u'\n'.join(summary)
+
+
+def failure_to_dict(failed_task_result):
+    """Extract information out of a failed TaskResult into a dict.
+
+    The intent is to transform a TaskResult object into something easier to
+    manipulate. TaskResult is ansible.executor.task_result.TaskResult.
+    """
+    # pylint: disable=protected-access; Ansible gives us no sufficient public
+    # API on TaskResult objects.
+    _result = failed_task_result._result
+    return {
+        'host': failed_task_result._host.get_name(),
+        'play': play_name(failed_task_result._task),
+        'task': failed_task_result.task_name,
+        'msg': _result.get('msg', FAILED_NO_MSG),
+        'checks': tuple(
+            (name, result.get('msg', FAILED_NO_MSG))
+            for name, result in sorted(_result.get('checks', {}).items())
+            if result.get('failed')
+        ),
+    }
+
+
+def play_name(obj):
+    """Given a task or block, return the name of its parent play.
+
+    This is loosely inspired by ansible.playbook.base.Base.dump_me.
+    """
+    # pylint: disable=protected-access; Ansible gives us no sufficient public
+    # API to implement this.
+    if not obj:
+        return ''
+    if hasattr(obj, '_play'):
+        return obj._play.get_name()
+    return play_name(getattr(obj, '_parent'))
+
+
+def deduplicate_failures(failures):
+    """Group together similar failures from different hosts.
+
+    Returns a new list of failures such that identical failures from different
+    hosts are grouped together in a single entry. The relative order of failures
+    is preserved.
+
+    If failures is unhashable, the original list of failures is returned.
+    """
+    groups = defaultdict(list)
+    for failure in failures:
+        group_key = tuple(sorted((key, value) for key, value in failure.items() if key != 'host'))
+        try:
+            groups[group_key].append(failure)
+        except TypeError:
+            # abort and return original list of failures when failures has an
+            # unhashable type.
+            return failures
+
+    result = []
+    for failure in failures:
+        group_key = tuple(sorted((key, value) for key, value in failure.items() if key != 'host'))
+        if group_key not in groups:
+            continue
+        failure['host'] = tuple(sorted(g_failure['host'] for g_failure in groups.pop(group_key)))
+        result.append(failure)
+    return result
+
+
+def format_failure(failure):
+    """Return a list of pretty-formatted text entries describing a failure, including
     relevant information about it. Expect that the list of text entries will be joined
-    by a newline separator when output to the user.'''
-    result = failure['result']
-    host = result._host.get_name()
-    play = _get_play(result._task)
-    if play:
-        play = play.get_name()
-    task = result._task.get_name()
-    msg = result._result.get('msg', u'???')
+    by a newline separator when output to the user."""
+    if isinstance(failure['host'], string_types):
+        host = failure['host']
+    else:
+        host = u', '.join(failure['host'])
+    play = failure['play']
+    task = failure['task']
+    msg = failure['msg']
+    checks = failure['checks']
     fields = (
-        (u'Host', host),
+        (u'Hosts', host),
         (u'Play', play),
         (u'Task', task),
         (u'Message', stringc(msg, C.COLOR_ERROR)),
     )
-    if 'checks' in result._result:
-        fields += ((u'Details', _format_failed_checks(result._result['checks'])),)
+    if checks:
+        fields += ((u'Details', format_failed_checks(checks)),)
     row_format = '{:10}{}'
     return [row_format.format(header + u':', body) for header, body in fields]
 
 
-def _format_failed_checks(checks):
-    '''Return pretty-formatted text describing checks that failed.'''
-    failed_check_msgs = []
-    for check, body in checks.items():
-        if body.get('failed', False):   # only show the failed checks
-            msg = body.get('msg', u"Failed without returning a message")
-            failed_check_msgs.append('check "%s":\n%s' % (check, msg))
-    if failed_check_msgs:
-        return stringc("\n\n".join(failed_check_msgs), C.COLOR_ERROR)
-    else:    # something failed but no checks will admit to it, so dump everything
-        return stringc(pformat(checks), C.COLOR_ERROR)
-
-
-# Reason: disable pylint protected-access because we need to access _*
-#         attributes of obj to implement this function.
-#         This is inspired by ansible.playbook.base.Base.dump_me.
-# Status: permanently disabled unless Ansible's API changes.
-# pylint: disable=protected-access
-def _get_play(obj):
-    '''Given a task or block, recursively tries to find its parent play.'''
-    if hasattr(obj, '_play'):
-        return obj._play
-    if getattr(obj, '_parent'):
-        return _get_play(obj._parent)
+def format_failed_checks(checks):
+    """Return pretty-formatted text describing checks that failed."""
+    messages = []
+    for name, message in checks:
+        messages.append(u'check "{}":\n{}'.format(name, message))
+    return stringc(u'\n\n'.join(messages), C.COLOR_ERROR)
+
+
+def check_failure_footer(failed_checks, context, playbook):
+    """Return a textual explanation about checks depending on context.
+
+    The purpose of specifying context is to vary the output depending on what
+    the user was expecting to happen (based on which playbook they ran). The
+    only use currently is to vary the message depending on whether the user was
+    deliberately running checks or was trying to install/upgrade and checks are
+    just included. Other use cases may arise.
+    """
+    checks = ','.join(sorted(failed_checks))
+    summary = [u'']
+    if context in ['pre-install', 'health', 'adhoc']:
+        # User was expecting to run checks, less explanation needed.
+        summary.extend([
+            u'You may configure or disable checks by setting Ansible '
+            u'variables. To disable those above, set:',
+            u'    openshift_disable_check={checks}'.format(checks=checks),
+            u'Consult check documentation for configurable variables.',
+        ])
+    else:
+        # User may not be familiar with the checks, explain what checks are in
+        # the first place.
+        summary.extend([
+            u'The execution of "{playbook}" includes checks designed to fail '
+            u'early if the requirements of the playbook are not met. One or '
+            u'more of these checks failed. To disregard these results,'
+            u'explicitly disable checks by setting an Ansible variable:'.format(playbook=playbook),
+            u'   openshift_disable_check={checks}'.format(checks=checks),
+            u'Failing check names are shown in the failure details above. '
+            u'Some checks may be configurable by variables if your requirements '
+            u'are different from the defaults; consult check documentation.',
+        ])
+    summary.append(
+        u'Variables can be set in the inventory or passed on the command line '
+        u'using the -e flag to ansible-playbook.'
+    )
+    return u'\n'.join(summary)