diff options
Diffstat (limited to 'roles')
4 files changed, 434 insertions, 0 deletions
| diff --git a/roles/openshift_health_checker/library/search_journalctl.py b/roles/openshift_health_checker/library/search_journalctl.py new file mode 100644 index 000000000..3631f71c8 --- /dev/null +++ b/roles/openshift_health_checker/library/search_journalctl.py @@ -0,0 +1,150 @@ +#!/usr/bin/python +"""Interface to journalctl.""" + +from time import time +import json +import re +import subprocess + +from ansible.module_utils.basic import AnsibleModule + + +class InvalidMatcherRegexp(Exception): +    """Exception class for invalid matcher regexp.""" +    pass + + +class InvalidLogEntry(Exception): +    """Exception class for invalid / non-json log entries.""" +    pass + + +class LogInputSubprocessError(Exception): +    """Exception class for errors that occur while executing a subprocess.""" +    pass + + +def main(): +    """Scan a given list of "log_matchers" for journalctl messages containing given patterns. +    "log_matchers" is a list of dicts consisting of three keys that help fine-tune log searching: +    'start_regexp', 'regexp', and 'unit'. + +    Sample "log_matchers" list: + +    [ +      { +        'start_regexp': r'Beginning of systemd unit', +        'regexp': r'the specific log message to find', +        'unit': 'etcd', +      } +    ] +    """ +    module = AnsibleModule( +        argument_spec=dict( +            log_count_limit=dict(type="int", default=500), +            log_matchers=dict(type="list", required=True), +        ), +    ) + +    timestamp_limit_seconds = time() - 60 * 60  # 1 hour + +    log_count_limit = module.params["log_count_limit"] +    log_matchers = module.params["log_matchers"] + +    matched_regexp, errors = get_log_matches(log_matchers, log_count_limit, timestamp_limit_seconds) + +    module.exit_json( +        changed=False, +        failed=bool(errors), +        errors=errors, +        matched=matched_regexp, +    ) + + +def get_log_matches(matchers, log_count_limit, timestamp_limit_seconds): +    """Return a list of up to log_count_limit matches for each matcher. + +    Log entries are only considered if newer than timestamp_limit_seconds. +    """ +    matched_regexp = [] +    errors = [] + +    for matcher in matchers: +        try: +            log_output = get_log_output(matcher) +        except LogInputSubprocessError as err: +            errors.append(str(err)) +            continue + +        try: +            matched = find_matches(log_output, matcher, log_count_limit, timestamp_limit_seconds) +            if matched: +                matched_regexp.append(matcher.get("regexp", "")) +        except InvalidMatcherRegexp as err: +            errors.append(str(err)) +        except InvalidLogEntry as err: +            errors.append(str(err)) + +    return matched_regexp, errors + + +def get_log_output(matcher): +    """Return an iterator on the logs of a given matcher.""" +    try: +        cmd_output = subprocess.Popen(list([ +            '/bin/journalctl', +            '-ru', matcher.get("unit", ""), +            '--output', 'json', +        ]), stdout=subprocess.PIPE) + +        return iter(cmd_output.stdout.readline, '') + +    except subprocess.CalledProcessError as exc: +        msg = "Could not obtain journalctl logs for the specified systemd unit: {}: {}" +        raise LogInputSubprocessError(msg.format(matcher.get("unit", "<missing>"), str(exc))) +    except OSError as exc: +        raise LogInputSubprocessError(str(exc)) + + +def find_matches(log_output, matcher, log_count_limit, timestamp_limit_seconds): +    """Return log messages matched in iterable log_output by a given matcher. + +    Ignore any log_output items older than timestamp_limit_seconds. +    """ +    try: +        regexp = re.compile(matcher.get("regexp", "")) +        start_regexp = re.compile(matcher.get("start_regexp", "")) +    except re.error as err: +        msg = "A log matcher object was provided with an invalid regular expression: {}" +        raise InvalidMatcherRegexp(msg.format(str(err))) + +    matched = None + +    for log_count, line in enumerate(log_output): +        if log_count >= log_count_limit: +            break + +        try: +            obj = json.loads(line) + +            # don't need to look past the most recent service restart +            if start_regexp.match(obj["MESSAGE"]): +                break + +            log_timestamp_seconds = float(obj["__REALTIME_TIMESTAMP"]) / 1000000 +            if log_timestamp_seconds < timestamp_limit_seconds: +                break + +            if regexp.match(obj["MESSAGE"]): +                matched = line +                break + +        except ValueError: +            msg = "Log entry for systemd unit {} contained invalid json syntax: {}" +            raise InvalidLogEntry(msg.format(matcher.get("unit"), line)) + +    return matched + + +if __name__ == '__main__': +    main() diff --git a/roles/openshift_health_checker/openshift_checks/etcd_traffic.py b/roles/openshift_health_checker/openshift_checks/etcd_traffic.py new file mode 100644 index 000000000..40c87873d --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/etcd_traffic.py @@ -0,0 +1,47 @@ +"""Check that scans journalctl for messages caused as a symptom of increased etcd traffic.""" + +from openshift_checks import OpenShiftCheck, get_var + + +class EtcdTraffic(OpenShiftCheck): +    """Check if host is being affected by an increase in etcd traffic.""" + +    name = "etcd_traffic" +    tags = ["health", "etcd"] + +    @classmethod +    def is_active(cls, task_vars): +        """Skip hosts that do not have etcd in their group names.""" +        group_names = get_var(task_vars, "group_names", default=[]) +        valid_group_names = "etcd" in group_names + +        version = get_var(task_vars, "openshift", "common", "short_version") +        valid_version = version in ("3.4", "3.5", "1.4", "1.5") + +        return super(EtcdTraffic, cls).is_active(task_vars) and valid_group_names and valid_version + +    def run(self, tmp, task_vars): +        is_containerized = get_var(task_vars, "openshift", "common", "is_containerized") +        unit = "etcd_container" if is_containerized else "etcd" + +        log_matchers = [{ +            "start_regexp": r"Starting Etcd Server", +            "regexp": r"etcd: sync duration of [^,]+, expected less than 1s", +            "unit": unit +        }] + +        match = self.execute_module("search_journalctl", { +            "log_matchers": log_matchers, +        }, task_vars) + +        if match.get("matched"): +            msg = ("Higher than normal etcd traffic detected.\n" +                   "OpenShift 3.4 introduced an increase in etcd traffic.\n" +                   "Upgrading to OpenShift 3.6 is recommended in order to fix this issue.\n" +                   "Please refer to https://access.redhat.com/solutions/2916381 for more information.") +            return {"failed": True, "msg": msg} + +        if match.get("failed"): +            return {"failed": True, "msg": "\n".join(match.get("errors"))} + +        return {} diff --git a/roles/openshift_health_checker/test/etcd_traffic_test.py b/roles/openshift_health_checker/test/etcd_traffic_test.py new file mode 100644 index 000000000..287175e29 --- /dev/null +++ b/roles/openshift_health_checker/test/etcd_traffic_test.py @@ -0,0 +1,80 @@ +import pytest + +from openshift_checks.etcd_traffic import EtcdTraffic + + +@pytest.mark.parametrize('group_names,version,is_active', [ +    (['masters'], "3.5", False), +    (['masters'], "3.6", False), +    (['nodes'], "3.4", False), +    (['etcd'], "3.4", True), +    (['etcd'], "3.5", True), +    (['etcd'], "3.1", False), +    (['masters', 'nodes'], "3.5", False), +    (['masters', 'etcd'], "3.5", True), +    ([], "3.4", False), +]) +def test_is_active(group_names, version, is_active): +    task_vars = dict( +        group_names=group_names, +        openshift=dict( +            common=dict(short_version=version), +        ), +    ) +    assert EtcdTraffic.is_active(task_vars=task_vars) == is_active + + +@pytest.mark.parametrize('group_names,matched,failed,extra_words', [ +    (["masters"], True, True, ["Higher than normal", "traffic"]), +    (["masters", "etcd"], False, False, []), +    (["etcd"], False, False, []), +]) +def test_log_matches_high_traffic_msg(group_names, matched, failed, extra_words): +    def execute_module(module_name, args, task_vars): +        return { +            "matched": matched, +            "failed": failed, +        } + +    task_vars = dict( +        group_names=group_names, +        openshift=dict( +            common=dict(service_type="origin", is_containerized=False), +        ) +    ) + +    check = EtcdTraffic(execute_module=execute_module) +    result = check.run(tmp=None, task_vars=task_vars) + +    for word in extra_words: +        assert word in result.get("msg", "") + +    assert result.get("failed", False) == failed + + +@pytest.mark.parametrize('is_containerized,expected_unit_value', [ +    (False, "etcd"), +    (True, "etcd_container"), +]) +def test_systemd_unit_matches_deployment_type(is_containerized, expected_unit_value): +    task_vars = dict( +        openshift=dict( +            common=dict(is_containerized=is_containerized), +        ) +    ) + +    def execute_module(module_name, args, task_vars): +        assert module_name == "search_journalctl" +        matchers = args["log_matchers"] + +        for matcher in matchers: +            assert matcher["unit"] == expected_unit_value + +        return {"failed": False} + +    check = EtcdTraffic(execute_module=execute_module) +    check.run(tmp=None, task_vars=task_vars) + + +def fake_execute_module(*args): +    raise AssertionError('this function should not be called') diff --git a/roles/openshift_health_checker/test/search_journalctl_test.py b/roles/openshift_health_checker/test/search_journalctl_test.py new file mode 100644 index 000000000..724928aa1 --- /dev/null +++ b/roles/openshift_health_checker/test/search_journalctl_test.py @@ -0,0 +1,157 @@ +import pytest +import search_journalctl + + +def canned_search_journalctl(get_log_output=None): +    """Create a search_journalctl object with canned get_log_output method""" +    module = search_journalctl +    if get_log_output: +        module.get_log_output = get_log_output +    return module + + +DEFAULT_TIMESTAMP = 1496341364 + + +def get_timestamp(modifier=0): +    return DEFAULT_TIMESTAMP + modifier + + +def get_timestamp_microseconds(modifier=0): +    return get_timestamp(modifier) * 1000000 + + +def create_test_log_object(stamp, msg): +    return '{{"__REALTIME_TIMESTAMP": "{}", "MESSAGE": "{}"}}'.format(stamp, msg) + + +@pytest.mark.parametrize('name,matchers,log_input,expected_matches,expected_errors', [ +    ( +        'test with valid params', +        [ +            { +                "start_regexp": r"Sample Logs Beginning", +                "regexp": r"test log message", +                "unit": "test", +            }, +        ], +        [ +            create_test_log_object(get_timestamp_microseconds(), "test log message"), +            create_test_log_object(get_timestamp_microseconds(), "Sample Logs Beginning"), +        ], +        ["test log message"], +        [], +    ), +    ( +        'test with invalid json in log input', +        [ +            { +                "start_regexp": r"Sample Logs Beginning", +                "regexp": r"test log message", +                "unit": "test-unit", +            }, +        ], +        [ +            '{__REALTIME_TIMESTAMP: ' + str(get_timestamp_microseconds()) + ', "MESSAGE": "test log message"}', +        ], +        [], +        [ +            ["invalid json", "test-unit", "test log message"], +        ], +    ), +    ( +        'test with invalid regexp', +        [ +            { +                "start_regexp": r"Sample Logs Beginning", +                "regexp": r"test [ log message", +                "unit": "test", +            }, +        ], +        [ +            create_test_log_object(get_timestamp_microseconds(), "test log message"), +            create_test_log_object(get_timestamp_microseconds(), "sample log message"), +            create_test_log_object(get_timestamp_microseconds(), "fake log message"), +            create_test_log_object(get_timestamp_microseconds(), "dummy log message"), +            create_test_log_object(get_timestamp_microseconds(), "Sample Logs Beginning"), +        ], +        [], +        [ +            ["invalid regular expression"], +        ], +    ), +], ids=lambda argval: argval[0]) +def test_get_log_matches(name, matchers, log_input, expected_matches, expected_errors): +    def get_log_output(matcher): +        return log_input + +    module = canned_search_journalctl(get_log_output) +    matched_regexp, errors = module.get_log_matches(matchers, 500, 60 * 60) + +    assert set(matched_regexp) == set(expected_matches) +    assert len(expected_errors) == len(errors) + +    for idx, partial_err_set in enumerate(expected_errors): +        for partial_err_msg in partial_err_set: +            assert partial_err_msg in errors[idx] + + +@pytest.mark.parametrize('name,matcher,log_count_lim,stamp_lim_seconds,log_input,expected_match', [ +    ( +        'test with matching log message, but out of bounds of log_count_lim', +        { +            "start_regexp": r"Sample Logs Beginning", +            "regexp": r"dummy log message", +            "unit": "test", +        }, +        3, +        get_timestamp(-100 * 60 * 60), +        [ +            create_test_log_object(get_timestamp_microseconds(), "test log message"), +            create_test_log_object(get_timestamp_microseconds(), "sample log message"), +            create_test_log_object(get_timestamp_microseconds(), "fake log message"), +            create_test_log_object(get_timestamp_microseconds(), "dummy log message"), +            create_test_log_object(get_timestamp_microseconds(), "Sample Logs Beginning"), +        ], +        None, +    ), +    ( +        'test with matching log message, but with timestamp too old', +        { +            "start_regexp": r"Sample Logs Beginning", +            "regexp": r"dummy log message", +            "unit": "test", +        }, +        100, +        get_timestamp(-10), +        [ +            create_test_log_object(get_timestamp_microseconds(), "test log message"), +            create_test_log_object(get_timestamp_microseconds(), "sample log message"), +            create_test_log_object(get_timestamp_microseconds(), "fake log message"), +            create_test_log_object(get_timestamp_microseconds(-1000), "dummy log message"), +            create_test_log_object(get_timestamp_microseconds(-1000), "Sample Logs Beginning"), +        ], +        None, +    ), +    ( +        'test with matching log message, and timestamp within time limit', +        { +            "start_regexp": r"Sample Logs Beginning", +            "regexp": r"dummy log message", +            "unit": "test", +        }, +        100, +        get_timestamp(-1010), +        [ +            create_test_log_object(get_timestamp_microseconds(), "test log message"), +            create_test_log_object(get_timestamp_microseconds(), "sample log message"), +            create_test_log_object(get_timestamp_microseconds(), "fake log message"), +            create_test_log_object(get_timestamp_microseconds(-1000), "dummy log message"), +            create_test_log_object(get_timestamp_microseconds(-1000), "Sample Logs Beginning"), +        ], +        create_test_log_object(get_timestamp_microseconds(-1000), "dummy log message"), +    ), +], ids=lambda argval: argval[0]) +def test_find_matches_skips_logs(name, matcher, log_count_lim, stamp_lim_seconds, log_input, expected_match): +    match = search_journalctl.find_matches(log_input, matcher, log_count_lim, stamp_lim_seconds) +    assert match == expected_match | 
