diff options
Diffstat (limited to 'roles/openshift_health_checker/openshift_checks')
14 files changed, 1293 insertions, 114 deletions
diff --git a/roles/openshift_health_checker/openshift_checks/__init__.py b/roles/openshift_health_checker/openshift_checks/__init__.py index be63d864a..5c9949ced 100644 --- a/roles/openshift_health_checker/openshift_checks/__init__.py +++ b/roles/openshift_health_checker/openshift_checks/__init__.py @@ -66,16 +66,26 @@ class OpenShiftCheck(object): LOADER_EXCLUDES = ( "__init__.py", "mixins.py", + "logging.py", ) -def load_checks(): +def load_checks(path=None, subpkg=""): """Dynamically import all check modules for the side effect of registering checks.""" - return [ - import_module(__package__ + "." + name[:-3]) - for name in os.listdir(os.path.dirname(__file__)) - if name.endswith(".py") and name not in LOADER_EXCLUDES - ] + if path is None: + path = os.path.dirname(__file__) + + modules = [] + + for name in os.listdir(path): + if os.path.isdir(os.path.join(path, name)): + modules = modules + load_checks(os.path.join(path, name), subpkg + "." + name) + continue + + if name.endswith(".py") and name not in LOADER_EXCLUDES: + modules.append(import_module(__package__ + subpkg + "." + name[:-3])) + + return modules def get_var(task_vars, *keys, **kwargs): diff --git a/roles/openshift_health_checker/openshift_checks/disk_availability.py b/roles/openshift_health_checker/openshift_checks/disk_availability.py index c2792a0fe..962148cb8 100644 --- a/roles/openshift_health_checker/openshift_checks/disk_availability.py +++ b/roles/openshift_health_checker/openshift_checks/disk_availability.py @@ -27,10 +27,12 @@ class DiskAvailability(NotContainerizedMixin, OpenShiftCheck): def run(self, tmp, task_vars): group_names = get_var(task_vars, "group_names") ansible_mounts = get_var(task_vars, "ansible_mounts") - - min_free_bytes = max(self.recommended_disk_space_bytes.get(name, 0) for name in group_names) free_bytes = self.openshift_available_disk(ansible_mounts) + recommended_min = max(self.recommended_disk_space_bytes.get(name, 0) for name in group_names) + configured_min = int(get_var(task_vars, "openshift_check_min_host_disk_gb", default=0)) * 10**9 + min_free_bytes = configured_min or recommended_min + if free_bytes < min_free_bytes: return { 'failed': True, diff --git a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py index cce289b95..27e6fe383 100644 --- a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py +++ b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py @@ -1,8 +1,9 @@ # pylint: disable=missing-docstring from openshift_checks import OpenShiftCheck, get_var +from openshift_checks.mixins import DockerHostMixin -class DockerImageAvailability(OpenShiftCheck): +class DockerImageAvailability(DockerHostMixin, OpenShiftCheck): """Check that required Docker images are available. This check attempts to ensure that required docker images are @@ -13,41 +14,47 @@ class DockerImageAvailability(OpenShiftCheck): name = "docker_image_availability" tags = ["preflight"] - skopeo_image = "openshift/openshift-ansible" + dependencies = ["skopeo", "python-docker-py"] - # FIXME(juanvallejo): we should consider other possible values of - # `deployment_type` (the key here). See - # https://github.com/openshift/openshift-ansible/blob/8e26f8c/roles/openshift_repos/vars/main.yml#L7 - docker_image_base = { + deployment_image_info = { "origin": { - "repo": "openshift", - "image": "origin", + "namespace": "openshift", + "name": "origin", }, "openshift-enterprise": { - "repo": "openshift3", - "image": "ose", + "namespace": "openshift3", + "name": "ose", }, } - def run(self, tmp, task_vars): - required_images = self.required_images(task_vars) - missing_images = set(required_images) - set(self.local_images(required_images, task_vars)) - - # exit early if all images were found locally - if not missing_images: - return {"changed": False} + @classmethod + def is_active(cls, task_vars): + """Skip hosts with unsupported deployment types.""" + deployment_type = get_var(task_vars, "openshift_deployment_type") + has_valid_deployment_type = deployment_type in cls.deployment_image_info - msg, failed, changed = self.update_skopeo_image(task_vars) + return super(DockerImageAvailability, cls).is_active(task_vars) and has_valid_deployment_type - # exit early if Skopeo update fails + def run(self, tmp, task_vars): + msg, failed, changed = self.ensure_dependencies(task_vars) if failed: return { "failed": True, "changed": changed, - "msg": "Failed to update Skopeo image ({img_name}). {msg}".format(img_name=self.skopeo_image, msg=msg), + "msg": "Some dependencies are required in order to check Docker image availability.\n" + msg } + required_images = self.required_images(task_vars) + missing_images = set(required_images) - set(self.local_images(required_images, task_vars)) + + # exit early if all images were found locally + if not missing_images: + return {"changed": changed} + registries = self.known_docker_registries(task_vars) + if not registries: + return {"failed": True, "msg": "Unable to retrieve any docker registries.", "changed": changed} + available_images = self.available_images(missing_images, registries, task_vars) unavailable_images = set(missing_images) - set(available_images) @@ -55,44 +62,60 @@ class DockerImageAvailability(OpenShiftCheck): return { "failed": True, "msg": ( - "One or more required images are not available: {}.\n" + "One or more required Docker images are not available:\n {}\n" "Configured registries: {}" - ).format(", ".join(sorted(unavailable_images)), ", ".join(registries)), + ).format(",\n ".join(sorted(unavailable_images)), ", ".join(registries)), "changed": changed, } return {"changed": changed} def required_images(self, task_vars): - deployment_type = get_var(task_vars, "deployment_type") - # FIXME(juanvallejo): we should handle gracefully with a proper error - # message when given an unexpected value for `deployment_type`. - image_base_name = self.docker_image_base[deployment_type] - - openshift_release = get_var(task_vars, "openshift_release") - # FIXME(juanvallejo): this variable is not required when the - # installation is non-containerized. The example inventories have it - # commented out. We should handle gracefully and with a proper error - # message when this variable is required and not set. - openshift_image_tag = get_var(task_vars, "openshift_image_tag") + deployment_type = get_var(task_vars, "openshift_deployment_type") + image_info = self.deployment_image_info[deployment_type] + openshift_release = get_var(task_vars, "openshift_release", default="latest") + openshift_image_tag = get_var(task_vars, "openshift_image_tag") is_containerized = get_var(task_vars, "openshift", "common", "is_containerized") - if is_containerized: - images = set(self.containerized_docker_images(image_base_name, openshift_release)) - else: - images = set(self.rpm_docker_images(image_base_name, openshift_release)) + images = set(self.required_docker_images( + image_info["namespace"], + image_info["name"], + ["registry-console"] if "enterprise" in deployment_type else [], # include enterprise-only image names + openshift_release, + is_containerized, + )) # append images with qualified image tags to our list of required images. # these are images with a (v0.0.0.0) tag, rather than a standard release # format tag (v0.0). We want to check this set in both containerized and # non-containerized installations. images.update( - self.qualified_docker_images(self.image_from_base_name(image_base_name), "v" + openshift_image_tag) + self.required_qualified_docker_images( + image_info["namespace"], + image_info["name"], + openshift_image_tag, + ), ) return images + @staticmethod + def required_docker_images(namespace, name, additional_image_names, version, is_containerized): + if is_containerized: + return ["{}/{}:{}".format(namespace, name, version)] if name else [] + + # include additional non-containerized images specific to the current deployment type + return ["{}/{}:{}".format(namespace, img_name, version) for img_name in additional_image_names] + + @staticmethod + def required_qualified_docker_images(namespace, name, version): + # pylint: disable=invalid-name + return [ + "{}/{}-{}:{}".format(namespace, name, suffix, version) + for suffix in ["haproxy-router", "docker-registry", "deployer", "pod"] + ] + def local_images(self, images, task_vars): """Filter a list of images and return those available locally.""" return [ @@ -107,31 +130,26 @@ class DockerImageAvailability(OpenShiftCheck): return bool(result.get("images", [])) - def known_docker_registries(self, task_vars): - result = self.module_executor("docker_info", {}, task_vars) + @staticmethod + def known_docker_registries(task_vars): + docker_facts = get_var(task_vars, "openshift", "docker") + regs = set(docker_facts["additional_registries"]) - if result.get("failed", False): - return [] + deployment_type = get_var(task_vars, "openshift_deployment_type") + if deployment_type == "origin": + regs.update(["docker.io"]) + elif "enterprise" in deployment_type: + regs.update(["registry.access.redhat.com"]) - # FIXME(juanvallejo): wrong default type, result["info"] is expected to - # contain a dictionary (see how we call `docker_info.get` below). - docker_info = result.get("info", "") - return [registry.get("Name", "") for registry in docker_info.get("Registries", {})] + return list(regs) def available_images(self, images, registries, task_vars): """Inspect existing images using Skopeo and return all images successfully inspected.""" return [ image for image in images - if self.is_image_available(image, registries, task_vars) + if any(self.is_available_skopeo_image(image, registry, task_vars) for registry in registries) ] - def is_image_available(self, image, registries, task_vars): - for registry in registries: - if self.is_available_skopeo_image(image, registry, task_vars): - return True - - return False - def is_available_skopeo_image(self, image, registry, task_vars): """Uses Skopeo to determine if required image exists in a given registry.""" @@ -140,40 +158,6 @@ class DockerImageAvailability(OpenShiftCheck): image=image, ) - args = { - "name": "skopeo_inspect", - "image": self.skopeo_image, - "command": cmd_str, - "detach": False, - "cleanup": True, - } - result = self.module_executor("docker_container", args, task_vars) - return result.get("failed", False) - - def containerized_docker_images(self, base_name, version): - return [ - "{image}:{version}".format(image=self.image_from_base_name(base_name), version=version) - ] - - @staticmethod - def rpm_docker_images(base, version): - return [ - "{image_repo}/registry-console:{version}".format(image_repo=base["repo"], version=version) - ] - - @staticmethod - def qualified_docker_images(image_name, version): - return [ - "{}-{}:{}".format(image_name, component, version) - for component in "haproxy-router docker-registry deployer pod".split() - ] - - @staticmethod - def image_from_base_name(base): - return "".join([base["repo"], "/", base["image"]]) - - # ensures that the skopeo docker image exists, and updates it - # with latest if image was already present locally. - def update_skopeo_image(self, task_vars): - result = self.module_executor("docker_image", {"name": self.skopeo_image}, task_vars) - return result.get("msg", ""), result.get("failed", False), result.get("changed", False) + args = {"_raw_params": cmd_str} + result = self.module_executor("command", args, task_vars) + return not result.get("failed", False) and result.get("rc", 0) == 0 diff --git a/roles/openshift_health_checker/openshift_checks/docker_storage.py b/roles/openshift_health_checker/openshift_checks/docker_storage.py new file mode 100644 index 000000000..7f1751b36 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/docker_storage.py @@ -0,0 +1,185 @@ +"""Check Docker storage driver and usage.""" +import json +import re +from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var +from openshift_checks.mixins import DockerHostMixin + + +class DockerStorage(DockerHostMixin, OpenShiftCheck): + """Check Docker storage driver compatibility. + + This check ensures that Docker is using a supported storage driver, + and that loopback is not being used (if using devicemapper). + Also that storage usage is not above threshold. + """ + + name = "docker_storage" + tags = ["pre-install", "health", "preflight"] + + dependencies = ["python-docker-py"] + storage_drivers = ["devicemapper", "overlay2"] + max_thinpool_data_usage_percent = 90.0 + max_thinpool_meta_usage_percent = 90.0 + + # pylint: disable=too-many-return-statements + # Reason: permanent stylistic exception; + # it is clearer to return on failures and there are just many ways to fail here. + def run(self, tmp, task_vars): + msg, failed, changed = self.ensure_dependencies(task_vars) + if failed: + return { + "failed": True, + "changed": changed, + "msg": "Some dependencies are required in order to query docker storage on host:\n" + msg + } + + # attempt to get the docker info hash from the API + info = self.execute_module("docker_info", {}, task_vars) + if info.get("failed"): + return {"failed": True, "changed": changed, + "msg": "Failed to query Docker API. Is docker running on this host?"} + if not info.get("info"): # this would be very strange + return {"failed": True, "changed": changed, + "msg": "Docker API query missing info:\n{}".format(json.dumps(info))} + info = info["info"] + + # check if the storage driver we saw is valid + driver = info.get("Driver", "[NONE]") + if driver not in self.storage_drivers: + msg = ( + "Detected unsupported Docker storage driver '{driver}'.\n" + "Supported storage drivers are: {drivers}" + ).format(driver=driver, drivers=', '.join(self.storage_drivers)) + return {"failed": True, "changed": changed, "msg": msg} + + # driver status info is a list of tuples; convert to dict and validate based on driver + driver_status = {item[0]: item[1] for item in info.get("DriverStatus", [])} + if driver == "devicemapper": + if driver_status.get("Data loop file"): + msg = ( + "Use of loopback devices with the Docker devicemapper storage driver\n" + "(the default storage configuration) is unsupported in production.\n" + "Please use docker-storage-setup to configure a backing storage volume.\n" + "See http://red.ht/2rNperO for further information." + ) + return {"failed": True, "changed": changed, "msg": msg} + result = self._check_dm_usage(driver_status, task_vars) + result['changed'] = result.get('changed', False) or changed + return result + + # TODO(lmeyer): determine how to check usage for overlay2 + + return {"changed": changed} + + def _check_dm_usage(self, driver_status, task_vars): + """ + Backing assumptions: We expect devicemapper to be backed by an auto-expanding thin pool + implemented as an LV in an LVM2 VG. This is how docker-storage-setup currently configures + devicemapper storage. The LV is "thin" because it does not use all available storage + from its VG, instead expanding as needed; so to determine available space, we gather + current usage as the Docker API reports for the driver as well as space available for + expansion in the pool's VG. + Usage within the LV is divided into pools allocated to data and metadata, either of which + could run out of space first; so we check both. + """ + vals = dict( + vg_free=self._get_vg_free(driver_status.get("Pool Name"), task_vars), + data_used=driver_status.get("Data Space Used"), + data_total=driver_status.get("Data Space Total"), + metadata_used=driver_status.get("Metadata Space Used"), + metadata_total=driver_status.get("Metadata Space Total"), + ) + + # convert all human-readable strings to bytes + for key, value in vals.copy().items(): + try: + vals[key + "_bytes"] = self._convert_to_bytes(value) + except ValueError as err: # unlikely to hit this from API info, but just to be safe + return { + "failed": True, + "values": vals, + "msg": "Could not interpret {} value '{}' as bytes: {}".format(key, value, str(err)) + } + + # determine the threshold percentages which usage should not exceed + for name, default in [("data", self.max_thinpool_data_usage_percent), + ("metadata", self.max_thinpool_meta_usage_percent)]: + percent = get_var(task_vars, "max_thinpool_" + name + "_usage_percent", default=default) + try: + vals[name + "_threshold"] = float(percent) + except ValueError: + return { + "failed": True, + "msg": "Specified thinpool {} usage limit '{}' is not a percentage".format(name, percent) + } + + # test whether the thresholds are exceeded + messages = [] + for name in ["data", "metadata"]: + vals[name + "_pct_used"] = 100 * vals[name + "_used_bytes"] / ( + vals[name + "_total_bytes"] + vals["vg_free_bytes"]) + if vals[name + "_pct_used"] > vals[name + "_threshold"]: + messages.append( + "Docker thinpool {name} usage percentage {pct:.1f} " + "is higher than threshold {thresh:.1f}.".format( + name=name, + pct=vals[name + "_pct_used"], + thresh=vals[name + "_threshold"], + )) + vals["failed"] = True + + vals["msg"] = "\n".join(messages or ["Thinpool usage is within thresholds."]) + return vals + + def _get_vg_free(self, pool, task_vars): + # Determine which VG to examine according to the pool name, the only indicator currently + # available from the Docker API driver info. We assume a name that looks like + # "vg--name-docker--pool"; vg and lv names with inner hyphens doubled, joined by a hyphen. + match = re.match(r'((?:[^-]|--)+)-(?!-)', pool) # matches up to the first single hyphen + if not match: # unlikely, but... be clear if we assumed wrong + raise OpenShiftCheckException( + "This host's Docker reports it is using a storage pool named '{}'.\n" + "However this name does not have the expected format of 'vgname-lvname'\n" + "so the available storage in the VG cannot be determined.".format(pool) + ) + vg_name = match.groups()[0].replace("--", "-") + vgs_cmd = "/sbin/vgs --noheadings -o vg_free --select vg_name=" + vg_name + # should return free space like " 12.00g" if the VG exists; empty if it does not + + ret = self.execute_module("command", {"_raw_params": vgs_cmd}, task_vars) + if ret.get("failed") or ret.get("rc", 0) != 0: + raise OpenShiftCheckException( + "Is LVM installed? Failed to run /sbin/vgs " + "to determine docker storage usage:\n" + ret.get("msg", "") + ) + size = ret.get("stdout", "").strip() + if not size: + raise OpenShiftCheckException( + "This host's Docker reports it is using a storage pool named '{pool}'.\n" + "which we expect to come from local VG '{vg}'.\n" + "However, /sbin/vgs did not find this VG. Is Docker for this host" + "running and using the storage on the host?".format(pool=pool, vg=vg_name) + ) + return size + + @staticmethod + def _convert_to_bytes(string): + units = dict( + b=1, + k=1024, + m=1024**2, + g=1024**3, + t=1024**4, + p=1024**5, + ) + string = string or "" + match = re.match(r'(\d+(?:\.\d+)?)\s*(\w)?', string) # float followed by optional unit + if not match: + raise ValueError("Cannot convert to a byte size: " + string) + + number, unit = match.groups() + multiplier = 1 if not unit else units.get(unit.lower()) + if not multiplier: + raise ValueError("Cannot convert to a byte size: " + string) + + return float(number) * multiplier diff --git a/roles/openshift_health_checker/openshift_checks/logging/__init__.py b/roles/openshift_health_checker/openshift_checks/logging/__init__.py new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/__init__.py diff --git a/roles/openshift_health_checker/openshift_checks/logging/curator.py b/roles/openshift_health_checker/openshift_checks/logging/curator.py new file mode 100644 index 000000000..c9fc59896 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/curator.py @@ -0,0 +1,61 @@ +""" +Module for performing checks on an Curator logging deployment +""" + +from openshift_checks import get_var +from openshift_checks.logging.logging import LoggingCheck + + +class Curator(LoggingCheck): + """Module that checks an integrated logging Curator deployment""" + + name = "curator" + tags = ["health", "logging"] + + logging_namespace = None + + def run(self, tmp, task_vars): + """Check various things and gather errors. Returns: result as hash""" + + self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging") + curator_pods, error = super(Curator, self).get_pods_for_component( + self.module_executor, + self.logging_namespace, + "curator", + task_vars + ) + if error: + return {"failed": True, "changed": False, "msg": error} + check_error = self.check_curator(curator_pods) + + if check_error: + msg = ("The following Curator deployment issue was found:" + "\n-------\n" + "{}".format(check_error)) + return {"failed": True, "changed": False, "msg": msg} + + # TODO(lmeyer): run it all again for the ops cluster + return {"failed": False, "changed": False, "msg": 'No problems found with Curator deployment.'} + + def check_curator(self, pods): + """Check to see if curator is up and working. Returns: error string""" + if not pods: + return ( + "There are no Curator pods for the logging stack,\n" + "so nothing will prune Elasticsearch indexes.\n" + "Is Curator correctly deployed?" + ) + + not_running = super(Curator, self).not_running_pods(pods) + if len(not_running) == len(pods): + return ( + "The Curator pod is not currently in a running state,\n" + "so Elasticsearch indexes may increase without bound." + ) + if len(pods) - len(not_running) > 1: + return ( + "There is more than one Curator pod running. This should not normally happen.\n" + "Although this doesn't cause any problems, you may want to investigate." + ) + + return None diff --git a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py new file mode 100644 index 000000000..01cb35b81 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py @@ -0,0 +1,217 @@ +""" +Module for performing checks on an Elasticsearch logging deployment +""" + +import json +import re + +from openshift_checks import get_var +from openshift_checks.logging.logging import LoggingCheck + + +class Elasticsearch(LoggingCheck): + """Module that checks an integrated logging Elasticsearch deployment""" + + name = "elasticsearch" + tags = ["health", "logging"] + + logging_namespace = None + + def run(self, tmp, task_vars): + """Check various things and gather errors. Returns: result as hash""" + + self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging") + es_pods, error = super(Elasticsearch, self).get_pods_for_component( + self.execute_module, + self.logging_namespace, + "es", + task_vars, + ) + if error: + return {"failed": True, "changed": False, "msg": error} + check_error = self.check_elasticsearch(es_pods, task_vars) + + if check_error: + msg = ("The following Elasticsearch deployment issue was found:" + "\n-------\n" + "{}".format(check_error)) + return {"failed": True, "changed": False, "msg": msg} + + # TODO(lmeyer): run it all again for the ops cluster + return {"failed": False, "changed": False, "msg": 'No problems found with Elasticsearch deployment.'} + + def _not_running_elasticsearch_pods(self, es_pods): + """Returns: list of running pods, list of errors about non-running pods""" + not_running = super(Elasticsearch, self).not_running_pods(es_pods) + if not_running: + return not_running, [( + 'The following Elasticsearch pods are not running:\n' + '{pods}' + 'These pods will not aggregate logs from their nodes.' + ).format(pods=''.join( + " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None')) + for pod in not_running + ))] + return not_running, [] + + def check_elasticsearch(self, es_pods, task_vars): + """Various checks for elasticsearch. Returns: error string""" + not_running_pods, error_msgs = self._not_running_elasticsearch_pods(es_pods) + running_pods = [pod for pod in es_pods if pod not in not_running_pods] + pods_by_name = { + pod['metadata']['name']: pod for pod in running_pods + # Filter out pods that are not members of a DC + if pod['metadata'].get('labels', {}).get('deploymentconfig') + } + if not pods_by_name: + return 'No logging Elasticsearch pods were found. Is logging deployed?' + error_msgs += self._check_elasticsearch_masters(pods_by_name, task_vars) + error_msgs += self._check_elasticsearch_node_list(pods_by_name, task_vars) + error_msgs += self._check_es_cluster_health(pods_by_name, task_vars) + error_msgs += self._check_elasticsearch_diskspace(pods_by_name, task_vars) + return '\n'.join(error_msgs) + + @staticmethod + def _build_es_curl_cmd(pod_name, url): + base = "exec {name} -- curl -s --cert {base}cert --key {base}key --cacert {base}ca -XGET '{url}'" + return base.format(base="/etc/elasticsearch/secret/admin-", name=pod_name, url=url) + + def _check_elasticsearch_masters(self, pods_by_name, task_vars): + """Check that Elasticsearch masters are sane. Returns: list of error strings""" + es_master_names = set() + error_msgs = [] + for pod_name in pods_by_name.keys(): + # Compare what each ES node reports as master and compare for split brain + get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master") + master_name_str = self._exec_oc(get_master_cmd, [], task_vars) + master_names = (master_name_str or '').split(' ') + if len(master_names) > 1: + es_master_names.add(master_names[1]) + else: + error_msgs.append( + 'No master? Elasticsearch {pod} returned bad string when asked master name:\n' + ' {response}'.format(pod=pod_name, response=master_name_str) + ) + + if not es_master_names: + error_msgs.append('No logging Elasticsearch masters were found. Is logging deployed?') + return '\n'.join(error_msgs) + + if len(es_master_names) > 1: + error_msgs.append( + 'Found multiple Elasticsearch masters according to the pods:\n' + '{master_list}\n' + 'This implies that the masters have "split brain" and are not correctly\n' + 'replicating data for the logging cluster. Log loss is likely to occur.' + .format(master_list='\n'.join(' ' + master for master in es_master_names)) + ) + + return error_msgs + + def _check_elasticsearch_node_list(self, pods_by_name, task_vars): + """Check that reported ES masters are accounted for by pods. Returns: list of error strings""" + + if not pods_by_name: + return ['No logging Elasticsearch masters were found. Is logging deployed?'] + + # get ES cluster nodes + node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes') + cluster_node_data = self._exec_oc(node_cmd, [], task_vars) + try: + cluster_nodes = json.loads(cluster_node_data)['nodes'] + except (ValueError, KeyError): + return [ + 'Failed to query Elasticsearch for the list of ES nodes. The output was:\n' + + cluster_node_data + ] + + # Try to match all ES-reported node hosts to known pods. + error_msgs = [] + for node in cluster_nodes.values(): + # Note that with 1.4/3.4 the pod IP may be used as the master name + if not any(node['host'] in (pod_name, pod['status'].get('podIP')) + for pod_name, pod in pods_by_name.items()): + error_msgs.append( + 'The Elasticsearch cluster reports a member node "{node}"\n' + 'that does not correspond to any known ES pod.'.format(node=node['host']) + ) + + return error_msgs + + def _check_es_cluster_health(self, pods_by_name, task_vars): + """Exec into the elasticsearch pods and check the cluster health. Returns: list of errors""" + error_msgs = [] + for pod_name in pods_by_name.keys(): + cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true') + cluster_health_data = self._exec_oc(cluster_health_cmd, [], task_vars) + try: + health_res = json.loads(cluster_health_data) + if not health_res or not health_res.get('status'): + raise ValueError() + except ValueError: + error_msgs.append( + 'Could not retrieve cluster health status from logging ES pod "{pod}".\n' + 'Response was:\n{output}'.format(pod=pod_name, output=cluster_health_data) + ) + continue + + if health_res['status'] not in ['green', 'yellow']: + error_msgs.append( + 'Elasticsearch cluster health status is RED according to pod "{}"'.format(pod_name) + ) + + return error_msgs + + def _check_elasticsearch_diskspace(self, pods_by_name, task_vars): + """ + Exec into an ES pod and query the diskspace on the persistent volume. + Returns: list of errors + """ + error_msgs = [] + for pod_name in pods_by_name.keys(): + df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name) + disk_output = self._exec_oc(df_cmd, [], task_vars) + lines = disk_output.splitlines() + # expecting one header looking like 'IUse% Use%' and one body line + body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$' + if len(lines) != 2 or len(lines[0].split()) != 2 or not re.match(body_re, lines[1]): + error_msgs.append( + 'Could not retrieve storage usage from logging ES pod "{pod}".\n' + 'Response to `df` command was:\n{output}'.format(pod=pod_name, output=disk_output) + ) + continue + inode_pct, disk_pct = re.match(body_re, lines[1]).groups() + + inode_pct_thresh = get_var(task_vars, 'openshift_check_efk_es_inode_pct', default='90') + if int(inode_pct) >= int(inode_pct_thresh): + error_msgs.append( + 'Inode percent usage on the storage volume for logging ES pod "{pod}"\n' + ' is {pct}, greater than threshold {limit}.\n' + ' Note: threshold can be specified in inventory with {param}'.format( + pod=pod_name, + pct=str(inode_pct), + limit=str(inode_pct_thresh), + param='openshift_check_efk_es_inode_pct', + )) + disk_pct_thresh = get_var(task_vars, 'openshift_check_efk_es_storage_pct', default='80') + if int(disk_pct) >= int(disk_pct_thresh): + error_msgs.append( + 'Disk percent usage on the storage volume for logging ES pod "{pod}"\n' + ' is {pct}, greater than threshold {limit}.\n' + ' Note: threshold can be specified in inventory with {param}'.format( + pod=pod_name, + pct=str(disk_pct), + limit=str(disk_pct_thresh), + param='openshift_check_efk_es_storage_pct', + )) + + return error_msgs + + def _exec_oc(self, cmd_str, extra_args, task_vars): + return super(Elasticsearch, self).exec_oc( + self.execute_module, + self.logging_namespace, + cmd_str, + extra_args, + task_vars, + ) diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py new file mode 100644 index 000000000..627567293 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py @@ -0,0 +1,170 @@ +""" +Module for performing checks on an Fluentd logging deployment +""" + +import json + +from openshift_checks import get_var +from openshift_checks.logging.logging import LoggingCheck + + +class Fluentd(LoggingCheck): + """Module that checks an integrated logging Fluentd deployment""" + name = "fluentd" + tags = ["health", "logging"] + + logging_namespace = None + + def run(self, tmp, task_vars): + """Check various things and gather errors. Returns: result as hash""" + + self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging") + fluentd_pods, error = super(Fluentd, self).get_pods_for_component( + self.execute_module, + self.logging_namespace, + "fluentd", + task_vars, + ) + if error: + return {"failed": True, "changed": False, "msg": error} + check_error = self.check_fluentd(fluentd_pods, task_vars) + + if check_error: + msg = ("The following Fluentd deployment issue was found:" + "\n-------\n" + "{}".format(check_error)) + return {"failed": True, "changed": False, "msg": msg} + + # TODO(lmeyer): run it all again for the ops cluster + return {"failed": False, "changed": False, "msg": 'No problems found with Fluentd deployment.'} + + @staticmethod + def _filter_fluentd_labeled_nodes(nodes_by_name, node_selector): + """Filter to all nodes with fluentd label. Returns dict(name: node), error string""" + label, value = node_selector.split('=', 1) + fluentd_nodes = { + name: node for name, node in nodes_by_name.items() + if node['metadata']['labels'].get(label) == value + } + if not fluentd_nodes: + return None, ( + 'There are no nodes with the fluentd label {label}.\n' + 'This means no logs will be aggregated from the nodes.' + ).format(label=node_selector) + return fluentd_nodes, None + + @staticmethod + def _check_node_labeling(nodes_by_name, fluentd_nodes, node_selector, task_vars): + """Note if nodes are not labeled as expected. Returns: error string""" + intended_nodes = get_var(task_vars, 'openshift_logging_fluentd_hosts', default=['--all']) + if not intended_nodes or '--all' in intended_nodes: + intended_nodes = nodes_by_name.keys() + nodes_missing_labels = set(intended_nodes) - set(fluentd_nodes.keys()) + if nodes_missing_labels: + return ( + 'The following nodes are supposed to be labeled with {label} but are not:\n' + ' {nodes}\n' + 'Fluentd will not aggregate logs from these nodes.' + ).format(label=node_selector, nodes=', '.join(nodes_missing_labels)) + return None + + @staticmethod + def _check_nodes_have_fluentd(pods, fluentd_nodes): + """Make sure fluentd is on all the labeled nodes. Returns: error string""" + unmatched_nodes = fluentd_nodes.copy() + node_names_by_label = { + node['metadata']['labels']['kubernetes.io/hostname']: name + for name, node in fluentd_nodes.items() + } + node_names_by_internal_ip = { + address['address']: name + for name, node in fluentd_nodes.items() + for address in node['status']['addresses'] + if address['type'] == "InternalIP" + } + for pod in pods: + for name in [ + pod['spec']['nodeName'], + node_names_by_internal_ip.get(pod['spec']['nodeName']), + node_names_by_label.get(pod.get('spec', {}).get('host')), + ]: + unmatched_nodes.pop(name, None) + if unmatched_nodes: + return ( + 'The following nodes are supposed to have a Fluentd pod but do not:\n' + '{nodes}' + 'These nodes will not have their logs aggregated.' + ).format(nodes=''.join( + " {}\n".format(name) + for name in unmatched_nodes.keys() + )) + return None + + def _check_fluentd_pods_running(self, pods): + """Make sure all fluentd pods are running. Returns: error string""" + not_running = super(Fluentd, self).not_running_pods(pods) + if not_running: + return ( + 'The following Fluentd pods are supposed to be running but are not:\n' + '{pods}' + 'These pods will not aggregate logs from their nodes.' + ).format(pods=''.join( + " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None')) + for pod in not_running + )) + return None + + def check_fluentd(self, pods, task_vars): + """Verify fluentd is running everywhere. Returns: error string""" + + node_selector = get_var(task_vars, 'openshift_logging_fluentd_nodeselector', + default='logging-infra-fluentd=true') + + nodes_by_name, error = self.get_nodes_by_name(task_vars) + + if error: + return error + fluentd_nodes, error = self._filter_fluentd_labeled_nodes(nodes_by_name, node_selector) + if error: + return error + + error_msgs = [] + error = self._check_node_labeling(nodes_by_name, fluentd_nodes, node_selector, task_vars) + if error: + error_msgs.append(error) + error = self._check_nodes_have_fluentd(pods, fluentd_nodes) + if error: + error_msgs.append(error) + error = self._check_fluentd_pods_running(pods) + if error: + error_msgs.append(error) + + # Make sure there are no extra fluentd pods + if len(pods) > len(fluentd_nodes): + error_msgs.append( + 'There are more Fluentd pods running than nodes labeled.\n' + 'This may not cause problems with logging but it likely indicates something wrong.' + ) + + return '\n'.join(error_msgs) + + def get_nodes_by_name(self, task_vars): + """Retrieve all the node definitions. Returns: dict(name: node), error string""" + nodes_json = self._exec_oc("get nodes -o json", [], task_vars) + try: + nodes = json.loads(nodes_json) + except ValueError: # no valid json - should not happen + return None, "Could not obtain a list of nodes to validate fluentd. Output from oc get:\n" + nodes_json + if not nodes or not nodes.get('items'): # also should not happen + return None, "No nodes appear to be defined according to the API." + return { + node['metadata']['name']: node + for node in nodes['items'] + }, None + + def _exec_oc(self, cmd_str, extra_args, task_vars): + return super(Fluentd, self).exec_oc(self.execute_module, + self.logging_namespace, + cmd_str, + extra_args, + task_vars) diff --git a/roles/openshift_health_checker/openshift_checks/logging/kibana.py b/roles/openshift_health_checker/openshift_checks/logging/kibana.py new file mode 100644 index 000000000..442f407b1 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/kibana.py @@ -0,0 +1,229 @@ +""" +Module for performing checks on a Kibana logging deployment +""" + +import json +import ssl + +try: + from urllib2 import HTTPError, URLError + import urllib2 +except ImportError: + from urllib.error import HTTPError, URLError + import urllib.request as urllib2 + +from openshift_checks import get_var +from openshift_checks.logging.logging import LoggingCheck + + +class Kibana(LoggingCheck): + """Module that checks an integrated logging Kibana deployment""" + + name = "kibana" + tags = ["health", "logging"] + + logging_namespace = None + + def run(self, tmp, task_vars): + """Check various things and gather errors. Returns: result as hash""" + + self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging") + kibana_pods, error = super(Kibana, self).get_pods_for_component( + self.execute_module, + self.logging_namespace, + "kibana", + task_vars, + ) + if error: + return {"failed": True, "changed": False, "msg": error} + check_error = self.check_kibana(kibana_pods) + + if not check_error: + check_error = self._check_kibana_route(task_vars) + + if check_error: + msg = ("The following Kibana deployment issue was found:" + "\n-------\n" + "{}".format(check_error)) + return {"failed": True, "changed": False, "msg": msg} + + # TODO(lmeyer): run it all again for the ops cluster + return {"failed": False, "changed": False, "msg": 'No problems found with Kibana deployment.'} + + def _verify_url_internal(self, url, task_vars): + """ + Try to reach a URL from the host. + Returns: success (bool), reason (for failure) + """ + args = dict( + url=url, + follow_redirects='none', + validate_certs='no', # likely to be signed with internal CA + # TODO(lmeyer): give users option to validate certs + status_code=302, + ) + result = self.execute_module('uri', args, task_vars) + if result.get('failed'): + return result['msg'] + return None + + @staticmethod + def _verify_url_external(url): + """ + Try to reach a URL from ansible control host. + Returns: success (bool), reason (for failure) + """ + # This actually checks from the ansible control host, which may or may not + # really be "external" to the cluster. + + # Disable SSL cert validation to work around internally signed certs + ctx = ssl.create_default_context() + ctx.check_hostname = False # or setting CERT_NONE is refused + ctx.verify_mode = ssl.CERT_NONE + + # Verify that the url is returning a valid response + try: + # We only care if the url connects and responds + return_code = urllib2.urlopen(url, context=ctx).getcode() + except HTTPError as httperr: + return httperr.reason + except URLError as urlerr: + return str(urlerr) + + # there appears to be no way to prevent urlopen from following redirects + if return_code != 200: + return 'Expected success (200) but got return code {}'.format(int(return_code)) + + return None + + def check_kibana(self, pods): + """Check to see if Kibana is up and working. Returns: error string.""" + + if not pods: + return "There are no Kibana pods deployed, so no access to the logging UI." + + not_running = self.not_running_pods(pods) + if len(not_running) == len(pods): + return "No Kibana pod is in a running state, so there is no access to the logging UI." + elif not_running: + return ( + "The following Kibana pods are not currently in a running state:\n" + "{pods}" + "However at least one is, so service may not be impacted." + ).format(pods="".join(" " + pod['metadata']['name'] + "\n" for pod in not_running)) + + return None + + def _get_kibana_url(self, task_vars): + """ + Get kibana route or report error. + Returns: url (or empty), reason for failure + """ + + # Get logging url + get_route = self._exec_oc("get route logging-kibana -o json", [], task_vars) + if not get_route: + return None, 'no_route_exists' + + route = json.loads(get_route) + + # check that the route has been accepted by a router + ingress = route["status"]["ingress"] + # ingress can be null if there is no router, or empty if not routed + if not ingress or not ingress[0]: + return None, 'route_not_accepted' + + host = route.get("spec", {}).get("host") + if not host: + return None, 'route_missing_host' + + return 'https://{}/'.format(host), None + + def _check_kibana_route(self, task_vars): + """ + Check to see if kibana route is up and working. + Returns: error string + """ + known_errors = dict( + no_route_exists=( + 'No route is defined for Kibana in the logging namespace,\n' + 'so the logging stack is not accessible. Is logging deployed?\n' + 'Did something remove the logging-kibana route?' + ), + route_not_accepted=( + 'The logging-kibana route is not being routed by any router.\n' + 'Is the router deployed and working?' + ), + route_missing_host=( + 'The logging-kibana route has no hostname defined,\n' + 'which should never happen. Did something alter its definition?' + ), + ) + + kibana_url, error = self._get_kibana_url(task_vars) + if not kibana_url: + return known_errors.get(error, error) + + # first, check that kibana is reachable from the master. + error = self._verify_url_internal(kibana_url, task_vars) + if error: + if 'urlopen error [Errno 111] Connection refused' in error: + error = ( + 'Failed to connect from this master to Kibana URL {url}\n' + 'Is kibana running, and is at least one router routing to it?' + ).format(url=kibana_url) + elif 'urlopen error [Errno -2] Name or service not known' in error: + error = ( + 'Failed to connect from this master to Kibana URL {url}\n' + 'because the hostname does not resolve.\n' + 'Is DNS configured for the Kibana hostname?' + ).format(url=kibana_url) + elif 'Status code was not' in error: + error = ( + 'A request from this master to the Kibana URL {url}\n' + 'did not return the correct status code (302).\n' + 'This could mean that Kibana is malfunctioning, the hostname is\n' + 'resolving incorrectly, or other network issues. The output was:\n' + ' {error}' + ).format(url=kibana_url, error=error) + return 'Error validating the logging Kibana route:\n' + error + + # in production we would like the kibana route to work from outside the + # cluster too; but that may not be the case, so allow disabling just this part. + if not get_var(task_vars, "openshift_check_efk_kibana_external", default=True): + return None + error = self._verify_url_external(kibana_url) + if error: + if 'urlopen error [Errno 111] Connection refused' in error: + error = ( + 'Failed to connect from the Ansible control host to Kibana URL {url}\n' + 'Is the router for the Kibana hostname exposed externally?' + ).format(url=kibana_url) + elif 'urlopen error [Errno -2] Name or service not known' in error: + error = ( + 'Failed to resolve the Kibana hostname in {url}\n' + 'from the Ansible control host.\n' + 'Is DNS configured to resolve this Kibana hostname externally?' + ).format(url=kibana_url) + elif 'Expected success (200)' in error: + error = ( + 'A request to Kibana at {url}\n' + 'returned the wrong error code:\n' + ' {error}\n' + 'This could mean that Kibana is malfunctioning, the hostname is\n' + 'resolving incorrectly, or other network issues.' + ).format(url=kibana_url, error=error) + error = ( + 'Error validating the logging Kibana route:\n{error}\n' + 'To disable external Kibana route validation, set in your inventory:\n' + ' openshift_check_efk_kibana_external=False' + ).format(error=error) + return error + return None + + def _exec_oc(self, cmd_str, extra_args, task_vars): + return super(Kibana, self).exec_oc(self.execute_module, + self.logging_namespace, + cmd_str, + extra_args, + task_vars) diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging.py b/roles/openshift_health_checker/openshift_checks/logging/logging.py new file mode 100644 index 000000000..05b4d300c --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/logging/logging.py @@ -0,0 +1,96 @@ +""" +Util functions for performing checks on an Elasticsearch, Fluentd, and Kibana stack +""" + +import json +import os + +from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var + + +class LoggingCheck(OpenShiftCheck): + """Base class for logging component checks""" + + name = "logging" + + @classmethod + def is_active(cls, task_vars): + return super(LoggingCheck, cls).is_active(task_vars) and cls.is_first_master(task_vars) + + @staticmethod + def is_first_master(task_vars): + """Run only on first master and only when logging is configured. Returns: bool""" + logging_deployed = get_var(task_vars, "openshift_hosted_logging_deploy", default=True) + # Note: It would be nice to use membership in oo_first_master group, however for now it + # seems best to avoid requiring that setup and just check this is the first master. + hostname = get_var(task_vars, "ansible_ssh_host") or [None] + masters = get_var(task_vars, "groups", "masters", default=None) or [None] + return logging_deployed and masters[0] == hostname + + def run(self, tmp, task_vars): + pass + + def get_pods_for_component(self, execute_module, namespace, logging_component, task_vars): + """Get all pods for a given component. Returns: list of pods for component, error string""" + pod_output = self.exec_oc( + execute_module, + namespace, + "get pods -l component={} -o json".format(logging_component), + [], + task_vars + ) + try: + pods = json.loads(pod_output) + if not pods or not pods.get('items'): + raise ValueError() + except ValueError: + # successful run but non-parsing data generally means there were no pods in the namespace + return None, 'There are no pods in the {} namespace. Is logging deployed?'.format(namespace) + + return pods['items'], None + + @staticmethod + def not_running_pods(pods): + """Returns: list of pods not in a ready and running state""" + return [ + pod for pod in pods + if any( + container['ready'] is False + for container in pod['status']['containerStatuses'] + ) or not any( + condition['type'] == 'Ready' and condition['status'] == 'True' + for condition in pod['status']['conditions'] + ) + ] + + @staticmethod + def exec_oc(execute_module=None, namespace="logging", cmd_str="", extra_args=None, task_vars=None): + """ + Execute an 'oc' command in the remote host. + Returns: output of command and namespace, + or raises OpenShiftCheckException on error + """ + config_base = get_var(task_vars, "openshift", "common", "config_base") + args = { + "namespace": namespace, + "config_file": os.path.join(config_base, "master", "admin.kubeconfig"), + "cmd": cmd_str, + "extra_args": list(extra_args) if extra_args else [], + } + + result = execute_module("ocutil", args, task_vars) + if result.get("failed"): + msg = ( + 'Unexpected error using `oc` to validate the logging stack components.\n' + 'Error executing `oc {cmd}`:\n' + '{error}' + ).format(cmd=args['cmd'], error=result['result']) + + if result['result'] == '[Errno 2] No such file or directory': + msg = ( + "This host is supposed to be a master but does not have the `oc` command where expected.\n" + "Has an installation been run on this host yet?" + ) + raise OpenShiftCheckException(msg) + + return result.get("result", "") diff --git a/roles/openshift_health_checker/openshift_checks/memory_availability.py b/roles/openshift_health_checker/openshift_checks/memory_availability.py index 28805dc37..f4e31065f 100644 --- a/roles/openshift_health_checker/openshift_checks/memory_availability.py +++ b/roles/openshift_health_checker/openshift_checks/memory_availability.py @@ -1,6 +1,9 @@ # pylint: disable=missing-docstring from openshift_checks import OpenShiftCheck, get_var +MIB = 2**20 +GIB = 2**30 + class MemoryAvailability(OpenShiftCheck): """Check that recommended memory is available.""" @@ -11,10 +14,12 @@ class MemoryAvailability(OpenShiftCheck): # Values taken from the official installation documentation: # https://docs.openshift.org/latest/install_config/install/prerequisites.html#system-requirements recommended_memory_bytes = { - "masters": 16 * 10**9, - "nodes": 8 * 10**9, - "etcd": 20 * 10**9, + "masters": 16 * GIB, + "nodes": 8 * GIB, + "etcd": 8 * GIB, } + # https://access.redhat.com/solutions/3006511 physical RAM is partly reserved from memtotal + memtotal_adjustment = 1 * GIB @classmethod def is_active(cls, task_vars): @@ -25,19 +30,21 @@ class MemoryAvailability(OpenShiftCheck): def run(self, tmp, task_vars): group_names = get_var(task_vars, "group_names") - total_memory_bytes = get_var(task_vars, "ansible_memtotal_mb") * 10**6 + total_memory_bytes = get_var(task_vars, "ansible_memtotal_mb") * MIB - min_memory_bytes = max(self.recommended_memory_bytes.get(name, 0) for name in group_names) + recommended_min = max(self.recommended_memory_bytes.get(name, 0) for name in group_names) + configured_min = float(get_var(task_vars, "openshift_check_min_host_memory_gb", default=0)) * GIB + min_memory_bytes = configured_min or recommended_min - if total_memory_bytes < min_memory_bytes: + if total_memory_bytes + self.memtotal_adjustment < min_memory_bytes: return { 'failed': True, 'msg': ( - 'Available memory ({available:.1f} GB) ' - 'below recommended value ({recommended:.1f} GB)' + 'Available memory ({available:.1f} GiB) is too far ' + 'below recommended value ({recommended:.1f} GiB)' ).format( - available=float(total_memory_bytes) / 10**9, - recommended=float(min_memory_bytes) / 10**9, + available=float(total_memory_bytes) / GIB, + recommended=float(min_memory_bytes) / GIB, ), } diff --git a/roles/openshift_health_checker/openshift_checks/mixins.py b/roles/openshift_health_checker/openshift_checks/mixins.py index 20d160eaf..7f3d78cc4 100644 --- a/roles/openshift_health_checker/openshift_checks/mixins.py +++ b/roles/openshift_health_checker/openshift_checks/mixins.py @@ -1,4 +1,3 @@ -# pylint: disable=missing-docstring,too-few-public-methods """ Mixin classes meant to be used with subclasses of OpenShiftCheck. """ @@ -8,8 +7,49 @@ from openshift_checks import get_var class NotContainerizedMixin(object): """Mixin for checks that are only active when not in containerized mode.""" + # permanent # pylint: disable=too-few-public-methods + # Reason: The mixin is not intended to stand on its own as a class. @classmethod def is_active(cls, task_vars): + """Only run on non-containerized hosts.""" is_containerized = get_var(task_vars, "openshift", "common", "is_containerized") return super(NotContainerizedMixin, cls).is_active(task_vars) and not is_containerized + + +class DockerHostMixin(object): + """Mixin for checks that are only active on hosts that require Docker.""" + + dependencies = [] + + @classmethod + def is_active(cls, task_vars): + """Only run on hosts that depend on Docker.""" + is_containerized = get_var(task_vars, "openshift", "common", "is_containerized") + is_node = "nodes" in get_var(task_vars, "group_names", default=[]) + return super(DockerHostMixin, cls).is_active(task_vars) and (is_containerized or is_node) + + def ensure_dependencies(self, task_vars): + """ + Ensure that docker-related packages exist, but not on atomic hosts + (which would not be able to install but should already have them). + Returns: msg, failed, changed + """ + if get_var(task_vars, "openshift", "common", "is_atomic"): + return "", False, False + + # NOTE: we would use the "package" module but it's actually an action plugin + # and it's not clear how to invoke one of those. This is about the same anyway: + pkg_manager = get_var(task_vars, "ansible_pkg_mgr", default="yum") + result = self.module_executor(pkg_manager, {"name": self.dependencies, "state": "present"}, task_vars) + msg = result.get("msg", "") + if result.get("failed"): + if "No package matching" in msg: + msg = "Ensure that all required dependencies can be installed via `yum`.\n" + msg = ( + "Unable to install required packages on this host:\n" + " {deps}\n{msg}" + ).format(deps=',\n '.join(self.dependencies), msg=msg) + failed = result.get("failed", False) or result.get("rc", 0) != 0 + changed = result.get("changed", False) + return msg, failed, changed diff --git a/roles/openshift_health_checker/openshift_checks/ovs_version.py b/roles/openshift_health_checker/openshift_checks/ovs_version.py new file mode 100644 index 000000000..1e45ae3af --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/ovs_version.py @@ -0,0 +1,78 @@ +""" +Ansible module for determining if an installed version of Open vSwitch is incompatible with the +currently installed version of OpenShift. +""" + +from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var +from openshift_checks.mixins import NotContainerizedMixin + + +class OvsVersion(NotContainerizedMixin, OpenShiftCheck): + """Check that packages in a package_list are installed on the host + and are the correct version as determined by an OpenShift installation. + """ + + name = "ovs_version" + tags = ["health"] + + openshift_to_ovs_version = { + "3.6": "2.6", + "3.5": "2.6", + "3.4": "2.4", + } + + # map major release versions across releases + # to a common major version + openshift_major_release_version = { + "1": "3", + } + + @classmethod + def is_active(cls, task_vars): + """Skip hosts that do not have package requirements.""" + group_names = get_var(task_vars, "group_names", default=[]) + master_or_node = 'masters' in group_names or 'nodes' in group_names + return super(OvsVersion, cls).is_active(task_vars) and master_or_node + + def run(self, tmp, task_vars): + args = { + "package_list": [ + { + "name": "openvswitch", + "version": self.get_required_ovs_version(task_vars), + }, + ], + } + return self.execute_module("rpm_version", args, task_vars) + + def get_required_ovs_version(self, task_vars): + """Return the correct Open vSwitch version for the current OpenShift version""" + openshift_version = self._get_openshift_version(task_vars) + + if float(openshift_version) < 3.5: + return self.openshift_to_ovs_version["3.4"] + + ovs_version = self.openshift_to_ovs_version.get(str(openshift_version)) + if ovs_version: + return self.openshift_to_ovs_version[str(openshift_version)] + + msg = "There is no recommended version of Open vSwitch for the current version of OpenShift: {}" + raise OpenShiftCheckException(msg.format(openshift_version)) + + def _get_openshift_version(self, task_vars): + openshift_version = get_var(task_vars, "openshift_image_tag") + if openshift_version and openshift_version[0] == 'v': + openshift_version = openshift_version[1:] + + return self._parse_version(openshift_version) + + def _parse_version(self, version): + components = version.split(".") + if not components or len(components) < 2: + msg = "An invalid version of OpenShift was found for this host: {}" + raise OpenShiftCheckException(msg.format(version)) + + if components[0] in self.openshift_major_release_version: + components[0] = self.openshift_major_release_version[components[0]] + + return '.'.join(components[:2]) diff --git a/roles/openshift_health_checker/openshift_checks/package_version.py b/roles/openshift_health_checker/openshift_checks/package_version.py index 682f6bd40..2e737818b 100644 --- a/roles/openshift_health_checker/openshift_checks/package_version.py +++ b/roles/openshift_health_checker/openshift_checks/package_version.py @@ -1,5 +1,5 @@ # pylint: disable=missing-docstring -from openshift_checks import OpenShiftCheck, get_var +from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var from openshift_checks.mixins import NotContainerizedMixin @@ -9,6 +9,25 @@ class PackageVersion(NotContainerizedMixin, OpenShiftCheck): name = "package_version" tags = ["preflight"] + openshift_to_ovs_version = { + "3.6": "2.6", + "3.5": "2.6", + "3.4": "2.4", + } + + openshift_to_docker_version = { + "3.1": "1.8", + "3.2": "1.10", + "3.3": "1.10", + "3.4": "1.12", + } + + # map major release versions across releases + # to a common major version + openshift_major_release_version = { + "1": "3", + } + @classmethod def is_active(cls, task_vars): """Skip hosts that do not have package requirements.""" @@ -17,9 +36,90 @@ class PackageVersion(NotContainerizedMixin, OpenShiftCheck): return super(PackageVersion, cls).is_active(task_vars) and master_or_node def run(self, tmp, task_vars): + rpm_prefix = get_var(task_vars, "openshift", "common", "service_type") + openshift_release = get_var(task_vars, "openshift_release", default='') + deployment_type = get_var(task_vars, "openshift_deployment_type") + check_multi_minor_release = deployment_type in ['openshift-enterprise'] + args = { - "requested_openshift_release": get_var(task_vars, "openshift_release", default=''), - "openshift_deployment_type": get_var(task_vars, "openshift_deployment_type"), - "rpm_prefix": get_var(task_vars, "openshift", "common", "service_type"), + "package_list": [ + { + "name": "openvswitch", + "version": self.get_required_ovs_version(task_vars), + "check_multi": False, + }, + { + "name": "docker", + "version": self.get_required_docker_version(task_vars), + "check_multi": False, + }, + { + "name": "{}".format(rpm_prefix), + "version": openshift_release, + "check_multi": check_multi_minor_release, + }, + { + "name": "{}-master".format(rpm_prefix), + "version": openshift_release, + "check_multi": check_multi_minor_release, + }, + { + "name": "{}-node".format(rpm_prefix), + "version": openshift_release, + "check_multi": check_multi_minor_release, + }, + ], } + return self.execute_module("aos_version", args, tmp, task_vars) + + def get_required_ovs_version(self, task_vars): + """Return the correct Open vSwitch version for the current OpenShift version. + If the current OpenShift version is >= 3.5, ensure Open vSwitch version 2.6, + Else ensure Open vSwitch version 2.4""" + openshift_version = self.get_openshift_version(task_vars) + + if float(openshift_version) < 3.5: + return self.openshift_to_ovs_version["3.4"] + + ovs_version = self.openshift_to_ovs_version.get(str(openshift_version)) + if ovs_version: + return ovs_version + + msg = "There is no recommended version of Open vSwitch for the current version of OpenShift: {}" + raise OpenShiftCheckException(msg.format(openshift_version)) + + def get_required_docker_version(self, task_vars): + """Return the correct Docker version for the current OpenShift version. + If the OpenShift version is 3.1, ensure Docker version 1.8. + If the OpenShift version is 3.2 or 3.3, ensure Docker version 1.10. + If the current OpenShift version is >= 3.4, ensure Docker version 1.12.""" + openshift_version = self.get_openshift_version(task_vars) + + if float(openshift_version) >= 3.4: + return self.openshift_to_docker_version["3.4"] + + docker_version = self.openshift_to_docker_version.get(str(openshift_version)) + if docker_version: + return docker_version + + msg = "There is no recommended version of Docker for the current version of OpenShift: {}" + raise OpenShiftCheckException(msg.format(openshift_version)) + + def get_openshift_version(self, task_vars): + openshift_version = get_var(task_vars, "openshift_image_tag") + if openshift_version and openshift_version[0] == 'v': + openshift_version = openshift_version[1:] + + return self.parse_version(openshift_version) + + def parse_version(self, version): + components = version.split(".") + if not components or len(components) < 2: + msg = "An invalid version of OpenShift was found for this host: {}" + raise OpenShiftCheckException(msg.format(version)) + + if components[0] in self.openshift_major_release_version: + components[0] = self.openshift_major_release_version[components[0]] + + return '.'.join(components[:2]) |