diff options
Diffstat (limited to 'roles/openshift_health_checker/openshift_checks')
4 files changed, 152 insertions, 90 deletions
diff --git a/roles/openshift_health_checker/openshift_checks/disk_availability.py b/roles/openshift_health_checker/openshift_checks/disk_availability.py index c2792a0fe..962148cb8 100644 --- a/roles/openshift_health_checker/openshift_checks/disk_availability.py +++ b/roles/openshift_health_checker/openshift_checks/disk_availability.py @@ -27,10 +27,12 @@ class DiskAvailability(NotContainerizedMixin, OpenShiftCheck): def run(self, tmp, task_vars): group_names = get_var(task_vars, "group_names") ansible_mounts = get_var(task_vars, "ansible_mounts") - - min_free_bytes = max(self.recommended_disk_space_bytes.get(name, 0) for name in group_names) free_bytes = self.openshift_available_disk(ansible_mounts) + recommended_min = max(self.recommended_disk_space_bytes.get(name, 0) for name in group_names) + configured_min = int(get_var(task_vars, "openshift_check_min_host_disk_gb", default=0)) * 10**9 + min_free_bytes = configured_min or recommended_min + if free_bytes < min_free_bytes: return { 'failed': True, diff --git a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py index cce289b95..4588ed634 100644 --- a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py +++ b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py @@ -13,41 +13,55 @@ class DockerImageAvailability(OpenShiftCheck): name = "docker_image_availability" tags = ["preflight"] - skopeo_image = "openshift/openshift-ansible" + dependencies = ["skopeo", "python-docker-py"] - # FIXME(juanvallejo): we should consider other possible values of - # `deployment_type` (the key here). See - # https://github.com/openshift/openshift-ansible/blob/8e26f8c/roles/openshift_repos/vars/main.yml#L7 - docker_image_base = { + deployment_image_info = { "origin": { - "repo": "openshift", - "image": "origin", + "namespace": "openshift", + "name": "origin", }, "openshift-enterprise": { - "repo": "openshift3", - "image": "ose", + "namespace": "openshift3", + "name": "ose", }, } - def run(self, tmp, task_vars): - required_images = self.required_images(task_vars) - missing_images = set(required_images) - set(self.local_images(required_images, task_vars)) + @classmethod + def is_active(cls, task_vars): + """Skip hosts with unsupported deployment types.""" + deployment_type = get_var(task_vars, "openshift_deployment_type") + has_valid_deployment_type = deployment_type in cls.deployment_image_info - # exit early if all images were found locally - if not missing_images: - return {"changed": False} + return super(DockerImageAvailability, cls).is_active(task_vars) and has_valid_deployment_type - msg, failed, changed = self.update_skopeo_image(task_vars) + def run(self, tmp, task_vars): + msg, failed, changed = self.ensure_dependencies(task_vars) # exit early if Skopeo update fails if failed: + if "No package matching" in msg: + msg = "Ensure that all required dependencies can be installed via `yum`.\n" return { "failed": True, "changed": changed, - "msg": "Failed to update Skopeo image ({img_name}). {msg}".format(img_name=self.skopeo_image, msg=msg), + "msg": ( + "Unable to update or install required dependency packages on this host;\n" + "These are required in order to check Docker image availability:" + "\n {deps}\n{msg}" + ).format(deps=',\n '.join(self.dependencies), msg=msg), } + required_images = self.required_images(task_vars) + missing_images = set(required_images) - set(self.local_images(required_images, task_vars)) + + # exit early if all images were found locally + if not missing_images: + return {"changed": changed} + registries = self.known_docker_registries(task_vars) + if not registries: + return {"failed": True, "msg": "Unable to retrieve any docker registries.", "changed": changed} + available_images = self.available_images(missing_images, registries, task_vars) unavailable_images = set(missing_images) - set(available_images) @@ -55,44 +69,60 @@ class DockerImageAvailability(OpenShiftCheck): return { "failed": True, "msg": ( - "One or more required images are not available: {}.\n" + "One or more required Docker images are not available:\n {}\n" "Configured registries: {}" - ).format(", ".join(sorted(unavailable_images)), ", ".join(registries)), + ).format(",\n ".join(sorted(unavailable_images)), ", ".join(registries)), "changed": changed, } return {"changed": changed} def required_images(self, task_vars): - deployment_type = get_var(task_vars, "deployment_type") - # FIXME(juanvallejo): we should handle gracefully with a proper error - # message when given an unexpected value for `deployment_type`. - image_base_name = self.docker_image_base[deployment_type] - - openshift_release = get_var(task_vars, "openshift_release") - # FIXME(juanvallejo): this variable is not required when the - # installation is non-containerized. The example inventories have it - # commented out. We should handle gracefully and with a proper error - # message when this variable is required and not set. - openshift_image_tag = get_var(task_vars, "openshift_image_tag") + deployment_type = get_var(task_vars, "openshift_deployment_type") + image_info = self.deployment_image_info[deployment_type] + openshift_release = get_var(task_vars, "openshift_release", default="latest") + openshift_image_tag = get_var(task_vars, "openshift_image_tag") is_containerized = get_var(task_vars, "openshift", "common", "is_containerized") - if is_containerized: - images = set(self.containerized_docker_images(image_base_name, openshift_release)) - else: - images = set(self.rpm_docker_images(image_base_name, openshift_release)) + images = set(self.required_docker_images( + image_info["namespace"], + image_info["name"], + ["registry-console"] if "enterprise" in deployment_type else [], # include enterprise-only image names + openshift_release, + is_containerized, + )) # append images with qualified image tags to our list of required images. # these are images with a (v0.0.0.0) tag, rather than a standard release # format tag (v0.0). We want to check this set in both containerized and # non-containerized installations. images.update( - self.qualified_docker_images(self.image_from_base_name(image_base_name), "v" + openshift_image_tag) + self.required_qualified_docker_images( + image_info["namespace"], + image_info["name"], + openshift_image_tag, + ), ) return images + @staticmethod + def required_docker_images(namespace, name, additional_image_names, version, is_containerized): + if is_containerized: + return ["{}/{}:{}".format(namespace, name, version)] if name else [] + + # include additional non-containerized images specific to the current deployment type + return ["{}/{}:{}".format(namespace, img_name, version) for img_name in additional_image_names] + + @staticmethod + def required_qualified_docker_images(namespace, name, version): + # pylint: disable=invalid-name + return [ + "{}/{}-{}:{}".format(namespace, name, suffix, version) + for suffix in ["haproxy-router", "docker-registry", "deployer", "pod"] + ] + def local_images(self, images, task_vars): """Filter a list of images and return those available locally.""" return [ @@ -107,31 +137,26 @@ class DockerImageAvailability(OpenShiftCheck): return bool(result.get("images", [])) - def known_docker_registries(self, task_vars): - result = self.module_executor("docker_info", {}, task_vars) + @staticmethod + def known_docker_registries(task_vars): + docker_facts = get_var(task_vars, "openshift", "docker") + regs = set(docker_facts["additional_registries"]) - if result.get("failed", False): - return [] + deployment_type = get_var(task_vars, "openshift_deployment_type") + if deployment_type == "origin": + regs.update(["docker.io"]) + elif "enterprise" in deployment_type: + regs.update(["registry.access.redhat.com"]) - # FIXME(juanvallejo): wrong default type, result["info"] is expected to - # contain a dictionary (see how we call `docker_info.get` below). - docker_info = result.get("info", "") - return [registry.get("Name", "") for registry in docker_info.get("Registries", {})] + return list(regs) def available_images(self, images, registries, task_vars): """Inspect existing images using Skopeo and return all images successfully inspected.""" return [ image for image in images - if self.is_image_available(image, registries, task_vars) + if any(self.is_available_skopeo_image(image, registry, task_vars) for registry in registries) ] - def is_image_available(self, image, registries, task_vars): - for registry in registries: - if self.is_available_skopeo_image(image, registry, task_vars): - return True - - return False - def is_available_skopeo_image(self, image, registry, task_vars): """Uses Skopeo to determine if required image exists in a given registry.""" @@ -140,40 +165,15 @@ class DockerImageAvailability(OpenShiftCheck): image=image, ) - args = { - "name": "skopeo_inspect", - "image": self.skopeo_image, - "command": cmd_str, - "detach": False, - "cleanup": True, - } - result = self.module_executor("docker_container", args, task_vars) - return result.get("failed", False) - - def containerized_docker_images(self, base_name, version): - return [ - "{image}:{version}".format(image=self.image_from_base_name(base_name), version=version) - ] + args = {"_raw_params": cmd_str} + result = self.module_executor("command", args, task_vars) + return not result.get("failed", False) and result.get("rc", 0) == 0 - @staticmethod - def rpm_docker_images(base, version): - return [ - "{image_repo}/registry-console:{version}".format(image_repo=base["repo"], version=version) - ] + # ensures that the skopeo and python-docker-py packages exist + # check is skipped on atomic installations + def ensure_dependencies(self, task_vars): + if get_var(task_vars, "openshift", "common", "is_atomic"): + return "", False, False - @staticmethod - def qualified_docker_images(image_name, version): - return [ - "{}-{}:{}".format(image_name, component, version) - for component in "haproxy-router docker-registry deployer pod".split() - ] - - @staticmethod - def image_from_base_name(base): - return "".join([base["repo"], "/", base["image"]]) - - # ensures that the skopeo docker image exists, and updates it - # with latest if image was already present locally. - def update_skopeo_image(self, task_vars): - result = self.module_executor("docker_image", {"name": self.skopeo_image}, task_vars) - return result.get("msg", ""), result.get("failed", False), result.get("changed", False) + result = self.module_executor("yum", {"name": self.dependencies, "state": "latest"}, task_vars) + return result.get("msg", ""), result.get("failed", False) or result.get("rc", 0) != 0, result.get("changed") diff --git a/roles/openshift_health_checker/openshift_checks/etcd_volume.py b/roles/openshift_health_checker/openshift_checks/etcd_volume.py new file mode 100644 index 000000000..7452c9cc1 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/etcd_volume.py @@ -0,0 +1,58 @@ +"""A health check for OpenShift clusters.""" + +from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var + + +class EtcdVolume(OpenShiftCheck): + """Ensures etcd storage usage does not exceed a given threshold.""" + + name = "etcd_volume" + tags = ["etcd", "health"] + + # Default device usage threshold. Value should be in the range [0, 100]. + default_threshold_percent = 90 + # Where to find ectd data, higher priority first. + supported_mount_paths = ["/var/lib/etcd", "/var/lib", "/var", "/"] + + @classmethod + def is_active(cls, task_vars): + etcd_hosts = get_var(task_vars, "groups", "etcd", default=[]) or get_var(task_vars, "groups", "masters", + default=[]) or [] + is_etcd_host = get_var(task_vars, "ansible_ssh_host") in etcd_hosts + return super(EtcdVolume, cls).is_active(task_vars) and is_etcd_host + + def run(self, tmp, task_vars): + mount_info = self._etcd_mount_info(task_vars) + available = mount_info["size_available"] + total = mount_info["size_total"] + used = total - available + + threshold = get_var( + task_vars, + "etcd_device_usage_threshold_percent", + default=self.default_threshold_percent + ) + + used_percent = 100.0 * used / total + + if used_percent > threshold: + device = mount_info.get("device", "unknown") + mount = mount_info.get("mount", "unknown") + msg = "etcd storage usage ({:.1f}%) is above threshold ({:.1f}%). Device: {}, mount: {}.".format( + used_percent, threshold, device, mount + ) + return {"failed": True, "msg": msg} + + return {"changed": False} + + def _etcd_mount_info(self, task_vars): + ansible_mounts = get_var(task_vars, "ansible_mounts") + mounts = {mnt.get("mount"): mnt for mnt in ansible_mounts} + + for path in self.supported_mount_paths: + if path in mounts: + return mounts[path] + + paths = ', '.join(sorted(mounts)) or 'none' + msg = "Unable to find etcd storage mount point. Paths mounted: {}.".format(paths) + raise OpenShiftCheckException(msg) diff --git a/roles/openshift_health_checker/openshift_checks/memory_availability.py b/roles/openshift_health_checker/openshift_checks/memory_availability.py index 28805dc37..8b1a58ef4 100644 --- a/roles/openshift_health_checker/openshift_checks/memory_availability.py +++ b/roles/openshift_health_checker/openshift_checks/memory_availability.py @@ -13,7 +13,7 @@ class MemoryAvailability(OpenShiftCheck): recommended_memory_bytes = { "masters": 16 * 10**9, "nodes": 8 * 10**9, - "etcd": 20 * 10**9, + "etcd": 8 * 10**9, } @classmethod @@ -27,7 +27,9 @@ class MemoryAvailability(OpenShiftCheck): group_names = get_var(task_vars, "group_names") total_memory_bytes = get_var(task_vars, "ansible_memtotal_mb") * 10**6 - min_memory_bytes = max(self.recommended_memory_bytes.get(name, 0) for name in group_names) + recommended_min = max(self.recommended_memory_bytes.get(name, 0) for name in group_names) + configured_min = int(get_var(task_vars, "openshift_check_min_host_memory_gb", default=0)) * 10**9 + min_memory_bytes = configured_min or recommended_min if total_memory_bytes < min_memory_bytes: return { |