diff options
Diffstat (limited to 'roles/openshift_health_checker/openshift_checks')
6 files changed, 80 insertions, 34 deletions
diff --git a/roles/openshift_health_checker/openshift_checks/__init__.py b/roles/openshift_health_checker/openshift_checks/__init__.py index 02ee1d0f9..987c955b6 100644 --- a/roles/openshift_health_checker/openshift_checks/__init__.py +++ b/roles/openshift_health_checker/openshift_checks/__init__.py @@ -4,6 +4,7 @@ Health checks for OpenShift clusters. import operator import os +import time from abc import ABCMeta, abstractmethod, abstractproperty from importlib import import_module @@ -57,6 +58,9 @@ class OpenShiftCheck(object): self._execute_module = execute_module self.task_vars = task_vars or {} self.tmp = tmp + # mainly for testing purposes; see execute_module_with_retries + self._module_retries = 3 + self._module_retry_interval = 5 # seconds # set to True when the check changes the host, for accurate total "changed" count self.changed = False @@ -115,6 +119,19 @@ class OpenShiftCheck(object): ) return self._execute_module(module_name, module_args, self.tmp, self.task_vars) + def execute_module_with_retries(self, module_name, module_args): + """Run execute_module and retry on failure.""" + result = {} + tries = 0 + while True: + res = self.execute_module(module_name, module_args) + if tries > self._module_retries or not res.get("failed"): + result.update(res) + return result + result["last_failed"] = res + tries += 1 + time.sleep(self._module_retry_interval) + def get_var(self, *keys, **kwargs): """Get deeply nested values from task_vars. diff --git a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py index 866c74d7c..9c35f0f92 100644 --- a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py +++ b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py @@ -32,7 +32,12 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck): # we use python-docker-py to check local docker for images, and skopeo # to look for images available remotely without waiting to pull them. dependencies = ["python-docker-py", "skopeo"] - skopeo_img_check_command = "timeout 10 skopeo inspect --tls-verify=false" + skopeo_img_check_command = "timeout 10 skopeo inspect --tls-verify=false docker://{registry}/{image}" + + def __init__(self, *args, **kwargs): + super(DockerImageAvailability, self).__init__(*args, **kwargs) + # record whether we could reach a registry or not (and remember results) + self.reachable_registries = {} def is_active(self): """Skip hosts with unsupported deployment types.""" @@ -64,15 +69,21 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck): unavailable_images = set(missing_images) - set(available_images) if unavailable_images: - return { - "failed": True, - "msg": ( - "One or more required Docker images are not available:\n {}\n" - "Configured registries: {}\n" - "Checked by: {}" - ).format(",\n ".join(sorted(unavailable_images)), ", ".join(registries), - self.skopeo_img_check_command), - } + registries = [ + reg if self.reachable_registries.get(reg, True) else reg + " (unreachable)" + for reg in registries + ] + msg = ( + "One or more required Docker images are not available:\n {}\n" + "Configured registries: {}\n" + "Checked by: {}" + ).format( + ",\n ".join(sorted(unavailable_images)), + ", ".join(registries), + self.skopeo_img_check_command + ) + + return dict(failed=True, msg=msg) return {} @@ -128,31 +139,31 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck): def local_images(self, images): """Filter a list of images and return those available locally.""" - return [ - image for image in images - if self.is_image_local(image) - ] + registries = self.known_docker_registries() + found_images = [] + for image in images: + # docker could have the image name as-is or prefixed with any registry + imglist = [image] + [reg + "/" + image for reg in registries] + if self.is_image_local(imglist): + found_images.append(image) + return found_images def is_image_local(self, image): """Check if image is already in local docker index.""" result = self.execute_module("docker_image_facts", {"name": image}) - if result.get("failed", False): - return False - - return bool(result.get("images", [])) + return bool(result.get("images")) and not result.get("failed") def known_docker_registries(self): """Build a list of docker registries available according to inventory vars.""" - docker_facts = self.get_var("openshift", "docker") - regs = set(docker_facts["additional_registries"]) + regs = list(self.get_var("openshift.docker.additional_registries", default=[])) deployment_type = self.get_var("openshift_deployment_type") - if deployment_type == "origin": - regs.update(["docker.io"]) - elif "enterprise" in deployment_type: - regs.update(["registry.access.redhat.com"]) + if deployment_type == "origin" and "docker.io" not in regs: + regs.append("docker.io") + elif "enterprise" in deployment_type and "registry.access.redhat.com" not in regs: + regs.append("registry.access.redhat.com") - return list(regs) + return regs def available_images(self, images, default_registries): """Search remotely for images. Returns: list of images found.""" @@ -165,17 +176,35 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck): """Use Skopeo to determine if required image exists in known registry(s).""" registries = default_registries - # if image already includes a registry, only use that + # If image already includes a registry, only use that. + # NOTE: This logic would incorrectly identify images that do not use a namespace, e.g. + # registry.access.redhat.com/rhel7 as if the registry were a namespace. + # It's not clear that there's any way to distinguish them, but fortunately + # the current set of images all look like [registry/]namespace/name[:version]. if image.count("/") > 1: registry, image = image.split("/", 1) registries = [registry] for registry in registries: - args = { - "_raw_params": self.skopeo_img_check_command + " docker://{}/{}".format(registry, image) - } - result = self.execute_module("command", args) + if registry not in self.reachable_registries: + self.reachable_registries[registry] = self.connect_to_registry(registry) + if not self.reachable_registries[registry]: + continue + + args = {"_raw_params": self.skopeo_img_check_command.format(registry=registry, image=image)} + result = self.execute_module_with_retries("command", args) if result.get("rc", 0) == 0 and not result.get("failed"): return True + if result.get("rc") == 124: # RC 124 == timed out; mark unreachable + self.reachable_registries[registry] = False return False + + def connect_to_registry(self, registry): + """Use ansible wait_for module to test connectivity from host to registry. Returns bool.""" + # test a simple TCP connection + host, _, port = registry.partition(":") + port = port or 443 + args = dict(host=host, port=port, state="started", timeout=30) + result = self.execute_module("wait_for", args) + return result.get("rc", 0) == 0 and not result.get("failed") diff --git a/roles/openshift_health_checker/openshift_checks/mixins.py b/roles/openshift_health_checker/openshift_checks/mixins.py index e9bae60a3..24f1d938a 100644 --- a/roles/openshift_health_checker/openshift_checks/mixins.py +++ b/roles/openshift_health_checker/openshift_checks/mixins.py @@ -36,7 +36,7 @@ class DockerHostMixin(object): # NOTE: we would use the "package" module but it's actually an action plugin # and it's not clear how to invoke one of those. This is about the same anyway: - result = self.execute_module( + result = self.execute_module_with_retries( self.get_var("ansible_pkg_mgr", default="yum"), {"name": self.dependencies, "state": "present"}, ) diff --git a/roles/openshift_health_checker/openshift_checks/package_availability.py b/roles/openshift_health_checker/openshift_checks/package_availability.py index a86180b00..21355c2f0 100644 --- a/roles/openshift_health_checker/openshift_checks/package_availability.py +++ b/roles/openshift_health_checker/openshift_checks/package_availability.py @@ -26,7 +26,7 @@ class PackageAvailability(NotContainerizedMixin, OpenShiftCheck): packages.update(self.node_packages(rpm_prefix)) args = {"packages": sorted(set(packages))} - return self.execute_module("check_yum_update", args) + return self.execute_module_with_retries("check_yum_update", args) @staticmethod def master_packages(rpm_prefix): diff --git a/roles/openshift_health_checker/openshift_checks/package_update.py b/roles/openshift_health_checker/openshift_checks/package_update.py index 1e9aecbe0..8464e8a5e 100644 --- a/roles/openshift_health_checker/openshift_checks/package_update.py +++ b/roles/openshift_health_checker/openshift_checks/package_update.py @@ -11,4 +11,4 @@ class PackageUpdate(NotContainerizedMixin, OpenShiftCheck): def run(self): args = {"packages": []} - return self.execute_module("check_yum_update", args) + return self.execute_module_with_retries("check_yum_update", args) diff --git a/roles/openshift_health_checker/openshift_checks/package_version.py b/roles/openshift_health_checker/openshift_checks/package_version.py index 0b795b6c4..d4aec3ed8 100644 --- a/roles/openshift_health_checker/openshift_checks/package_version.py +++ b/roles/openshift_health_checker/openshift_checks/package_version.py @@ -76,7 +76,7 @@ class PackageVersion(NotContainerizedMixin, OpenShiftCheck): ], } - return self.execute_module("aos_version", args) + return self.execute_module_with_retries("aos_version", args) def get_required_ovs_version(self): """Return the correct Open vSwitch version(s) for the current OpenShift version.""" |