diff options
Diffstat (limited to 'roles/openshift_health_checker/openshift_checks/docker_storage.py')
-rw-r--r-- | roles/openshift_health_checker/openshift_checks/docker_storage.py | 191 |
1 files changed, 152 insertions, 39 deletions
diff --git a/roles/openshift_health_checker/openshift_checks/docker_storage.py b/roles/openshift_health_checker/openshift_checks/docker_storage.py index 2bd615457..dea15a56e 100644 --- a/roles/openshift_health_checker/openshift_checks/docker_storage.py +++ b/roles/openshift_health_checker/openshift_checks/docker_storage.py @@ -1,7 +1,8 @@ """Check Docker storage driver and usage.""" import json +import os.path import re -from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var +from openshift_checks import OpenShiftCheck, OpenShiftCheckException from openshift_checks.mixins import DockerHostMixin @@ -17,15 +18,32 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck): tags = ["pre-install", "health", "preflight"] dependencies = ["python-docker-py"] - storage_drivers = ["devicemapper", "overlay2"] + storage_drivers = ["devicemapper", "overlay", "overlay2"] max_thinpool_data_usage_percent = 90.0 max_thinpool_meta_usage_percent = 90.0 + max_overlay_usage_percent = 90.0 - # pylint: disable=too-many-return-statements - # Reason: permanent stylistic exception; - # it is clearer to return on failures and there are just many ways to fail here. - def run(self, tmp, task_vars): - msg, failed, changed = self.ensure_dependencies(task_vars) + # TODO(lmeyer): mention these in the output when check fails + configuration_variables = [ + ( + "max_thinpool_data_usage_percent", + "For 'devicemapper' storage driver, usage threshold percentage for data. " + "Format: float. Default: {:.1f}".format(max_thinpool_data_usage_percent), + ), + ( + "max_thinpool_meta_usage_percent", + "For 'devicemapper' storage driver, usage threshold percentage for metadata. " + "Format: float. Default: {:.1f}".format(max_thinpool_meta_usage_percent), + ), + ( + "max_overlay_usage_percent", + "For 'overlay' or 'overlay2' storage driver, usage threshold percentage. " + "Format: float. Default: {:.1f}".format(max_overlay_usage_percent), + ), + ] + + def run(self): + msg, failed, changed = self.ensure_dependencies() if failed: return { "failed": True, @@ -34,17 +52,17 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck): } # attempt to get the docker info hash from the API - info = self.execute_module("docker_info", {}, task_vars=task_vars) - if info.get("failed"): + docker_info = self.execute_module("docker_info", {}) + if docker_info.get("failed"): return {"failed": True, "changed": changed, "msg": "Failed to query Docker API. Is docker running on this host?"} - if not info.get("info"): # this would be very strange + if not docker_info.get("info"): # this would be very strange return {"failed": True, "changed": changed, - "msg": "Docker API query missing info:\n{}".format(json.dumps(info))} - info = info["info"] + "msg": "Docker API query missing info:\n{}".format(json.dumps(docker_info))} + docker_info = docker_info["info"] # check if the storage driver we saw is valid - driver = info.get("Driver", "[NONE]") + driver = docker_info.get("Driver", "[NONE]") if driver not in self.storage_drivers: msg = ( "Detected unsupported Docker storage driver '{driver}'.\n" @@ -53,26 +71,34 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck): return {"failed": True, "changed": changed, "msg": msg} # driver status info is a list of tuples; convert to dict and validate based on driver - driver_status = {item[0]: item[1] for item in info.get("DriverStatus", [])} + driver_status = {item[0]: item[1] for item in docker_info.get("DriverStatus", [])} + + result = {} + if driver == "devicemapper": - if driver_status.get("Data loop file"): - msg = ( - "Use of loopback devices with the Docker devicemapper storage driver\n" - "(the default storage configuration) is unsupported in production.\n" - "Please use docker-storage-setup to configure a backing storage volume.\n" - "See http://red.ht/2rNperO for further information." - ) - return {"failed": True, "changed": changed, "msg": msg} - result = self._check_dm_usage(driver_status, task_vars) - result['changed'] = result.get('changed', False) or changed - return result + result = self.check_devicemapper_support(driver_status) - # TODO(lmeyer): determine how to check usage for overlay2 + if driver in ['overlay', 'overlay2']: + result = self.check_overlay_support(docker_info, driver_status) - return {"changed": changed} + result['changed'] = result.get('changed', False) or changed + return result - def _check_dm_usage(self, driver_status, task_vars): - """ + def check_devicemapper_support(self, driver_status): + """Check if dm storage driver is supported as configured. Return: result dict.""" + if driver_status.get("Data loop file"): + msg = ( + "Use of loopback devices with the Docker devicemapper storage driver\n" + "(the default storage configuration) is unsupported in production.\n" + "Please use docker-storage-setup to configure a backing storage volume.\n" + "See http://red.ht/2rNperO for further information." + ) + return {"failed": True, "msg": msg} + result = self.check_dm_usage(driver_status) + return result + + def check_dm_usage(self, driver_status): + """Check usage thresholds for Docker dm storage driver. Return: result dict. Backing assumptions: We expect devicemapper to be backed by an auto-expanding thin pool implemented as an LV in an LVM2 VG. This is how docker-storage-setup currently configures devicemapper storage. The LV is "thin" because it does not use all available storage @@ -83,7 +109,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck): could run out of space first; so we check both. """ vals = dict( - vg_free=self._get_vg_free(driver_status.get("Pool Name"), task_vars), + vg_free=self.get_vg_free(driver_status.get("Pool Name")), data_used=driver_status.get("Data Space Used"), data_total=driver_status.get("Data Space Total"), metadata_used=driver_status.get("Metadata Space Used"), @@ -93,7 +119,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck): # convert all human-readable strings to bytes for key, value in vals.copy().items(): try: - vals[key + "_bytes"] = self._convert_to_bytes(value) + vals[key + "_bytes"] = self.convert_to_bytes(value) except ValueError as err: # unlikely to hit this from API info, but just to be safe return { "failed": True, @@ -104,7 +130,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck): # determine the threshold percentages which usage should not exceed for name, default in [("data", self.max_thinpool_data_usage_percent), ("metadata", self.max_thinpool_meta_usage_percent)]: - percent = get_var(task_vars, "max_thinpool_" + name + "_usage_percent", default=default) + percent = self.get_var("max_thinpool_" + name + "_usage_percent", default=default) try: vals[name + "_threshold"] = float(percent) except ValueError: @@ -131,10 +157,12 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck): vals["msg"] = "\n".join(messages or ["Thinpool usage is within thresholds."]) return vals - def _get_vg_free(self, pool, task_vars): - # Determine which VG to examine according to the pool name, the only indicator currently - # available from the Docker API driver info. We assume a name that looks like - # "vg--name-docker--pool"; vg and lv names with inner hyphens doubled, joined by a hyphen. + def get_vg_free(self, pool): + """Determine which VG to examine according to the pool name. Return: size vgs reports. + Pool name is the only indicator currently available from the Docker API driver info. + We assume a name that looks like "vg--name-docker--pool"; + vg and lv names with inner hyphens doubled, joined by a hyphen. + """ match = re.match(r'((?:[^-]|--)+)-(?!-)', pool) # matches up to the first single hyphen if not match: # unlikely, but... be clear if we assumed wrong raise OpenShiftCheckException( @@ -143,10 +171,10 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck): "so the available storage in the VG cannot be determined.".format(pool) ) vg_name = match.groups()[0].replace("--", "-") - vgs_cmd = "/sbin/vgs --noheadings -o vg_free --select vg_name=" + vg_name + vgs_cmd = "/sbin/vgs --noheadings -o vg_free --units g --select vg_name=" + vg_name # should return free space like " 12.00g" if the VG exists; empty if it does not - ret = self.execute_module("command", {"_raw_params": vgs_cmd}, task_vars=task_vars) + ret = self.execute_module("command", {"_raw_params": vgs_cmd}) if ret.get("failed") or ret.get("rc", 0) != 0: raise OpenShiftCheckException( "Is LVM installed? Failed to run /sbin/vgs " @@ -163,7 +191,8 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck): return size @staticmethod - def _convert_to_bytes(string): + def convert_to_bytes(string): + """Convert string like "10.3 G" to bytes (binary units assumed). Return: float bytes.""" units = dict( b=1, k=1024, @@ -183,3 +212,87 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck): raise ValueError("Cannot convert to a byte size: " + string) return float(number) * multiplier + + def check_overlay_support(self, docker_info, driver_status): + """Check if overlay storage driver is supported for this host. Return: result dict.""" + # check for xfs as backing store + backing_fs = driver_status.get("Backing Filesystem", "[NONE]") + if backing_fs != "xfs": + msg = ( + "Docker storage drivers 'overlay' and 'overlay2' are only supported with\n" + "'xfs' as the backing storage, but this host's storage is type '{fs}'." + ).format(fs=backing_fs) + return {"failed": True, "msg": msg} + + # check support for OS and kernel version + o_s = docker_info.get("OperatingSystem", "[NONE]") + if "Red Hat Enterprise Linux" in o_s or "CentOS" in o_s: + # keep it simple, only check enterprise kernel versions; assume everyone else is good + kernel = docker_info.get("KernelVersion", "[NONE]") + kernel_arr = [int(num) for num in re.findall(r'\d+', kernel)] + if kernel_arr < [3, 10, 0, 514]: # rhel < 7.3 + msg = ( + "Docker storage drivers 'overlay' and 'overlay2' are only supported beginning with\n" + "kernel version 3.10.0-514; but Docker reports kernel version {version}." + ).format(version=kernel) + return {"failed": True, "msg": msg} + # NOTE: we could check for --selinux-enabled here but docker won't even start with + # that option until it's supported in the kernel so we don't need to. + + return self.check_overlay_usage(docker_info) + + def check_overlay_usage(self, docker_info): + """Check disk usage on OverlayFS backing store volume. Return: result dict.""" + path = docker_info.get("DockerRootDir", "/var/lib/docker") + "/" + docker_info["Driver"] + + threshold = self.get_var("max_overlay_usage_percent", default=self.max_overlay_usage_percent) + try: + threshold = float(threshold) + except ValueError: + return { + "failed": True, + "msg": "Specified 'max_overlay_usage_percent' is not a percentage: {}".format(threshold), + } + + mount = self.find_ansible_mount(path, self.get_var("ansible_mounts")) + try: + free_bytes = mount['size_available'] + total_bytes = mount['size_total'] + usage = 100.0 * (total_bytes - free_bytes) / total_bytes + except (KeyError, ZeroDivisionError): + return { + "failed": True, + "msg": "The ansible_mount found for path {} is invalid.\n" + "This is likely to be an Ansible bug. The record was:\n" + "{}".format(path, json.dumps(mount, indent=2)), + } + + if usage > threshold: + return { + "failed": True, + "msg": ( + "For Docker OverlayFS mount point {path},\n" + "usage percentage {pct:.1f} is higher than threshold {thresh:.1f}." + ).format(path=mount["mount"], pct=usage, thresh=threshold) + } + + return {} + + # TODO(lmeyer): migrate to base class + @staticmethod + def find_ansible_mount(path, ansible_mounts): + """Return the mount point for path from ansible_mounts.""" + + mount_for_path = {mount['mount']: mount for mount in ansible_mounts} + mount_point = path + while mount_point not in mount_for_path: + if mount_point in ["/", ""]: # "/" not in ansible_mounts??? + break + mount_point = os.path.dirname(mount_point) + + try: + return mount_for_path[mount_point] + except KeyError: + known_mounts = ', '.join('"{}"'.format(mount) for mount in sorted(mount_for_path)) or 'none' + msg = 'Unable to determine mount point for path "{}". Known mount points: {}.' + raise OpenShiftCheckException(msg.format(path, known_mounts)) |