diff options
Diffstat (limited to 'roles/openshift_health_checker/openshift_checks/docker_storage.py')
-rw-r--r-- | roles/openshift_health_checker/openshift_checks/docker_storage.py | 185 |
1 files changed, 185 insertions, 0 deletions
diff --git a/roles/openshift_health_checker/openshift_checks/docker_storage.py b/roles/openshift_health_checker/openshift_checks/docker_storage.py new file mode 100644 index 000000000..7f1751b36 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/docker_storage.py @@ -0,0 +1,185 @@ +"""Check Docker storage driver and usage.""" +import json +import re +from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var +from openshift_checks.mixins import DockerHostMixin + + +class DockerStorage(DockerHostMixin, OpenShiftCheck): + """Check Docker storage driver compatibility. + + This check ensures that Docker is using a supported storage driver, + and that loopback is not being used (if using devicemapper). + Also that storage usage is not above threshold. + """ + + name = "docker_storage" + tags = ["pre-install", "health", "preflight"] + + dependencies = ["python-docker-py"] + storage_drivers = ["devicemapper", "overlay2"] + max_thinpool_data_usage_percent = 90.0 + max_thinpool_meta_usage_percent = 90.0 + + # pylint: disable=too-many-return-statements + # Reason: permanent stylistic exception; + # it is clearer to return on failures and there are just many ways to fail here. + def run(self, tmp, task_vars): + msg, failed, changed = self.ensure_dependencies(task_vars) + if failed: + return { + "failed": True, + "changed": changed, + "msg": "Some dependencies are required in order to query docker storage on host:\n" + msg + } + + # attempt to get the docker info hash from the API + info = self.execute_module("docker_info", {}, task_vars) + if info.get("failed"): + return {"failed": True, "changed": changed, + "msg": "Failed to query Docker API. Is docker running on this host?"} + if not info.get("info"): # this would be very strange + return {"failed": True, "changed": changed, + "msg": "Docker API query missing info:\n{}".format(json.dumps(info))} + info = info["info"] + + # check if the storage driver we saw is valid + driver = info.get("Driver", "[NONE]") + if driver not in self.storage_drivers: + msg = ( + "Detected unsupported Docker storage driver '{driver}'.\n" + "Supported storage drivers are: {drivers}" + ).format(driver=driver, drivers=', '.join(self.storage_drivers)) + return {"failed": True, "changed": changed, "msg": msg} + + # driver status info is a list of tuples; convert to dict and validate based on driver + driver_status = {item[0]: item[1] for item in info.get("DriverStatus", [])} + if driver == "devicemapper": + if driver_status.get("Data loop file"): + msg = ( + "Use of loopback devices with the Docker devicemapper storage driver\n" + "(the default storage configuration) is unsupported in production.\n" + "Please use docker-storage-setup to configure a backing storage volume.\n" + "See http://red.ht/2rNperO for further information." + ) + return {"failed": True, "changed": changed, "msg": msg} + result = self._check_dm_usage(driver_status, task_vars) + result['changed'] = result.get('changed', False) or changed + return result + + # TODO(lmeyer): determine how to check usage for overlay2 + + return {"changed": changed} + + def _check_dm_usage(self, driver_status, task_vars): + """ + Backing assumptions: We expect devicemapper to be backed by an auto-expanding thin pool + implemented as an LV in an LVM2 VG. This is how docker-storage-setup currently configures + devicemapper storage. The LV is "thin" because it does not use all available storage + from its VG, instead expanding as needed; so to determine available space, we gather + current usage as the Docker API reports for the driver as well as space available for + expansion in the pool's VG. + Usage within the LV is divided into pools allocated to data and metadata, either of which + could run out of space first; so we check both. + """ + vals = dict( + vg_free=self._get_vg_free(driver_status.get("Pool Name"), task_vars), + data_used=driver_status.get("Data Space Used"), + data_total=driver_status.get("Data Space Total"), + metadata_used=driver_status.get("Metadata Space Used"), + metadata_total=driver_status.get("Metadata Space Total"), + ) + + # convert all human-readable strings to bytes + for key, value in vals.copy().items(): + try: + vals[key + "_bytes"] = self._convert_to_bytes(value) + except ValueError as err: # unlikely to hit this from API info, but just to be safe + return { + "failed": True, + "values": vals, + "msg": "Could not interpret {} value '{}' as bytes: {}".format(key, value, str(err)) + } + + # determine the threshold percentages which usage should not exceed + for name, default in [("data", self.max_thinpool_data_usage_percent), + ("metadata", self.max_thinpool_meta_usage_percent)]: + percent = get_var(task_vars, "max_thinpool_" + name + "_usage_percent", default=default) + try: + vals[name + "_threshold"] = float(percent) + except ValueError: + return { + "failed": True, + "msg": "Specified thinpool {} usage limit '{}' is not a percentage".format(name, percent) + } + + # test whether the thresholds are exceeded + messages = [] + for name in ["data", "metadata"]: + vals[name + "_pct_used"] = 100 * vals[name + "_used_bytes"] / ( + vals[name + "_total_bytes"] + vals["vg_free_bytes"]) + if vals[name + "_pct_used"] > vals[name + "_threshold"]: + messages.append( + "Docker thinpool {name} usage percentage {pct:.1f} " + "is higher than threshold {thresh:.1f}.".format( + name=name, + pct=vals[name + "_pct_used"], + thresh=vals[name + "_threshold"], + )) + vals["failed"] = True + + vals["msg"] = "\n".join(messages or ["Thinpool usage is within thresholds."]) + return vals + + def _get_vg_free(self, pool, task_vars): + # Determine which VG to examine according to the pool name, the only indicator currently + # available from the Docker API driver info. We assume a name that looks like + # "vg--name-docker--pool"; vg and lv names with inner hyphens doubled, joined by a hyphen. + match = re.match(r'((?:[^-]|--)+)-(?!-)', pool) # matches up to the first single hyphen + if not match: # unlikely, but... be clear if we assumed wrong + raise OpenShiftCheckException( + "This host's Docker reports it is using a storage pool named '{}'.\n" + "However this name does not have the expected format of 'vgname-lvname'\n" + "so the available storage in the VG cannot be determined.".format(pool) + ) + vg_name = match.groups()[0].replace("--", "-") + vgs_cmd = "/sbin/vgs --noheadings -o vg_free --select vg_name=" + vg_name + # should return free space like " 12.00g" if the VG exists; empty if it does not + + ret = self.execute_module("command", {"_raw_params": vgs_cmd}, task_vars) + if ret.get("failed") or ret.get("rc", 0) != 0: + raise OpenShiftCheckException( + "Is LVM installed? Failed to run /sbin/vgs " + "to determine docker storage usage:\n" + ret.get("msg", "") + ) + size = ret.get("stdout", "").strip() + if not size: + raise OpenShiftCheckException( + "This host's Docker reports it is using a storage pool named '{pool}'.\n" + "which we expect to come from local VG '{vg}'.\n" + "However, /sbin/vgs did not find this VG. Is Docker for this host" + "running and using the storage on the host?".format(pool=pool, vg=vg_name) + ) + return size + + @staticmethod + def _convert_to_bytes(string): + units = dict( + b=1, + k=1024, + m=1024**2, + g=1024**3, + t=1024**4, + p=1024**5, + ) + string = string or "" + match = re.match(r'(\d+(?:\.\d+)?)\s*(\w)?', string) # float followed by optional unit + if not match: + raise ValueError("Cannot convert to a byte size: " + string) + + number, unit = match.groups() + multiplier = 1 if not unit else units.get(unit.lower()) + if not multiplier: + raise ValueError("Cannot convert to a byte size: " + string) + + return float(number) * multiplier |