14 files changed, 1293 insertions, 114 deletions
diff --git a/roles/openshift_health_checker/openshift_checks/__init__.py b/roles/openshift_health_checker/openshift_checks/__init__.py
index be63d864a..5c9949ced 100644
--- a/roles/openshift_health_checker/openshift_checks/__init__.py
+++ b/roles/openshift_health_checker/openshift_checks/__init__.py
@@ -66,16 +66,26 @@ class OpenShiftCheck(object):
 LOADER_EXCLUDES = (
     "__init__.py",
     "mixins.py",
+    "logging.py",
 )
 
 
-def load_checks():
+def load_checks(path=None, subpkg=""):
     """Dynamically import all check modules for the side effect of registering checks."""
-    return [
-        import_module(__package__ + "." + name[:-3])
-        for name in os.listdir(os.path.dirname(__file__))
-        if name.endswith(".py") and name not in LOADER_EXCLUDES
-    ]
+    if path is None:
+        path = os.path.dirname(__file__)
+
+    modules = []
+
+    for name in os.listdir(path):
+        if os.path.isdir(os.path.join(path, name)):
+            modules = modules + load_checks(os.path.join(path, name), subpkg + "." + name)
+            continue
+
+        if name.endswith(".py") and name not in LOADER_EXCLUDES:
+            modules.append(import_module(__package__ + subpkg + "." + name[:-3]))
+
+    return modules
 
 
 def get_var(task_vars, *keys, **kwargs):
diff --git a/roles/openshift_health_checker/openshift_checks/disk_availability.py b/roles/openshift_health_checker/openshift_checks/disk_availability.py
index c2792a0fe..962148cb8 100644
--- a/roles/openshift_health_checker/openshift_checks/disk_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/disk_availability.py
@@ -27,10 +27,12 @@ class DiskAvailability(NotContainerizedMixin, OpenShiftCheck):
     def run(self, tmp, task_vars):
         group_names = get_var(task_vars, "group_names")
         ansible_mounts = get_var(task_vars, "ansible_mounts")
-
-        min_free_bytes = max(self.recommended_disk_space_bytes.get(name, 0) for name in group_names)
         free_bytes = self.openshift_available_disk(ansible_mounts)
 
+        recommended_min = max(self.recommended_disk_space_bytes.get(name, 0) for name in group_names)
+        configured_min = int(get_var(task_vars, "openshift_check_min_host_disk_gb", default=0)) * 10**9
+        min_free_bytes = configured_min or recommended_min
+
         if free_bytes < min_free_bytes:
             return {
                 'failed': True,
diff --git a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
index cce289b95..27e6fe383 100644
--- a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py
@@ -1,8 +1,9 @@
 # pylint: disable=missing-docstring
 from openshift_checks import OpenShiftCheck, get_var
+from openshift_checks.mixins import DockerHostMixin
 
 
-class DockerImageAvailability(OpenShiftCheck):
+class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
     """Check that required Docker images are available.
 
     This check attempts to ensure that required docker images are
@@ -13,41 +14,47 @@ class DockerImageAvailability(OpenShiftCheck):
     name = "docker_image_availability"
     tags = ["preflight"]
 
-    skopeo_image = "openshift/openshift-ansible"
+    dependencies = ["skopeo", "python-docker-py"]
 
-    # FIXME(juanvallejo): we should consider other possible values of
-    # `deployment_type` (the key here). See
-    # https://github.com/openshift/openshift-ansible/blob/8e26f8c/roles/openshift_repos/vars/main.yml#L7
-    docker_image_base = {
+    deployment_image_info = {
         "origin": {
-            "repo": "openshift",
-            "image": "origin",
+            "namespace": "openshift",
+            "name": "origin",
         },
         "openshift-enterprise": {
-            "repo": "openshift3",
-            "image": "ose",
+            "namespace": "openshift3",
+            "name": "ose",
         },
     }
 
-    def run(self, tmp, task_vars):
-        required_images = self.required_images(task_vars)
-        missing_images = set(required_images) - set(self.local_images(required_images, task_vars))
-
-        # exit early if all images were found locally
-        if not missing_images:
-            return {"changed": False}
+    @classmethod
+    def is_active(cls, task_vars):
+        """Skip hosts with unsupported deployment types."""
+        deployment_type = get_var(task_vars, "openshift_deployment_type")
+        has_valid_deployment_type = deployment_type in cls.deployment_image_info
 
-        msg, failed, changed = self.update_skopeo_image(task_vars)
+        return super(DockerImageAvailability, cls).is_active(task_vars) and has_valid_deployment_type
 
-        # exit early if Skopeo update fails
+    def run(self, tmp, task_vars):
+        msg, failed, changed = self.ensure_dependencies(task_vars)
         if failed:
             return {
                 "failed": True,
                 "changed": changed,
-                "msg": "Failed to update Skopeo image ({img_name}). {msg}".format(img_name=self.skopeo_image, msg=msg),
+                "msg": "Some dependencies are required in order to check Docker image availability.\n" + msg
             }
 
+        required_images = self.required_images(task_vars)
+        missing_images = set(required_images) - set(self.local_images(required_images, task_vars))
+
+        # exit early if all images were found locally
+        if not missing_images:
+            return {"changed": changed}
+
         registries = self.known_docker_registries(task_vars)
+        if not registries:
+            return {"failed": True, "msg": "Unable to retrieve any docker registries.", "changed": changed}
+
         available_images = self.available_images(missing_images, registries, task_vars)
         unavailable_images = set(missing_images) - set(available_images)
 
@@ -55,44 +62,60 @@ class DockerImageAvailability(OpenShiftCheck):
             return {
                 "failed": True,
                 "msg": (
-                    "One or more required images are not available: {}.\n"
+                    "One or more required Docker images are not available:\n    {}\n"
                     "Configured registries: {}"
-                ).format(", ".join(sorted(unavailable_images)), ", ".join(registries)),
+                ).format(",\n    ".join(sorted(unavailable_images)), ", ".join(registries)),
                 "changed": changed,
             }
 
         return {"changed": changed}
 
     def required_images(self, task_vars):
-        deployment_type = get_var(task_vars, "deployment_type")
-        # FIXME(juanvallejo): we should handle gracefully with a proper error
-        # message when given an unexpected value for `deployment_type`.
-        image_base_name = self.docker_image_base[deployment_type]
-
-        openshift_release = get_var(task_vars, "openshift_release")
-        # FIXME(juanvallejo): this variable is not required when the
-        # installation is non-containerized. The example inventories have it
-        # commented out. We should handle gracefully and with a proper error
-        # message when this variable is required and not set.
-        openshift_image_tag = get_var(task_vars, "openshift_image_tag")
+        deployment_type = get_var(task_vars, "openshift_deployment_type")
+        image_info = self.deployment_image_info[deployment_type]
 
+        openshift_release = get_var(task_vars, "openshift_release", default="latest")
+        openshift_image_tag = get_var(task_vars, "openshift_image_tag")
         is_containerized = get_var(task_vars, "openshift", "common", "is_containerized")
 
-        if is_containerized:
-            images = set(self.containerized_docker_images(image_base_name, openshift_release))
-        else:
-            images = set(self.rpm_docker_images(image_base_name, openshift_release))
+        images = set(self.required_docker_images(
+            image_info["namespace"],
+            image_info["name"],
+            ["registry-console"] if "enterprise" in deployment_type else [],  # include enterprise-only image names
+            openshift_release,
+            is_containerized,
+        ))
 
         # append images with qualified image tags to our list of required images.
         # these are images with a (v0.0.0.0) tag, rather than a standard release
         # format tag (v0.0). We want to check this set in both containerized and
         # non-containerized installations.
         images.update(
-            self.qualified_docker_images(self.image_from_base_name(image_base_name), "v" + openshift_image_tag)
+            self.required_qualified_docker_images(
+                image_info["namespace"],
+                image_info["name"],
+                openshift_image_tag,
+            ),
         )
 
         return images
 
+    @staticmethod
+    def required_docker_images(namespace, name, additional_image_names, version, is_containerized):
+        if is_containerized:
+            return ["{}/{}:{}".format(namespace, name, version)] if name else []
+
+        # include additional non-containerized images specific to the current deployment type
+        return ["{}/{}:{}".format(namespace, img_name, version) for img_name in additional_image_names]
+
+    @staticmethod
+    def required_qualified_docker_images(namespace, name, version):
+        # pylint: disable=invalid-name
+        return [
+            "{}/{}-{}:{}".format(namespace, name, suffix, version)
+            for suffix in ["haproxy-router", "docker-registry", "deployer", "pod"]
+        ]
+
     def local_images(self, images, task_vars):
         """Filter a list of images and return those available locally."""
         return [
@@ -107,31 +130,26 @@ class DockerImageAvailability(OpenShiftCheck):
 
         return bool(result.get("images", []))
 
-    def known_docker_registries(self, task_vars):
-        result = self.module_executor("docker_info", {}, task_vars)
+    @staticmethod
+    def known_docker_registries(task_vars):
+        docker_facts = get_var(task_vars, "openshift", "docker")
+        regs = set(docker_facts["additional_registries"])
 
-        if result.get("failed", False):
-            return []
+        deployment_type = get_var(task_vars, "openshift_deployment_type")
+        if deployment_type == "origin":
+            regs.update(["docker.io"])
+        elif "enterprise" in deployment_type:
+            regs.update(["registry.access.redhat.com"])
 
-        # FIXME(juanvallejo): wrong default type, result["info"] is expected to
-        # contain a dictionary (see how we call `docker_info.get` below).
-        docker_info = result.get("info", "")
-        return [registry.get("Name", "") for registry in docker_info.get("Registries", {})]
+        return list(regs)
 
     def available_images(self, images, registries, task_vars):
         """Inspect existing images using Skopeo and return all images successfully inspected."""
         return [
             image for image in images
-            if self.is_image_available(image, registries, task_vars)
+            if any(self.is_available_skopeo_image(image, registry, task_vars) for registry in registries)
         ]
 
-    def is_image_available(self, image, registries, task_vars):
-        for registry in registries:
-            if self.is_available_skopeo_image(image, registry, task_vars):
-                return True
-
-        return False
-
     def is_available_skopeo_image(self, image, registry, task_vars):
         """Uses Skopeo to determine if required image exists in a given registry."""
 
@@ -140,40 +158,6 @@ class DockerImageAvailability(OpenShiftCheck):
             image=image,
         )
 
-        args = {
-            "name": "skopeo_inspect",
-            "image": self.skopeo_image,
-            "command": cmd_str,
-            "detach": False,
-            "cleanup": True,
-        }
-        result = self.module_executor("docker_container", args, task_vars)
-        return result.get("failed", False)
-
-    def containerized_docker_images(self, base_name, version):
-        return [
-            "{image}:{version}".format(image=self.image_from_base_name(base_name), version=version)
-        ]
-
-    @staticmethod
-    def rpm_docker_images(base, version):
-        return [
-            "{image_repo}/registry-console:{version}".format(image_repo=base["repo"], version=version)
-        ]
-
-    @staticmethod
-    def qualified_docker_images(image_name, version):
-        return [
-            "{}-{}:{}".format(image_name, component, version)
-            for component in "haproxy-router docker-registry deployer pod".split()
-        ]
-
-    @staticmethod
-    def image_from_base_name(base):
-        return "".join([base["repo"], "/", base["image"]])
-
-    # ensures that the skopeo docker image exists, and updates it
-    # with latest if image was already present locally.
-    def update_skopeo_image(self, task_vars):
-        result = self.module_executor("docker_image", {"name": self.skopeo_image}, task_vars)
-        return result.get("msg", ""), result.get("failed", False), result.get("changed", False)
+        args = {"_raw_params": cmd_str}
+        result = self.module_executor("command", args, task_vars)
+        return not result.get("failed", False) and result.get("rc", 0) == 0
diff --git a/roles/openshift_health_checker/openshift_checks/docker_storage.py b/roles/openshift_health_checker/openshift_checks/docker_storage.py
new file mode 100644
index 000000000..7f1751b36
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/docker_storage.py
@@ -0,0 +1,185 @@
+"""Check Docker storage driver and usage."""
+import json
+import re
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
+from openshift_checks.mixins import DockerHostMixin
+
+
+class DockerStorage(DockerHostMixin, OpenShiftCheck):
+    """Check Docker storage driver compatibility.
+
+    This check ensures that Docker is using a supported storage driver,
+    and that loopback is not being used (if using devicemapper).
+    Also that storage usage is not above threshold.
+    """
+
+    name = "docker_storage"
+    tags = ["pre-install", "health", "preflight"]
+
+    dependencies = ["python-docker-py"]
+    storage_drivers = ["devicemapper", "overlay2"]
+    max_thinpool_data_usage_percent = 90.0
+    max_thinpool_meta_usage_percent = 90.0
+
+    # pylint: disable=too-many-return-statements
+    # Reason: permanent stylistic exception;
+    #         it is clearer to return on failures and there are just many ways to fail here.
+    def run(self, tmp, task_vars):
+        msg, failed, changed = self.ensure_dependencies(task_vars)
+        if failed:
+            return {
+                "failed": True,
+                "changed": changed,
+                "msg": "Some dependencies are required in order to query docker storage on host:\n" + msg
+            }
+
+        # attempt to get the docker info hash from the API
+        info = self.execute_module("docker_info", {}, task_vars)
+        if info.get("failed"):
+            return {"failed": True, "changed": changed,
+                    "msg": "Failed to query Docker API. Is docker running on this host?"}
+        if not info.get("info"):  # this would be very strange
+            return {"failed": True, "changed": changed,
+                    "msg": "Docker API query missing info:\n{}".format(json.dumps(info))}
+        info = info["info"]
+
+        # check if the storage driver we saw is valid
+        driver = info.get("Driver", "[NONE]")
+        if driver not in self.storage_drivers:
+            msg = (
+                "Detected unsupported Docker storage driver '{driver}'.\n"
+                "Supported storage drivers are: {drivers}"
+            ).format(driver=driver, drivers=', '.join(self.storage_drivers))
+            return {"failed": True, "changed": changed, "msg": msg}
+
+        # driver status info is a list of tuples; convert to dict and validate based on driver
+        driver_status = {item[0]: item[1] for item in info.get("DriverStatus", [])}
+        if driver == "devicemapper":
+            if driver_status.get("Data loop file"):
+                msg = (
+                    "Use of loopback devices with the Docker devicemapper storage driver\n"
+                    "(the default storage configuration) is unsupported in production.\n"
+                    "Please use docker-storage-setup to configure a backing storage volume.\n"
+                    "See http://red.ht/2rNperO for further information."
+                )
+                return {"failed": True, "changed": changed, "msg": msg}
+            result = self._check_dm_usage(driver_status, task_vars)
+            result['changed'] = result.get('changed', False) or changed
+            return result
+
+        # TODO(lmeyer): determine how to check usage for overlay2
+
+        return {"changed": changed}
+
+    def _check_dm_usage(self, driver_status, task_vars):
+        """
+        Backing assumptions: We expect devicemapper to be backed by an auto-expanding thin pool
+        implemented as an LV in an LVM2 VG. This is how docker-storage-setup currently configures
+        devicemapper storage. The LV is "thin" because it does not use all available storage
+        from its VG, instead expanding as needed; so to determine available space, we gather
+        current usage as the Docker API reports for the driver as well as space available for
+        expansion in the pool's VG.
+        Usage within the LV is divided into pools allocated to data and metadata, either of which
+        could run out of space first; so we check both.
+        """
+        vals = dict(
+            vg_free=self._get_vg_free(driver_status.get("Pool Name"), task_vars),
+            data_used=driver_status.get("Data Space Used"),
+            data_total=driver_status.get("Data Space Total"),
+            metadata_used=driver_status.get("Metadata Space Used"),
+            metadata_total=driver_status.get("Metadata Space Total"),
+        )
+
+        # convert all human-readable strings to bytes
+        for key, value in vals.copy().items():
+            try:
+                vals[key + "_bytes"] = self._convert_to_bytes(value)
+            except ValueError as err:  # unlikely to hit this from API info, but just to be safe
+                return {
+                    "failed": True,
+                    "values": vals,
+                    "msg": "Could not interpret {} value '{}' as bytes: {}".format(key, value, str(err))
+                }
+
+        # determine the threshold percentages which usage should not exceed
+        for name, default in [("data", self.max_thinpool_data_usage_percent),
+                              ("metadata", self.max_thinpool_meta_usage_percent)]:
+            percent = get_var(task_vars, "max_thinpool_" + name + "_usage_percent", default=default)
+            try:
+                vals[name + "_threshold"] = float(percent)
+            except ValueError:
+                return {
+                    "failed": True,
+                    "msg": "Specified thinpool {} usage limit '{}' is not a percentage".format(name, percent)
+                }
+
+        # test whether the thresholds are exceeded
+        messages = []
+        for name in ["data", "metadata"]:
+            vals[name + "_pct_used"] = 100 * vals[name + "_used_bytes"] / (
+                vals[name + "_total_bytes"] + vals["vg_free_bytes"])
+            if vals[name + "_pct_used"] > vals[name + "_threshold"]:
+                messages.append(
+                    "Docker thinpool {name} usage percentage {pct:.1f} "
+                    "is higher than threshold {thresh:.1f}.".format(
+                        name=name,
+                        pct=vals[name + "_pct_used"],
+                        thresh=vals[name + "_threshold"],
+                    ))
+                vals["failed"] = True
+
+        vals["msg"] = "\n".join(messages or ["Thinpool usage is within thresholds."])
+        return vals
+
+    def _get_vg_free(self, pool, task_vars):
+        # Determine which VG to examine according to the pool name, the only indicator currently
+        # available from the Docker API driver info. We assume a name that looks like
+        # "vg--name-docker--pool"; vg and lv names with inner hyphens doubled, joined by a hyphen.
+        match = re.match(r'((?:[^-]|--)+)-(?!-)', pool)  # matches up to the first single hyphen
+        if not match:  # unlikely, but... be clear if we assumed wrong
+            raise OpenShiftCheckException(
+                "This host's Docker reports it is using a storage pool named '{}'.\n"
+                "However this name does not have the expected format of 'vgname-lvname'\n"
+                "so the available storage in the VG cannot be determined.".format(pool)
+            )
+        vg_name = match.groups()[0].replace("--", "-")
+        vgs_cmd = "/sbin/vgs --noheadings -o vg_free --select vg_name=" + vg_name
+        # should return free space like "  12.00g" if the VG exists; empty if it does not
+
+        ret = self.execute_module("command", {"_raw_params": vgs_cmd}, task_vars)
+        if ret.get("failed") or ret.get("rc", 0) != 0:
+            raise OpenShiftCheckException(
+                "Is LVM installed? Failed to run /sbin/vgs "
+                "to determine docker storage usage:\n" + ret.get("msg", "")
+            )
+        size = ret.get("stdout", "").strip()
+        if not size:
+            raise OpenShiftCheckException(
+                "This host's Docker reports it is using a storage pool named '{pool}'.\n"
+                "which we expect to come from local VG '{vg}'.\n"
+                "However, /sbin/vgs did not find this VG. Is Docker for this host"
+                "running and using the storage on the host?".format(pool=pool, vg=vg_name)
+            )
+        return size
+
+    @staticmethod
+    def _convert_to_bytes(string):
+        units = dict(
+            b=1,
+            k=1024,
+            m=1024**2,
+            g=1024**3,
+            t=1024**4,
+            p=1024**5,
+        )
+        string = string or ""
+        match = re.match(r'(\d+(?:\.\d+)?)\s*(\w)?', string)  # float followed by optional unit
+        if not match:
+            raise ValueError("Cannot convert to a byte size: " + string)
+
+        number, unit = match.groups()
+        multiplier = 1 if not unit else units.get(unit.lower())
+        if not multiplier:
+            raise ValueError("Cannot convert to a byte size: " + string)
+
+        return float(number) * multiplier
diff --git a/roles/openshift_health_checker/openshift_checks/logging/__init__.py b/roles/openshift_health_checker/openshift_checks/logging/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/__init__.py
diff --git a/roles/openshift_health_checker/openshift_checks/logging/curator.py b/roles/openshift_health_checker/openshift_checks/logging/curator.py
new file mode 100644
index 000000000..c9fc59896
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/curator.py
@@ -0,0 +1,61 @@
+"""
+Module for performing checks on an Curator logging deployment
+"""
+
+from openshift_checks import get_var
+from openshift_checks.logging.logging import LoggingCheck
+
+
+class Curator(LoggingCheck):
+    """Module that checks an integrated logging Curator deployment"""
+
+    name = "curator"
+    tags = ["health", "logging"]
+
+    logging_namespace = None
+
+    def run(self, tmp, task_vars):
+        """Check various things and gather errors. Returns: result as hash"""
+
+        self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging")
+        curator_pods, error = super(Curator, self).get_pods_for_component(
+            self.module_executor,
+            self.logging_namespace,
+            "curator",
+            task_vars
+        )
+        if error:
+            return {"failed": True, "changed": False, "msg": error}
+        check_error = self.check_curator(curator_pods)
+
+        if check_error:
+            msg = ("The following Curator deployment issue was found:"
+                   "\n-------\n"
+                   "{}".format(check_error))
+            return {"failed": True, "changed": False, "msg": msg}
+
+        # TODO(lmeyer): run it all again for the ops cluster
+        return {"failed": False, "changed": False, "msg": 'No problems found with Curator deployment.'}
+
+    def check_curator(self, pods):
+        """Check to see if curator is up and working. Returns: error string"""
+        if not pods:
+            return (
+                "There are no Curator pods for the logging stack,\n"
+                "so nothing will prune Elasticsearch indexes.\n"
+                "Is Curator correctly deployed?"
+            )
+
+        not_running = super(Curator, self).not_running_pods(pods)
+        if len(not_running) == len(pods):
+            return (
+                "The Curator pod is not currently in a running state,\n"
+                "so Elasticsearch indexes may increase without bound."
+            )
+        if len(pods) - len(not_running) > 1:
+            return (
+                "There is more than one Curator pod running. This should not normally happen.\n"
+                "Although this doesn't cause any problems, you may want to investigate."
+            )
+
+        return None
diff --git a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
new file mode 100644
index 000000000..01cb35b81
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
@@ -0,0 +1,217 @@
+"""
+Module for performing checks on an Elasticsearch logging deployment
+"""
+
+import json
+import re
+
+from openshift_checks import get_var
+from openshift_checks.logging.logging import LoggingCheck
+
+
+class Elasticsearch(LoggingCheck):
+    """Module that checks an integrated logging Elasticsearch deployment"""
+
+    name = "elasticsearch"
+    tags = ["health", "logging"]
+
+    logging_namespace = None
+
+    def run(self, tmp, task_vars):
+        """Check various things and gather errors. Returns: result as hash"""
+
+        self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging")
+        es_pods, error = super(Elasticsearch, self).get_pods_for_component(
+            self.execute_module,
+            self.logging_namespace,
+            "es",
+            task_vars,
+        )
+        if error:
+            return {"failed": True, "changed": False, "msg": error}
+        check_error = self.check_elasticsearch(es_pods, task_vars)
+
+        if check_error:
+            msg = ("The following Elasticsearch deployment issue was found:"
+                   "\n-------\n"
+                   "{}".format(check_error))
+            return {"failed": True, "changed": False, "msg": msg}
+
+        # TODO(lmeyer): run it all again for the ops cluster
+        return {"failed": False, "changed": False, "msg": 'No problems found with Elasticsearch deployment.'}
+
+    def _not_running_elasticsearch_pods(self, es_pods):
+        """Returns: list of running pods, list of errors about non-running pods"""
+        not_running = super(Elasticsearch, self).not_running_pods(es_pods)
+        if not_running:
+            return not_running, [(
+                'The following Elasticsearch pods are not running:\n'
+                '{pods}'
+                'These pods will not aggregate logs from their nodes.'
+            ).format(pods=''.join(
+                "  {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None'))
+                for pod in not_running
+            ))]
+        return not_running, []
+
+    def check_elasticsearch(self, es_pods, task_vars):
+        """Various checks for elasticsearch. Returns: error string"""
+        not_running_pods, error_msgs = self._not_running_elasticsearch_pods(es_pods)
+        running_pods = [pod for pod in es_pods if pod not in not_running_pods]
+        pods_by_name = {
+            pod['metadata']['name']: pod for pod in running_pods
+            # Filter out pods that are not members of a DC
+            if pod['metadata'].get('labels', {}).get('deploymentconfig')
+        }
+        if not pods_by_name:
+            return 'No logging Elasticsearch pods were found. Is logging deployed?'
+        error_msgs += self._check_elasticsearch_masters(pods_by_name, task_vars)
+        error_msgs += self._check_elasticsearch_node_list(pods_by_name, task_vars)
+        error_msgs += self._check_es_cluster_health(pods_by_name, task_vars)
+        error_msgs += self._check_elasticsearch_diskspace(pods_by_name, task_vars)
+        return '\n'.join(error_msgs)
+
+    @staticmethod
+    def _build_es_curl_cmd(pod_name, url):
+        base = "exec {name} -- curl -s --cert {base}cert --key {base}key --cacert {base}ca -XGET '{url}'"
+        return base.format(base="/etc/elasticsearch/secret/admin-", name=pod_name, url=url)
+
+    def _check_elasticsearch_masters(self, pods_by_name, task_vars):
+        """Check that Elasticsearch masters are sane. Returns: list of error strings"""
+        es_master_names = set()
+        error_msgs = []
+        for pod_name in pods_by_name.keys():
+            # Compare what each ES node reports as master and compare for split brain
+            get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master")
+            master_name_str = self._exec_oc(get_master_cmd, [], task_vars)
+            master_names = (master_name_str or '').split(' ')
+            if len(master_names) > 1:
+                es_master_names.add(master_names[1])
+            else:
+                error_msgs.append(
+                    'No master? Elasticsearch {pod} returned bad string when asked master name:\n'
+                    '  {response}'.format(pod=pod_name, response=master_name_str)
+                )
+
+        if not es_master_names:
+            error_msgs.append('No logging Elasticsearch masters were found. Is logging deployed?')
+            return '\n'.join(error_msgs)
+
+        if len(es_master_names) > 1:
+            error_msgs.append(
+                'Found multiple Elasticsearch masters according to the pods:\n'
+                '{master_list}\n'
+                'This implies that the masters have "split brain" and are not correctly\n'
+                'replicating data for the logging cluster. Log loss is likely to occur.'
+                .format(master_list='\n'.join('  ' + master for master in es_master_names))
+            )
+
+        return error_msgs
+
+    def _check_elasticsearch_node_list(self, pods_by_name, task_vars):
+        """Check that reported ES masters are accounted for by pods. Returns: list of error strings"""
+
+        if not pods_by_name:
+            return ['No logging Elasticsearch masters were found. Is logging deployed?']
+
+        # get ES cluster nodes
+        node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes')
+        cluster_node_data = self._exec_oc(node_cmd, [], task_vars)
+        try:
+            cluster_nodes = json.loads(cluster_node_data)['nodes']
+        except (ValueError, KeyError):
+            return [
+                'Failed to query Elasticsearch for the list of ES nodes. The output was:\n' +
+                cluster_node_data
+            ]
+
+        # Try to match all ES-reported node hosts to known pods.
+        error_msgs = []
+        for node in cluster_nodes.values():
+            # Note that with 1.4/3.4 the pod IP may be used as the master name
+            if not any(node['host'] in (pod_name, pod['status'].get('podIP'))
+                       for pod_name, pod in pods_by_name.items()):
+                error_msgs.append(
+                    'The Elasticsearch cluster reports a member node "{node}"\n'
+                    'that does not correspond to any known ES pod.'.format(node=node['host'])
+                )
+
+        return error_msgs
+
+    def _check_es_cluster_health(self, pods_by_name, task_vars):
+        """Exec into the elasticsearch pods and check the cluster health. Returns: list of errors"""
+        error_msgs = []
+        for pod_name in pods_by_name.keys():
+            cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true')
+            cluster_health_data = self._exec_oc(cluster_health_cmd, [], task_vars)
+            try:
+                health_res = json.loads(cluster_health_data)
+                if not health_res or not health_res.get('status'):
+                    raise ValueError()
+            except ValueError:
+                error_msgs.append(
+                    'Could not retrieve cluster health status from logging ES pod "{pod}".\n'
+                    'Response was:\n{output}'.format(pod=pod_name, output=cluster_health_data)
+                )
+                continue
+
+            if health_res['status'] not in ['green', 'yellow']:
+                error_msgs.append(
+                    'Elasticsearch cluster health status is RED according to pod "{}"'.format(pod_name)
+                )
+
+        return error_msgs
+
+    def _check_elasticsearch_diskspace(self, pods_by_name, task_vars):
+        """
+        Exec into an ES pod and query the diskspace on the persistent volume.
+        Returns: list of errors
+        """
+        error_msgs = []
+        for pod_name in pods_by_name.keys():
+            df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name)
+            disk_output = self._exec_oc(df_cmd, [], task_vars)
+            lines = disk_output.splitlines()
+            # expecting one header looking like 'IUse% Use%' and one body line
+            body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$'
+            if len(lines) != 2 or len(lines[0].split()) != 2 or not re.match(body_re, lines[1]):
+                error_msgs.append(
+                    'Could not retrieve storage usage from logging ES pod "{pod}".\n'
+                    'Response to `df` command was:\n{output}'.format(pod=pod_name, output=disk_output)
+                )
+                continue
+            inode_pct, disk_pct = re.match(body_re, lines[1]).groups()
+
+            inode_pct_thresh = get_var(task_vars, 'openshift_check_efk_es_inode_pct', default='90')
+            if int(inode_pct) >= int(inode_pct_thresh):
+                error_msgs.append(
+                    'Inode percent usage on the storage volume for logging ES pod "{pod}"\n'
+                    '  is {pct}, greater than threshold {limit}.\n'
+                    '  Note: threshold can be specified in inventory with {param}'.format(
+                        pod=pod_name,
+                        pct=str(inode_pct),
+                        limit=str(inode_pct_thresh),
+                        param='openshift_check_efk_es_inode_pct',
+                    ))
+            disk_pct_thresh = get_var(task_vars, 'openshift_check_efk_es_storage_pct', default='80')
+            if int(disk_pct) >= int(disk_pct_thresh):
+                error_msgs.append(
+                    'Disk percent usage on the storage volume for logging ES pod "{pod}"\n'
+                    '  is {pct}, greater than threshold {limit}.\n'
+                    '  Note: threshold can be specified in inventory with {param}'.format(
+                        pod=pod_name,
+                        pct=str(disk_pct),
+                        limit=str(disk_pct_thresh),
+                        param='openshift_check_efk_es_storage_pct',
+                    ))
+
+        return error_msgs
+
+    def _exec_oc(self, cmd_str, extra_args, task_vars):
+        return super(Elasticsearch, self).exec_oc(
+            self.execute_module,
+            self.logging_namespace,
+            cmd_str,
+            extra_args,
+            task_vars,
+        )
diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
new file mode 100644
index 000000000..627567293
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
@@ -0,0 +1,170 @@
+"""
+Module for performing checks on an Fluentd logging deployment
+"""
+
+import json
+
+from openshift_checks import get_var
+from openshift_checks.logging.logging import LoggingCheck
+
+
+class Fluentd(LoggingCheck):
+    """Module that checks an integrated logging Fluentd deployment"""
+    name = "fluentd"
+    tags = ["health", "logging"]
+
+    logging_namespace = None
+
+    def run(self, tmp, task_vars):
+        """Check various things and gather errors. Returns: result as hash"""
+
+        self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging")
+        fluentd_pods, error = super(Fluentd, self).get_pods_for_component(
+            self.execute_module,
+            self.logging_namespace,
+            "fluentd",
+            task_vars,
+        )
+        if error:
+            return {"failed": True, "changed": False, "msg": error}
+        check_error = self.check_fluentd(fluentd_pods, task_vars)
+
+        if check_error:
+            msg = ("The following Fluentd deployment issue was found:"
+                   "\n-------\n"
+                   "{}".format(check_error))
+            return {"failed": True, "changed": False, "msg": msg}
+
+        # TODO(lmeyer): run it all again for the ops cluster
+        return {"failed": False, "changed": False, "msg": 'No problems found with Fluentd deployment.'}
+
+    @staticmethod
+    def _filter_fluentd_labeled_nodes(nodes_by_name, node_selector):
+        """Filter to all nodes with fluentd label. Returns dict(name: node), error string"""
+        label, value = node_selector.split('=', 1)
+        fluentd_nodes = {
+            name: node for name, node in nodes_by_name.items()
+            if node['metadata']['labels'].get(label) == value
+        }
+        if not fluentd_nodes:
+            return None, (
+                'There are no nodes with the fluentd label {label}.\n'
+                'This means no logs will be aggregated from the nodes.'
+            ).format(label=node_selector)
+        return fluentd_nodes, None
+
+    @staticmethod
+    def _check_node_labeling(nodes_by_name, fluentd_nodes, node_selector, task_vars):
+        """Note if nodes are not labeled as expected. Returns: error string"""
+        intended_nodes = get_var(task_vars, 'openshift_logging_fluentd_hosts', default=['--all'])
+        if not intended_nodes or '--all' in intended_nodes:
+            intended_nodes = nodes_by_name.keys()
+        nodes_missing_labels = set(intended_nodes) - set(fluentd_nodes.keys())
+        if nodes_missing_labels:
+            return (
+                'The following nodes are supposed to be labeled with {label} but are not:\n'
+                '  {nodes}\n'
+                'Fluentd will not aggregate logs from these nodes.'
+            ).format(label=node_selector, nodes=', '.join(nodes_missing_labels))
+        return None
+
+    @staticmethod
+    def _check_nodes_have_fluentd(pods, fluentd_nodes):
+        """Make sure fluentd is on all the labeled nodes. Returns: error string"""
+        unmatched_nodes = fluentd_nodes.copy()
+        node_names_by_label = {
+            node['metadata']['labels']['kubernetes.io/hostname']: name
+            for name, node in fluentd_nodes.items()
+        }
+        node_names_by_internal_ip = {
+            address['address']: name
+            for name, node in fluentd_nodes.items()
+            for address in node['status']['addresses']
+            if address['type'] == "InternalIP"
+        }
+        for pod in pods:
+            for name in [
+                    pod['spec']['nodeName'],
+                    node_names_by_internal_ip.get(pod['spec']['nodeName']),
+                    node_names_by_label.get(pod.get('spec', {}).get('host')),
+            ]:
+                unmatched_nodes.pop(name, None)
+        if unmatched_nodes:
+            return (
+                'The following nodes are supposed to have a Fluentd pod but do not:\n'
+                '{nodes}'
+                'These nodes will not have their logs aggregated.'
+            ).format(nodes=''.join(
+                "  {}\n".format(name)
+                for name in unmatched_nodes.keys()
+            ))
+        return None
+
+    def _check_fluentd_pods_running(self, pods):
+        """Make sure all fluentd pods are running. Returns: error string"""
+        not_running = super(Fluentd, self).not_running_pods(pods)
+        if not_running:
+            return (
+                'The following Fluentd pods are supposed to be running but are not:\n'
+                '{pods}'
+                'These pods will not aggregate logs from their nodes.'
+            ).format(pods=''.join(
+                "  {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None'))
+                for pod in not_running
+            ))
+        return None
+
+    def check_fluentd(self, pods, task_vars):
+        """Verify fluentd is running everywhere. Returns: error string"""
+
+        node_selector = get_var(task_vars, 'openshift_logging_fluentd_nodeselector',
+                                default='logging-infra-fluentd=true')
+
+        nodes_by_name, error = self.get_nodes_by_name(task_vars)
+
+        if error:
+            return error
+        fluentd_nodes, error = self._filter_fluentd_labeled_nodes(nodes_by_name, node_selector)
+        if error:
+            return error
+
+        error_msgs = []
+        error = self._check_node_labeling(nodes_by_name, fluentd_nodes, node_selector, task_vars)
+        if error:
+            error_msgs.append(error)
+        error = self._check_nodes_have_fluentd(pods, fluentd_nodes)
+        if error:
+            error_msgs.append(error)
+        error = self._check_fluentd_pods_running(pods)
+        if error:
+            error_msgs.append(error)
+
+        # Make sure there are no extra fluentd pods
+        if len(pods) > len(fluentd_nodes):
+            error_msgs.append(
+                'There are more Fluentd pods running than nodes labeled.\n'
+                'This may not cause problems with logging but it likely indicates something wrong.'
+            )
+
+        return '\n'.join(error_msgs)
+
+    def get_nodes_by_name(self, task_vars):
+        """Retrieve all the node definitions. Returns: dict(name: node), error string"""
+        nodes_json = self._exec_oc("get nodes -o json", [], task_vars)
+        try:
+            nodes = json.loads(nodes_json)
+        except ValueError:  # no valid json - should not happen
+            return None, "Could not obtain a list of nodes to validate fluentd. Output from oc get:\n" + nodes_json
+        if not nodes or not nodes.get('items'):  # also should not happen
+            return None, "No nodes appear to be defined according to the API."
+        return {
+            node['metadata']['name']: node
+            for node in nodes['items']
+        }, None
+
+    def _exec_oc(self, cmd_str, extra_args, task_vars):
+        return super(Fluentd, self).exec_oc(self.execute_module,
+                                            self.logging_namespace,
+                                            cmd_str,
+                                            extra_args,
+                                            task_vars)
diff --git a/roles/openshift_health_checker/openshift_checks/logging/kibana.py b/roles/openshift_health_checker/openshift_checks/logging/kibana.py
new file mode 100644
index 000000000..442f407b1
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/kibana.py
@@ -0,0 +1,229 @@
+"""
+Module for performing checks on a Kibana logging deployment
+"""
+
+import json
+import ssl
+
+try:
+    from urllib2 import HTTPError, URLError
+    import urllib2
+except ImportError:
+    from urllib.error import HTTPError, URLError
+    import urllib.request as urllib2
+
+from openshift_checks import get_var
+from openshift_checks.logging.logging import LoggingCheck
+
+
+class Kibana(LoggingCheck):
+    """Module that checks an integrated logging Kibana deployment"""
+
+    name = "kibana"
+    tags = ["health", "logging"]
+
+    logging_namespace = None
+
+    def run(self, tmp, task_vars):
+        """Check various things and gather errors. Returns: result as hash"""
+
+        self.logging_namespace = get_var(task_vars, "openshift_logging_namespace", default="logging")
+        kibana_pods, error = super(Kibana, self).get_pods_for_component(
+            self.execute_module,
+            self.logging_namespace,
+            "kibana",
+            task_vars,
+        )
+        if error:
+            return {"failed": True, "changed": False, "msg": error}
+        check_error = self.check_kibana(kibana_pods)
+
+        if not check_error:
+            check_error = self._check_kibana_route(task_vars)
+
+        if check_error:
+            msg = ("The following Kibana deployment issue was found:"
+                   "\n-------\n"
+                   "{}".format(check_error))
+            return {"failed": True, "changed": False, "msg": msg}
+
+        # TODO(lmeyer): run it all again for the ops cluster
+        return {"failed": False, "changed": False, "msg": 'No problems found with Kibana deployment.'}
+
+    def _verify_url_internal(self, url, task_vars):
+        """
+        Try to reach a URL from the host.
+        Returns: success (bool), reason (for failure)
+        """
+        args = dict(
+            url=url,
+            follow_redirects='none',
+            validate_certs='no',  # likely to be signed with internal CA
+            # TODO(lmeyer): give users option to validate certs
+            status_code=302,
+        )
+        result = self.execute_module('uri', args, task_vars)
+        if result.get('failed'):
+            return result['msg']
+        return None
+
+    @staticmethod
+    def _verify_url_external(url):
+        """
+        Try to reach a URL from ansible control host.
+        Returns: success (bool), reason (for failure)
+        """
+        # This actually checks from the ansible control host, which may or may not
+        # really be "external" to the cluster.
+
+        # Disable SSL cert validation to work around internally signed certs
+        ctx = ssl.create_default_context()
+        ctx.check_hostname = False  # or setting CERT_NONE is refused
+        ctx.verify_mode = ssl.CERT_NONE
+
+        # Verify that the url is returning a valid response
+        try:
+            # We only care if the url connects and responds
+            return_code = urllib2.urlopen(url, context=ctx).getcode()
+        except HTTPError as httperr:
+            return httperr.reason
+        except URLError as urlerr:
+            return str(urlerr)
+
+        # there appears to be no way to prevent urlopen from following redirects
+        if return_code != 200:
+            return 'Expected success (200) but got return code {}'.format(int(return_code))
+
+        return None
+
+    def check_kibana(self, pods):
+        """Check to see if Kibana is up and working. Returns: error string."""
+
+        if not pods:
+            return "There are no Kibana pods deployed, so no access to the logging UI."
+
+        not_running = self.not_running_pods(pods)
+        if len(not_running) == len(pods):
+            return "No Kibana pod is in a running state, so there is no access to the logging UI."
+        elif not_running:
+            return (
+                "The following Kibana pods are not currently in a running state:\n"
+                "{pods}"
+                "However at least one is, so service may not be impacted."
+            ).format(pods="".join("  " + pod['metadata']['name'] + "\n" for pod in not_running))
+
+        return None
+
+    def _get_kibana_url(self, task_vars):
+        """
+        Get kibana route or report error.
+        Returns: url (or empty), reason for failure
+        """
+
+        # Get logging url
+        get_route = self._exec_oc("get route logging-kibana -o json", [], task_vars)
+        if not get_route:
+            return None, 'no_route_exists'
+
+        route = json.loads(get_route)
+
+        # check that the route has been accepted by a router
+        ingress = route["status"]["ingress"]
+        # ingress can be null if there is no router, or empty if not routed
+        if not ingress or not ingress[0]:
+            return None, 'route_not_accepted'
+
+        host = route.get("spec", {}).get("host")
+        if not host:
+            return None, 'route_missing_host'
+
+        return 'https://{}/'.format(host), None
+
+    def _check_kibana_route(self, task_vars):
+        """
+        Check to see if kibana route is up and working.
+        Returns: error string
+        """
+        known_errors = dict(
+            no_route_exists=(
+                'No route is defined for Kibana in the logging namespace,\n'
+                'so the logging stack is not accessible. Is logging deployed?\n'
+                'Did something remove the logging-kibana route?'
+            ),
+            route_not_accepted=(
+                'The logging-kibana route is not being routed by any router.\n'
+                'Is the router deployed and working?'
+            ),
+            route_missing_host=(
+                'The logging-kibana route has no hostname defined,\n'
+                'which should never happen. Did something alter its definition?'
+            ),
+        )
+
+        kibana_url, error = self._get_kibana_url(task_vars)
+        if not kibana_url:
+            return known_errors.get(error, error)
+
+        # first, check that kibana is reachable from the master.
+        error = self._verify_url_internal(kibana_url, task_vars)
+        if error:
+            if 'urlopen error [Errno 111] Connection refused' in error:
+                error = (
+                    'Failed to connect from this master to Kibana URL {url}\n'
+                    'Is kibana running, and is at least one router routing to it?'
+                ).format(url=kibana_url)
+            elif 'urlopen error [Errno -2] Name or service not known' in error:
+                error = (
+                    'Failed to connect from this master to Kibana URL {url}\n'
+                    'because the hostname does not resolve.\n'
+                    'Is DNS configured for the Kibana hostname?'
+                ).format(url=kibana_url)
+            elif 'Status code was not' in error:
+                error = (
+                    'A request from this master to the Kibana URL {url}\n'
+                    'did not return the correct status code (302).\n'
+                    'This could mean that Kibana is malfunctioning, the hostname is\n'
+                    'resolving incorrectly, or other network issues. The output was:\n'
+                    '  {error}'
+                ).format(url=kibana_url, error=error)
+            return 'Error validating the logging Kibana route:\n' + error
+
+        # in production we would like the kibana route to work from outside the
+        # cluster too; but that may not be the case, so allow disabling just this part.
+        if not get_var(task_vars, "openshift_check_efk_kibana_external", default=True):
+            return None
+        error = self._verify_url_external(kibana_url)
+        if error:
+            if 'urlopen error [Errno 111] Connection refused' in error:
+                error = (
+                    'Failed to connect from the Ansible control host to Kibana URL {url}\n'
+                    'Is the router for the Kibana hostname exposed externally?'
+                ).format(url=kibana_url)
+            elif 'urlopen error [Errno -2] Name or service not known' in error:
+                error = (
+                    'Failed to resolve the Kibana hostname in {url}\n'
+                    'from the Ansible control host.\n'
+                    'Is DNS configured to resolve this Kibana hostname externally?'
+                ).format(url=kibana_url)
+            elif 'Expected success (200)' in error:
+                error = (
+                    'A request to Kibana at {url}\n'
+                    'returned the wrong error code:\n'
+                    '  {error}\n'
+                    'This could mean that Kibana is malfunctioning, the hostname is\n'
+                    'resolving incorrectly, or other network issues.'
+                ).format(url=kibana_url, error=error)
+            error = (
+                'Error validating the logging Kibana route:\n{error}\n'
+                'To disable external Kibana route validation, set in your inventory:\n'
+                '  openshift_check_efk_kibana_external=False'
+            ).format(error=error)
+            return error
+        return None
+
+    def _exec_oc(self, cmd_str, extra_args, task_vars):
+        return super(Kibana, self).exec_oc(self.execute_module,
+                                           self.logging_namespace,
+                                           cmd_str,
+                                           extra_args,
+                                           task_vars)
diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging.py b/roles/openshift_health_checker/openshift_checks/logging/logging.py
new file mode 100644
index 000000000..05b4d300c
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/logging/logging.py
@@ -0,0 +1,96 @@
+"""
+Util functions for performing checks on an Elasticsearch, Fluentd, and Kibana stack
+"""
+
+import json
+import os
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
+
+
+class LoggingCheck(OpenShiftCheck):
+    """Base class for logging component checks"""
+
+    name = "logging"
+
+    @classmethod
+    def is_active(cls, task_vars):
+        return super(LoggingCheck, cls).is_active(task_vars) and cls.is_first_master(task_vars)
+
+    @staticmethod
+    def is_first_master(task_vars):
+        """Run only on first master and only when logging is configured. Returns: bool"""
+        logging_deployed = get_var(task_vars, "openshift_hosted_logging_deploy", default=True)
+        # Note: It would be nice to use membership in oo_first_master group, however for now it
+        # seems best to avoid requiring that setup and just check this is the first master.
+        hostname = get_var(task_vars, "ansible_ssh_host") or [None]
+        masters = get_var(task_vars, "groups", "masters", default=None) or [None]
+        return logging_deployed and masters[0] == hostname
+
+    def run(self, tmp, task_vars):
+        pass
+
+    def get_pods_for_component(self, execute_module, namespace, logging_component, task_vars):
+        """Get all pods for a given component. Returns: list of pods for component, error string"""
+        pod_output = self.exec_oc(
+            execute_module,
+            namespace,
+            "get pods -l component={} -o json".format(logging_component),
+            [],
+            task_vars
+        )
+        try:
+            pods = json.loads(pod_output)
+            if not pods or not pods.get('items'):
+                raise ValueError()
+        except ValueError:
+            # successful run but non-parsing data generally means there were no pods in the namespace
+            return None, 'There are no pods in the {} namespace. Is logging deployed?'.format(namespace)
+
+        return pods['items'], None
+
+    @staticmethod
+    def not_running_pods(pods):
+        """Returns: list of pods not in a ready and running state"""
+        return [
+            pod for pod in pods
+            if any(
+                container['ready'] is False
+                for container in pod['status']['containerStatuses']
+            ) or not any(
+                condition['type'] == 'Ready' and condition['status'] == 'True'
+                for condition in pod['status']['conditions']
+            )
+        ]
+
+    @staticmethod
+    def exec_oc(execute_module=None, namespace="logging", cmd_str="", extra_args=None, task_vars=None):
+        """
+        Execute an 'oc' command in the remote host.
+        Returns: output of command and namespace,
+        or raises OpenShiftCheckException on error
+        """
+        config_base = get_var(task_vars, "openshift", "common", "config_base")
+        args = {
+            "namespace": namespace,
+            "config_file": os.path.join(config_base, "master", "admin.kubeconfig"),
+            "cmd": cmd_str,
+            "extra_args": list(extra_args) if extra_args else [],
+        }
+
+        result = execute_module("ocutil", args, task_vars)
+        if result.get("failed"):
+            msg = (
+                'Unexpected error using `oc` to validate the logging stack components.\n'
+                'Error executing `oc {cmd}`:\n'
+                '{error}'
+            ).format(cmd=args['cmd'], error=result['result'])
+
+            if result['result'] == '[Errno 2] No such file or directory':
+                msg = (
+                    "This host is supposed to be a master but does not have the `oc` command where expected.\n"
+                    "Has an installation been run on this host yet?"
+                )
+            raise OpenShiftCheckException(msg)
+
+        return result.get("result", "")
diff --git a/roles/openshift_health_checker/openshift_checks/memory_availability.py b/roles/openshift_health_checker/openshift_checks/memory_availability.py
index 28805dc37..f4e31065f 100644
--- a/roles/openshift_health_checker/openshift_checks/memory_availability.py
+++ b/roles/openshift_health_checker/openshift_checks/memory_availability.py
@@ -1,6 +1,9 @@
 # pylint: disable=missing-docstring
 from openshift_checks import OpenShiftCheck, get_var
 
+MIB = 2**20
+GIB = 2**30
+
 
 class MemoryAvailability(OpenShiftCheck):
     """Check that recommended memory is available."""
@@ -11,10 +14,12 @@ class MemoryAvailability(OpenShiftCheck):
     # Values taken from the official installation documentation:
     # https://docs.openshift.org/latest/install_config/install/prerequisites.html#system-requirements
     recommended_memory_bytes = {
-        "masters": 16 * 10**9,
-        "nodes": 8 * 10**9,
-        "etcd": 20 * 10**9,
+        "masters": 16 * GIB,
+        "nodes": 8 * GIB,
+        "etcd": 8 * GIB,
     }
+    # https://access.redhat.com/solutions/3006511 physical RAM is partly reserved from memtotal
+    memtotal_adjustment = 1 * GIB
 
     @classmethod
     def is_active(cls, task_vars):
@@ -25,19 +30,21 @@ class MemoryAvailability(OpenShiftCheck):
 
     def run(self, tmp, task_vars):
         group_names = get_var(task_vars, "group_names")
-        total_memory_bytes = get_var(task_vars, "ansible_memtotal_mb") * 10**6
+        total_memory_bytes = get_var(task_vars, "ansible_memtotal_mb") * MIB
 
-        min_memory_bytes = max(self.recommended_memory_bytes.get(name, 0) for name in group_names)
+        recommended_min = max(self.recommended_memory_bytes.get(name, 0) for name in group_names)
+        configured_min = float(get_var(task_vars, "openshift_check_min_host_memory_gb", default=0)) * GIB
+        min_memory_bytes = configured_min or recommended_min
 
-        if total_memory_bytes < min_memory_bytes:
+        if total_memory_bytes + self.memtotal_adjustment < min_memory_bytes:
             return {
                 'failed': True,
                 'msg': (
-                    'Available memory ({available:.1f} GB) '
-                    'below recommended value ({recommended:.1f} GB)'
+                    'Available memory ({available:.1f} GiB) is too far '
+                    'below recommended value ({recommended:.1f} GiB)'
                 ).format(
-                    available=float(total_memory_bytes) / 10**9,
-                    recommended=float(min_memory_bytes) / 10**9,
+                    available=float(total_memory_bytes) / GIB,
+                    recommended=float(min_memory_bytes) / GIB,
                 ),
             }
 
diff --git a/roles/openshift_health_checker/openshift_checks/mixins.py b/roles/openshift_health_checker/openshift_checks/mixins.py
index 20d160eaf..7f3d78cc4 100644
--- a/roles/openshift_health_checker/openshift_checks/mixins.py
+++ b/roles/openshift_health_checker/openshift_checks/mixins.py
@@ -1,4 +1,3 @@
-# pylint: disable=missing-docstring,too-few-public-methods
 """
 Mixin classes meant to be used with subclasses of OpenShiftCheck.
 """
@@ -8,8 +7,49 @@ from openshift_checks import get_var
 
 class NotContainerizedMixin(object):
     """Mixin for checks that are only active when not in containerized mode."""
+    # permanent # pylint: disable=too-few-public-methods
+    # Reason: The mixin is not intended to stand on its own as a class.
 
     @classmethod
     def is_active(cls, task_vars):
+        """Only run on non-containerized hosts."""
         is_containerized = get_var(task_vars, "openshift", "common", "is_containerized")
         return super(NotContainerizedMixin, cls).is_active(task_vars) and not is_containerized
+
+
+class DockerHostMixin(object):
+    """Mixin for checks that are only active on hosts that require Docker."""
+
+    dependencies = []
+
+    @classmethod
+    def is_active(cls, task_vars):
+        """Only run on hosts that depend on Docker."""
+        is_containerized = get_var(task_vars, "openshift", "common", "is_containerized")
+        is_node = "nodes" in get_var(task_vars, "group_names", default=[])
+        return super(DockerHostMixin, cls).is_active(task_vars) and (is_containerized or is_node)
+
+    def ensure_dependencies(self, task_vars):
+        """
+        Ensure that docker-related packages exist, but not on atomic hosts
+        (which would not be able to install but should already have them).
+        Returns: msg, failed, changed
+        """
+        if get_var(task_vars, "openshift", "common", "is_atomic"):
+            return "", False, False
+
+        # NOTE: we would use the "package" module but it's actually an action plugin
+        # and it's not clear how to invoke one of those. This is about the same anyway:
+        pkg_manager = get_var(task_vars, "ansible_pkg_mgr", default="yum")
+        result = self.module_executor(pkg_manager, {"name": self.dependencies, "state": "present"}, task_vars)
+        msg = result.get("msg", "")
+        if result.get("failed"):
+            if "No package matching" in msg:
+                msg = "Ensure that all required dependencies can be installed via `yum`.\n"
+            msg = (
+                "Unable to install required packages on this host:\n"
+                "    {deps}\n{msg}"
+            ).format(deps=',\n    '.join(self.dependencies), msg=msg)
+        failed = result.get("failed", False) or result.get("rc", 0) != 0
+        changed = result.get("changed", False)
+        return msg, failed, changed
diff --git a/roles/openshift_health_checker/openshift_checks/ovs_version.py b/roles/openshift_health_checker/openshift_checks/ovs_version.py
new file mode 100644
index 000000000..1e45ae3af
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/ovs_version.py
@@ -0,0 +1,78 @@
+"""
+Ansible module for determining if an installed version of Open vSwitch is incompatible with the
+currently installed version of OpenShift.
+"""
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
+from openshift_checks.mixins import NotContainerizedMixin
+
+
+class OvsVersion(NotContainerizedMixin, OpenShiftCheck):
+    """Check that packages in a package_list are installed on the host
+    and are the correct version as determined by an OpenShift installation.
+    """
+
+    name = "ovs_version"
+    tags = ["health"]
+
+    openshift_to_ovs_version = {
+        "3.6": "2.6",
+        "3.5": "2.6",
+        "3.4": "2.4",
+    }
+
+    # map major release versions across releases
+    # to a common major version
+    openshift_major_release_version = {
+        "1": "3",
+    }
+
+    @classmethod
+    def is_active(cls, task_vars):
+        """Skip hosts that do not have package requirements."""
+        group_names = get_var(task_vars, "group_names", default=[])
+        master_or_node = 'masters' in group_names or 'nodes' in group_names
+        return super(OvsVersion, cls).is_active(task_vars) and master_or_node
+
+    def run(self, tmp, task_vars):
+        args = {
+            "package_list": [
+                {
+                    "name": "openvswitch",
+                    "version": self.get_required_ovs_version(task_vars),
+                },
+            ],
+        }
+        return self.execute_module("rpm_version", args, task_vars)
+
+    def get_required_ovs_version(self, task_vars):
+        """Return the correct Open vSwitch version for the current OpenShift version"""
+        openshift_version = self._get_openshift_version(task_vars)
+
+        if float(openshift_version) < 3.5:
+            return self.openshift_to_ovs_version["3.4"]
+
+        ovs_version = self.openshift_to_ovs_version.get(str(openshift_version))
+        if ovs_version:
+            return self.openshift_to_ovs_version[str(openshift_version)]
+
+        msg = "There is no recommended version of Open vSwitch for the current version of OpenShift: {}"
+        raise OpenShiftCheckException(msg.format(openshift_version))
+
+    def _get_openshift_version(self, task_vars):
+        openshift_version = get_var(task_vars, "openshift_image_tag")
+        if openshift_version and openshift_version[0] == 'v':
+            openshift_version = openshift_version[1:]
+
+        return self._parse_version(openshift_version)
+
+    def _parse_version(self, version):
+        components = version.split(".")
+        if not components or len(components) < 2:
+            msg = "An invalid version of OpenShift was found for this host: {}"
+            raise OpenShiftCheckException(msg.format(version))
+
+        if components[0] in self.openshift_major_release_version:
+            components[0] = self.openshift_major_release_version[components[0]]
+
+        return '.'.join(components[:2])
diff --git a/roles/openshift_health_checker/openshift_checks/package_version.py b/roles/openshift_health_checker/openshift_checks/package_version.py
index 682f6bd40..2e737818b 100644
--- a/roles/openshift_health_checker/openshift_checks/package_version.py
+++ b/roles/openshift_health_checker/openshift_checks/package_version.py
@@ -1,5 +1,5 @@
 # pylint: disable=missing-docstring
-from openshift_checks import OpenShiftCheck, get_var
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
 from openshift_checks.mixins import NotContainerizedMixin
 
 
@@ -9,6 +9,25 @@ class PackageVersion(NotContainerizedMixin, OpenShiftCheck):
     name = "package_version"
     tags = ["preflight"]
 
+    openshift_to_ovs_version = {
+        "3.6": "2.6",
+        "3.5": "2.6",
+        "3.4": "2.4",
+    }
+
+    openshift_to_docker_version = {
+        "3.1": "1.8",
+        "3.2": "1.10",
+        "3.3": "1.10",
+        "3.4": "1.12",
+    }
+
+    # map major release versions across releases
+    # to a common major version
+    openshift_major_release_version = {
+        "1": "3",
+    }
+
     @classmethod
     def is_active(cls, task_vars):
         """Skip hosts that do not have package requirements."""
@@ -17,9 +36,90 @@ class PackageVersion(NotContainerizedMixin, OpenShiftCheck):
         return super(PackageVersion, cls).is_active(task_vars) and master_or_node
 
     def run(self, tmp, task_vars):
+        rpm_prefix = get_var(task_vars, "openshift", "common", "service_type")
+        openshift_release = get_var(task_vars, "openshift_release", default='')
+        deployment_type = get_var(task_vars, "openshift_deployment_type")
+        check_multi_minor_release = deployment_type in ['openshift-enterprise']
+
         args = {
-            "requested_openshift_release": get_var(task_vars, "openshift_release", default=''),
-            "openshift_deployment_type": get_var(task_vars, "openshift_deployment_type"),
-            "rpm_prefix": get_var(task_vars, "openshift", "common", "service_type"),
+            "package_list": [
+                {
+                    "name": "openvswitch",
+                    "version": self.get_required_ovs_version(task_vars),
+                    "check_multi": False,
+                },
+                {
+                    "name": "docker",
+                    "version": self.get_required_docker_version(task_vars),
+                    "check_multi": False,
+                },
+                {
+                    "name": "{}".format(rpm_prefix),
+                    "version": openshift_release,
+                    "check_multi": check_multi_minor_release,
+                },
+                {
+                    "name": "{}-master".format(rpm_prefix),
+                    "version": openshift_release,
+                    "check_multi": check_multi_minor_release,
+                },
+                {
+                    "name": "{}-node".format(rpm_prefix),
+                    "version": openshift_release,
+                    "check_multi": check_multi_minor_release,
+                },
+            ],
         }
+
         return self.execute_module("aos_version", args, tmp, task_vars)
+
+    def get_required_ovs_version(self, task_vars):
+        """Return the correct Open vSwitch version for the current OpenShift version.
+        If the current OpenShift version is >= 3.5, ensure Open vSwitch version 2.6,
+        Else ensure Open vSwitch version 2.4"""
+        openshift_version = self.get_openshift_version(task_vars)
+
+        if float(openshift_version) < 3.5:
+            return self.openshift_to_ovs_version["3.4"]
+
+        ovs_version = self.openshift_to_ovs_version.get(str(openshift_version))
+        if ovs_version:
+            return ovs_version
+
+        msg = "There is no recommended version of Open vSwitch for the current version of OpenShift: {}"
+        raise OpenShiftCheckException(msg.format(openshift_version))
+
+    def get_required_docker_version(self, task_vars):
+        """Return the correct Docker version for the current OpenShift version.
+        If the OpenShift version is 3.1, ensure Docker version 1.8.
+        If the OpenShift version is 3.2 or 3.3, ensure Docker version 1.10.
+        If the current OpenShift version is >= 3.4, ensure Docker version 1.12."""
+        openshift_version = self.get_openshift_version(task_vars)
+
+        if float(openshift_version) >= 3.4:
+            return self.openshift_to_docker_version["3.4"]
+
+        docker_version = self.openshift_to_docker_version.get(str(openshift_version))
+        if docker_version:
+            return docker_version
+
+        msg = "There is no recommended version of Docker for the current version of OpenShift: {}"
+        raise OpenShiftCheckException(msg.format(openshift_version))
+
+    def get_openshift_version(self, task_vars):
+        openshift_version = get_var(task_vars, "openshift_image_tag")
+        if openshift_version and openshift_version[0] == 'v':
+            openshift_version = openshift_version[1:]
+
+        return self.parse_version(openshift_version)
+
+    def parse_version(self, version):
+        components = version.split(".")
+        if not components or len(components) < 2:
+            msg = "An invalid version of OpenShift was found for this host: {}"
+            raise OpenShiftCheckException(msg.format(version))
+
+        if components[0] in self.openshift_major_release_version:
+            components[0] = self.openshift_major_release_version[components[0]]
+
+        return '.'.join(components[:2])