1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
|
"""
Check for ensuring logs from pods can be queried in a reasonable amount of time.
"""
import json
import time
from uuid import uuid4
from openshift_checks import OpenShiftCheckException
from openshift_checks.logging.logging import LoggingCheck
ES_CMD_TIMEOUT_SECONDS = 30
class LoggingIndexTime(LoggingCheck):
"""Check that pod logs are aggregated and indexed in ElasticSearch within a reasonable amount of time."""
name = "logging_index_time"
tags = ["health", "logging"]
def run(self):
"""Add log entry by making unique request to Kibana. Check for unique entry in the ElasticSearch pod logs."""
try:
log_index_timeout = int(
self.get_var("openshift_check_logging_index_timeout_seconds", default=ES_CMD_TIMEOUT_SECONDS)
)
except ValueError:
raise OpenShiftCheckException(
'InvalidTimeout',
'Invalid value provided for "openshift_check_logging_index_timeout_seconds". '
'Value must be an integer representing an amount in seconds.'
)
running_component_pods = dict()
# get all component pods
for component, name in (['kibana', 'Kibana'], ['es', 'Elasticsearch']):
pods = self.get_pods_for_component(component)
running_pods = self.running_pods(pods)
if not running_pods:
raise OpenShiftCheckException(
component + 'NoRunningPods',
'No {} pods in the "Running" state were found.'
'At least one pod is required in order to perform this check.'.format(name)
)
running_component_pods[component] = running_pods
uuid = self.curl_kibana_with_uuid(running_component_pods["kibana"][0])
self.wait_until_cmd_or_err(running_component_pods["es"][0], uuid, log_index_timeout)
return {}
def wait_until_cmd_or_err(self, es_pod, uuid, timeout_secs):
"""Retry an Elasticsearch query every second until query success, or a defined
length of time has passed."""
deadline = time.time() + timeout_secs
interval = 1
while not self.query_es_from_es(es_pod, uuid):
if time.time() + interval > deadline:
raise OpenShiftCheckException(
"NoMatchFound",
"expecting match in Elasticsearch for message with uuid {}, "
"but no matches were found after {}s.".format(uuid, timeout_secs)
)
time.sleep(interval)
def curl_kibana_with_uuid(self, kibana_pod):
"""curl Kibana with a unique uuid."""
uuid = self.generate_uuid()
pod_name = kibana_pod["metadata"]["name"]
exec_cmd = "exec {pod_name} -c kibana -- curl --max-time 30 -s http://localhost:5601/{uuid}"
exec_cmd = exec_cmd.format(pod_name=pod_name, uuid=uuid)
error_str = self.exec_oc(exec_cmd, [])
try:
error_code = json.loads(error_str)["statusCode"]
except (KeyError, ValueError):
raise OpenShiftCheckException(
'kibanaInvalidResponse',
'invalid response returned from Kibana request:\n'
'Command: {}\nResponse: {}'.format(exec_cmd, error_str)
)
if error_code != 404:
raise OpenShiftCheckException(
'kibanaInvalidReturnCode',
'invalid error code returned from Kibana request.\n'
'Expecting error code "404", but got "{}" instead.'.format(error_code)
)
return uuid
def query_es_from_es(self, es_pod, uuid):
"""curl the Elasticsearch pod and look for a unique uuid in its logs."""
pod_name = es_pod["metadata"]["name"]
exec_cmd = (
"exec {pod_name} -- curl --max-time 30 -s -f "
"--cacert /etc/elasticsearch/secret/admin-ca "
"--cert /etc/elasticsearch/secret/admin-cert "
"--key /etc/elasticsearch/secret/admin-key "
"https://logging-es:9200/project.{namespace}*/_count?q=message:{uuid}"
)
exec_cmd = exec_cmd.format(pod_name=pod_name, namespace=self.logging_namespace(), uuid=uuid)
result = self.exec_oc(exec_cmd, [], save_as_name="query_for_uuid.json")
try:
count = json.loads(result)["count"]
except (KeyError, ValueError):
raise OpenShiftCheckException(
'esInvalidResponse',
'Invalid response from Elasticsearch query:\n'
' {}\n'
'Response was:\n{}'.format(exec_cmd, result)
)
return count
@staticmethod
def running_pods(pods):
"""Filter pods that are running."""
return [pod for pod in pods if pod['status']['phase'] == 'Running']
@staticmethod
def generate_uuid():
"""Wrap uuid generator. Allows for testing with expected values."""
return str(uuid4())
|