blob: e190e0d10e704d91b435d6edd41e824337a3e04b (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
|
#! /bin/bash
cd "$(dirname "$0")"
. opts.sh
e_nodes=$2
e_pods=2
e_restarts=10
p_pods=10
online=$(../scripts/ping.pl "$host")
healthy=$online
version=$(oc version | head -n 1 | awk '{ print $2 }')
if [ -z "$version" ]; then
healthy=0
else
version="OpenShift $version"
fi
etcd=$(oc get cs | grep etcd | grep "Healthy" | wc -l)
if [ -z "$etcd" -o "$etcd" -lt 3 ]; then
healthy=2
oc get cs | grep etcd | grep "Healthy" | sed -r -e 's/\s+/ /g' | awk '{ print $1, $2 }' | sed 's/^/* /'
fi
if [ $healthy -ne 0 ]; then
nodes=$(oc get nodes)
ready=$(echo "$nodes" | grep Ready | wc -l)
active=$(echo "$nodes" | grep Ready | grep -vi SchedulingDisabled | wc -l)
if [ $ready -ge $e_nodes ]; then
nodes=" \${color gray}/ $etcd etcd, $ready nodes"
if [ $active -ne $ready ]; then
nodes="$nodes ($active active)"
fi
else
echo "$nodes" | grep -v "STATUS" | grep -v "Ready" | awk '{ print $1, $2 }' | sed 's/^/* /'
offline=$(echo "$nodes" | grep -v "STATUS" | grep -v "Ready" | wc -l)
nodes=" \${color gray}/ $etcd etcd, $ready ready, $offline offline"
healthy=2
fi
fi
# Find pods in unexpected states
if [ $healthy -ne 0 ]; then
pods=$(oc get pods --all-namespaces | awk '$6~/s|m/ { next } { print }' | grep -P "Terminating|Pending" | wc -l)
if [ $pods -ge $e_pods ]; then
healthy=2
echo "Pods stalled in wrong states:"
oc get pods --all-namespaces -o wide | awk '$6~/s|m/ { next } { print }' | grep -P "(Terminating|Pending)" | head -n $p_pods | sed -e 's/[[:space:]]\+/ /g'
fi
fi
# Find not-ready running pods with large restart number
if [ $healthy -ne 0 ]; then
pods=$(oc get pods --all-namespaces | awk '$6~/s|m/ { next } $5<$e_restarts { next } $3~/^0/ { print $0 }' | grep Running | wc -l)
if [ $pods -ge $e_pods ]; then
healthy=2
echo "Pods restarting:"
oc get pods --all-namespaces -o wide | awk '$6~/s|m/ { next } $5<$e_restarts { next } $3~/^0/ { print $0 }' | grep Running | head -n $p_pods | sed -e 's/[[:space:]]\+/ /g'
fi
fi
# Find own pods in error states
if [ $healthy -ne 0 ]; then
pods=$(oc get pods --all-namespaces | grep -P "adei|adai|bora" | awk '$6~/s|m/ { next } { print }' | grep -P "CrashLoopBackOff|Error" | wc -l)
if [ $pods -ge $e_pods ]; then
healthy=2
echo "Pods with errors:"
oc get pods --all-namespaces -o wide | grep -P "adei|adai|bora" | awk '$6~/s|m/ { next } { print }' | grep -P "(CrashLoopBackOff|Error)" | head -n $p_pods | sed -e 's/[[:space:]]\+/ /g'
fi
fi
# Find if schedulling takes too long
if [ $healthy -ne 0 ]; then
pods=$(oc get pods --all-namespaces | grep "ContainerCreating" | awk '$6~/d|h/ { print } $6~/s/ { next } int(substr($6, 1, length($6) - 1)) > 3 { print }' | wc -l) #'
if [ $pods -gt 0 ]; then
healthy=2
echo "Scheduling problems for the following pods:"
oc get pods --all-namespaces | grep "ContainerCreating" | awk '$6~/d|h/ { print } $6~/s/ { next } int(substr($6, 1, length($6) - 1)) > 3 { print }'
fi
fi
echo "$online $healthy $version $nodes"
|