service/check_kaas.sh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85

#! /bin/bash

cd "$(dirname "$0")"
. opts.sh

e_nodes=$2
e_pods=2
e_restarts=10
p_pods=10

online=$(../scripts/ping.pl "$host")
healthy=$online

version=$(oc version | head -n 1 | awk '{ print $2 }')
if [ -z "$version" ]; then
    healthy=0
else
    version="OpenShift $version"
fi

etcd=$(oc get cs | grep etcd | grep "Healthy" | wc -l)
if [ -z "$etcd" -o "$etcd" -lt 3 ]; then
    healthy=2
    oc get cs | grep etcd | grep "Healthy" | sed -r -e 's/\s+/ /g' | awk '{ print $1, $2 }' | sed 's/^/* /'
fi

if [ $healthy -ne 0 ]; then
    nodes=$(oc get nodes)
    ready=$(echo "$nodes" | grep Ready | wc -l)
    active=$(echo "$nodes" | grep Ready | grep -vi SchedulingDisabled | wc -l)
    if [ $ready -ge $e_nodes ]; then
        nodes=" \${color gray}/ $etcd etcd, $ready nodes"
        if [ $active -ne $ready ]; then
            nodes="$nodes ($active active)"
        fi
    else
        echo "$nodes" | grep -v "STATUS" | grep -v "Ready" | awk '{ print $1, $2 }' | sed 's/^/* /'

        offline=$(echo "$nodes" | grep -v "STATUS" | grep -v "Ready" | wc -l)
        nodes=" \${color gray}/ $etcd etcd, $ready ready, $offline offline"
        healthy=2
    fi
fi

# Find pods in unexpected states
if [ $healthy -ne 0 ]; then
    pods=$(oc get pods --all-namespaces  | awk '$6~/s|m/ { next } { print }' | grep -P "Terminating|Pending"  | wc -l)
    if [ $pods -ge $e_pods ]; then
        healthy=2
        echo "Pods stalled in wrong states:"
        oc get pods --all-namespaces -o wide  | awk '$6~/s|m/ { next } { print }' | grep -P "(Terminating|Pending)" | head -n $p_pods | sed -e 's/[[:space:]]\+/ /g'
    fi
fi

# Find not-ready running pods with large restart number
if [ $healthy -ne 0 ]; then
    pods=$(oc get pods --all-namespaces | awk '$6~/s|m/ { next } $5<$e_restarts { next } $3~/^0/ { print $0 }' | grep Running | wc -l)
    if [ $pods -ge $e_pods ]; then
        healthy=2
        echo "Pods restarting:"
        oc get pods --all-namespaces -o wide | awk '$6~/s|m/ { next } $5<$e_restarts { next } $3~/^0/ { print $0 }' | grep Running | head -n $p_pods | sed -e 's/[[:space:]]\+/ /g'
    fi
fi

# Find own pods in error states
if [ $healthy -ne 0 ]; then
    pods=$(oc get pods --all-namespaces  | grep -P "adei|adai|bora" | awk '$6~/s|m/ { next } { print }' | grep -P "CrashLoopBackOff|Error"  | wc -l)
    if [ $pods -ge $e_pods ]; then
        healthy=2
        echo "Pods with errors:"
        oc get pods --all-namespaces -o wide | grep -P "adei|adai|bora" | awk '$6~/s|m/ { next } { print }' | grep -P "(CrashLoopBackOff|Error)" | head -n $p_pods | sed -e 's/[[:space:]]\+/ /g'
    fi
fi

# Find if schedulling takes too long
if [ $healthy -ne 0 ]; then
    pods=$(oc get pods --all-namespaces | grep "ContainerCreating"   | awk '$6~/d|h/ { print } $6~/s/ { next } int(substr($6, 1, length($6) - 1)) > 3 { print }' | wc -l) #'
    if [ $pods -gt 0 ]; then
        healthy=2
        echo "Scheduling problems for the following pods:"
        oc get pods --all-namespaces | grep "ContainerCreating"   | awk '$6~/d|h/ { print } $6~/s/ { next } int(substr($6, 1, length($6) - 1)) > 3 { print }'
    fi
fi

echo "$online $healthy $version $nodes"