#!/bin/bash fs=`df -lm / | grep -vi Filesystem | sed -e 's/[[:space:]]\+/ /g' | cut -d ' ' -f 4` datafs=`df -lm /mnt/ands | grep -vi Filesystem | sed -e 's/[[:space:]]\+/ /g' | cut -d ' ' -f 4` mem=`free -g | grep "Mem" | sed -e 's/[[:space:]]\+/ /g' | cut -d ' ' -f 7` cpu=`uptime | sed -e "s/[[:space:]]/\n/g" -e s/,/./g | tail -n 1` max_cpu=$(cat /proc/cpuinfo | grep processor | tail -n 1 | cut -d ':' -f 2) cpu_usage=$(echo "100 * $cpu / ( $max_cpu + 1)" | bc) #" if [ $fs -le 8192 ]; then echo "Only $(($fs / 1024)) GB left in the root file system" fi if [ $datafs -le 1048576 ]; then echo "Only $(($datafs / 1024)) GB left in the data file system" fi if [ $mem -le 16 ]; then echo "The system is starving on memory, $mem GB left free" fi if [ `echo "$cpu_usage < 80" | bc` -eq 0 ]; then echo "The system is starving on cpu, $cpu ($cpu_usage%) is load average for the last 15 min" fi vol=$(/opt/MegaRAID/storcli/storcli64 /c0/v0 show | grep -P "^0/0" | grep "Optl" | wc -l) if [ -z "$vol" -o "$vol" -eq 0 ]; then echo "Raid volume is not optimal:" /opt/MegaRAID/storcli/storcli64 /c0/v0 show | grep -P "^0/0" fi disks=$(/opt/MegaRAID/storcli/storcli64 /c0 show | grep -P "(HDD|SSD)" | grep -v "Onln" | wc -l) if [ -z "$disks" -o "$disks" -ne 0 ]; then echo "Not all disks are online:" /opt/MegaRAID/storcli/storcli64 /c0 show | grep -P "(HDD|SSD)" | grep -v "Onln" fi ifaces=$(ovs-vsctl show | grep -oP "could not open network device\s*\Kveth[a-f0-9]+" | wc -l) if [ "$ifaces" -gt 50 ]; then echo "Too many rogue interfaces ($ifaces) is registered on OpenVSwitch bridge. It could introduce large delays in pod scheduling..." fi #Check various known problems vssize=$(du -sm /var/log/openvswitch/ovs-vswitchd.log | cut -f 1) if [ "$vssize" -gt 128 ]; then echo "Current OpenVSwitch log is over $vssize MB. It could indicate some severe problems in pod networking..." fi host google.com &> /dev/null if [ $? -ne 0 ]; then echo "DNS problems, can't resolve google.com" fi ping -c 1 -W 2 8.8.8.8 &> /dev/null if [ $? -ne 0 ]; then echo "Networkign problems, can't ping Google's public DNS server" fi info=$(LC_ALL=C docker info) if [ -n "$info" ]; then images=$(echo "$info" | grep -i images | grep -Po "\d+") [ -n "$images" ] && images=$(docker images -a | wc -l) c=$(echo "$info" | grep -i containers | grep -Po "\d+") c_running=$(echo "$info" | grep -i containers -A 5 | grep -i running | grep -Po "\d+") c_paused=$(echo "$info" | grep -i containers -A 5 | grep -i paused | grep -Po "\d+") c_stopped=$(echo "$info" | grep -i containers -A 5 | grep -i stopped | grep -Po "\d+") data_space=$(echo "$info" | grep -i "\bData Space Available" | grep -Po "[\d.]+\s+\w+") data_size=$(echo "$data_space" | grep -Po "[\d.]+") [ -n "$(echo $data_space | grep -P 'TB')" ] && data_size=$(echo "$data_size * 1024" | bc) [ -z "$(echo $data_space | grep '[TG]B')" ] && data_size=0 metadata_space=$(echo "$info" | grep -i "\bMetadata Space Available" | grep -Po "[\d.]+\s+\w+") metadata_size=$(echo "$metadata_space" | grep -Po "[\d.]+") [ -n "$(echo $metadata_space | grep -P 'TB')" ] && metadata_size=$(echo "$metadata_size * 1024" | bc) [ -z "$(echo $metadata_space | grep '[TG]B')" ] && metadata_size=0 [ $(echo "$data_size > 300" | bc) -eq 0 ] && echo "Docker Data Space is critically low ($data_space)" [ $(echo "$metadata_size > 5" | bc) -eq 0 ] && echo "Docker Metadata Space is critically low ($metadata_space)" else images=$(docker images -a | wc -l) echo "docker info has timed out" fi [ "$images" -gt 1000 ] && echo "Too many docker images ($images) will cause severe scheduling penalties"