From 2c3f1522274c09f7cfdb6309adc0719f05c188e9 Mon Sep 17 00:00:00 2001 From: "Suren A. Chilingaryan" Date: Thu, 5 Jul 2018 06:29:09 +0200 Subject: Update monitoring scripts to track leftover OpenVSwitch 'veth' interfaces and clean them up pereodically to avoid performance degradation, split kickstart --- roles/ands_monitor/tasks/main.yml | 13 +++++++++++++ roles/ands_monitor/templates/cron/maintain.j2 | 4 ++++ .../templates/scripts/check_server_status.sh.j2 | 11 +++++++++++ .../templates/scripts/check_uptime_status.sh.j2 | 9 ++++++++- .../templates/scripts/clean_rogue_interfaces.sh.j2 | 18 ++++++++++++++++++ .../templates/scripts/list_containers.sh.j2 | 3 +++ roles/ands_monitor/templates/scripts/maintain.sh.j2 | 8 ++++++++ 7 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 roles/ands_monitor/templates/cron/maintain.j2 create mode 100755 roles/ands_monitor/templates/scripts/clean_rogue_interfaces.sh.j2 create mode 100755 roles/ands_monitor/templates/scripts/list_containers.sh.j2 create mode 100755 roles/ands_monitor/templates/scripts/maintain.sh.j2 (limited to 'roles/ands_monitor') diff --git a/roles/ands_monitor/tasks/main.yml b/roles/ands_monitor/tasks/main.yml index ac70d28..8cac4ea 100644 --- a/roles/ands_monitor/tasks/main.yml +++ b/roles/ands_monitor/tasks/main.yml @@ -1,3 +1,8 @@ +- name: Install monitoring applications + package: name={{item}} state=present + with_items: + - sysstat + - name: Create scripts directory file: path="{{ ands_script_path }}" state=directory @@ -7,3 +12,11 @@ script_name: "{{ item | basename | regex_replace('\\.j2','') }}" with_fileglob: - "{{ role_path }}/templates/scripts/*.j2" + + +- name: "Deploy cron jobs" + template: src="{{ item | quote }}" dest="/etc/cron.d/{{ cron_name }}" owner=root group=root mode=0644 + vars: + cron_name: "{{ item | basename | regex_replace('\\.j2','') }}" + with_fileglob: + - "{{ role_path }}/templates/cron/*.j2" diff --git a/roles/ands_monitor/templates/cron/maintain.j2 b/roles/ands_monitor/templates/cron/maintain.j2 new file mode 100644 index 0000000..2c3ce9c --- /dev/null +++ b/roles/ands_monitor/templates/cron/maintain.j2 @@ -0,0 +1,4 @@ +SHELL=/bin/bash +PATH=/sbin:/bin:/usr/sbin:/usr/bin +MAILTO=csa-darkserv@suren.me +33 */4 * * * root {{ ands_script_path }}/maintain.sh diff --git a/roles/ands_monitor/templates/scripts/check_server_status.sh.j2 b/roles/ands_monitor/templates/scripts/check_server_status.sh.j2 index caa63ce..b02f031 100755 --- a/roles/ands_monitor/templates/scripts/check_server_status.sh.j2 +++ b/roles/ands_monitor/templates/scripts/check_server_status.sh.j2 @@ -32,3 +32,14 @@ if [ -z "$disks" -o "$disks" -ne 0 ]; then echo "Not all disks are online:" /opt/MegaRAID/storcli/storcli64 /c0 show | grep -P "(HDD|SSD)" | grep -v "Onln" fi + +ifaces=$(ovs-vsctl show | grep -oP "could not open network device\s*\Kveth[a-f0-9]+" | wc -l) +if [ "$ifaces" -gt 50 ]; then + echo "Too many rogue interfaces ($ifaces) is registered on OpenVSwitch bridge. It could introduce large delays in pod scheduling..." +fi + +#Check various known problems +vssize=$(du -sm /var/log/openvswitch/ovs-vswitchd.log | cut -f 1) +if [ "$vssize" -gt 128 ]; then + echo "Current OpenVSwitch log is over $vssize MB. It could indicate some severe problems in pod networking..." +fi diff --git a/roles/ands_monitor/templates/scripts/check_uptime_status.sh.j2 b/roles/ands_monitor/templates/scripts/check_uptime_status.sh.j2 index 0602fcb..7acac5f 100755 --- a/roles/ands_monitor/templates/scripts/check_uptime_status.sh.j2 +++ b/roles/ands_monitor/templates/scripts/check_uptime_status.sh.j2 @@ -2,7 +2,14 @@ up=$(uptime | cut -d ' ' -f 4- | cut -d ',' -f 1 | sed -re 's/^\s*//') load=$(uptime | cut -d ' ' -f 4- | cut -d ',' -f 4- | cut -d ':' -f 2 | cut -d ',' -f 3 | sed -re 's/^\s*//') +#pods=$(oc get pods --all-namespaces -o wide | grep `hostname` | wc -l) +containers=$(docker ps -q | wc -l) +#processes=$(ps xa --no-headers | wc -l) +mem=$(free -t -g | grep "Mem:" | sed -re 's/\s+/ /g' | cut -d ' ' -f 3) +iops=$(iostat -d | grep -E "^sd" | awk '{s+=$2} END {print s}' | cut -d '.' -f 1) +net=$(ifstat -n; sleep 0.1; ifstat | grep -E "^(enp|ib)" | awk '{s+=$4+$5} END {print s}'); net=$((net / 100)) disks=$(/opt/MegaRAID/storcli/storcli64 /c0 show | grep -P "(HDD|SSD)" | grep "Onln" | wc -l) data=`df -lh /mnt/ands | grep -vi Filesystem | sed -e 's/[[:space:]]\+/ /g' | cut -d ' ' -f 4` -echo -n "1 Up $up \${color gray}/ $disks disks, $data free, load: $load" +#echo -n "1 Up $up \${color gray}/ $disks disks, $data free, load: $load, pods: $pods" +echo -en "1 $up\${color gray}, ${disks}/${data}, $(printf %3u ${containers}) c - $(printf %4.1f ${load}), $(printf %3u ${mem}) GB, $(printf %4u ${iops}) IOPS, $(printf %3u ${net}) MB/s" diff --git a/roles/ands_monitor/templates/scripts/clean_rogue_interfaces.sh.j2 b/roles/ands_monitor/templates/scripts/clean_rogue_interfaces.sh.j2 new file mode 100755 index 0000000..c04ce60 --- /dev/null +++ b/roles/ands_monitor/templates/scripts/clean_rogue_interfaces.sh.j2 @@ -0,0 +1,18 @@ +#!/bin/bash + +ifaces=$(ovs-vsctl show | grep -oP "could not open network device\s*\Kveth[a-f0-9]+") +[ $? -eq 0 ] || exit + +#Find bridge +#ovs-vsctl list-br + +for iface in $ifaces; do +# echo "$iface" + +# Verify that interface is not active + ip link show | grep $iface &> /dev/null + [ $? -eq 0 ] && continue + + echo "Removing: $iface" + ovs-vsctl del-port br0 $iface +done diff --git a/roles/ands_monitor/templates/scripts/list_containers.sh.j2 b/roles/ands_monitor/templates/scripts/list_containers.sh.j2 new file mode 100755 index 0000000..bac2884 --- /dev/null +++ b/roles/ands_monitor/templates/scripts/list_containers.sh.j2 @@ -0,0 +1,3 @@ +#! /bin/bash + +kubectl get pods --all-namespaces -o jsonpath='{range .items[*]}{@.metadata.name}{" "}{@.spec.containers[*].image}{" - "}{@.status.containerStatuses[*].containerID}{"\n"}{end}' diff --git a/roles/ands_monitor/templates/scripts/maintain.sh.j2 b/roles/ands_monitor/templates/scripts/maintain.sh.j2 new file mode 100755 index 0000000..45c9513 --- /dev/null +++ b/roles/ands_monitor/templates/scripts/maintain.sh.j2 @@ -0,0 +1,8 @@ +#! /bin/bash + +# Left-over network interfaces on the OpenVSwitch bridge after pod termination +ifaces=$(ovs-vsctl show | grep -oP "could not open network device\s*\Kveth[a-f0-9]+" | wc -l) +if [ $ifaces -gt 25 ]; then + echo "Cleaning rogue interfaces ($ifaces) on $(hostname)" + {{ ands_script_path }}/clean_rogue_interfaces.sh > /dev/null +fi -- cgit v1.2.3