Graphite is an enterprise scale monitoring tool that stores numeric time-series data and can render graphs of this data on demand.
Graphite consists of 3 software components:
- carbon - a Twisted daemon that listens for time-series data
- whisper - a simple database library for storing time-series data (similar in design to RRD)
- graphite webapp - A Django webapp that renders graphs on-demand using Cairo
# carbon / graphite */5 * * * * ~/Bin/ &>/dev/null
#!/bin/sh # expects the following env vars # METRIC_PREFIX - At least the hostname to record these stats under in graphite - collectd.someserver # SNMP_HOST - the host to pull snmp stats from # CARBON_HOST - The carbon host to send the stats # CARBON_PORT - The carbon port to send the stats METRIC_PREFIX="ubnt" SNMP_HOST="" CARBON_HOST="123.456.789.0" CARBON_PORT="2003" NOW="$(date +%s)" snmpId() { echo "$1" | awk '{print $1}' } snmpVal() { echo "$1" | awk '{print $3}' | tr -d '"' } sendStat() { echo "$METRIC_PREFIX.$1 $2 $NOW" | nc "$CARBON_HOST" "$CARBON_PORT" # echo "$METRIC_PREFIX.$1 $2 $NOW" } IFACE_NAMES="$(snmpwalk -OQtn -v 2c -c "public" "$SNMP_HOST" "")" findName() { id="$(snmpId "$1" | rev | cut -d'.' -f1 | rev)" echo "$IFACE_NAMES" | grep "$id =" | awk '{print $3}' | tr -d '"' } # Memory info snmpwalk -OQtn -v 2c -c "public" "$SNMP_HOST" "" | while read line; do field="" id="$(snmpId "$line")" val="$(snmpVal "$line")" case "$id" in ".") field="";; ".") field="";; ".") field="memory.memory.shared";; ".") field="memory.memory.buffered";; ".") field="memory.memory.cached";; esac if [ "$field" != "" ]; then sendStat "$field" "$val" fi done # System info snmpwalk -OQtn -v 2c -c "public" "$SNMP_HOST" "" | while read line; do field="" id="$(snmpId "$line")" val="$(snmpVal "$line")" case "$id" in ".") field="uptime.uptime"; val="$(expr "$val" / 100)";; ".") field="users.users";; ".") field="processes.processes";; esac if [ "$field" != "" ]; then sendStat "$field" "$val" fi done # load snmpwalk -OQtn -v 2c -c "public" "$SNMP_HOST" "" | while read line; do field="" id="$(snmpId "$line")" val="$(snmpVal "$line")" case "$id" in ".") field="load.load.shortterm";; ".") field="load.load.midterm";; ".") field="load.load.longterm";; esac if [ "$field" != "" ]; then sendStat "$field" "$val" fi done # cpu info snmpwalk -OQtn -v 2c -c "public" "$SNMP_HOST" "" | while read line; do field="" id="$(snmpId "$line")" val="$(snmpVal "$line")" case "$id" in ".") field="cpu.0.percent.user";; ".") field="cpu.0.percent.system";; ".") field="cpu.0.percent.idle";; esac if [ "$field" != "" ]; then sendStat "$field" "$val" fi done # 32 bit interface stats snmpwalk -OQtn -v 2c -c "public" "$SNMP_HOST" "" | while read line; do field="" id="$(snmpId "$line")" val="$(snmpVal "$line")" name="$(findName "$line")" case "$id" in "."*) field="interface.$name.if_discards.rx";; "."*) field="interface.$name.if_errors.rx";; "."*) field="interface.$name.if_unknown_protocols.rx";; "."*) field="interface.$name.if_discards.tx";; "."*) field="interface.$name.if_errors.tx";; "."*) field="interface.$name.if_queue.tx";; esac if [ "$field" != "" ]; then sendStat "$field" "$val" fi done # 64 bit interface stats snmpwalk -OQtn -v 2c -c "public" "$SNMP_HOST" "" | while read line; do field="" id="$(snmpId "$line")" val="$(snmpVal "$line")" name="$(findName "$line")" case "$id" in "."*) field="interface.$name.if_octets.rx";; "."*) field="interface.$name.if_unicast_packets.rx";; "."*) field="interface.$name.if_multicast_packets.rx";; "."*) field="interface.$name.if_broadcast_packets.rx";; "."*) field="interface.$name.if_octets.tx";; "."*) field="interface.$name.if_unicast_packets.tx";; "."*) field="interface.$name.if_multicast_packets.tx";; "."*) field="interface.$name.if_broadcast_packets.tx";; esac if [ "$field" != "" ]; then sendStat "$field" "$val" fi done
# graphite carbon * * * * * /root/bin/
#!/bin/bash echo "nas.cpu.cpu.load" `cat /proc/loadavg | awk '{ print $1 }' 2>/dev/null` `date +%s` | nc 123.456.789.0 2003; exit;
Properly naming your metrics is critical to avoid conflicts, confusing data and potentially wrong interpretation later on. I like to organize metrics using the following schema:
<namespace>.<instrumented section>.<target (noun)>.<action (past tense verb)>
For example...
accounts.authentication.password.attempted accounts.authentication.password.succeeded accounts.authentication.password.failed
I use nouns to define the target and past tense verbs to define the action. This becomes a useful convention when you need to nest metrics. In the above example, let’s say I want to monitor the reasons for the failed password authentications. Here is how I would organize the extra stats...
accounts.authentication.password.failure.no_email_found accounts.authentication.password.failure.password_check_failed accounts.authentication.password.failure.password_reset_required