Inhaltsverzeichnis

Heartbeat

Installation

# aptitude update && aptitiude install heartbeat
# mkdir -p /var/run/heartbeat/rsctmp/send_arp

Die Konfigurationsdateien in der Übersicht

# tar cvf /var/lib/dokuwiki/data/media/openqrm_had.tar /etc/ha.d/authkeys /etc/ha.d/ha.cf /etc/ha.d/haresources /etc/ha.d/resource.d/drbddisk /etc/ha.d/resource.d/fsck_jfs /etc/ha.d/resource.d/myopenqrm /etc/ha.d/resource.d/mydrbd /etc/ha.d/resource.d/killpid

authkeys

(echo -ne "auth 1\n1 sha1 "; dd if=/dev/urandom bs=512 count=1 | openssl sha1) > /etc/ha.d/authkeys
chmod 0600 /etc/ha.d/authkeys

Die so erstellte Datei muss auf beide Knoten kopiert werden.

FIXME

/etc/ha.d/haresources

rbopenqrm01 \
        mac::bond0::00:19:19:19:19:19 \
        IPaddr2::192.168.1.100/24/bond0:0 \
        mydrbd \
        drbddisk::data \
        LVM::data \
        mount_all_dev \
        check_lsof \
        mysql \
        ntp \
        squid \
        apache2 \
        nagios-nrpe-server

/opt/pidfile.cfg

if [ -z "${PIDDATEI}" ] ; then
        SCRIPT="$(basename ${0})"
        PIDNAME="$(echo "${SCRIPT}"|rev|sed 's/.*\.//'|rev)"
        PIDDATEI="/var/run/${PIDNAME}.pid"
fi

##------------------------------------------------------------------
pid_beginn()
{
        if  [ -e "${PIDDATEI}" ] ; then
                exit 0
        else
                echo "$$" > ${PIDDATEI} && echo "${PIDDATEI} wurde angelegt" || echo "${HOSTNAME}:${SCRIPT} konnte ${PIDDATEI} nicht anlegen..."
        fi
}
##------------------------------------------------------------------
pid_status()
{
        if  [ -e "${PIDDATEI}" ] ; then
                echo "${SCRIPT} läuft schon mit der PID $(cat ${PIDDATEI}) ..."
        else
                echo "${SCRIPT} ist gestoppt ..."
        fi
}
##------------------------------------------------------------------
pid_ende()
{
        if  [ -e "${PIDDATEI}" ] ; then
                rm -fv ${PIDDATEI} || echo "${HOSTNAME}:${SCRIPT} PID-File (${PIDDATEI}) kann nicht geloescht werden..."
        fi
}
##------------------------------------------------------------------

/etc/ha.d/resource.d/mac

#!/bin/bash

#set -x

# damit das Script LSB-konform ist
. /opt/pidfile.cfg

KOMMANDO="$(echo "${@}" | awk '{print $NF}')"

case ${KOMMANDO} in
        start)
                pid_beginn
              cat $(find /sys/ -type f | fgrep 'address' | fgrep "${1}") > ${PIDDATEI}
              ifdown ${1}
              ifconfig ${1} hw ether ${2}
              ifup ${1}
                ;;

        status)
                pid_status
                ;;

        stop)
                if [ -e "${PIDDATEI}" ] ; then
                        ifdown ${1}
                        ifconfig ${1} hw ether $(cat ${PIDDATEI})
                        ifup ${1}
                fi
                pid_ende
                ;;

esac

exit 0

/etc/ha.d/resource.d/drbddisk

#!/bin/bash
#
# This script is inteded to be used as resource script by heartbeat
#
# Copright 2003-2008 LINBIT Information Technologies
# Philipp Reisner, Lars Ellenberg
#
###

DEFAULTFILE="/etc/default/drbd"
DRBDADM="/sbin/drbdadm"

if [ -f $DEFAULTFILE ]; then
  . $DEFAULTFILE
fi

if [ "$#" -eq 2 ]; then
  RES="$1"
  CMD="$2"
else
  RES="all"
  CMD="$1"
fi

## EXIT CODES
# since this is a "legacy heartbeat R1 resource agent" script,
# exit codes actually do not matter that much as long as we conform to
#  http://wiki.linux-ha.org/HeartbeatResourceAgent
# but it does not hurt to conform to lsb init-script exit codes,
# where we can.
#  http://refspecs.linux-foundation.org/LSB_3.1.0/
#     LSB-Core-generic/LSB-Core-generic/iniscrptact.html
####

case "$CMD" in
    start)
      # try several times, in case heartbeat deadtime
      # was smaller than drbd ping time
      try=6
      while true; do
              $DRBDADM primary $RES && break
              let "--try" || exit 1 # LSB generic error
              sleep 1
      done
      ;;
    stop)
      $DRBDADM secondary $RES
      ex=$?
      case $ex in
      0)
              exit 0
              ;;
      11)
              # see drbdadm_main.c adm_generic and m_system
              # as well as drbdsetup.c:
              # in fact a role change was attempted, but failed.
              echo >&2 "$DRBDADM secondary $RES: exit code $ex, mapping to 1"
              exit 1 # LSB generic error
              ;;
      *)
              # other error, may be syntax error in config file,
              # anything else: to not confuse heartbeat further,
              # and avoid reboot due so "failed stop recovery",
              # pretend that we succeeded in stopping this.
              echo >&2 "$DRBDADM secondary $RES: exit code $ex, mapping to 0"
              exit 0
              ;;
      esac
      ;;
    status)
      if [ "$RES" = "all" ]; then
          echo "A resource name is required for status inquiries."
          exit 10
      fi
      ST=$( $DRBDADM role $RES )
      STATE=${ST%/*}
      case $STATE in
              Primary)
                      echo "running (Primary)"
                      exit 0 # LSB status "service is OK"
                      ;;
              Secondary|Unconfigured)
                      echo "stopped ($STATE)" ;;
              "")
                      echo "stopped" ;;
              *)
                      # unexpected. whatever...
                      echo "stopped ($ST)" ;;
      esac
      exit 3 # LSB status "service is not running"
      ;;
    *)
      echo "Usage: drbddisk [resource] {start|stop|status}"
      exit 1
      ;;
esac

exit 0

/etc/ha.d/resource.d/mount_all_dev

#!/bin/bash

VOLGRUPPE="data"

# damit das Script LSB-konform ist
. /opt/pidfile.cfg


case ${1} in
        start)
                pid_beginn
                for mountdev in $(fgrep noauto /etc/fstab | awk '{print $1}' | while read L1BEZ REST; do echo "${L1BEZ}" | egrep "^/dev/mapper/${VOLGRUPPE}-" | awk '{gsub("[\"]","");print $1}'; echo "${L1BEZ}" | egrep "^/dev/${VOLGRUPPE}/" | awk '{gsub("[\"]","");print $1}'; echo "${L1BEZ}" | awk -F'=' '/^LABEL=/ {gsub("[\"]","");print $2}' | while read L2BEZ; do blkid -L ${L2BEZ}; done; done)
                do
                        for umgebung in $(blkid -o udev ${mountdev})
                        do 
                                export ${umgebung}
                        done 
                        if [ -n "${ID_FS_TYPE}" ] ; then
                                fsck -t ${ID_FS_TYPE} -y ${mountdev} && mount -v ${mountdev}
                        fi
                done
                ;;

        status)
                pid_status
                ;;

        stop)
                for mountpoint in $(tac /etc/mtab | egrep "^/dev/mapper/${VOLGRUPPE}-" | awk '{print $2}')
                do
                        umount -v ${mountpoint}
                done
                pid_ende
                ;;

esac

exit 0

/etc/ha.d/resource.d/check_lsof

#!/bin/sh

HAVOLGR="lager"         # HA-Volumen-Gruppe
MPOINTS="$(fgrep 'noauto' /etc/fstab | egrep "^LABEL=|^/dev/mapper/${HAVOLGR}-|^/dev/${HAVOLGR}/" | awk '{print $2}' | awk '{print " "$1}' | tr -s '\n' '|' | sed 's/|$//' | head -n1)"

case $1 in
      start)
              ;;
  
        status)
                lsof | egrep "${MPOINTS}";echo
                ;;
  
        stop)
                while [ -n "$(lsof | egrep "${MPOINTS}")" ]
                do
                        echo "Warte bis die noch offenen Datei-Haendler geschlossen sind..."
                        sleep 4
                        RPIDS="$(lsof | fgrep '/collectd/' | awk '{print $2}' | sort | uniq)";
                        if [ -n "${RPIDS}" ] ; then
                                kill ${RPIDS};
                        fi;
                        sleep 4 ;
                done
              ;;

esac

exit 0

Cluster-Schwenk

# /etc/init.d/heartbeat standby && tail -f /var/log/syslog

May 21 11:40:58 rbopenqrm01 heartbeat: [1893]: info: rbopenqrm02 wants to go standby [all]
May 21 11:40:58 rbopenqrm01 ipfail: [2172]: debug: Other side is unstable.
May 21 11:40:58 rbopenqrm01 kernel: [ 5357.069101] block drbd1: peer( Primary -> Secondary ) 
May 21 11:40:59 rbopenqrm01 heartbeat: [1893]: info: standby: acquire [all] resources from rbopenqrm02
May 21 11:40:59 rbopenqrm01 heartbeat: [3830]: info: acquire all HA resources (standby).
May 21 11:40:59 rbopenqrm01 ResourceManager[3844]: info: Acquiring resource group: rbopenqrm01 IPaddr::10.10.5.80/24/br0 mydrbd drbddisk::data LVM::data Filesystem::/dev/data/mysql::/var/lib/mysql::jfs Filesystem::/dev/data/etcmysql::/etc/mysql::jfs mysql
May 21 11:40:59 rbopenqrm01 IPaddr[3871]: INFO:  Resource is stopped
May 21 11:40:59 rbopenqrm01 ResourceManager[3844]: info: Running /etc/ha.d/resource.d/IPaddr 10.10.5.80/24/br0 start
May 21 11:40:59 rbopenqrm01 IPaddr[3951]: INFO: Using calculated netmask for 10.10.5.80: 255.255.255.0
May 21 11:40:59 rbopenqrm01 IPaddr[3951]: INFO: eval ifconfig br0:0 10.10.5.80 netmask 255.255.255.0 broadcast 10.10.5.255
May 21 11:40:59 rbopenqrm01 IPaddr[3927]: INFO:  Success
May 21 11:40:59 rbopenqrm01 ResourceManager[3844]: info: Running /etc/ha.d/resource.d/mydrbd  start
May 21 11:40:59 rbopenqrm01 mountd[3767]: Caught signal 15, un-registering and exiting.
May 21 11:40:59 rbopenqrm01 kernel: [ 5357.466126] nfsd: last server has exited, flushing export cache
May 21 11:40:59 rbopenqrm01 ResourceManager[3844]: info: Running /etc/ha.d/resource.d/drbddisk data start
May 21 11:40:59 rbopenqrm01 kernel: [ 5357.492064] block drbd1: role( Secondary -> Primary ) 
May 21 11:40:59 rbopenqrm01 LVM[4119]: INFO: LVM Volume data is offline
May 21 11:40:59 rbopenqrm01 LVM[4113]: INFO:  Resource is stopped
May 21 11:40:59 rbopenqrm01 ResourceManager[3844]: info: Running /etc/ha.d/resource.d/LVM data start
May 21 11:40:59 rbopenqrm01 LVM[4171]: INFO: Activating volume group data
May 21 11:40:59 rbopenqrm01 LVM[4171]: INFO: File descriptor 4 (socket:[5467]) leaked on vgscan invocation. Parent PID 4171: /bin/sh File descriptor 5 (socket:[5468]) leaked on vgscan invocation. Parent PID 4171: /bin/sh File descriptor 6 (/proc/loadavg) leaked on vgscan invocation. Parent PID 4171: /bin/sh File descriptor 7 (socket:[5861]) leaked on vgscan invocation. Parent PID 4171: /bin/sh File descriptor 8 (socket:[5479]) leaked on vgscan invocation. Parent PID 4171: /bin/sh File descriptor 9 (socket:[5474]) leaked on vgscan invocation. Parent PID 4171: /bin/sh File descriptor 10 (socket:[5867]) leaked on vgscan invocation. Parent PID 4171: /bin/sh File descriptor 11 (socket:[5476]) leaked on vgscan invocation. Parent PID 4171: /bin/sh File descriptor 12 (socket:[5481]) leaked on vgscan invocation. Parent PID 4171: /bin/sh File des criptor 13 (pipe:[9664]) leaked on vgscan invocation. Parent PID 4171: /bin/sh Reading all physical volumes. This may take a while... Found volume group "data" using metadata type lvm2
May 21 11:40:59 rbopenqrm01 LVM[4171]: INFO: File descriptor 4 (socket:[5467]) leaked on vgchange invocation. Parent PID 4171: /bin/sh File descriptor 5 (socket:[5468]) leaked on vgchange invocation. Parent PID 4171: /bin/sh File descriptor 6 (/proc/loadavg) leaked on vgchange invocation. Parent PID 4171: /bin/sh File descriptor 7 (socket:[5861]) leaked on vgchange invocation. Parent PID 4171: /bin/sh File descriptor 8 (socket:[5479]) leaked on vgchange invocation. Parent PID 4171: /bin/sh File descriptor 9 (socket:[5474]) leaked on vgchange invocation. Parent PID 4171: /bin/sh File descriptor 10 (socket:[5867]) leaked on vgchange invocation. Parent PID 4171: /bin/sh File descriptor 11 (socket:[5476]) leaked on vgchange invocation. Parent PID 4171: /bin/sh File descriptor 12 (socket:[5481]) leaked on vgchange invocation. Parent PID 4171: /bin/sh File descriptor 13 (pipe:[9664]) leaked on vgchange invocation. Parent PID 4171: /bin/sh 2 logical volume(s) in volume group "data" now active
May 21 11:40:59 rbopenqrm01 LVM[4165]: INFO:  Success
May 21 11:40:59 rbopenqrm01 Filesystem[4233]: INFO:  Resource is stopped
May 21 11:40:59 rbopenqrm01 ResourceManager[3844]: info: Running /etc/ha.d/resource.d/Filesystem /dev/data/mysql /var/lib/mysql jfs start
May 21 11:40:59 rbopenqrm01 Filesystem[4303]: INFO: Running start for /dev/data/mysql on /var/lib/mysql
May 21 11:40:59 rbopenqrm01 Filesystem[4297]: INFO:  Success
May 21 11:40:59 rbopenqrm01 Filesystem[4375]: INFO:  Resource is stopped
May 21 11:40:59 rbopenqrm01 ResourceManager[3844]: info: Running /etc/ha.d/resource.d/Filesystem /dev/data/etcmysql /etc/mysql jfs start
May 21 11:40:59 rbopenqrm01 Filesystem[4444]: INFO: Running start for /dev/data/etcmysql on /etc/mysql
May 21 11:41:00 rbopenqrm01 Filesystem[4438]: INFO:  Success
May 21 11:41:00 rbopenqrm01 ResourceManager[3844]: info: Running /etc/init.d/mysql  start
May 21 11:41:00 rbopenqrm01 kernel: [ 5358.319849] type=1505 audit(1274434860.115:10):  operation="profile_replace" pid=4529 name="/usr/sbin/mysqld"
May 21 11:41:01 rbopenqrm01 /etc/mysql/debian-start[4544]: Upgrading MySQL tables if necessary.
May 21 11:41:01 rbopenqrm01 heartbeat: [3830]: info: all HA resource acquisition completed (standby).
May 21 11:41:01 rbopenqrm01 heartbeat: [1893]: info: Standby resource acquisition done [all].
May 21 11:41:01 rbopenqrm01 /etc/mysql/debian-start[4547]: /usr/bin/mysql_upgrade: the '--basedir' option is always ignored
May 21 11:41:01 rbopenqrm01 /etc/mysql/debian-start[4547]: Looking for 'mysql' as: /usr/bin/mysql
May 21 11:41:01 rbopenqrm01 /etc/mysql/debian-start[4547]: Looking for 'mysqlcheck' as: /usr/bin/mysqlcheck
May 21 11:41:01 rbopenqrm01 /etc/mysql/debian-start[4547]: This installation of MySQL is already upgraded to 5.1.41, use --force if you still need to run mysql_upgrade
May 21 11:41:01 rbopenqrm01 /etc/mysql/debian-start[4554]: Checking for insecure root accounts.
May 21 11:41:01 rbopenqrm01 /etc/mysql/debian-start[4558]: Triggering myisam-recover for all MyISAM tables
May 21 11:41:02 rbopenqrm01 heartbeat: [1893]: info: remote resource transition completed.

Bei ein sauberen Schwenk sollte hier "remote resource transition completed." stehen.

Heartbeat schlägt nicht mehr -> Pacemaker gibt jetzt den Takt an

PacemakerPacemaker und Corosync

Pacemaker, welches ursprünglich aus dem Entwicklungsstrang von HA-Linux (Heartbeat) entkoppelt wurde, erledigt hier einen wesentlich besseren Job als Heartbeat. Vor allem lassen sich nicht nur komplette Knoten, sondern auch einzelne Services überwachen und bei Bedarf auf demselben oder einem anderen Knoten starten.

Auch in der offiziellen Doku von DRBD wird eher auf Pacemaker gesetzt:

DRBD is frequently found in system configurations using the Linux-HA cluster manager ("Heartbeat"). Heartbeat has been superseded by the Pacemaker cluster manager and the latter should be used whenever possible — please see Chapter 8, Integrating DRBD with Pacemaker clusters for more information. Nonetheless, this chapter outlines Heartbeat configurations and is intended for users who must maintain existing legacy Heartbeat systems for policy reasons.