Benutzer-Werkzeuge

Webseiten-Werkzeuge


hardware-ueberwachung

Dies ist eine alte Version des Dokuments!


Hardware-Überwachung

Standard-Hardware

Geräte am PCI-Bus

Sie möchten Informationen über die Geräte am PCI-Bus von FreeBSD finden:

> pciconf -lv
...
xhci0@pci0:0:20:0:      class=0x0c0330 rev=0x00 hdr=0x00 vendor=0x8086 device=0x06ed subvendor=0x1849 subdevice=0x06ed
    vendor     = 'Intel Corporation'
    device     = 'Comet Lake USB 3.1 xHCI Host Controller'
    class      = serial bus
    subclass   = USB
...
ahci0@pci0:3:0:0:       class=0x010601 rev=0x02 hdr=0x00 vendor=0x1b21 device=0x0612 subvendor=0x1b21 subdevice=0x1060
    vendor     = 'ASMedia Technology Inc.'
    device     = 'ASM1062 Serial ATA Controller'
    class      = mass storage
    subclass   = SATA
...
em0@pci0:7:0:0: class=0x020000 rev=0x00 hdr=0x00 vendor=0x8086 device=0x10d3 subvendor=0x8086 subdevice=0xa01f
    vendor     = 'Intel Corporation'
    device     = '82574L Gigabit Network Connection'
    class      = network
    subclass   = ethernet

Temperatur

CPU

# /home/sbin/temp_cpu.sh
CPU 0: 38,0° C
CPU 1: 38,0° C
CPU 2: 38,0° C
CPU 3: 38,0° C
CPU 4: 41,0° C
CPU 5: 41,0° C
CPU 6: 36,0° C
CPU 7: 36,0° C
CPU 8: 39,0° C
CPU 9: 39,0° C
CPU 10: 36,0° C
CPU 11: 36,0° C
~/bin/temp_cpu.sh
#!/bin/sh
 
#VERSION="v2018040100"          # aus dem Internet kopiert
VERSION="v2022051100"           # modifiziert; jetzt auch in Farbe
 
#------------------------------------------------------------------------------#
### Grenzwerte
#
ROT="90"        # zu heiß
#
GELB="60"       # unter Last
 
#------------------------------------------------------------------------------#
# Farben
GREEN="\033[32m"
YELLOW="\033[33m"
RED="\033[31m"
OFF="\033[0m"
 
#------------------------------------------------------------------------------#
 
#sysctl -a | grep -F dev.cpu | fgrep temperature
sysctl -a | grep -F dev.cpu. | awk '/[.]temperature[:]/{gsub("[.:]"," "); sub("C$",""); print $3,$5,$6}' | sort -n | while read CPU_NR VGR ZGR
do
        if [ "${VGR}" -gt "${ROT}" ]; then
                TEMP="${RED}${VGR},${ZGR}° C${OFF}"
        elif [ "${VGR}" -gt "${GELB}" ]; then
                TEMP="${YELLOW}${VGR},${ZGR}° C${OFF}"
        else
                TEMP="${GREEN}${VGR},${ZGR}° C${OFF}"
        fi
 
        echo -e "CPU ${CPU_NR}: ${TEMP}"
        unset TEMP
done
 
#------------------------------------------------------------------------------#
exit 0

HDD

34° C   ahcich0:ada0    600.000MB/s     WD-WMC789HLT21J WDC WD2005FBYZ-01YCBB2 RR07 (1863.02G)
38° C   ahcich1:ada1    600.000MB/s     2DG9876N        WDC WD161KRYZ-01AGBB0 01.01H01 (14902.00G)
36° C   ahcich2:ada2    600.000MB/s     5RGR7F4C        WDC WD121KRYZ-01W0RB0 01.01H01 (11176.00G)
36° C   ahcich3:ada3    600.000MB/s     7LJK2OBC        WDC WD101KRYZ-01JPDB1 01.01H02 (9314.00G)
43° C   ahcich4:ada4    600.000MB/s     Z925MOPJ        ST10000NM0016-1TT101 SND0 (9314.00G)
35° C   ahcich5:ada5    600.000MB/s     15HN9AYZ        HGST HUH721010ALE600 LHGNT384 (9314.00G)
35° C   ahcich6:ada6    600.000MB/s     7LJKENAC        WDC WD101KRYZ-01JPDB1 01.01H02 (9314.00G)
38° C   ahcich7:ada7    600.000MB/s     2DKNG95J        WDC WD161KRYZ-01AGBB0 01.01H01 (14902.00G)
39° C   ahcich10:ada8   600.000MB/s     5RHXEMHF        WDC WD121KRYZ-01W0RB1 02.02H02 (11176.00G)
41° C   ahcich12:ada9   600.000MB/s     Z92BZGRJ        ST10000NM0016-1TT101 SNE0 (9314.00G)
46° C   ahcich13:ada10  600.000MB/s     8OJ285WH        WDC WD121KRYZ-01W0RB0 01.01H01 (11176.00G)
~/bin/temp_hdd.sh
#!/bin/sh
#
# https://github.com/cytopia/freebsd-tools/blob/master/hdd-temp.sh
#
# ----------------------------------------------------------------------------
# "THE BEER-WARE LICENSE" (Revision 42):
# <cytopia@everythingcli.org> wrote this file. As long as you retain this notice you
# can do whatever you want with this stuff. If we meet some day, and you think
# this stuff is worth it, you can buy me a beer in return cytopia
# ----------------------------------------------------------------------------
 
#VERSION="v2018040800"          # aus dem Internet kopiert
VERSION="v2022051100"           # modifiziert; jetzt auch mit Seriennummer
 
### Grenzwerte
#
#ROT="40"       # Originalwert
ROT="45"        # meine Erfahrung
#
#GELB="30"      # Originalwert
GELB="42"       # meine Erfahrung
 
# ---------------------------------- Global Variables --------------------------------- #
# Colors
GREEN="\033[32m"
YELLOW="\033[33m"
RED="\033[31m"
OFF="\033[0m"
 
# ---------------------------------- Misc Function ---------------------------------- #
 
#
# Prequisites,
#  * check if this script is run by root
#  * check if smartctl is installed
#
check_requirements()
{
        # Check if we are root
        if [ "$(id -u)" != "0" ]; then
                echo "This script must be run as root" 1>&2
                exit 1
        fi
 
        # Check if smartctl exists on the system
        command -v smartctl >/dev/null  || { echo "smartctl not found. (install sysutils/smartmontools)"; exit 1; }
}
 
 
#
# Colorize output of temperature (all platforms)
#
colorize_temperature()
{
        TEMP="${1}"
 
        case "${TEMP}" in
                # no temperature obtained
                ''|*[!0-9]*)
                        TEMP="n.a."
                        ;;
                # temperature is obtained
                *)
                        if [ "${TEMP}" -gt "${ROT}" ]; then
                                TEMP="${RED}${TEMP}° C${OFF}"
                        elif [ "${TEMP}" -gt "${GELB}" ]; then
                                TEMP="${YELLOW}${TEMP}° C${OFF}"
                        else
                                TEMP="${GREEN}${TEMP}° C${OFF}"
                        fi
                        ;;
        esac
 
        echo "${TEMP}"
}
 
# ---------------------------------- Generic Disk Function ---------------------------------- #
 
#
# Get all devices that are attached to the system
#
get_attached_devices()
{
        DEVS="$(sysctl kern.disks | awk '{$1=""; ;print $0}' | awk 'gsub(" ", "\n")' | tail -n500 -r | sed '/^cd[0-9]/d')"
        echo "${DEVS}"
}
 
get_disk_bus()
{
        DEV="${1}"
        BUS="$(cat /var/run/dmesg.boot | grep -F "${DEV} at" | grep -F target | awk '{print $3}')"
        echo "${BUS}"
}
 
get_disk_size()
{
        DEV="${1}"
        SIZE="$(diskinfo -v /dev/${DEV} | grep -F bytes | awk '{printf "%.2f\n",($1/(1024*1024*1024))}')"
        echo "${SIZE}"
}
 
get_disk_speed()
{
        DEV="${1}"
        SPEED="$(cat /var/run/dmesg.boot | grep -F ${DEV}: | grep -F transfers | awk '{print $2};')"
        echo "${SPEED}"
}
 
get_disk_number()
{
        DEV="${1}"
        DISK_NUM="$(echo "${DEV}" | sed 's/[^0-9]*//g')"
        echo "${DISK_NUM}"
}
 
 
# ---------------------------------- ATA-Device Functions ---------------------------------- #
 
get_ata_disk_name()
{
        DEV="${1}"
        NAME="$(cat /var/run/dmesg.boot | grep -F "${DEV}:" | grep -E '[<>]' | awk -F '[<>]' '{print $2}')"
        echo "${NAME}"
}
 
get_ata_disk_temp()
{
        DEV="${1}"
        TEMP="$(smartctl -d atacam -A "/dev/${DEV}" | grep -F Temperature_Celsius | awk '{print $10}')"
        echo "${TEMP}"
}
 
# ---------------------------------- CISS-Device Functions ---------------------------------- #
 
get_ciss_disk_name()
{
        SMART_CTL="${1}"
        NAME="$(echo "${SMART_CTL}" | grep -F "Device Model" | awk '{$1=$2=""} {sub(/^[ \t]+/, ""); print;}')"
        FIRM="$(echo "${SMART_CTL}" | grep -F "Firmware" | awk ' {$1=$2=""} {sub(/^[ \t]+/, ""); print;}')"
        echo "${NAME} ${FIRM}"
}
 
get_ciss_disk_temp()
{
        SMART_CTL="${1}"
        TEMP="$(echo "${SMART_CTL}" | grep -F Temperature_Celsius | awk '{print $10}')"
        echo "${TEMP}"
}
 
# ---------------------------------- Main Entry Point ---------------------------------- #
 
# Check if script can be run
check_requirements
 
 
# Loop through all attached devices
for DEV in $(get_attached_devices)
do
        SIZE="$(get_disk_size ${DEV})"
        BUS="$(get_disk_bus ${DEV})"
        SPEED="$(get_disk_speed ${DEV})"
        SERIENNR="$(smartctl -i /dev/${DEV} | awk '/^Serial Number:[ ]*/{print $NF}' 2> /dev/null)"
 
        # check for HP Smart Array controllers
        if [ "${BUS}" == "ciss*" ]; then
                DEVNUM="$(get_disk_number ${DEV})"
                SMARTCTL="$(smartctl -a -T permissive -d cciss,${DEVNUM} /dev/${BUS} 2> /dev/null)"
                NAME="$(get_ciss_disk_name "${SMARTCTL}")"      # preserve newlines by using "
                TEMP="$(get_ciss_disk_temp "${SMARTCTL}")"
                echo "smartctl -a -T permissive -d cciss,${DEVNUM} /dev/${BUS} 2> /dev/null"    # debug
        else
                NAME="$(get_ata_disk_name ${DEV})"
                TEMP="$(get_ata_disk_temp ${DEV})"
        fi
 
        TEMP="$(colorize_temperature ${TEMP})"
 
        echo -e "${TEMP}\t${BUS}:${DEV}\t${SPEED}\t${SERIENNR}\t${NAME} (${SIZE}G)"
done
 
#eof

SMART-Monitor-Tools

Mit smartmontools kann man seine Festplatten überwachen. In zeitgemäßen Festplatten sind eine Menge Sensoren enthalten, die mit smartmontools ausgelesen werden können.

Um dieses Werkzeug nutzen zu können, muss S.M.A.R.T. im BIOS aktiviert werden.

in FreeBSD installieren:

# portupgrade -NROD sysutils/smartmontools

in Ubuntu installieren:

# aptitude install smartmontools

So aktiviert man das SMART-Monitoring:

# smartctl -s on /dev/ad6

So lässt man sich alle Fehler einer Festplatte anzeigen:

# smartctl -l error /dev/ad6

So lässt man sich alle Infos einer Festplatte anzeigen:

# smartctl -a /dev/ad6
smartctl 5.39.1 2010-01-28 r3054 [FreeBSD 8.0-RELEASE-p2 amd64] (local build)
Copyright (C) 2002-10 by Bruce Allen, http://smartmontools.sourceforge.net

=== START OF INFORMATION SECTION ===
Model Family:     Seagate Barracuda 7200.11 family
Device Model:     ST31500341AS
Serial Number:    9VS2YSXD
Firmware Version: CC1H
User Capacity:    1,500,301,910,016 bytes
Device is:        In smartctl database [for details use: -P show]
ATA Version is:   8
ATA Standard is:  ATA-8-ACS revision 4
Local Time is:    Thu Jun 10 20:35:50 2010 CEST
SMART support is: Available - device has SMART capability.
SMART support is: Enabled

=== START OF READ SMART DATA SECTION ===
SMART overall-health self-assessment test result: PASSED
See vendor-specific Attribute list for marginal Attributes.

General SMART Values:
Offline data collection status:  (0x82)       Offline data collection activity
                                      was completed without error.
                                      Auto Offline Data Collection: Enabled.
Self-test execution status:      (   0)       The previous self-test routine completed
                                      without error or no self-test has ever 
                                      been run.
Total time to complete Offline 
data collection:               ( 609) seconds.
Offline data collection
capabilities:                          (0x7b) SMART execute Offline immediate.
                                      Auto Offline data collection on/off support.
                                      Suspend Offline collection upon new
                                      command.
                                      Offline surface scan supported.
                                      Self-test supported.
                                      Conveyance Self-test supported.
                                      Selective Self-test supported.
SMART capabilities:            (0x0003)       Saves SMART data before entering
                                      power-saving mode.
                                      Supports SMART auto save timer.
Error logging capability:        (0x01)       Error logging supported.
                                      General Purpose Logging supported.
Short self-test routine 
recommended polling time:      (   1) minutes.
Extended self-test routine
recommended polling time:      ( 255) minutes.
Conveyance self-test routine
recommended polling time:      (   2) minutes.
SCT capabilities:            (0x103f) SCT Status supported.
                                      SCT Feature Control supported.
                                      SCT Data Table supported.

SMART Attributes Data Structure revision number: 10
Vendor Specific SMART Attributes with Thresholds:
ID# ATTRIBUTE_NAME          FLAG     VALUE WORST THRESH TYPE      UPDATED  WHEN_FAILED RAW_VALUE
  1 Raw_Read_Error_Rate     0x000f   115   099   006    Pre-fail  Always       -       85735962
  3 Spin_Up_Time            0x0003   100   100   000    Pre-fail  Always       -       0
  4 Start_Stop_Count        0x0032   100   100   020    Old_age   Always       -       66
  5 Reallocated_Sector_Ct   0x0033   100   100   036    Pre-fail  Always       -       4
  7 Seek_Error_Rate         0x000f   071   060   030    Pre-fail  Always       -       12888995
  9 Power_On_Hours          0x0032   097   097   000    Old_age   Always       -       2897
 10 Spin_Retry_Count        0x0013   100   100   097    Pre-fail  Always       -       0
 12 Power_Cycle_Count       0x0032   100   100   020    Old_age   Always       -       66
184 End-to-End_Error        0x0032   100   100   099    Old_age   Always       -       0
187 Reported_Uncorrect      0x0032   100   100   000    Old_age   Always       -       0
188 Command_Timeout         0x0032   100   100   000    Old_age   Always       -       0
189 High_Fly_Writes         0x003a   071   071   000    Old_age   Always       -       29
190 Airflow_Temperature_Cel 0x0022   047   042   045    Old_age   Always   In_the_past 53 (0 207 58 18)
194 Temperature_Celsius     0x0022   053   058   000    Old_age   Always       -       53 (0 16 0 0)
195 Hardware_ECC_Recovered  0x001a   052   033   000    Old_age   Always       -       85735962
197 Current_Pending_Sector  0x0012   100   100   000    Old_age   Always       -       0
198 Offline_Uncorrectable   0x0010   100   100   000    Old_age   Offline      -       0
199 UDMA_CRC_Error_Count    0x003e   200   200   000    Old_age   Always       -       0
240 Head_Flying_Hours       0x0000   100   253   000    Old_age   Offline      -       49014166784849
241 Total_LBAs_Written      0x0000   100   253   000    Old_age   Offline      -       3090393005
242 Total_LBAs_Read         0x0000   100   253   000    Old_age   Offline      -       112702149

SMART Error Log Version: 1
No Errors Logged

SMART Self-test log structure revision number 1
No self-tests have been logged.  [To run self-tests, use: smartctl -t]


SMART Selective self-test log data structure revision number 1
 SPAN  MIN_LBA  MAX_LBA  CURRENT_TEST_STATUS
    1        0        0  Not_testing
    2        0        0  Not_testing
    3        0        0  Not_testing
    4        0        0  Not_testing
    5        0        0  Not_testing
Selective self-test flags (0x0):
  After scanning selected spans, do NOT read-scan remainder of disk.
If Selective self-test is pending on power-up, resume after 0 minute delay.

An dem erhöhten Wert in der Spalte 190 Airflow_Temperature_Cel kann man sehen, das der Platte zumindest mal zu warm war.

In der Konfigurationsdatei /usr/local/etc/smartd.conf muss mindestens der Eintrag DEVICESCAN stehen, dann wird der smartd beim Aufruf das System nach Festplatten durchscannen. Man kann allerdings auch alle seine Festplatten (oder zumindest die, die überwacht werden sollen) in der Konfigurationsdatei eintragen.

Der smartd-Dienst sollte beim hoch fahren des Rechners automatisch starten. Dazu trägt man unter FreeBSD die entsprechende Variable in der /etc/rc.conf ein:

# vi /etc/rc.conf 
smartd_enable="YES"

Da wir den Rechner jetzt nicht neu starten wollen, starten wir den Dienst mal fix auf der Kommandozeile:

# /usr/local/etc/rc.d/smartd start

Bewertung der angegebenen S.M.A.R.T.-Informationen

Ausfallrelevant:

  5 Reallocated_Sector_Ct   0x0033
 10 Spin_Retry_Count        0x0013
184 End-to-End_Error        0x0032
187 Reported_Uncorrect      0x0032
188 Command_Timeout         0x0032
196 ???
197 Current_Pending_Sector  0x0012
198 Offline_Uncorrectable   0x0010
201 ???

Informierend:

  4 Start_Stop_Count        0x0032
  9 Power_On_Hours          0x0032
 12 Power_Cycle_Count       0x0032
193 ???
199 UDMA_CRC_Error_Count    0x003e

HDD

# sysctl kern.disks
kern.disks: ada10 ada9 ada8 ada7 ada6 ada5 ada4 ada3 ada2 ada1 ada0 cd0
# camcontrol devlist
<WDC WD2005FBYZ-01YCBB2 RR07>      at scbus0 target 0 lun 0 (ada0,pass0)
<WDC WD161KRYZ-01AGBB0 01.01H01>   at scbus1 target 0 lun 0 (ada1,pass1)
<WDC WD121KRYZ-01W0RB0 01.01H01>   at scbus2 target 0 lun 0 (ada2,pass2)
<WDC WD101KRYZ-01JPDB1 01.01H02>   at scbus3 target 0 lun 0 (ada3,pass3)
<ST10000NM0016-1TT101 SND0>        at scbus4 target 0 lun 0 (ada4,pass4)
<HGST HUH721010ALE600 LHGNT384>    at scbus5 target 0 lun 0 (ada5,pass5)
<WDC WD101KRYZ-01JPDB1 01.01H02>   at scbus6 target 0 lun 0 (ada6,pass6)
<WDC WD161KRYZ-01AGBB0 01.01H01>   at scbus7 target 0 lun 0 (ada7,pass7)
<PIONEER BD-RW   BDR-S09 1.51>     at scbus9 target 0 lun 0 (sg0,cd0,pass8)
<WDC WD121KRYZ-01W0RB1 02.02H02>   at scbus10 target 0 lun 0 (ada8,pass9)
<ST10000NM0016-1TT101 SNE0>        at scbus12 target 0 lun 0 (ada9,pass10)
<WDC WD121KRYZ-01W0RB0 01.01H01>   at scbus13 target 0 lun 0 (ada10,pass11)
<AHCI SGPIO Enclosure 2.00 0001>   at scbus14 target 0 lun 0 (ses0,pass12)
# lsblk 
DEVICE         MAJ:MIN SIZE TYPE                                          LABEL MOUNT
ada0             0:129 1.8T GPT                                               - -
  ada0p1         0:143 260M efi                                    gpt/efiboot0 /boot/efi
  <FREE>         -:-   1.0M -                                                 - -
  ada0p2         0:144 4.0G freebsd-swap                              gpt/swap0 SWAP
  ada0p3         0:145 1.8T freebsd-zfs                                gpt/zfs0 <ZFS>
  <FREE>         -:-    68K -                                                 - -
ada1             0:130  15T -                              diskid/DISK-2CGWXXXX -
ada10            0:141  11T -                                                 - -
ada2             0:131  11T -                              diskid/DISK-5PGRXXXX -
ada3             0:132 9.1T -                              diskid/DISK-7JJKXXXX -
ada4             0:133 9.1T -                              diskid/DISK-ZA25XXXX -
ada5             0:134 9.1T -                              diskid/DISK-1SHNXXXX -
ada6             0:135 9.1T -                              diskid/DISK-7JJKXXXX -
ada7             0:136  15T -                              diskid/DISK-2CKMXXXX -
ada8             0:137  11T -                              diskid/DISK-5PHWXXXX -
ada9             0:139 9.1T -                              diskid/DISK-ZA2CXXXX -
# smartctl --scan
/dev/ada0 -d atacam # /dev/ada0, ATA device
/dev/ada1 -d atacam # /dev/ada1, ATA device
/dev/ada2 -d atacam # /dev/ada2, ATA device
/dev/ada3 -d atacam # /dev/ada3, ATA device
/dev/ada4 -d atacam # /dev/ada4, ATA device
/dev/ada5 -d atacam # /dev/ada5, ATA device
/dev/ada6 -d atacam # /dev/ada6, ATA device
/dev/ada7 -d atacam # /dev/ada7, ATA device
/dev/cd0 -d atacam # /dev/cd0, ATA device
/dev/ada8 -d atacam # /dev/ada8, ATA device
/dev/ada9 -d atacam # /dev/ada9, ATA device
/dev/ada10 -d atacam # /dev/ada10, ATA device
/dev/ses0 -d atacam # /dev/ses0, ATA device

mpt-status installieren - mpt (and other) HW RAID controllers

# aptitude update && aptitude -y safe-upgrade
# aptitude -y install mpt-status
# modprobe mptctl
# aptitude -y remove mpt-status
# aptitude -y install mpt-status
# echo mptctl >> /etc/modules

DELL powerEdge R300

Um auf einem DELL powerEdge R300 den Festplattenstatus der Platten im RAID abfragen zu können, braucht man das Paket mpt-status und die LSIUtils oder MegaCli.

Im folgenden wird die Installation beschrieben.

LSIUtils

RAID-Controler-PCI-Meldungen

# lspci -vv
05:00.0 SCSI storage controller: LSI Logic / Symbios Logic SAS1068E PCI-Express Fusion-MPT SAS (rev 08)
        Subsystem: Dell Device 1f0e

saugen

# vi /etc/wgetrc
https_proxy = http://192.168.0.10:3128/
http_proxy = http://192.168.0.10:3128/
ftp_proxy = http://192.168.0.10:3128/
# cd /tmp
# wget ftp://ftp.lsil.com/HostAdapterDrivers/linux/lsiutil/lsiutil.tar.gz

oder

# wget http://repo.tvujweb.cz/dell/mpt/lsiutil.tar.gz

bauen

# aptitude -y install gcc
# tar xvzf lsiutil.tar.gz
# cd lsiutil
# rm lsiutil
# make
# cp -p lsiutil /usr/bin/

sauber machen

# cd
# rm -fr /tmp/lsiutil /tmp/lsiutil.tar.gz
# aptitude -y remove gcc_

MegaCLI installieren

RAID-Controler-PCI-Meldungen

# lspci | fgrep -i raid
03:00.0 RAID bus controller: LSI Logic / Symbios Logic MegaRAID SAS 1078 (rev 04)

auspacken

# aptitude install unzip
# mkdir MegaCli
# cd MegaCli
# unzip ../8.00.40_Linux_MegaCLI.zip
# unzip MegaCliLin.zip
Archive:  MegaCliLin.zip
  inflating: MegaCli-8.00.40-1.i386.rpm  
  inflating: Lib_Utils-1.00-08.noarch.rpm  
replace readme.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
# aptitude install alien
# alien -t --scripts Lib_Utils-1.00-08.noarch.rpm
error: incorrect format: unknown tag
Lib_Utils-1.00.tgz generated
# alien -t --scripts MegaCli-8.00.40-1.i386.rpm 
error: incorrect format: unknown tag
MegaCli-8.00.40.tgz generated
# tar xzf Lib_Utils-1.00.tgz
# mv opt/lsi /opt/
# install/doinst.sh
# rm -fr opt install
# tar xzf MegaCli-8.00.40.tgz
# mkdir -p /opt
# mv opt/MegaRAID/ /opt/
# install/doinst.sh
# rm -fr opt install /opt/MegaRAID/MegaCli/install.log
# ln -s /opt/MegaRAID/MegaCli/MegaCli64 /opt/MegaRAID/MegaCli/MegaCli

Test

mpt-status

in Sync

Ein Test mit einem Symbios Logic MegaRAID SAS oder Symbios Logic LSI MegaSAS sieht so aus:

# mpt-status
ioctl: No such device

Das liegt daran, das hier die "MegaRAID-Treiber" zum Einsatz kommen. ⇒ MegaCli (/opt/MegaRAID/MegaCli/MegaCli64)

Wenn die "MPT-Fusion-Treiber" zum Einsatz kommen, dann sieht das so aus:

# mpt-status
ioc0 vol_id 0 type IM, 2 phy, 465 GB, state OPTIMAL, flags ENABLED
ioc0 phy 1 scsi_id 9 ATA      WDC WD5002ABYS-1 3B04, 465 GB, state ONLINE, flags NONE
ioc0 phy 0 scsi_id 1 ATA      WDC WD5002ABYS-1 3B04, 465 GB, state ONLINE, flags NONE
# mpt-status -i 0
ioc0 vol_id 0 type IM, 2 phy, 465 GB, state OPTIMAL, flags ENABLED
ioc0 phy 1 scsi_id 9 ATA      WDC WD5002ABYS-1 3B04, 465 GB, state ONLINE, flags NONE
ioc0 phy 0 scsi_id 1 ATA      WDC WD5002ABYS-1 3B04, 465 GB, state ONLINE, flags NONE

out of sync

Wenn eine Platte getauscht wurde, und sich das RAID dann wieder synchronisiert, sieht das so aus:

# mpt-status
ioc0 vol_id 0 type IM, 2 phy, 465 GB, state DEGRADED, flags ENABLED RESYNC_IN_PROGRESS
ioc0 phy 1 scsi_id 9 ATA      WDC WD5002ABYS-1 3B04, 465 GB, state ONLINE, flags NONE
ioc0 phy 0 scsi_id 1 ATA      WDC WD5002ABYS-1 3B04, 465 GB, state ONLINE, flags OUT_OF_SYNC

LSIUtils

Infos

# lsiutil -i

LSI Logic MPT Configuration Utility, Version 1.56, March 19, 2008

1 MPT Port found

==============================================================================

/proc/mpt/ioc0    LSI Logic SAS1068E B3    MPT 105   Firmware 00192f00   IOC 0

Seg/Bus/Dev/Fun    Board Name       Board Assembly   Board Tracer
 0   5   0   0     SAS6IR                                            

Current Port State
------------------
SAS1068E's links are 3.0 G, 3.0 G, down, down, down, down, down, down

Software Version Information
----------------------------
Current active firmware version is 00192f00 (0.25.47)
Firmware image's version is MPTFW-00.25.47.00-IE
  LSI Logic
x86 BIOS image's version is MPTBIOS-6.22.03.00 (2008.08.06)

Firmware Settings
-----------------
SAS WWID:                       5a4badb02e6e6800
Multi-pathing:                  Disabled
SATA Native Command Queuing:    Enabled
SATA Write Caching:             Enabled
SATA Maximum Queue Depth:       8
Device Missing Report Delay:    0 seconds
Device Missing I/O Delay:       0 seconds
Phy Parameters for Phynum:      0    1    2    3    4    5    6    7    
  Link Enabled:                 Yes  Yes  Yes  Yes  Yes  Yes  Yes  Yes  
  Link Min Rate:                1.5  1.5  1.5  1.5  1.5  1.5  1.5  1.5  
  Link Max Rate:                3.0  3.0  3.0  3.0  3.0  3.0  3.0  3.0  
  SSP Initiator Enabled:        Yes  Yes  Yes  Yes  Yes  Yes  Yes  Yes  
  SSP Target Enabled:           No   No   No   No   No   No   No   No   
  Port Configuration:           Auto Auto Auto Auto Auto Auto Auto Auto 
Target IDs per enclosure:       1
Persistent mapping:             Enabled
Physical mapping type:          Enclosure/Slot
Target ID 0 reserved for boot:  No
Starting slot (direct attach):  0
Target IDs (physical mapping):  8
Interrupt Coalescing:           Enabled, timeout is 16 us, depth is 4

Persistent Mappings
-------------------
No persistent entries found
# lsiutil

LSI Logic MPT Configuration Utility, Version 1.56, March 19, 2008

1 MPT Port found

     Port Name         Chip Vendor/Type/Rev    MPT Rev  Firmware Rev  IOC
 1.  /proc/mpt/ioc0    LSI Logic SAS1068E B3     105      00192f00     0

Select a device:  [1-1 or 0 to quit] 1

 1.  Identify firmware, BIOS, and/or FCode
 2.  Download firmware (update the FLASH)
 4.  Download/erase BIOS and/or FCode (update the FLASH)
 8.  Scan for devices
10.  Change IOC settings (interrupt coalescing)
13.  Change SAS IO Unit settings
16.  Display attached devices
20.  Diagnostics
21.  RAID actions
22.  Reset bus
23.  Reset target
42.  Display operating system names for devices
45.  Concatenate SAS firmware and NVDATA files
60.  Show non-default settings
61.  Restore default settings
69.  Show board manufacturing information
97.  Reset SAS link, HARD RESET
98.  Reset SAS link
99.  Reset port
 e   Enable expert mode in menus
 p   Enable paged mode
 w   Enable logging

Main menu, select an option:  [1-99 or e/p/w or 0 to quit] 21

 1.  Show volumes
 2.  Show physical disks
 3.  Get volume state
 4.  Wait for volume resync to complete
23.  Replace physical disk
26.  Disable drive firmware update mode
27.  Enable drive firmware update mode
30.  Create volume
31.  Delete volume
32.  Change volume settings
33.  Change volume name
50.  Create hot spare
99.  Reset port
 e   Enable expert mode in menus
 p   Enable paged mode
 w   Enable logging

RAID actions menu, select an option:  [1-99 or e/p/w or 0 to quit] 3

Volume 0 State:  degraded, enabled, resync in progress
Resync Progress:  total blocks 975699968, blocks remaining 868269624, 88%

RAID actions menu, select an option:  [1-99 or e/p/w or 0 to quit] 0

Main menu, select an option:  [1-99 or e/p/w or 0 to quit] 0

     Port Name         Chip Vendor/Type/Rev    MPT Rev  Firmware Rev  IOC
 1.  /proc/mpt/ioc0    LSI Logic SAS1068E B3     105      00192f00     0

Select a device:  [1-1 or 0 to quit] 0

MegaCli

Hilfe:

# /opt/MegaRAID/MegaCli/MegaCli -h

erste Infos:

# /opt/MegaRAID/MegaCli/MegaCli -LDInfo -LALL -aALL
                                   

Adapter 0 -- Virtual Drive Information:
Virtual Drive: 0 (Target Id: 0)
Name                :System
RAID Level          : Primary-1, Secondary-0, RAID Level Qualifier-0
Size                : 544.5 GB
State               : Optimal
Strip Size          : 64 KB
Number Of Drives per span:2
Span Depth          : 4
Default Cache Policy: WriteBack, ReadAdaptive, Direct, No Write Cache if Bad BBU
Current Cache Policy: WriteBack, ReadAdaptive, Direct, No Write Cache if Bad BBU
Access Policy       : Read/Write
Disk Cache Policy   : Disk's Default
Encryption Type     : None
Bad Blocks Exist: No



Exit Code: 0x00

Adapter zählen:

# /opt/MegaRAID/MegaCli/MegaCli -adpCount
                                   

Controller Count: 1.

Exit Code: 0x01

von allen Adaptern nur die Fehler anzeigen:

# /opt/MegaRAID/MegaCli/MegaCli -PhyErrorCounters -aALL

Adapter #0

================
Phy No: 0 
Invalid DWord Count           : 0 
Running Disparity Error Count : 0 
Loss of DWord Synch Count     : 0 
Phy Reset problem Count       : 0 

Phy No: 1 
Invalid DWord Count           : 0 
Running Disparity Error Count : 0 
Loss of DWord Synch Count     : 0 
Phy Reset problem Count       : 0 

Phy No: 2 
Invalid DWord Count           : 0 
Running Disparity Error Count : 0 
Loss of DWord Synch Count     : 0 
Phy Reset problem Count       : 0 

Phy No: 3 
Invalid DWord Count           : 0 
Running Disparity Error Count : 0 
Loss of DWord Synch Count     : 0 
Phy Reset problem Count       : 0 

Phy No: 4 
Invalid DWord Count           : 0 
Running Disparity Error Count : 0 
Loss of DWord Synch Count     : 0 
Phy Reset problem Count       : 0 

Phy No: 5 
Invalid DWord Count           : 0 
Running Disparity Error Count : 0 
Loss of DWord Synch Count     : 0 
Phy Reset problem Count       : 0 

Phy No: 6 
Invalid DWord Count           : 0 
Running Disparity Error Count : 0 
Loss of DWord Synch Count     : 0 
Phy Reset problem Count       : 0 

Phy No: 7 
Invalid DWord Count           : 0 
Running Disparity Error Count : 0 
Loss of DWord Synch Count     : 0 
Phy Reset problem Count       : 0 


Exit Code: 0x00

Statusinformationen vom Adapter "0" (erster Adapter):

# /opt/MegaRAID/MegaCli/MegaCli -AdpAllInfo -a 0
                                   
Adapter #0

==============================================================================
                    Versions
                ================
Product Name    : PERC 6/i Integrated
Serial No       : 1122334455667788
FW Package Build: 6.2.0-0013

                    Mfg. Data
                ================
Mfg. Date       : 06/24/08
Rework Date     : 06/24/08
Revision No     : 
Battery FRU     : N/A

                Image Versions in Flash:
                ================
FW Version         : 1.22.02-0612
BIOS Version       : 2.04.00
WebBIOS Version    : 1.1-46-e_15-Rel
Ctrl-R Version     : 1.02-015B
Preboot CLI Version: 01.00-023:#%00006
Boot Block Version : 1.00.00.01-0011

                Pending Images in Flash
                ================
None

                PCI Info
                ================
Vendor Id       : 1000
Device Id       : 0060
SubVendorId     : 1028
SubDeviceId     : 1f0c

Host Interface  : PCIE

Number of Frontend Port: 0 
Device Interface  : PCIE

Number of Backend Port: 8 
Port  :  Address
0        500000e111efe142 
1        500000e11202e8b2 
2        5000cca009548f09 
3        5000cca0096db7fd 
4        5000cca00967ad8d 
5        0000000000000000 
6        0000000000000000 
7        0000000000000000 

                HW Configuration
                ================
SAS Address      : 50026b904a7cee00
BBU              : Present
Alarm            : Absent
NVRAM            : Present
Serial Debugger  : Present
Memory           : Present
Flash            : Present
Memory Size      : 256MB
TPM              : Absent
On board Expander: Absent
Upgrade Key      : Absent
Temperature sensor for ROC    : Absent
Temperature sensor for controller    : Absent


                Settings
                ================
Current Time                     : 13:56:29 5/24, 2011
Predictive Fail Poll Interval    : 300sec
Interrupt Throttle Active Count  : 16
Interrupt Throttle Completion    : 50us
Rebuild Rate                     : 30%
PR Rate                          : 30%
BGI Rate                         : 30%
Check Consistency Rate           : 30%
Reconstruction Rate              : 30%
Cache Flush Interval             : 4s
Max Drives to Spinup at One Time : 2
Delay Among Spinup Groups        : 12s
Physical Drive Coercion Mode     : 128MB
Cluster Mode                     : Disabled
Alarm                            : Disabled
Auto Rebuild                     : Enabled
Battery Warning                  : Enabled
Ecc Bucket Size                  : 15
Ecc Bucket Leak Rate             : 1440 Minutes
Restore HotSpare on Insertion    : Disabled
Expose Enclosure Devices         : Disabled
Maintain PD Fail History         : Disabled
Host Request Reordering          : Enabled
Auto Detect BackPlane Enabled    : SGPIO/i2c SEP
Load Balance Mode                : Auto
Use FDE Only                     : No
Security Key Assigned            : No
Security Key Failed              : No
Security Key Not Backedup        : No
Default LD PowerSave Policy      : Controller Defined
Maximum number of direct attached drives to spin up in 1 min : 0 
Any Offline VD Cache Preserved   : No
Allow Boot with Preserved Cache  : No
Disable Online Controller Reset  : No
PFK in NVRAM                     : No
Use disk activity for locate     : No

                Capabilities
                ================
RAID Level Supported             : RAID0, RAID1, RAID5, RAID6, RAID00, RAID10, RAID50, RAID60, PRL 11, PRL 11 with spanning, SRL 3 supported, PRL11-RLQ0 DDF layout with no span, PRL11-RLQ0 DDF layout with span
Supported Drives                 : SAS, SATA

Allowed Mixing:

Mix in Enclosure Allowed

                Status
                ================
ECC Bucket Count                 : 0

                Limitations
                ================
Max Arms Per VD          : 32 
Max Spans Per VD         : 8 
Max Arrays               : 128 
Max Number of VDs        : 64 
Max Parallel Commands    : 1008 
Max SGE Count            : 80 
Max Data Transfer Size   : 8192 sectors 
Max Strips PerIO         : 42 
Min Strip Size           : 8 KB
Max Strip Size           : 1.0 MB
Max Configurable CacheCade Size: 0 GB
Current Size of CacheCade      : 0 GB
Current Size of FW Cache       : 0 MB

                Device Present
                ================
Virtual Drives    : 2 
  Degraded        : 0 
  Offline         : 0 
Physical Devices  : 6 
  Disks           : 5 
  Critical Disks  : 0 
  Failed Disks    : 0 

                Supported Adapter Operations
                ================
Rebuild Rate                    : Yes
CC Rate                         : Yes
BGI Rate                        : Yes
Reconstruct Rate                : Yes
Patrol Read Rate                : Yes
Alarm Control                   : Yes
Cluster Support                 : No
BBU                             : Yes
Spanning                        : Yes
Dedicated Hot Spare             : Yes
Revertible Hot Spares           : Yes
Foreign Config Import           : Yes
Self Diagnostic                 : Yes
Allow Mixed Redundancy on Array : No
Global Hot Spares               : Yes
Deny SCSI Passthrough           : No
Deny SMP Passthrough            : No
Deny STP Passthrough            : No
Support Security                : No
Snapshot Enabled                : No
Support the OCE without adding drives : No
Support PFK                     : No

                Supported VD Operations
                ================
Read Policy          : Yes
Write Policy         : Yes
IO Policy            : Yes
Access Policy        : Yes
Disk Cache Policy    : Yes
Reconstruction       : Yes
Deny Locate          : No
Deny CC              : No
Allow Ctrl Encryption: No
Enable LDBBM         : No
Support Breakmirror  : No
Power Savings        : No

                Supported PD Operations
                ================
Force Online                            : Yes
Force Offline                           : Yes
Force Rebuild                           : Yes
Deny Force Failed                       : No
Deny Force Good/Bad                     : No
Deny Missing Replace                    : No
Deny Clear                              : No
Deny Locate                             : No
Support Temperature                     : No
Disable Copyback                        : No
Enable JBOD                             : No
Enable Copyback on SMART                : No
Enable Copyback to SSD on SMART Error   : No
Enable SSD Patrol Read                  : No
PR Correct Unconfigured Areas           : Yes
                Error Counters
                ================
Memory Correctable Errors   : 0 
Memory Uncorrectable Errors : 0 

                Cluster Information
                ================
Cluster Permitted     : No
Cluster Active        : No

                Default Settings
                ================
Phy Polarity                     : 0 
Phy PolaritySplit                : 0 
Background Rate                  : 30 
Strip Size                       : 64kB
Flush Time                       : 4 seconds
Write Policy                     : WB
Read Policy                      : None
Cache When BBU Bad               : Disabled
Cached IO                        : No
SMART Mode                       : Mode 6
Alarm Disable                    : No
Coercion Mode                    : 128MB
ZCR Config                       : Unknown
Dirty LED Shows Drive Activity   : No
BIOS Continue on Error           : No
Spin Down Mode                   : None
Allowed Device Type              : SAS/SATA Mix
Allow Mix in Enclosure           : Yes
Allow HDD SAS/SATA Mix in VD     : No
Allow SSD SAS/SATA Mix in VD     : No
Allow HDD/SSD Mix in VD          : No
Allow SATA in Cluster            : No
Max Chained Enclosures           : 1 
Disable Ctrl-R                   : No
Enable Web BIOS                  : No
Direct PD Mapping                : Yes
BIOS Enumerate VDs               : Yes
Restore Hot Spare on Insertion   : No
Expose Enclosure Devices         : No
Maintain PD Fail History         : No
Disable Puncturing               : No
Zero Based Enclosure Enumeration : Yes
PreBoot CLI Enabled              : No
LED Show Drive Activity          : No
Cluster Disable                  : Yes
SAS Disable                      : No
Auto Detect BackPlane Enable     : SGPIO/i2c SEP
Use FDE Only                     : No
Enable Led Header                : No
Delay during POST                : 0 
EnableCrashDump                  : No
Disable Online Controller Reset  : No
EnableLDBBM                      : No
Un-Certified Hard Disk Drives    : Block
Treat Single span R1E as R10     : No
Max LD per array                 : 16
Power Saving option              : All power saving options are enabled
Default spin down time in minutes: 0 
Enable JBOD                      : No
Time taken to detect CME         : 60s

Exit Code: 0x00

Sensoren

Temperatur

Ubuntu 14.04

Dieses Programm konnte auf einem DELL Optiplex 990 (Baujahr ca. 2012-2014) kaum etwas auslesen:

> aptitude install acpi

> acpi -V
No support for device type: power_supply
No support for device type: power_supply
Cooling 0: pkg-temp-0 no state information available
Cooling 1: intel_powerclamp no state information available
Cooling 2: Processor 0 of 10
Cooling 3: Processor 0 of 10
Cooling 4: Processor 0 of 10
Cooling 5: Processor 0 of 10
Cooling 6: Processor 0 of 10
Cooling 7: Processor 0 of 10
Cooling 8: Processor 0 of 10
Cooling 9: Processor 0 of 10

Beim Aufruf des Detektors, sollten alle Fragen mit YES beantwortet werden:

> aptitude install lm-sensors
> /usr/sbin/sensors-detect
> modprobe coretemp
> service kmod restart

> sensors
coretemp-isa-0000
Adapter: ISA adapter
Physical id 0:  +40.0°C  (high = +80.0°C, crit = +98.0°C)
Core 0:         +39.0°C  (high = +80.0°C, crit = +98.0°C)
Core 1:         +40.0°C  (high = +80.0°C, crit = +98.0°C)
Core 2:         +37.0°C  (high = +80.0°C, crit = +98.0°C)
Core 3:         +39.0°C  (high = +80.0°C, crit = +98.0°C)

Die Grafischen Oberflächen für sensors heißen psensors und xsensors.

> aptitude install hddtemp
> hddtemp /dev/sda
/dev/sda: WDC WD1003FBYZ-010AB0: 42°C

FreeBSD

Betriebssystembezeichnung:

> uname -mrs
FreeBSD 10.3-RELEASE-p7 amd64

CPU-Bezeichnung:

> sysctl hw.machine hw.model hw.ncpu
hw.machine: amd64
hw.model: Intel(R) Core(TM) i5-10600T CPU @ 2.40GHz
hw.ncpu: 12
> dmesg | grep -i cpu
CPU: AMD Athlon(tm) II X2 245e Processor (2913.04-MHz K8-class CPU)
FreeBSD/SMP: Multiprocessor System Detected: 2 CPUs
 cpu0 (BSP): APIC ID:  0
 cpu1 (AP): APIC ID:  1
cpu0: <ACPI CPU> on acpi0
cpu1: <ACPI CPU> on acpi0
hwpstate0: <Cool`n'Quiet 2.0> on cpu0
SMP: AP CPU #1 Launched!
amdtemp0: <AMD CPU On-Die Thermal Sensors> on hostb4
aibs0: T0: 0x06030000      CPU Temperature   600 /   950  0x10001
aibs0: F0: 0x06040000        CPU FAN Speed   600 /  7200  0x10001

FreeBSD-Kernel beim booten laden:

> vi /boot/loader.conf
### Intel-CPU-Temperatur
### device	driver for Intel Core on-die digital thermal sensor
coretemp_load="YES"

### AMD-CPU-Temperatur
### device driver for AMD processor	on-die digital thermal sensor
amdtemp_load="YES"

Temperatur-Kernel-Modul für Intel-CPU's laden:

> kldload coretemp

Temperatur-Kernel-Modul für AMD-CPU's laden:

> kldload amdtemp
> kldload acpi
> kldload aibs
> uname -mrs
FreeBSD 13.0-RELEASE-p4 amd64
> sysctl dev.cpu | sort -n | grep -F temperature:
dev.cpu.0.temperature: 28.0C
dev.cpu.1.temperature: 28.0C
> sysctl dev.aibs | grep -F temp
dev.aibs.0.temp.0: 48,0C 60,0C 95,0C
dev.aibs.0.temp.1: 37,0C 45,0C 95,0C

ACPI:

> pkg install sysutils/hwstat
> hwstat
                                        Current                         Unit
[Coretemp]
        CPU0:                           Cannot get temperature
        CPU1:                           Cannot get temperature

eine Alternative kann auch dieses Programm sein:

> cat /usr/ports/sysutils/mbmon/pkg-descr
This is a X/tty motherboard monitor which supports LM78/79, WINBond
83781D/83782D/83783S, ASUS 991227F, and VIA VT82C686A/B PC-health
chips via 3 methods: ISA-I/O, SMBus, VIA-direct.

Run "mbmon -h" or "xmbmon -help" to see the usage.

***CAUTION*** 

These programs access to the SMBus or the ISA-IO port directly under
the superuser privilege, so it may cause a system crash.  Please test
"mbmon -d" or "xmbmon -debug" first.

WWW: http://www.nt.phys.kyushu-u.ac.jp/shimizu/download/download.html
/home/http/wiki/data/attic/hardware-ueberwachung.1652278305.txt · Zuletzt geändert: von manfred