NVSM Health

Tasks

Tasks

Logs Collected

acpi_video_info

Brief

None

Description

None

Module

dump

Path

/proc/acpi/video/*/info

Exclude

None

application_dump

Brief

None

Description

None

Module

dump

Path

/var/crash/*.crash

Exclude

None

apt_log

Brief

None

Description

None

Module

dump

Path

/var/log/apt/*

Exclude

\d\.gz\Z

apt_preferences_nvidia

Brief

None

Description

None

Module

dump

Path

/etc/apt/preferences.d/nvidia

Exclude

None

apt_sources

Brief

None

Description

None

Module

dump

Path

/etc/apt/sources.list

Exclude

None

apt_sources_list_d

Brief

None

Description

None

Module

dump

Path

/etc/apt/sources.list.d/*

Exclude

None

bmc_sel_log

Brief

None

Description

None

Module

dump

Path

/var/log/bmc_sel*.log

Exclude

None

cmdline

Brief

None

Description

None

Module

dump

Path

/proc/cmdline

Exclude

None

collectd_log

Brief

None

Description

None

Module

dump

Path

/var/log/collectd.log

Exclude

None

comp_fw_log

Brief

None

Description

None

Module

dump

Path

/var/log/comp_fw_log.txt

Exclude

None

cosmos_log

Brief

None

Description

None

Module

dump

Path

/var/log/cosmos/*.log

Exclude

\d\.gz\Z

cpuinfo

Brief

None

Description

None

Module

dump

Path

/proc/cpuinfo

Exclude

None

debian_release

Brief

None

Description

None

Module

dump

Path

/etc/debian_release

Exclude

None

debian_version

Brief

None

Description

None

Module

dump

Path

/etc/debian_version

Exclude

None

dgx_release

Brief

None

Description

None

Module

dump

Path

/etc/dgx-release

Exclude

None

dmesg_log

Brief

None

Description

None

Module

dump

Path

/var/log/dmesg*

Exclude

\d\.gz\Z

docker_volume_netshare_log

Brief

None

Description

None

Module

dump

Path

/var/log/docker-volume-netshare.log

Exclude

None

dshm_log

Brief

None

Description

None

Module

dump

Path

/var/log/nvsm/dshm/*

Exclude

None

etc_netplan

Brief

None

Description

None

Module

dump

Path

/etc/netplan/*.yaml

Exclude

None

fabricmanager_log

Brief

None

Description

None

Module

dump

Path

/var/log/fabricmanager.log*

Exclude

None

fedora_release

Brief

None

Description

None

Module

dump

Path

/etc/fedora-release

Exclude

None

fscache_stats

Brief

None

Description

None

Module

dump

Path

/proc/fs/fscache/stats

Exclude

None

gds_collect

Brief

None

Description

None

Module

dump

Path

None

Exclude

None

gentoo_release

Brief

None

Description

None

Module

dump

Path

/etc/gentoo-release

Exclude

None

installer_syslog

Brief

None

Description

None

Module

dump

Path

/var/log/installer/syslog*

Exclude

\d\.gz\Z

interrupts

Brief

None

Description

None

Module

dump

Path

/proc/interrupts

Exclude

None

iomem

Brief

None

Description

None

Module

dump

Path

/proc/iomem

Exclude

None

issue

Brief

None

Description

None

Module

dump

Path

/etc/issue

Exclude

None

kern_log

Brief

None

Description

None

Module

dump

Path

/var/log/kern.log*

Exclude

None

kernel_log

Brief

None

Description

None

Module

dump

Path

/var/log/kernel.log*

Exclude

\d\.gz\Z

lib_netplan

Brief

None

Description

None

Module

dump

Path

/lib/netplan/*.yaml

Exclude

None

mandrake_release

Brief

None

Description

None

Module

dump

Path

/etc/mandrake-release

Exclude

None

mdstat

Brief

None

Description

None

Module

dump

Path

/proc/mdstat

Exclude

None

meminfo

Brief

None

Description

None

Module

dump

Path

/proc/meminfo

Exclude

None

mesos_master_error

Brief

None

Description

None

Module

dump

Path

/var/log/mesos/mesos-master.ERROR

Exclude

None

mesos_master_fatal

Brief

None

Description

None

Module

dump

Path

/var/log/mesos/mesos-master.FATAL

Exclude

None

mesos_master_info

Brief

None

Description

None

Module

dump

Path

/var/log/mesos/mesos-master.INFO

Exclude

None

mesos_master_warning

Brief

None

Description

None

Module

dump

Path

/var/log/mesos/mesos-master.WARNING

Exclude

None

mesos_slave_error

Brief

None

Description

None

Module

dump

Path

/var/log/mesos/mesos-slave.ERROR

Exclude

None

mesos_slave_fatal

Brief

None

Description

None

Module

dump

Path

/var/log/mesos/mesos-slave.FATAL

Exclude

None

mesos_slave_info

Brief

None

Description

None

Module

dump

Path

/var/log/mesos/mesos-slave.INFO

Exclude

None

mesos_slave_warning

Brief

None

Description

None

Module

dump

Path

/var/log/mesos/mesos-slave.WARNING

Exclude

None

messages

Brief

None

Description

None

Module

dump

Path

/var/log/messages*

Exclude

\d\.gz\Z

modules

Brief

None

Description

None

Module

dump

Path

/proc/modules

Exclude

None

monit_log

Brief

None

Description

None

Module

dump

Path

/var/log/monit

Exclude

None

mtrr

Brief

None

Description

None

Module

dump

Path

/proc/mtrr

Exclude

None

network_interfaces

Brief

None

Description

None

Module

dump

Path

/etc/network/interfaces

Exclude

None

network_interfaces_d

Brief

None

Description

None

Module

dump

Path

/etc/network/interfaces.d/*

Exclude

None

nfsfs_servers

Brief

None

Description

None

Module

dump

Path

/proc/fs/nfsfs/servers

Exclude

None

nfsfs_volumes

Brief

None

Description

None

Module

dump

Path

/proc/fs/nfsfs/volumes

Exclude

None

nginx_log

Brief

None

Description

None

Module

dump

Path

/var/log/nginx/*

Exclude

\d\.gz\Z

nvidia_application_profiles1

Brief

None

Description

None

Module

dump

Path

/etc/nvidia/nvidia-application-profiles-rc

Exclude

None

nvidia_application_profiles2

Brief

None

Description

None

Module

dump

Path

/etc/nvidia/nvidia-application-profiles-rc.d/*

Exclude

None

nvidia_application_profiles3

Brief

None

Description

None

Module

dump

Path

/usr/share/nvidia/nvidia-application-profiles-*-rc

Exclude

None

nvidia_dcshwapikey_conf

Brief

None

Description

None

Module

dump

Path

/etc/nvidia/gridd.conf*

Exclude

None

nvidia_dcshwapikey_license

Brief

None

Description

None

Module

dump

Path

/etc/nvidia/license/NV-*

Exclude

None

nvidia_driver_gpu_information

Brief

None

Description

None

Module

dump

Path

/proc/driver/nvidia/gpus/*/information

Exclude

None

nvidia_driver_gpu_registry

Brief

None

Description

None

Module

dump

Path

/proc/driver/nvidia/gpus/*/registry

Exclude

None

nvidia_driver_params

Brief

None

Description

None

Module

dump

Path

/proc/driver/nvidia/params

Exclude

None

nvidia_driver_registry

Brief

None

Description

None

Module

dump

Path

/proc/driver/nvidia/registry

Exclude

None

nvidia_driver_version

Brief

None

Description

None

Module

dump

Path

/proc/driver/nvidia/version

Exclude

None

nvidia_driver_warnings

Brief

None

Description

None

Module

dump

Path

/proc/driver/nvidia/warnings/*

Exclude

None

nvidia_fs_peer_affinity

Brief

None

Description

None

Module

dump

Path

/proc/driver/nvidia-fs/peer_affinity

Exclude

None

nvidia_fs_peer_distance

Brief

None

Description

None

Module

dump

Path

/proc/driver/nvidia-fs/peer_distance

Exclude

None

nvidia_fs_stats

Brief

None

Description

None

Module

dump

Path

/proc/driver/nvidia-fs/stats

Exclude

None

nvidia_fw_log

Brief

None

Description

None

Module

dump

Path

/var/log/nvidia-fw.log

Exclude

None

nvidia_installer_log

Brief

None

Description

None

Module

dump

Path

/var/log/nvidia-installer.log

Exclude

None

nvidia_uninstall_log

Brief

None

Description

None

Module

dump

Path

/var/log/nvidia-uninstall.log

Exclude

None

nvsm_log

Brief

None

Description

None

Module

dump

Path

/var/log/nvsm/*log*

Exclude

None

pci

Brief

None

Description

None

Module

dump

Path

/proc/pci

Exclude

None

pegasus_dbglog

Brief

None

Description

None

Module

dump

Path

None

Exclude

None

pegasus_syslog

Brief

None

Description

None

Module

dump

Path

None

Exclude

None

redhat_release

Brief

None

Description

None

Module

dump

Path

/etc/redhat-release

Exclude

None

redhat_version

Brief

None

Description

None

Module

dump

Path

/etc/redhat_version

Exclude

None

release

Brief

None

Description

None

Module

dump

Path

/etc/release

Exclude

None

remote_bmc_sel_log

Brief

None

Description

None

Module

dump

Path

/var/log/remote_bmc_sel.log

Exclude

None

run_netplan

Brief

None

Description

None

Module

dump

Path

/run/netplan/*.yaml

Exclude

None

slackware_release

Brief

None

Description

None

Module

dump

Path

/etc/slackware-release

Exclude

None

slackware_version

Brief

None

Description

None

Module

dump

Path

/etc/slackware-version

Exclude

None

sosreport

Brief

None

Description

None

Module

sosreport

Path

None

Exclude

None

sun_release

Brief

None

Description

None

Module

dump

Path

/etc/sun-release

Exclude

None

syslog

Brief

None

Description

None

Module

dump

Path

/var/log/syslog*

Exclude

None

system_map

Brief

None

Description

None

Module

dump

Path

/boot/System.map*

Exclude

None

td_agent_log

Brief

None

Description

None

Module

dump

Path

/var/log/td-agent/td-agent.*

Exclude

\d\.gz\Z

upstart_log

Brief

None

Description

None

Module

dump

Path

/var/log/upstart/*

Exclude

\d\.gz\Z

var_lib_dhcp

Brief

None

Description

None

Module

dump

Path

/var/lib/dhcp/*

Exclude

None

version

Brief

None

Description

None

Module

dump

Path

/proc/version

Exclude

None

xfree86_log

Brief

None

Description

None

Module

dump

Path

/var/log/XFree86.*.log*

Exclude

\.old\Z

xorg_log

Brief

None

Description

None

Module

dump

Path

/var/log/Xorg.*.log*

Exclude

\.old\Z

yellowdog_release

Brief

None

Description

None

Module

dump

Path

/etc/yellowdog-release

Exclude

None

zookeeper_log

Brief

None

Description

None

Module

dump

Path

/var/log/zookeeper/*.log

Exclude

\d\.gz\Z

Commands Executed

bash

Brief

None

Description

None

Module

dump

Command-line

bash --version 

Timeout

300 seconds.

bash_hello_world

Brief

None

Description

None

Module

dump

Command-line

${NVSMHEALTH_DUMP_TOOLS}/hello.bash 

Timeout

300 seconds.

collect_fru

Brief

Run ipmitool fru print command

Description

This runs the “ipmitool fru” command to obtain FRU (field replaceable unit) information from the BMC (baseboard management controller). FRU information is important for keeping inventory of the components installed on the system and their serial numbers.

Module

fru

Used By

Command-line

ipmitool fru print 

Timeout

300 seconds.

collect_nvsm

Brief

None

Description

None

Module

dump

Command-line

${NVSMHEALTH_DUMP_TOOLS}/collect_nvsm.py 

Timeout

300 seconds.

collect_usb_sysfs

Brief

Collect information for connected USB devices from sysfs

Description

None

Module

usb

Command-line

echo TODO 

Timeout

300 seconds.

date

Brief

None

Description

None

Module

dump

Command-line

date 

Timeout

300 seconds.

date_utc

Brief

None

Description

None

Module

dump

Command-line

date --utc 

Timeout

300 seconds.

dcc_ipmitool_sel_writeraw

Brief

None

Description

None

Module

dump

Command-line

ipmitool -I lanplus -H 192.168.1.42 -U nvsm-admin -P None sel writeraw \
    bin_file 

Timeout

300 seconds.

dcc_passgen

Brief

Run dcc_passgen tool

Description

Run dcc_passgen for DCC BMC. This command requires superuser privileges.

Module

dcs_modules

Command-line

dcc_passgen 

Timeout

300 seconds.

dcs_cam_camera_mapping

Brief

None

Description

None

Module

dump

Command-line

python3 ${NVSMHEALTH_DUMP_TOOLS}/dcs_camera_info.py --cmd camera_mapping \
    --display 0 

Timeout

300 seconds.

dcs_cam_gpus_all

Brief

None

Description

None

Module

dump

Command-line

python3 ${NVSMHEALTH_DUMP_TOOLS}/dcs_camera_info.py --cmd gpus_all --display 0 

Timeout

300 seconds.

dcs_cam_query_gpu_info

Brief

None

Description

None

Module

dump

Command-line

python3 ${NVSMHEALTH_DUMP_TOOLS}/dcs_camera_info.py --cmd query_gpu_info 

Timeout

300 seconds.

df

Brief

None

Description

None

Module

dump

Command-line

df -k 

Timeout

300 seconds.

dmesg

Brief

None

Description

None

Module

dump

Command-line

dmesg 

Timeout

300 seconds.

dmidecode

Brief

None

Description

None

Module

dump

Command-line

dmidecode 

Timeout

300 seconds.

docker_info

Brief

None

Description

None

Module

dump

Command-line

docker info 

Timeout

300 seconds.

docker_ps

Brief

None

Description

None

Module

dump

Command-line

docker ps 

Timeout

300 seconds.

dpkg_list

Brief

None

Description

None

Module

dump

Command-line

dpkg --list 

Timeout

300 seconds.

dpkg_verify

Brief

None

Description

None

Module

dump

Command-line

dpkg --verify 

Timeout

300 seconds.

ethtool

Brief

None

Description

None

Module

dump

Command-line

${NVSMHEALTH_DUMP_TOOLS}/ethtool.sh 

Timeout

300 seconds.

fru_dcc_version

Brief

Determine system version using DCC version stored in DCS FRU

Description

This command reads the dcc version stored in the DCS FRU table by reading its value using ipmitool. On C1.1 systems this will be “1.1”. This command does not require superuser privileges.

Module

sysfs

Command-line

ipmitool fru print 0 | grep -E 'Product Extra(\s+):' | head -n 3 | awk 'NR==3 \
    {{print $4}}' 

Timeout

300 seconds.

gcc

Brief

None

Description

None

Module

dump

Command-line

gcc -v 

Timeout

300 seconds.

gds_check

Brief

None

Description

None

Module

dump

Command-line

${NVSMHEALTH_DUMP_GDS_CUDA_PATH}/gds/tools/gdscheck.py -pvV 

Timeout

300 seconds.

gds_stack_trace

Brief

None

Description

None

Module

dump

Command-line

for x in `nvidia-smi --query-compute-apps=pid --format=csv,noheader` ; do cat \
    /proc/$x/task/*/stack; done 

Timeout

300 seconds.

gds_stats

Brief

None

Description

None

Module

dump

Command-line

for x in `nvidia-smi --query-compute-apps=pid --format=csv,noheader` ; do \
    ${NVSMHEALTH_DUMP_GDS_CUDA_PATH}/gds/tools/gds_stats -p $x -l 3; done 

Timeout

300 seconds.

glxinfo

Brief

None

Description

None

Module

dump

Command-line

ldd /usr/bin/glxinfo 

Timeout

300 seconds.

gpp

Brief

None

Description

None

Module

dump

Command-line

g++ -v 

Timeout

300 seconds.

hca_self_test

Brief

None

Description

None

Module

dump

Command-line

hca_self_test.ofed 

Timeout

300 seconds.

ibdev2netdev

Brief

None

Description

None

Module

dump

Command-line

ibdev2netdev 

Timeout

300 seconds.

ibstat

Brief

None

Description

None

Module

dump

Command-line

ibstat 

Timeout

300 seconds.

ibstatus

Brief

None

Description

None

Module

dump

Command-line

ibstatus 

Timeout

300 seconds.

ibv_devinfo

Brief

None

Description

None

Module

dump

Command-line

ibv_devinfo 

Timeout

300 seconds.

ip_addr_show

Brief

None

Description

None

Module

dump

Command-line

ip addr show 

Timeout

300 seconds.

ip_route_show

Brief

None

Description

None

Module

dump

Command-line

ip route show 

Timeout

300 seconds.

ipmitool_bmc_info

Brief

None

Description

None

Module

dump

Command-line

ipmitool bmc info 

Timeout

300 seconds.

ipmitool_chassis_status

Brief

None

Description

None

Module

dump

Command-line

ipmitool chassis status 

Timeout

300 seconds.

ipmitool_fru

Brief

None

Description

None

Module

dump

Command-line

ipmitool fru 

Timeout

300 seconds.

ipmitool_lan_print

Brief

None

Description

None

Module

dump

Command-line

ipmitool lan print 1 

Timeout

300 seconds.

ipmitool_power_led_status

Brief

None

Description

None

Module

dump

Command-line

${NVSMHEALTH_DUMP_TOOLS}/ipmitool_power_led_status.sh 

Timeout

300 seconds.

ipmitool_raw

Brief

None

Description

None

Module

dump

Command-line

${NVSMHEALTH_DUMP_TOOLS}/ipmitool_raw.sh 

Timeout

300 seconds.

ipmitool_raw_dgxa100

Brief

None

Description

None

Module

dump

Command-line

${NVSMHEALTH_DUMP_TOOLS}/ipmitool_raw_dgxa100.sh 

Timeout

300 seconds.

ipmitool_sdr

Brief

None

Description

None

Module

dump

Command-line

ipmitool sdr 

Timeout

300 seconds.

ipmitool_sdr_dump

Brief

None

Description

None

Module

dump

Command-line

out=$(mktemp); ipmitool sdr dump $out > /dev/null 2>&1; cat $out 

Timeout

300 seconds.

ipmitool_sdr_info

Brief

None

Description

None

Module

dump

Command-line

ipmitool sdr info 

Timeout

300 seconds.

ipmitool_sel_elist

Brief

None

Description

None

Module

dump

Command-line

ipmitool sel elist 

Timeout

300 seconds.

ipmitool_sel_info

Brief

None

Description

None

Module

dump

Command-line

ipmitool sel info 

Timeout

300 seconds.

ipmitool_sel_list

Brief

None

Description

None

Module

dump

Command-line

ipmitool sel list 

Timeout

300 seconds.

ipmitool_sel_time_get

Brief

None

Description

None

Module

dump

Command-line

ipmitool sel time get 

Timeout

300 seconds.

ipmitool_sel_writeraw

Brief

None

Description

None

Module

dump

Command-line

${NVSMHEALTH_DUMP_TOOLS}/sel_writeraw.sh 

Timeout

300 seconds.

ipmitool_user_list_1

Brief

None

Description

None

Module

dump

Command-line

ipmitool user list 1 

Timeout

300 seconds.

java

Brief

None

Description

None

Module

dump

Command-line

java -version 

Timeout

300 seconds.

java_hello_world

Brief

None

Description

None

Module

dump

Command-line

java -classpath ${NVSMHEALTH_DUMP_TOOLS}/tools hello 

Timeout

300 seconds.

ldconfig

Brief

None

Description

None

Module

dump

Command-line

ldconfig -p 

Timeout

300 seconds.

lsb_release

Brief

None

Description

None

Module

dump

Command-line

lsb_release -a 

Timeout

300 seconds.

lsblk

Brief

None

Description

None

Module

dump

Command-line

lsblk 

Timeout

300 seconds.

lsblk_discard

Brief

None

Description

None

Module

dump

Command-line

lsblk --discard 

Timeout

300 seconds.

lsblk_topology

Brief

None

Description

None

Module

dump

Command-line

lsblk --topology 

Timeout

300 seconds.

lscpu

Brief

None

Description

None

Module

dump

Command-line

lscpu 

Timeout

300 seconds.

lshw

Brief

None

Description

None

Module

dump

Command-line

lshw 

Timeout

300 seconds.

lslocks

Brief

None

Description

None

Module

dump

Command-line

lslocks 

Timeout

300 seconds.

lsmod

Brief

None

Description

None

Module

dump

Command-line

lsmod 

Timeout

300 seconds.

lspci

Brief

None

Description

None

Module

dump

Command-line

lspci -vvn 

Timeout

300 seconds.

lspci_plain

Brief

None

Description

None

Module

dump

Command-line

lspci 

Timeout

300 seconds.

lspci_tree

Brief

None

Description

None

Module

dump

Command-line

lspci -t 

Timeout

300 seconds.

lsusb

Brief

None

Description

None

Module

dump

Command-line

lsusb 

Timeout

300 seconds.

lsusb_tree

Brief

None

Description

None

Module

dump

Command-line

lsusb -t 

Timeout

300 seconds.

lsusb_verbose

Brief

None

Description

None

Module

dump

Command-line

lsusb --verbose 

Timeout

300 seconds.

mdadm_detail

Brief

None

Description

None

Module

dump

Command-line

${NVSMHEALTH_DUMP_TOOLS}/mdadm-detail.sh 

Timeout

300 seconds.

mdadm_examine

Brief

None

Description

None

Module

dump

Command-line

${NVSMHEALTH_DUMP_TOOLS}/mdadm-examine.sh 

Timeout

300 seconds.

mlx_fetch_arm_log

Brief

None

Description

None

Module

dump

Command-line

${NVSMHEALTH_DUMP_TOOLS}/mlnx_arm_logs.sh 

Timeout

300 seconds.

mlxcables

Brief

None

Description

None

Module

dump

Command-line

mst start && mst cable add && mlxcables 

Timeout

300 seconds.

modinfo

Brief

None

Description

None

Module

dump

Command-line

${NVSMHEALTH_DUMP_TOOLS}/modinfo.sh 

Timeout

300 seconds.

mount

Brief

None

Description

None

Module

dump

Command-line

mount 

Timeout

300 seconds.

ntpq

Brief

None

Description

None

Module

dump

Command-line

ntpq -p 

Timeout

300 seconds.

numactl

Brief

None

Description

None

Module

dump

Command-line

numactl --hardware 

Timeout

300 seconds.

nvcc

Brief

None

Description

None

Module

dump

Command-line

nvcc --version 

Timeout

300 seconds.

nvidia_address_text

Brief

None

Description

None

Module

dump

Command-line

${NVSMHEALTH_DUMP_TOOLS}/nvidia_address_text.py 

Timeout

300 seconds.

nvidia_debugdump

Brief

None

Description

None

Module

dump

Command-line

${NVSMHEALTH_DUMP_TOOLS}/nvidia-debugdump.sh 

Timeout

300 seconds.

nvidia_dkms_log

Brief

None

Description

None

Module

dump

Command-line

${NVSMHEALTH_DUMP_TOOLS}/nvidia-dkms-log.sh 

Timeout

300 seconds.

nvidia_driver_ko

Brief

None

Description

None

Module

dump

Command-line

${NVSMHEALTH_DUMP_TOOLS}/nvidia_driver_ko.py 

Timeout

300 seconds.

nvidia_settings

Brief

None

Description

None

Module

dump

Command-line

nvidia-settings -q all 

Timeout

300 seconds.

nvidia_smi

Brief

None

Description

None

Module

dump

Command-line

nvidia-smi 

Timeout

300 seconds.

nvidia_smi_query

Brief

None

Description

None

Module

dump

Command-line

nvidia-smi -q 

Timeout

300 seconds.

nvidia_smi_query_unit

Brief

None

Description

None

Module

dump

Command-line

nvidia-smi -q -u 

Timeout

300 seconds.

nvidia_smi_topo

Brief

None

Description

None

Module

dump

Command-line

nvidia-smi topo -m 

Timeout

300 seconds.

nvidia_smi_xml

Brief

None

Description

None

Module

dump

Command-line

nvidia-smi -q -x 

Timeout

300 seconds.

nvidia_vm_health_check_show

Brief

None

Description

None

Module

dump

Command-line

nvidia-vm health-check show 

Timeout

300 seconds.

nvidia_vm_image_show

Brief

None

Description

None

Module

dump

Command-line

nvidia-vm image show 

Timeout

300 seconds.

nvidia_vm_resources_show

Brief

None

Description

None

Module

dump

Command-line

nvidia-vm resources show 

Timeout

300 seconds.

nvme_list

Brief

None

Description

None

Module

dump

Command-line

nvme list 

Timeout

300 seconds.

nvme_list

Brief

Collect list of NVMe devices using the nvme-cli tool

Description

None

Module

nvme

Command-line

nvme list --output-format=json 

Timeout

300 seconds.

nvme_logs

Brief

None

Description

None

Module

dump

Command-line

${NVSMHEALTH_DUMP_TOOLS}/nvme-logs.sh 

Timeout

300 seconds.

nvsm_health_show_debug

Brief

None

Description

None

Module

dump

Command-line

nvsm-health --show --log-level=debug 

Timeout

300 seconds.

nvsm_show

Brief

None

Description

None

Module

dump

Command-line

nvsm show -level all 

Timeout

300 seconds.

nvsm_show_alerts

Brief

None

Description

None

Module

dump

Command-line

nvsm show alerts 

Timeout

300 seconds.

nvsm_show_debug

Brief

None

Description

None

Module

dump

Command-line

nvsm --log-level=debug show -level all 

Timeout

300 seconds.

ofed_info

Brief

None

Description

None

Module

dump

Command-line

ofed_info 

Timeout

300 seconds.

perl

Brief

None

Description

None

Module

dump

Command-line

perl -v 

Timeout

300 seconds.

perl_hello_world

Brief

None

Description

None

Module

dump

Command-line

${NVSMHEALTH_DUMP_TOOLS}/hello.pl 

Timeout

300 seconds.

ping_compute

Brief

None

Description

None

Module

dump

Command-line

ping -w 5 ngc.nvidia.com 

Timeout

300 seconds.

printenv

Brief

None

Description

None

Module

dump

Command-line

printenv 

Timeout

300 seconds.

ps

Brief

None

Description

None

Module

dump

Command-line

ps -wwo pid,uid,pcpu,pmem,etime,state,ppid,user,args --pid 2 --ppid 2 \
    --deselect 

Timeout

300 seconds.

ps_aux

Brief

None

Description

None

Module

dump

Command-line

ps aux 

Timeout

300 seconds.

psu_info_dgx1

Brief

None

Description

None

Module

dump

Command-line

${NVSMHEALTH_DUMP_TOOLS}/psu_info_dgx1.sh 

Timeout

300 seconds.

python

Brief

None

Description

None

Module

dump

Command-line

python --version 

Timeout

300 seconds.

python_hello_world

Brief

None

Description

None

Module

dump

Command-line

${NVSMHEALTH_DUMP_TOOLS}/hello.py 

Timeout

300 seconds.

run_bmc_boot_slot_task

Brief

Run ipmitool raw 0x3C 0x3 0x0

Description

Get bmc boot slot. This command requires superuser privileges.

Module

cec_info

Command-line

ipmitool raw 0x3C 0x3 0x0 

Timeout

300 seconds.

run_cec_boot_status

Brief

Run ipmitool raw 0x3C 0x68 0x00

Description

Get boot status. This command requires superuser privileges.

Module

cec_info

Command-line

ipmitool raw 0x3C 0x68 0x00 

Timeout

300 seconds.

run_cec_version

Brief

Run ipmitool raw 0x3C 0xF 0x9

Description

Get CEC version. This command requires superuser privileges.

Module

cec_info

Command-line

ipmitool raw 0x3C 0xF 0x9 

Timeout

300 seconds.

run_dmidecode

Brief

Run the dmidecode command

Description

Verify system as described by SMBIOS/DMI using the dmidecode tool

Module

dmidecode

Command-line

dmidecode 

Timeout

300 seconds.

run_dmidecode_memory

Brief

Run the dmidecode command

Description

Run the “dmidecode” command to get memory DMI type information. Some flags are added to output in a machine-readable format. This command does not require superuser privileges.

Module

dmidecode

Command-line

dmidecode --type memory 

Timeout

300 seconds.

run_dpkg_grep_kvm

Brief

Run dpkg list and grep for kvm package

Description

None

Module

kvm

Used By

Command-line

bash -c "dpkg -l | grep -c dgx-kvm-sw" 

Timeout

300 seconds.

run_gpu_monitor_status

Brief

Execute GET on nvsm_core

Description

This runs the “nvsm_core –mode=client GET /nvsm/v1/Systems/1/GPUs” command to obtain gpumonitor status information.

Module

vgpu

Command-line

nvsm_core --mode=client GET /nvsm/v1/Systems/1/GPUs 

Timeout

300 seconds.

run_ipmi_fru

Brief

Run ipmitool fru print command

Description

This runs the “ipmitool fru” command to obtain FRU (field replaceable unit) information from the BMC (baseboard management controller). FRU information is important for keeping inventory of the components installed on the system and their serial numbers.

Module

ipmitool

Depends On

Command-line

ipmitool fru print 

Timeout

300 seconds.

run_ipmi_getenables

Brief

Run ipmitool mc getenables command

Description

Check BMC status with ipmitool. This command requires superuser privileges.

Module

ipmitool

Depends On

Command-line

ipmitool mc getenables 

Timeout

300 seconds.

run_ipmi_info

Brief

Run ipmitool mc info command

Description

Check BMC status with ipmitool. This command requires superuser privileges.

Command-line

ipmitool mc info 

Timeout

300 seconds.

run_ipmi_sdr_elist

Brief

Run ipmitool sdr elist command

Description

Check BMC bom devices with ipmitool. This command requires superuser privileges.

Module

ipmitool

Command-line

ipmitool sdr elist 

Timeout

300 seconds.

run_ipmi_sensor

Brief

Run ipmitool sensor command

Description

Check BMC sensor status with ipmitool. This command requires superuser privileges.

Module

ipmitool

Command-line

ipmitool sensor 

Timeout

300 seconds.

run_ipmitool

Brief

Run the ipmitool command

Description

This simply runs the “ipmitool” command to make sure that ipmitool is able to access the BMC (baseboard management controller).

Module

bmc

Command-line

ipmitool 

Timeout

300 seconds.

run_lsblk_scsi_device_info

Brief

Run the lsblk utility

Description

Run the “lsblk” utility to get info for scsi block devices. Get the output in json format.

Module

lsblk

Command-line

lsblk -S -P -o NAME,HCTL,TYPE,VENDOR,MODEL,REV,TRAN 

Timeout

300 seconds.

run_lscpu

Brief

Run lscpu command

Description

Verify hyperthreading and NUMA are enabled

Module

lscpu

Used By

Command-line

lscpu 

Timeout

300 seconds.

run_lspci

Brief

Run the lspci command

Description

Run the “lspci” command to list PCI devices. Some flags are added such that lspci output is printed in a machine-readable format. This command does not require superuser privileges.

Module

lspci

Used By

Command-line

lspci -vmm -nn 

Timeout

300 seconds.

run_lspci_n

Brief

Run the lspci command

Description

Run the “lspci” command to list PCI devices. Some flags are added such that lspci output is printed in a machine-readable format. This command does not require superuser privileges.

Module

lspci

Used By

Command-line

lspci -vmm -n 

Timeout

300 seconds.

run_lspci_verbose

Brief

Run the lspci command with verbose flags

Description

Run the “lspci” command with verbose flags to show detailed information about PCI devices. This command requires superuser privileges in order to read privileged PCI device registers. Much of the verbose output from lspci is not necessarily in a machine-readable format.

Module

lspci

Command-line

lspci -vvv -nn -D 

Timeout

300 seconds.

run_mlxfwmanager

Brief

Collect details of mellanox devices firmware version using Mellanox Firmware Manager

Description

None

Module

mlnx

Command-line

mlxfwmanager --query-format xml 

Timeout

300 seconds.

run_net_ifconfig

Brief

Run ifconfig command to show all network interfaces

Description

See all network interfaces

Module

net

Command-line

ifconfig -a 

Timeout

300 seconds.

run_nvidia_smi_gpu_bus_id

Brief

Collect GPU’s identified with the NVIDIA System Management Interface (nvidia-smi) tool

Description

None

Module

nvidia_smi

Command-line

nvidia-smi --query-gpu=gpu_bus_id --format=csv,noheader 

Timeout

300 seconds.

run_nvidia_smi_p2p_topology

Brief

Collect GPUs p2p topology using the nvidia-smi tool

Description

None

Module

nvidia_smi

Command-line

nvidia-smi topo -p2p rw 

Timeout

300 seconds.

run_nvidia_smi_topology

Brief

Collect GPUDirect topology using the nvidia-smi tool

Description

None

Module

nvidia_smi

Command-line

nvidia-smi topo --matrix 

Timeout

300 seconds.

run_smartctl_scan

Brief

Run the smartctl utility

Description

Run the “smartctl” utility to scan for devices. Some flags are added to output in a machine-readable format. This command requires superuser privileges.

Module

smartctl

Command-line

smartctl --scan 

Timeout

300 seconds.

run_storcli_pall

Brief

Run the storcli command

Description

None

Module

storcli

Command-line

storcli64 /c0/pall show all J 

Timeout

300 seconds.

run_storcli_vall

Brief

Run the storcli command

Description

None

Module

storcli

Command-line

storcli64 /c0/vall show all J 

Timeout

300 seconds.

run_storcli_version

Brief

Run the storcli command

Description

None

Module

storcli

Command-line

storcli64 -v -NoLog 

Timeout

300 seconds.

run_xl_info

Brief

Run the “xl info” command for XenServer information

Description

The “xl info” command prints basic information about the running XenServer hypervisor.

Module

xenserver

Used By

Command-line

xl info 

Timeout

300 seconds.

service_cachefilesd_status

Brief

None

Description

None

Module

dump

Command-line

service cachefilesd status 

Timeout

300 seconds.

service_status_all

Brief

None

Description

None

Module

dump

Command-line

service --status-all 

Timeout

300 seconds.

smartctl

Brief

None

Description

None

Module

dump

Command-line

${NVSMHEALTH_DUMP_TOOLS}/smartctl.sh 

Timeout

300 seconds.

smartctl_scan

Brief

None

Description

None

Module

dump

Command-line

smartctl --scan 

Timeout

300 seconds.

storcli_cmds

Brief

None

Description

None

Module

dump

Command-line

${NVSMHEALTH_DUMP_TOOLS}/storcli_cmds.sh 

Timeout

300 seconds.

sysctl

Brief

None

Description

None

Module

dump

Command-line

sysctl -a 

Timeout

300 seconds.

sysfs_dmi_bios_version

Brief

Determine BIOS version in DMI table via sysfs

Description

This command reads the BIOS version stored in the DMI table by reading its value using sysfs. The product name is used to determine which BIOS version is running with, e.g. DGX-1, DGX-2, or DGX Station. This command does not require superuser privileges.

Module

sysfs

Command-line

cat /sys/devices/virtual/dmi/id/bios_version 

Timeout

300 seconds.

sysfs_dmi_product_name

Brief

Determine product name in DMI table via sysfs

Description

This command reads the product name stored in the DMI table by reading its value using sysfs. The product name is used to determine which platform NVSysinfo is running on, e.g. DGX-1, DGX-2, or DGX Station. This command does not require superuser privileges.

Module

sysfs

Command-line

cat /sys/devices/virtual/dmi/id/product_name 

Timeout

300 seconds.

sysfs_dmi_system_vendor

Brief

Determine system vendor in DMI table via sysfs

Description

This command reads the system vendor name (sometimes also “Manufacturer”) stored in the DMI table by reading its value using sysfs. On DGX systems this will be “NVIDIA”, but might be some other string depending on the system. This command does not require superuser privileges.

Module

sysfs

Command-line

cat /sys/devices/virtual/dmi/id/sys_vendor 

Timeout

300 seconds.

timedatectl_status

Brief

None

Description

None

Module

dump

Command-line

timedatectl status 

Timeout

300 seconds.

top

Brief

None

Description

None

Module

dump

Command-line

top -b -n 5 

Timeout

300 seconds.

ulimit

Brief

None

Description

None

Module

dump

Command-line

bash -c "ulimit -a" 

Timeout

300 seconds.

uname

Brief

None

Description

None

Module

dump

Command-line

uname -a 

Timeout

300 seconds.

uptime

Brief

Run uptime command

Description

Check system uptime with the uptime utility

Module

system

Command-line

uptime -p 

Timeout

300 seconds.

virsh_list_all

Brief

None

Description

None

Module

dump

Command-line

virsh list --all 

Timeout

300 seconds.

xenserver_status_report

Brief

None

Description

None

Module

dump

Command-line

${NVSMHEALTH_DUMP_TOOLS}/xenserver-status-report.sh 

Timeout

300 seconds.

xl_info

Brief

None

Description

None

Module

dump

Command-line

xl info 

Timeout

300 seconds.

xrandr

Brief

None

Description

None

Module

dump

Command-line

xrandr --verbose 

Timeout

300 seconds.

xset

Brief

None

Description

None

Module

dump

Command-line

xset -q 

Timeout

300 seconds.

Health Checks Performed

check_blacklist_recommendations

Brief

Check DCGM for GPU blacklist recommendations

Description

None

Source Code Listing

    def run(self):

        own_params = self.getParameters()
        params = self.__parameters_task.getResult()

        check_blacklist_recommendations = False
        for param, req in own_params.items():
            if param in params.keys():
                check_blacklist_recommendations = params[param]

        # Return if parameter is not defined
        if not check_blacklist_recommendations:
            return

        # check if kvm mode is on
        if modules.kvm.kvm_mode_on.getResult() == True:
            self.addCheckMessage('KVM mode is on, skipping blacklist recommendations check.')
            self.addInformational()
            return

        # Run the blacklist_recommendations task
        args = ['./modules/blacklist_recommendations/_gpu_blacklist_recommendations', \
            '--detect', '--watches']
        args.extend(check_blacklist_recommendations)
        collect_task = tasks.RunCommand(args=args, timeout=1000)
        collect_task.run()

        # For DCGM failures, GPU blacklist recommendations can
        # exit with returncode 1, handle it gracefully
        # Return, for no respone or exitcode > 1
        if (collect_task.getReturnCode() > 1) or not collect_task.getOutput():
            self.addCheckMessage('No response or error while running GPU blacklist recommendations: {}'.format(
                                    collect_task.getError()))
            self.addUnknown()
            return

        healthy = True

        try:
            result = json.loads(collect_task.getOutput())
            blacklist = result.get('blacklistedGpus', {})
            # Check for GPU/NVSwitch blacklist recommendations
            if len(blacklist) > 0:
                healthy = False
                self.addCheckMessage('Found {count} device(s) recommended for blacklist:'.format(
                    count=len(blacklist)))
            else:
                self.addCheckMessage('No devices found recommended for blacklist.')

            for entity_id in sorted(blacklist.keys()):
                details = blacklist[entity_id]
                device_uuid = details.get('UUID')
                device_bdf = details.get('BDF')
                failure_explanation = details.get('Failure Explanation')
                self.addCheckMessage('\t"GPU{entity_id}":\n' \
                                     '\t"BDF": "{device_bdf}"\n' \
                                     '\t"UUID": "{device_uuid}"\n' \
                                     '\t"Failure Explanation": {failure_explanation}'.format(
                                     entity_id=entity_id,
                                     device_bdf=device_bdf,
                                     device_uuid=device_uuid,
                                     failure_explanation=failure_explanation))
            # Check for other errors in blacklist recommendation script
            error_list = result.get('errors', [])
            if error_list:
                nv_hostengine_running = True
                self.addCheckMessage('Errors encountered:')
                for e in error_list:
                    if 'host engine is not valid any longer' in e:
                        nv_hostengine_running = False
                    self.addCheckMessage('\t{}'.format(e))

                # If nv-hostengine is not running return as unknown
                if not nv_hostengine_running:
                    self.addUnknown()
                    return

                healthy = False

        except Exception as e:
            self.addCheckMessage('Error while parsing GPU blacklist recommendations: {}'.format(e.message))
            self.addUnknown()
            return

        # make sure SBE page pending retirements are caught as informational,
        # as the blacklist_recommendations script ignores them as warnings
        if healthy:
            nvidia_smi_res = modules.nvidia_smi.parse_nvidia_smi.getResult()
            if nvidia_smi_res:
                for gpu, info in nvidia_smi_res.items():
                    gpu_dict = xmltodict.parse(info)
                    check_cls = modules.nvidia_smi.GpuCheckRetiredPagesPending(gpu, gpu_dict)
                    check_cls.setCallback(self.getCallback())
                    check_cls.run()
                    bad_count = check_cls.getResult()['unhealthy'] + check_cls.getResult()['unknown']
                    if bad_count:
                        healthy = False
                        self.addCheckMessage(check_cls.getTitle() )
                if not healthy:
                    self.addCheckMessage(config.gpu_total_retired_pages_pending_error)
                    self.addInformational()
                    return

        if healthy:
            self.addHealthy()
        else:
            self.addUnHealthy()

check_bom_dimms

Brief

Check Memory DIMMs devices information for consistency

Description

None

Module

bom

Source Code Listing

    def run(self):
        generic_bom_check(self, self.__parameters_task, self.__parse_dmidecode_task, \
            config.dimms_info_str, config.dimms_command_str)

check_bom_disk_controllers

Brief

Check Disk Controllers PCIe devices information for consistency

Description

None

Module

bom

Source Code Listing

    def run(self):
        generic_bom_check(self, self.__parameters_task, self.__parse_lspci_task, \
            config.disk_controllers_info_str, config.disk_controllers_command_str, \
            config.disk_controllers_pci_device_missing_str, config.pci_device_changed_str)

check_bom_ethernet_controllers

Brief

Check Ethernet Controllers PCIe devices information for consistency

Description

None

Module

bom

Source Code Listing

    def run(self):
        # Get parameters from own task
        own_params = self.getParameters()

        # Get parameters from parameter task
        params = self.__parameters_task.getResult()

        bom_config = None
        for param, req in own_params.items():
            if param in params.keys():
               bom_config = params[param]
        # If parameter is not found in platform parameters - Do Nothing
        if bom_config == None:
            return
        # Print/Stream Task info and command messages
        self.title(config.ethernet_controllers_info_str)
        self.addCheckMessage(config.ethernet_controllers_command_str)

        if self.__parse_lspci_task.getResult() != None:
            # Compare task output with expected config
            res = self.__parse_lspci_task.getResult()
            if type(res) is dict:
                out_dict = res
            else:
                out_dict = json.loads(res)

            # dictionary compare
            ddiff = DeepDiff(out_dict, bom_config)

            message = ''
            result = 'Healthy'
            if any(key in ddiff for key in ['dictionary_item_added',
                                            'values_changed']):
                if 'dictionary_item_added' in ddiff:
                    for item in ddiff['dictionary_item_added']:
                        key = re.findall('\[\'(.*?)\'\]', item)
                        message += '\n'
                        message += config.ethernet_controllers_pci_device_missing_str.format(' -> '.join(key))
                    result = 'UnHealthy'

                if 'values_changed' in ddiff:
                    for key, value in ddiff['values_changed'].items():
                        key = re.findall('\[\'(.*?)\'\]', key)
                        message += '\n'
                        # Best effort to add additional_message_key information
                        try:
                            message += 'For {} with '.format(ref_dict[key[0]]['device'])
                        except:
                            pass
                        message += config.pci_device_changed_str.format(
                                ' -> '.join(key),
                                value['old_value'],
                                value['new_value'])
                    if result == 'Healthy':
                        result = 'Informational'
            self.addCheckMessage(message)

            if result == 'Healthy':
                # Healthy check - No diffs found in bill-of-materials
                self.addHealthy()
            elif result == 'UnHealthy':
                # UnHealthy check - Print/Stream diffs found in bill-of-materials
                self.addUnHealthy()
            elif result == 'Informational':
                # Informational status - print change in config
                self.addInformational()
            else:
                self.addUnknown()

check_bom_gpus

Brief

Check GPUs PCIe devices information for consistency

Description

None

Module

bom

Source Code Listing

    def run(self):
        generic_bom_check(self, self.__parameters_task, self.__parse_lspci_task, \
            config.gpus_info_str, config.gpus_command_str, \
            config.gpus_pci_device_missing_str, config.pci_device_changed_str)

check_bom_ib_controllers

Brief

Check Infiband controllers PCIe devices information for consistency

Description

None

Module

bom

Source Code Listing

    def run(self):
        generic_bom_check(self, self.__parameters_task, self.__parse_lspci_task, \
            config.ib_controllers_info_str, config.ib_controllers_command_str, \
            config.ib_controllers_pci_device_missing_str, config.pci_device_changed_str)

check_bom_nvswitch

Brief

Check NVSwitch controller PCIe devices information for consistency

Description

None

Module

bom

Source Code Listing

    def run(self):
        generic_bom_check(self, self.__parameters_task, self.__parse_lspci_task, \
            config.nvswitch_info_str, config.nvswitch_command_str, \
            config.nvswitch_pci_device_missing_str, config.pci_device_changed_str)

check_bom_pcie_switches

Brief

Check PCIe Switches PCIe devices information for consistency

Description

None

Module

bom

Source Code Listing

    def run(self):
        generic_bom_check(self, self.__parameters_task, self.__parse_lspci_task, \
            config.pcie_switches_info_str, config.pcie_switches_command_str, \
            config.pcie_switches_pci_device_missing_str, config.pci_device_changed_str)

check_bom_vgas

Brief

Check VGA Controller PCIe devices information for consistency

Description

None

Module

bom

Source Code Listing

    def run(self):
        generic_bom_check(self, self.__parameters_task, self.__parse_lspci_task, \
            config.vgas_info_str, config.vgas_command_str, \
            config.vgas_pci_device_missing_str, config.pci_device_changed_str)

check_dcc_can_health

Brief

Drive Constellation: Check DCC CAN Health

Description

None

Module

dcs_modules

Source Code Listing

     def run(self):
        json_output = self.__run_dcc_hardware_health.getOutput()
        if json_output != None:
            try:
                dcc_can_health = json_output['dcc_can_health']
                if dcc_can_health['test_result']  == "Healthy":
                    self.addHealthy()
                else:
                    self.addUnHealthy()
                    self.addCheckMessage(dcc_can_health['test_information'])

            except Exception as e:
                logging.debug("Error fetching can health info: {}".format(e))
                self.addUnknown()
                return
        self.addHealthy()

check_dcc_can_reachability

Brief

Drive Constellation: Check DCC CAN reachability Health

Description

None

Module

dcs_modules

Source Code Listing

     def run(self):

        json_output = self.__run_check_application_config_health.getOutput()
        if json_output != None:
            try:
                dcc_can_reachability = json_output['can_reachability']
                if dcc_can_reachability['test_result']  == "Healthy":
                    self.addHealthy()
                else:
                    self.addUnHealthy()
                    self.addCheckMessage(dcc_can_reachability['test_information'])
                    #self.__output(dcc_display_configuration['test_information'])

            except Exception as e:
                logging.debug("Error fetching DCC CAN health info: {}".format(e))
                self.addUnknown()
                return
        self.addHealthy()

check_dcc_display_configuration

Brief

Drive Constellation: Check DCC Display Configuration Health

Description

None

Module

dcs_modules

Source Code Listing

     def run(self):
        #self.addCheckMessage("Checking Application Config health")
        #import pdb
        #pdb.set_trace()
        json_output = self.__run_check_application_config_health.getOutput()
        if json_output != None:
            try:
                dcc_display_configuration = json_output['display_configuration']
                if dcc_display_configuration['test_result']  == "Healthy":
                    self.addHealthy()
                else:
                    self.addUnHealthy()
                    self.addCheckMessage(dcc_display_configuration['test_information'])
                    #self.__output(dcc_display_configuration['test_information'])

            except Exception as e:
                logging.debug("Error fetching DCC Display health info: {}".format(e))
                self.addUnknown()
                return
        self.addHealthy()

check_dcc_display_synchronization

Brief

Drive Constellation: Check DCC Display Synchronization Health

Description

None

Module

dcs_modules

Source Code Listing

     def run(self):
        json_output = self.__run_check_application_config_health.getOutput()
        if json_output != None:
            try:
                dcc_display_synchronization = json_output['display_synchronization']
                if dcc_display_synchronization['test_result']  == "Healthy":
                    self.addHealthy()
                else:
                    self.addUnHealthy()
                    self.addCheckMessage(dcc_display_synchronization['test_information'])
                    #self.__output(dcc_display_configuration['test_information'])

            except Exception as e:
                print("Error fetching ethernet health info: {}".format(e))
                self.addUnknown()
                return
        self.addHealthy()

check_dcc_ecu_tegraA_health

Brief

Drive Constellation: Check DCC ECU TegraA Health

Description

None

Module

dcs_modules

Source Code Listing

     def run(self):
        self.addCheckMessage("Checking DCC ECU TegraA Hardware health")
        #import pdb
        #pdb.set_trace()
        json_output = self.__run_dcc_hardware_health.getOutput()
        if json_output != None:
            try:
                dcc_tegraA_health = json_output['tegraA_health']
                if dcc_tegraA_health['test_result']  == "Healthy":
                    self.addHealthy()
                else:
                    self.addUnHealthy()
                    self.addCheckMessage(dcc_tegraA_health['test_information'])

            except Exception as e:
                print("Error fetching ecu TegraA health info: {}".format(e))
                self.addUnknown()
                return
        self.addHealthy()

check_dcc_ecu_tegraA_storage_health

Brief

Drive Constellation: Check DCC ECU TegraA Storage Health

Description

None

Module

dcs_modules

Source Code Listing

     def run(self):
        self.addCheckMessage("Checking DCC ECU TegraA Storage health")
        #import pdb
        #pdb.set_trace()
        json_output = self.__run_dcc_ecu_application_health.getOutput()
        if json_output != None:
            try:
                dcc_tegraB_health = json_output['tegraA_health']
                if dcc_tegraB_health['test_result']  == "Healthy":
                    self.addHealthy()
                else:
                    self.addUnHealthy()
                self.addCheckMessage(dcc_tegraB_health['test_information'])

            except Exception as e:
                print("Error fetching ecu TegraA health info: {}".format(e))
                self.addUnknown()
                return
        self.addHealthy()

check_dcc_ecu_tegraB_health

Brief

Drive Constellation: Check DCC ECU TegraB Health

Description

None

Module

dcs_modules

Source Code Listing

     def run(self):
        self.addCheckMessage("Checking DCC ECU TegraB Hardware health")
        #import pdb
        #pdb.set_trace()
        json_output = self.__run_dcc_hardware_health.getOutput()
        if json_output != None:
            try:
                dcc_tegraB_health = json_output['tegraB_health']
                if dcc_tegraB_health['test_result']  == "Healthy":
                    self.addHealthy()
                else:
                    self.addUnHealthy()
                    self.addCheckMessage(dcc_tegraB_health['test_information'])

            except Exception as e:
                print("Error fetching ecu TegraB health info: {}".format(e))
                self.addUnknown()
                return
        self.addHealthy()

check_dcc_ecu_tegraB_storage_health

Brief

Drive Constellation: Check DCC ECU TegraB Storage Health

Description

None

Module

dcs_modules

Source Code Listing

     def run(self):
        self.addCheckMessage("Checking DCC ECU TegraB Storage health")
        #import pdb
        #pdb.set_trace()
        json_output = self.__run_dcc_ecu_application_health.getOutput()
        if json_output != None:
            try:
                dcc_tegraB_health = json_output['tegraB_health']
                if dcc_tegraB_health['test_result']  == "Healthy":
                    self.addHealthy()
                else:
                    self.addUnHealthy()
                self.addCheckMessage(dcc_tegraB_health['test_information'])

            except Exception as e:
                print("Error fetching ecu TegraB health info: {}".format(e))
                self.addUnknown()
                return
        self.addHealthy()

check_dcc_ethernet_health

Brief

Drive Constellation: Check DCC Ethernet Health

Description

None

Module

dcs_modules

Source Code Listing

     def run(self):
        self.addCheckMessage("Checking DCS Hardware health")
        #import pdb
        #pdb.set_trace()
        json_output = self.__run_dcc_hardware_health.getOutput()
        if json_output != None:
            try:
                dcc_ethernet_health = json_output['dcc_ethernet_health']
                if dcc_ethernet_health['test_result']  == "Healthy":
                    self.addHealthy()
                else:
                    self.addUnHealthy()
                    self.addCheckMessage(dcc_ethernet_health['test_information'])

            except Exception as e:
                logging.debug("Error fetching ethernet health info: {}".format(e))
                self.addUnknown()
                return
        self.addHealthy()

check_dcc_fan_health

Brief

Drive Constellation: Check DCC Fan Health

Description

None

Module

dcs_modules

Source Code Listing

     def run(self):
        json_output = self.__run_dcc_hardware_health.getOutput()
        if json_output != None:
            try:
                dcc_fan_health = json_output['dcc_fan_health']
                if dcc_fan_health['test_result']  == "Healthy":
                    self.addHealthy()
                else:
                    self.addUnHealthy()
                    self.addCheckMessage(dcc_fan_health['test_information'])
                    return
            except Exception as e:
                logging.debug("Error fetching ethernet health info: {}".format(e))
                self.addUnknown()
                return
        self.addHealthy()

check_dcc_gpu_health

Brief

Drive Constellation: Check DCC GPU Health

Description

None

Module

dcs_modules

Source Code Listing

     def run(self):
        json_output = self.__run_dcc_hardware_health.getOutput()
        if json_output != None:
            try:
                dcc_gpu_health = json_output['dcc_gpu_health']
                if dcc_gpu_health['test_result']  == "Healthy":
                    self.addHealthy()
                else:
                    self.addUnHealthy()
                    self.addCheckMessage(dcc_gpu_health['test_information'])

            except Exception as e:
                logging.debug("Error fetching gpu health info: {}".format(e))
                self.addUnknown()
                return
        self.addHealthy()

check_dcc_info

Brief

Drive Constellation: Get DCC Info

Description

None

Module

dcs_modules

Source Code Listing

    def run(self):
        json_output = self.__run_dcc_health_api_task.getOutput()
        if json_output != None:
            try:
                dcc_info = json_output['dcc_info']
                self.addHealthy()
                for key, value in dcc_info.items():
                    key = key[0:].replace('_', ' ')
                    key = key[0:].title()
                    self.send("{:20} : {:20}".format(key, value))
            except Exception as e:
                self.addUnHealthy()
                logging.debug("Error fetching dcc info: {}".format(e))
                self.__output = None 
                #self.addUnknown()
                return

check_dcc_network_reachability

Brief

Drive Constellation: Check DCC Network Reachability

Description

None

Module

dcs_modules

Source Code Listing

     def run(self):
        json_output = self.__run_check_application_config_health.getOutput()
        if json_output != None:
            try:
                dcc_network_reachability = json_output['network_reachability']
                if dcc_network_reachability['test_result']  == "Healthy":
                    self.addHealthy()
                else:
                    self.addUnHealthy()
                    self.addCheckMessage(dcc_network_reachability['test_information'])
                    #self.__output(dcc_display_configuration['test_information'])

            except Exception as e:
                logging.debug("Error fetching network reachability info: {}".format(e))
                self.addUnknown()
                return
        self.addHealthy()

check_dcc_serializer_configuration

Brief

Drive Constellation: Check DCC Serializer Configuration Health

Description

None

Module

dcs_modules

Source Code Listing

     def run(self):
        json_output = self.__run_check_application_config_health.getOutput()
        if json_output != None:
            try:
                dcc_serializer_configuration = json_output['serializer_configuration']
                if dcc_serializer_configuration['test_result']  == "Healthy":
                    self.addHealthy()
                else:
                    self.addUnHealthy()
                    self.addCheckMessage(dcc_serializer_configuration['test_information'])
                    #self.__output(dcc_display_configuration['test_information'])

            except Exception as e:
                logging.debug("Error fetching serializer_configuration info: {}".format(e))
                self.addUnknown()
                return
        self.addHealthy()

check_dcc_usb_health

Brief

Drive Constellation: Check DCC USB Health

Description

None

Module

dcs_modules

Source Code Listing

     def run(self):
        json_output = self.__run_dcc_hardware_health.getOutput()
        if json_output != None:
            try:
                dcc_usb_health = json_output['dcc_usb_health']
                if dcc_usb_health['test_result']  == "Healthy":
                    self.addHealthy()
                else:
                    self.addUnHealthy()
                    self.addCheckMessage(dcc_usb_health['test_information'])

            except Exception as e:
                logging.debug("Error fetching usb health info: {}".format(e))
                self.addUnknown()
                return
        self.addHealthy()

check_dcc_usb_reachability

Brief

Drive Constellation: Check DCC USB reachability Health

Description

None

Module

dcs_modules

Source Code Listing

     def run(self):

        json_output = self.__run_check_application_config_health.getOutput()
        if json_output != None:
            try:
                dcc_usb_reachability = json_output['usb_reachability']
                if dcc_usb_reachability['test_result']  == "Healthy":
                    self.addHealthy()
                else:
                    self.addUnHealthy()
                    self.addCheckMessage(dcc_usb_reachability['test_information'])
                    #self.__output(dcc_display_configuration['test_information'])

            except Exception as e:
                logging.debug("Error fetching DCC USB reachability health info: {}".format(e))
                self.addUnknown()
                return
        self.addHealthy()

check_dcs_psu_info

Brief

Drive Constellation: Check DCC PSU Info

Description

None

Module

dcs_modules

Used By

Source Code Listing

    def run(self):
        Health = True
        try:
            dcs_psu_attrib_values = self.__parameters_task.getResult()['dcs_psu_attrib_values']
        except:
            # Could not get list of valid values
            self.addCheckMessage("Could not get list of valid PSU values")
            self.addUnknown()
            return

        # Get the parsed results
        dcs_psu_results = {}
        dcs_psu_results['PSU-0'] = self.__parse_psu0_task.getOutput()
        dcs_psu_results['PSU-1'] = self.__parse_psu1_task.getOutput()

        # Check PSU Vendor
        Msg = self.check_psu_attrib('Vendor', dcs_psu_attrib_values, dcs_psu_results)
        if Msg != '':
            self.addCheckMessage(Msg)
            Health = False

        # Check PSU Model
        Msg = self.check_psu_attrib('Model', dcs_psu_attrib_values, dcs_psu_results)
        if Msg != '':
            self.addCheckMessage(Msg)
            Health = False

        if Health == True:
            self.addHealthy()
        elif Health == False:
            self.addUnHealthy()
        else:
            self.addUnknown()

check_dimm_part_number

Brief

Verify DIMM part number

Description

None

Module

dimm

Source Code Listing

    def run(self):
        # Return if parameter is not defined
        try:
            dimm_boms = self.__parameters_task.getResult()['dimm_bom']
            part_number = self.__parameters_task.getResult()['dimm_part_number']
        except:
            return

        # Unknown check for no result from dmidecode
        res = self.__dimm_task.getResult()
        if not res:
            self.addCheckMessage('No result from parse dmidecode output')
            self.addUnknown()
            return

        healthy = True
        for dimm in dimm_boms:
            if dimm in res.keys():
                if 'part_number' in res[dimm].keys():
                    if res[dimm]['part_number'].strip() not in part_number:
                        self.addCheckMessage('Mismatch in DIMM "{}" part number, expected is "{}" found is "{}"'.format(dimm, " or ".join(part_number), res[dimm]['part_number']))
                        healthy = False
                else:
                    self.addCheckMessage('DIMM "{}" part number not found'.format(dimm))
                    healthy = False
            else:
                # Must be caught on checking DIMMs
                pass

        if healthy:
            self.addHealthy()
        else:
            self.addUnHealthy()

check_dimm_vendors

Brief

Verify DIMM vendors

Description

None

Module

dimm

Source Code Listing

    def run(self):

        try:
            dimm_vendors = self.__parameters_task.getResult()['dmidecode_dimm_vendors']
        except:
            return

        out = self.__get_dimm_vendors_task.getResult()
        if not out:
            self.addCheckMessage('ERROR: Could not parse dmidecode output')
            self.addUnknown()
            return

        healthy = True
        for dimm in out:
            if dimm not in dimm_vendors:
                self.addCheckMessage('Unknown DIMM vendor "{value}"'.format(value=dimm))
                healthy = False
            else:
                # Found the expected dimm vendor
                pass

        if healthy:
            self.addHealthy()
        else:
            self.addUnHealthy()

check_ecu_info

Brief

Drive Constellation: Get ECU Info

Description

None

Module

dcs_modules

Source Code Listing

    def run(self):
        json_output = self.__run_dcc_health_api_task.getOutput()
        if json_output != None:
            try:
                ecu_info = json_output['ecu_info']
                self.addHealthy()
                for key, value in ecu_info.items():
                    key = key[0:].replace('_', ' ')
                    key = key[0:].title()
                    self.send("{:20} : {:20}".format(key, value))
            except Exception as e:
                self.addUnHealthy()
                logging.debug("Error fetching dcc info: {}".format(e))
                self.__output = None
                #self.addUnknown()
                return

check_ethernet_controller_info

Brief

None

Description

None

Module

bom

Source Code Listing

    def run(self):
        try:
            devices = self.__parameters_task.getResult()[self.__parameter]
        except:
            return
        for device in deepcopy(devices):
            pstate_not_found = False
            if self.__is_gpu_check:
                bdf_pstate = self.__bdf_pstate.getResult()
                try:
                    pstate = bdf_pstate[device['bdf']]
                    device['speed'] = device['speed'][pstate]
                    device['width'] = device['width'][pstate]
                except:
                    # not able to find the pstate for this gpu bdf
                    pstate_not_found = True
            kvm_mode_disabled = True
            if self.__parameter== "gpu_link_info"  and modules.kvm.kvm_mode_on.getResult() == True:
                kvm_mode_disabled = False
            for check_type in ['speed', 'width']:

                check_cls = CheckLink(device['bdf'], device[check_type], check_type, self.__parse_lspci_task, self.__parameter)
                if 'name' not in device:
                    device["name"] = ""
                check_cls.setCallback(self.getCallback())
                if kvm_mode_disabled:
                    if pstate_not_found:
                        device['speed'] = "None"
                        device['width'] = "None"
                        check_cls.addCheckMessage("unknown pstate for the GPU[{}]".format(device['bdf']))
                        check_cls.addUnknown()
                    else:
                        check_cls.run()
                else:
                    device['speed'] = "None"
                    device['width'] = "None"
                    check_cls.addCheckMessage('KVM mode is on, skipping check.')
                    check_cls.addInformational()
                    self.addInformational()

                check_cls.title(self.__title_str[check_type].format(**device).strip())
                if kvm_mode_disabled:
                    self.addHealthy(count=check_cls.getResult()['healthy'])
                    self.addUnHealthy(count=check_cls.getResult()['unhealthy'])
                    self.addUnknown(count=check_cls.getResult()['unknown'])
                    self.addInformational(count=check_cls.getResult()['informational'])
                check_cls.sendComplete()
                super().addMessages(check_cls.getMessages())

        # clear message as this task doesnt print anything
        self.title('')

check_fan_bom

Brief

Verify chassis fan presence

Description

None

Module

ipmitool

Source Code Listing

    def run(self):
        from nvsmhealth.lib import DictionarySuperset
        try:
            fan_bom = self.__parameters_task.getResult()['fan_bom']
        except:
            return

        output = self.__sdr_device_bom_task.getResult()
        dictionary_superset = DictionarySuperset.DictionarySuperset(
                    missing_message="Could not detect presence of chassis fan {}")
        result = dictionary_superset.compare(output, fan_bom)
        self.addCheckMessage("Checking output of 'ipmitool sdr elist' for expected chassis fans")
        if result:
            self.addCheckMessage(result)
            self.addUnHealthy()
        else:
            self.addHealthy()

check_fru_consistency

Brief

Check FRU information for consistency

Description

The FRU (field replaceable unit) information recorded in the BMC (baseboard management controller) includes serial numbers for various FRUs on the system. For any given system, these serial numbers should be consistent among all FRUs. However, it is possible for these serial numbers to become inconsistent as the result of normal maintenance (such as FRU replacement). This check makes sure serial numbers are consistent for all FRUs recorded in the BMC.

Module

ipmitool

Source Code Listing

    def run(self):
        own_params = self.getParameters()
        params = self.__parameters_task.getResult()

        fru_devices = None
        for param, req in own_params.items():
            if param in params.keys():
                fru_devices = params[param]

        if fru_devices == None:
            return

        # Unknown check for parse tasks failure
        if not (self.__fru_task.getResult() and self.__dmidecode_task.getResult()):
            self.addCheckMessage("No results from 'ipmitool fru print' or 'dmidecode' commands")
            self.addUnknown()
            return

        result = "healthy"
        self.addCheckMessage(config.fru_command_str)

        try:
            fru_res = self.__fru_task.getResult()
            # Check for FRU devices
            devices_not_found = [device for device in fru_devices if device not in fru_res.keys()]
            if devices_not_found:
                self.addCheckMessage("FRU devices not found '{}'".format(", ".join(devices_not_found)))
                if len(devices_not_found) == len(fru_devices):
                    self.addUnHealthy()
                    return
                result = "unhealthy"

            devices_found = [device for device in fru_devices if device not in devices_not_found]
            # Check for FRU devices chassis serial number
            chassis_serial_not_found = [device for device in devices_found if 'chassis_serial' not in fru_res[device].keys()]
            if chassis_serial_not_found:
                self.addCheckMessage("Chassis serial number not found for FRU devices '{}'".format(", ".join(chassis_serial_not_found)))
                result = "unhealthy"

            chassis_serial_found = [device for device in devices_found if device not in chassis_serial_not_found]

            # Get expected serial number
            dmidecode_res = self.__dmidecode_task.getResult()
            chassis_info = [v['serial_number'] for k, v in dmidecode_res.items() if 'chassis information' in k.lower() and
                            'serial_number' in v.keys()]
            if chassis_info:
                expected_serial_number = chassis_info[0]
            else:
                self.addCheckMessage("Failed while fetching serial number from chassis information")
                self.addUnknown()
                return

            # Check and print the FRU devices having inconsistent chassis serial numbers
            diff = [device for device in chassis_serial_found if fru_res[device]['chassis_serial'] != expected_serial_number]
            for device in diff:
                self.addCheckMessage("FRU device '{}' got chassis serial '{}' whereas expected is '{}'"
                                        .format(device, fru_res[device]['chassis_serial'], expected_serial_number))
                # For change in FRU chassis serial print informational status
                result = "info"

            if result == "unhealthy":
                self.addUnHealthy()
            elif result == "info":
                self.addInformational()
            else:
                self.addHealthy()

        except:
            self.addCheckMessage("Failed while checking FRU serial number consistency")
            self.addUnknown()

check_gpu_direct_topology

Brief

Check GPUDirect Topology information for consistency

Description

None

Module

nvidia_smi

Source Code Listing

    def run(self):
        # Get parameters from own task
        own_params = self.getParameters()

        # Get parameters from parameter task
        params = self.__parameters_task.getResult()

        # Get GPU MIG State
        gpu_mig_state = self.__gpu_mig_state.getResult()

        # Check if kvm mode is on then skip
        if modules.kvm.kvm_mode_on.getResult() == True:
            self.addCheckMessage('KVM mode is on, skipping check.')
            self.addInformational()
            return

        # If GPU State Enabled
        if gpu_mig_state != None and any(gpu_mig_state.values()):
            logging.info("MIG State Detected: Modifying to MIG Topology")
            _gpu_direct_topology = params['gpu_direct_topology']
            num_gpus = len(gpu_mig_state.values())
            # gpu_mig_state is a dict
            gpus = range(num_gpus)
            mig_enabled = [x for x, y in gpu_mig_state.items() if y == 1]
            mig_enabled_gpus = ['GPU{}'.format(x) for x in mig_enabled]
            pxb_enabled_gpus = ['GPU{}'.format(
                y) for y in [x-1 if x % 2 != 0 else x+1 for x in mig_enabled]]
            non_mig_gpus = ['GPU{}'.format(x)
                            for x in gpus if x not in mig_enabled]

            # Go over all the MIG enabled GPUs first
            for index, mig_gpu in enumerate(mig_enabled_gpus):
                for k, v in _gpu_direct_topology[mig_gpu].items():
                    if k == mig_gpu:
                        continue  # Already marked as X
                    elif k == pxb_enabled_gpus[index]:
                        _gpu_direct_topology[mig_gpu][k] = 'PXB'
                    else:
                        _gpu_direct_topology[mig_gpu][k] = 'SYS'

            #Go over non-mig gpus next
            for index, gpu in enumerate(non_mig_gpus):
                for k, v in _gpu_direct_topology[gpu].items():
                    if k == gpu:
                        continue
                    elif k in mig_enabled_gpus and gpu in pxb_enabled_gpus:
                        if mig_enabled_gpus.index(k) == pxb_enabled_gpus.index(gpu):
                            _gpu_direct_topology[gpu][k] = 'PXB'
                        else:
                            _gpu_direct_topology[gpu][k] = 'SYS'
                    elif k in mig_enabled_gpus:
                        _gpu_direct_topology[gpu][k] = 'SYS'

            params['gpu_direct_topology'] = _gpu_direct_topology

        expected_topology = None
        for param, req in own_params.items():
            if param in params.keys():
               expected_topology = params[param]
        # If parameter is not found in platform parameters - Do Nothing
        if expected_topology == None:
            return

        # Print/Stream Task info and command messages
        self.addCheckMessage(config.gpu_direct_topology_command_str)

        # Unknown check for no result from nvidia-smi topology parse task
        if not self.__parse_task.getResult():
            self.addCheckMessage(
                'No result for GPUDirect topology information gathered from nvidia-smi tool')
            self.addUnknown()
            return

        try:
            # Compare task output with expected config
            topology = json.loads(self.__parse_task.getResult())
            healthy, message = genericGpuTopologyCheck(
                topology, expected_topology)
            if not healthy:
                self.addCheckMessage(message)
                self.addUnHealthy()
            else:
                self.addHealthy()
        except:
            self.addCheckMessage('Error while checking gpu direct topology')
            self.addUnknown()  # Unknown check

check_gpu_p2p_topology

Brief

Check GPUs p2p Topology information for consistency

Description

None

Module

nvidia_smi

Source Code Listing

    def run(self):
        # Get parameters from own task
        own_params = self.getParameters()

        # Get parameters from parameter task
        params = self.__parameters_task.getResult()

        expected_topology = None
        for param, req in own_params.items():
            if param in params.keys():
               expected_topology = params[param]
        # If parameter is not found in platform parameters - Do Nothing
        if expected_topology == None:
            return
        if modules.kvm.kvm_mode_on.getResult() == True:
            self.addCheckMessage('KVM mode is on, skipping check.')
            self.addInformational()
            return
        # Print/Stream Task info and command messages
        self.addCheckMessage(config.gpu_p2p_topology_command_str)

        # Unknown check for no result from nvidia-smi topology parse task
        if not self.__parse_task.getResult():
            self.addCheckMessage(
                'No result for GPUs p2p topology information gathered from nvidia-smi tool')
            self.addUnknown()
            return

        try:
            # Compare task output with expected config
            topology = json.loads(self.__parse_task.getResult())
            healthy, message = genericGpuTopologyCheck(
                topology, expected_topology)
            if not healthy:
                self.addCheckMessage(message)
                self.addUnHealthy()
            else:
                self.addHealthy()
        except:
            self.addCheckMessage('Error while checking gpu direct topology')
            self.addUnknown()  # Unknown check

check_gpu_vbios_version_consistency

Brief

Verify GPUs VBIOS version consistency

Description

None

Module

nvidia_smi

Source Code Listing

    def run(self):
        if modules.kvm.kvm_mode_on.getResult() == True:
            self.addCheckMessage('KVM mode is on, skipping check.')
            self.addInformational()
            return
        # If nvidia-smi run got a failure, skip parsing the output

        nvidia_smi_res = self.__parse_task.getResult()
        if not nvidia_smi_res:
            self.addCheckMessage('No result from nvidia-smi tool')
            self.addUnknown()
            return

        inconsistent_gpus = {}
        self.addCheckMessage(config.vbios_command_str)
        '''
        product name: vbios_version: gpu_name
        '''
        try:
            for gpu, info in nvidia_smi_res.items():
                gpu_dict = xmltodict.parse(info)
                vbios_version = gpu_dict['nvidia_smi_log']['gpu']['vbios_version']
                product_name = gpu_dict['nvidia_smi_log']['gpu']['pci']['pci_device_id']
                if product_name not in inconsistent_gpus:
                    inconsistent_gpus[product_name] = {}
                if vbios_version not in inconsistent_gpus[product_name]:
                    inconsistent_gpus[product_name][vbios_version] = []
                inconsistent_gpus[product_name][vbios_version].append('GPU{}'.format(gpu))
            # Unhealthy check for multiple versions
            res = ""
            for product_name, vbios_version in inconsistent_gpus.items():
                if len(vbios_version) > 1:
                    for k, v in vbios_version.items():
                        res += "GPUs: {} has VBIOS version '{}'\n".format(
                            ", ".join(v), k)
            if res != "":
                self.addCheckMessage(
                    f"Different VBIOS version found on GPUs\n{res}")
                self.addUnHealthy()
            else:
                self.addHealthy()
        # Unknown check
        except:
            self.addCheckMessage(
                'Error while checking GPUs VBIOS version consistency')
            self.addUnknown()

check_gpus

Brief

Check GPU health retired page count, retired pages pending, inforom storage version and vbios version

Description

None

Module

nvidia_smi

Source Code Listing

    def run(self):
        own_params = self.getParameters()
        params = self.__parameters_task.getResult()

        gpu_total_retired_page_count = None
        for param, req in own_params.items():
            if param in params.keys():
                gpu_total_retired_page_count = params[param]

        # If nvidia-smi run got a failure, skip parsing the output
        nvidia_smi_res = self.__parse_nvidia_smi_task.getResult()
        if nvidia_smi_res == None:
            return

        gpus_info = {}

        for gpu, info in nvidia_smi_res.items():
            gpus_info[gpu] = {}
            try:
                gpu_dict = xmltodict.parse(info)
                check_cls = GpuCheckVbiosVersion()
                vbios_version = gpu_dict['nvidia_smi_log']['gpu']['vbios_version']
                gpus_info[gpu]['vbios_version'] = vbios_version
                product_name = gpu_dict['nvidia_smi_log']['gpu']['product_name']
                gpus_info[gpu]['product_name'] = product_name
                bdf = gpu_dict['nvidia_smi_log']['gpu']['pci']['pci_bus_id']
                msg_str = config.gpu_vbios_version_str.format(
                    gpu_index=gpu, bdf=bdf)
                check_cls.title(msg_str)
                check_cls.setCallback(self.getCallback())
                check_cls.send(msg=vbios_version)
                super().addMessages(check_cls.getMessages())

                check_cls = GpuCheckInforomStorageVersion()
                inforom_version = gpu_dict['nvidia_smi_log']['gpu']['inforom_version']['img_version']
                msg_str = config.gpu_inforom_version_str.format(
                    gpu_index=gpu, bdf=bdf)
                check_cls.title(msg_str)
                check_cls.setCallback(self.getCallback())
                check_cls.send(msg=inforom_version)
                super().addMessages(check_cls.getMessages())

                # NVBug-2691112: Disable GPU total retired page count and retired pages pending health checks
                # These checks are already covered in blacklist_recommendations health check
                '''if gpu_total_retired_page_count != None:
                    check_cls = GpuCheckRetiredPagesCount(gpu, gpu_dict, gpu_total_retired_page_count)
                    check_cls.setCallback(self.getCallback())
                    check_cls.run()
                    self.addHealthy(count=check_cls.getResult()['healthy'])
                    self.addUnHealthy(count=check_cls.getResult()['unhealthy'])
                    self.addUnknown(count=check_cls.getResult()['unknown'])
                    check_cls.sendComplete()
                    super().addMessages(check_cls.getMessages())

                    check_cls = GpuCheckRetiredPagesPending(gpu, gpu_dict)
                    check_cls.setCallback(self.getCallback())
                    check_cls.run()
                    self.addHealthy(count=check_cls.getResult()['healthy'])
                    self.addUnHealthy(count=check_cls.getResult()['unhealthy'])
                    self.addUnknown(count=check_cls.getResult()['unknown'])
                    check_cls.sendComplete()
                    super().addMessages(check_cls.getMessages())'''
            except:
                logging.error(
                    "ERROR: Failed to perform GPU {} health check".format(gpu))
                pass
        self.title('')

        self.__gpu_result = gpus_info

check_instant_blacklist_recommendations

Brief

Quick health check of GPU using DCGM

Description

None

Source Code Listing

    def run(self):

        own_params = self.getParameters()
        params = self.__parameters_task.getResult()

        check_blacklist_recommendations = False
        for param, req in own_params.items():
            if param in params.keys():
                check_blacklist_recommendations = params[param]

        # Return if parameter is not defined
        if not check_blacklist_recommendations:
            return

        # check if kvm mode is on
        if modules.kvm.kvm_mode_on.getResult() == True:
            self.addCheckMessage('KVM mode is on, skipping blacklist recommendations check.')
            self.addInformational()
            return

        # Run the blacklist_recommendations task
        args = ['./modules/blacklist_recommendations/_gpu_blacklist_recommendations', \
            '--detect', '--watches']
        args.extend(check_blacklist_recommendations)
        collect_task = tasks.RunCommand(args=args, timeout=1000)
        collect_task.run()

        # For DCGM failures, GPU blacklist recommendations can
        # exit with returncode 1, handle it gracefully
        # Return, for no respone or exitcode > 1
        if (collect_task.getReturnCode() > 1) or not collect_task.getOutput():
            self.addCheckMessage('No response or error while running GPU blacklist recommendations: {}'.format(
                                    collect_task.getError()))
            self.addUnknown()
            return

        healthy = True

        try:
            result = json.loads(collect_task.getOutput())
            blacklist = result.get('blacklistedGpus', {})
            # Check for GPU/NVSwitch blacklist recommendations
            if len(blacklist) > 0:
                healthy = False
                self.addCheckMessage('Found {count} device(s) recommended for blacklist:'.format(
                    count=len(blacklist)))
            else:
                self.addCheckMessage('No devices found recommended for blacklist.')

            for entity_id in sorted(blacklist.keys()):
                details = blacklist[entity_id]
                device_uuid = details.get('UUID')
                device_bdf = details.get('BDF')
                failure_explanation = details.get('Failure Explanation')
                self.addCheckMessage('\t"GPU{entity_id}":\n' \
                                     '\t"BDF": "{device_bdf}"\n' \
                                     '\t"UUID": "{device_uuid}"\n' \
                                     '\t"Failure Explanation": {failure_explanation}'.format(
                                     entity_id=entity_id,
                                     device_bdf=device_bdf,
                                     device_uuid=device_uuid,
                                     failure_explanation=failure_explanation))
            # Check for other errors in blacklist recommendation script
            error_list = result.get('errors', [])
            if error_list:
                nv_hostengine_running = True
                self.addCheckMessage('Errors encountered:')
                for e in error_list:
                    if 'host engine is not valid any longer' in e:
                        nv_hostengine_running = False
                    self.addCheckMessage('\t{}'.format(e))

                # If nv-hostengine is not running return as unknown
                if not nv_hostengine_running:
                    self.addUnknown()
                    return

                healthy = False

        except Exception as e:
            self.addCheckMessage('Error while parsing GPU blacklist recommendations: {}'.format(e.message))
            self.addUnknown()
            return

        # make sure SBE page pending retirements are caught as informational,
        # as the blacklist_recommendations script ignores them as warnings
        if healthy:
            nvidia_smi_res = modules.nvidia_smi.parse_nvidia_smi.getResult()
            if nvidia_smi_res:
                for gpu, info in nvidia_smi_res.items():
                    gpu_dict = xmltodict.parse(info)
                    check_cls = modules.nvidia_smi.GpuCheckRetiredPagesPending(gpu, gpu_dict)
                    check_cls.setCallback(self.getCallback())
                    check_cls.run()
                    bad_count = check_cls.getResult()['unhealthy'] + check_cls.getResult()['unknown']
                    if bad_count:
                        healthy = False
                        self.addCheckMessage(check_cls.getTitle() )
                if not healthy:
                    self.addCheckMessage(config.gpu_total_retired_pages_pending_error)
                    self.addInformational()
                    return

        if healthy:
            self.addHealthy()
        else:
            self.addUnHealthy()

check_ipmi_sensor_thresholds

Brief

Check BMC sensor thresholds

Description

None

Module

ipmitool

Depends On

Source Code Listing

    def run(self):
        threshold_check_dispatch = {
            'lower_non_recoverable': lambda observed, threshold: threshold < observed,
            'lower_critical': lambda observed, threshold: threshold < observed,
            'upper_non_recoverable': lambda observed, threshold: observed < threshold,
            'upper_critical': lambda observed, threshold: observed < threshold
        }

        threshold_display = {
            'lower_non_recoverable': '{name}: Observed value "{observed}" ({units}) below non-recoverable lower threshold "{threshold}"',
            'lower_critical': '{name}: Observed value "{observed}" ({units}) below critical lower threshold "{threshold}"',
            'upper_non_recoverable': '{name}: Observed value "{observed}" ({units}) above non-recoverable upper threshold "{threshold}"',
            'upper_critical': '{name}: Observed value "{observed}" ({units}) above critical upper threshold "{threshold}"'
        }

        # Look for any sensor values that fall outside of critical thresholds
        try:
            healthy = True
            sensors = self.__parse_ipmi_sensor_task.getResult()
            if not sensors:
                return
            for sensor in sensors:
                name = sensor['name']
                observed = sensor['current_reading']
                try:
                    observed = float(observed)
                except:
                    continue
                units = sensor['type']
                if units.lower() == 'discrete':
                    continue
                for field in [
                        'lower_non_recoverable',
                        'lower_critical',
                        'upper_non_recoverable',
                        'upper_critical' ]:
                    threshold = sensor.get(field)
                    try:
                        threshold = float(threshold)
                    except:
                        continue
                    check = threshold_check_dispatch[field]
                    if check(observed, threshold):
                        continue  # Observed value is within threshold
                    healthy = False
                    display = threshold_display[field]
                    self.addCheckMessage(display.format(
                            name=name,
                            observed=observed,
                            units=units,
                            threshold=threshold))
            self.addCheckMessage('Checked {count} sensor values against BMC thresholds.'.format(
                 count=len(sensors)))

            if healthy:
                self.addHealthy()
            else:
                self.addUnHealthy()

        except:
            self.addCheckMessage('No sensors found in "ipmitool sensor"')
            self.addUnknown()
            return

check_ipmitool_working

Brief

Check that the ipmitool command is working

Description

This checks the exit status of the “ipmitool” command. If the ipmitool command runs with successful exit status, then this is a good indication that ipmitool was able to communicate with the BMC (baseboard management controller).

Module

bmc

Depends On

Source Code Listing

    def run(self):
        pass

check_logical_core_count

Brief

Number of logical CPU cores [{0}]

Description

None

Module

lscpu

Source Code Listing

    def run(self):
        output = self.__parse_lscpu_task.getResult()

        observed_core_count = output['CPU']
        self.title(self.getTitle().format(observed_core_count))

        try:
            expected_core_count = self.__parameters_task.getResult()['lscpu_number_of_cores']
        except:
            return

        if observed_core_count == expected_core_count:
            self.addCheckMessage('Observed {0} logical CPU cores, matching expectations'.format(
                        observed_core_count))
            self.addHealthy()
            return

        self.addCheckMessage('Observed {0} logical CPU cores when {1} cores were expected'.format(
                    observed_core_count, expected_core_count))
        self.addUnHealthy()

        if observed_core_count * 2 == expected_core_count:
            # When only half of the expected logical cores are observed, we
            # suspect hyperthreading might be disabled

            # Look for the hyperthreading flag
            hyperthreading_enabled = output['hyperthread'] == 2

            if not hyperthreading_enabled:
                self.addCheckMessage('It appears that Hyper-Threading is disabled.' \
                      ' Some customers choose to disable Hyper-Threading in' \
                  ' order to improve the performance of certain' \
                  ' workloads. If Hyper-Threading was intentionally' \
                  ' disabled, please ignore this message.')

check_mdadm_disks

Brief

Status of software RAID disk superblocks

Description

None

Module

mdadm

Source Code Listing

    def run(self):
        try:
            check = self.__parameters_task.getResult()['mdadm_disk_status']
            if not check:
                return
        except:
            return

        disk_info = self.__mdadm_parse_examine.getResult()
        if not disk_info:
            self.addCheckMessage("No result from parse 'mdadm --examine' for software RAID superblock")
            self.addUnknown()
            return

        self.addCheckMessage("Checking output of 'mdadm --examine' for each software RAID superblock")

        # Check the checksum of each RAID disk superblock managed by mdadm
        healthy = True
        for name, disk in disk_info.items():
            if 'checksum' not in disk:
                self.addCheckMessage('Checksum not known for RAID disk "{0}"'.format(name))
                healthy = False
                continue
            checksum = disk['checksum']
            if 'correct' not in checksum:
                self.addCheckMessage('Observed failed checksum "{0}" on RAID disk "{1}"'.format(
                            checksum, name))
                healthy = False

         # Return healthy/unhealthy status
        if healthy:
            self.addHealthy()
        else:
            self.addUnHealthy()

check_mdadm_volumes

Brief

Status of software RAID volumes

Description

None

Module

mdadm

Source Code Listing

    def run(self):
        try:
            check = self.__parameters_task.getResult()['mdadm_volume_status']
            if not check:
                return
        except:
            return

        volume_info = self.__mdadm_parse_details.getResult()
        if not volume_info:
            self.addCheckMessage("No result from parse 'mdadm --detail' for software RAID volume")
            self.addUnknown()
            return

        self.addCheckMessage("Checking output of 'mdadm --detail' for each software RAID volume")

        good_volume_states = [ 'clean', 'active', 'write-pending', 'active-idle' ]

        healthy = True
        for name, volume in volume_info.items():
            # Check the volume state
            if 'state' not in volume:
                self.addCheckMessage('State not known for RAID volume "{0}"'.format(name))
                healthy = False
            elif 'recovering' in volume['state'].lower():
                self.addCheckMessage('It appears that the RAID volume "{0}" is currently' \
                      ' recovering. This is normal. However, volume performance' \
                      ' might be reduced while the volume is recovering. The' \
                      ' recovery process should complete soon, but if it does' \
                      ' not please contact NVIDIA support.'.format(name))
            elif 'resync' in volume['state'].lower():
                self.addCheckMessage('It appears that the RAID volume "{0}" is currently' \
                      ' resyncing. This is normal. However, volume performance' \
                      ' might be reduced while the volume is resyncing. The' \
                      ' resync process should complete soon, but if it does' \
                      ' not please contact NVIDIA support.'.format(name))
            elif volume['state'].lower() not in good_volume_states:
                self.addCheckMessage('Observed unhealthy state "{0}" for RAID volume "{1}"'.format(
                            volume['state'], name))
                healthy = False
            # Check for failed devices in the volume
            if 'failed_devices' not in volume:
                pass  # Hmm...
            elif int(volume['failed_devices']) > 0:
                self.addCheckMessage('Observed {0} failed device(s) in RAID volume "{1}"'.format(
                            volume['failed_devices'], name))
                healthy = False

        # Return healthy/unhealthy status
        if healthy:
            self.addHealthy()
        else:
            self.addUnHealthy()

check_meminfo_mem_size

Brief

Installed memory capacity [{0:.2f}GB]

Description

None

Module

meminfo

Source Code Listing

    def run(self):
        try:
            threshold = self.__parameters_task.getResult()['meminfo_memory_size']
        except:
            return

        threshold = threshold * 9.537e-7
        self.title(self.getTitle().format(threshold))

        actual_mem = self.__collect_meminfo_task.getResult()
        if not actual_mem:
            self.addUnknown()
            return

        # check for actual mem w.r.t threshold with tolerance of 1GB
        tolerance = 1
        if modules.common.almost_equal(actual_mem, threshold, tolerance):
            self.addHealthy()
        else:
            self.addCheckMessage('Amount of memory is {0:.2f} GB'.format(actual_mem))
            self.addUnHealthy()

check_mlx_fw_version

Brief

Verify Mellanox devices firmware version consistency

Description

None

Module

mlnx

Source Code Listing

    def run(self):

        own_params = self.getParameters()
        params = self.__parameters_task.getResult()

        firmware_versions = 0
        for param, req in own_params.items():
            if param in params.keys():
                firmware_versions = params[param]

        if firmware_versions == 0:
            return

        # If mlxfwmanager run got a failure, skip parsing the output
        mlnx_res = self.__run_task.getOutput()
        if self.__run_task.getReturnCode() or not mlnx_res:
            self.addCheckMessage('No result from mellanox firmware manager')
            self.addUnknown()
            return

        self.addCheckMessage(config.mlx_fw_ver_cmd_str)
        inconsistent_devices = {}

        try:
            res_dict = xmltodict.parse(mlnx_res)
            for res in res_dict['Devices']['Device']:
                pci = res['@pciName']
                fw_ver = res['Versions']['FW']['@current']
                if fw_ver not in inconsistent_devices:
                   inconsistent_devices[fw_ver] = []
                inconsistent_devices[fw_ver].append(pci)

            # Unhealthy check for multiple versions
            if len(inconsistent_devices) > firmware_versions:
                res = ""
                for k, v in inconsistent_devices.items():
                    res += "PCI device: '{}' has firmware version '{}'\n".format(", ".join(v), k)
                self.addCheckMessage(f"Different firmware version found on Mellanox devices\n{res}")
                self.addUnHealthy()
            else:
                self.addHealthy()
        # Unknown check
        except:
            self.addCheckMessage('Error while checking Mellanox devices firmware version consistency')
            self.addUnknown()

check_net_ping

Brief

Verify Network IP Reachability

Description

None

Module

net

Depends On

Source Code Listing

    def run(self):
        from re import findall
        try:
            net_ping = self.__parameters_task.getResult()['net_ping']
            if not net_ping:
                return
        except:
            return
        
        health_list = [True for x in range(len(net_ping))]

        for i, (interface, ip) in enumerate(net_ping.items()):
            run_ping_task = tasks.RunCommand(
                    args=['ping', '-c', str(config.check_net_ping_count), '-W', str(config.check_net_ping_timeout), ip]) \
                    .title('Run ping') \
                    .describe('''Check IP Reachability via Ping''')
            try:
                run_ping_task.run()
                if(run_ping_task.getReturnCode()!=0):
                    raise Exception("Unable to ping {} at {}".format(ip, interface))

                output = run_ping_task.getOutput()
            except:
                self.addCheckMessage("Unable to ping {} at {}".format(ip, interface))
                health_list[i] = False
                continue

            regex = r'[0-9 ]*[a-z].+?(?=,).+?(?=,).+(.+?(?=%))'
            packet_loss = findall(regex, output)
            
            self.addCheckMessage("Checking Packet Loss on {} at {}: {}%".format(interface, ip, packet_loss[0]))

            if(packet_loss[0]!='0'):
                health_list[i] = False
            elif(packet_loss[0]=='0'):
                health_list[i] = True

        if all(health_list):
            self.addHealthy()
        else: 
            self.addUnHealthy()

check_nvidia_grid_license

Brief

Drive Constellation: GRID License Status

Description

None

Module

dcs_modules

Source Code Listing

    def run(self):
        try:
            dcs_grid = self.__parameters_task.getResult()['dcs_grid_license']
            if not dcs_grid:
                return
        except:
            return
        
        nvidia_smi_res = self.__parse_nvidia_smi_task.getResult()
        if nvidia_smi_res == None:
            return

        healthy = False

        try:
            for gpu, info in nvidia_smi_res.items():
                gpu_dict = xmltodict.parse(info)
                # GRID License check on GRID Products only
                # NVBug-2795033: GPU cards which can be licensed will have the `License Status` field and
                # those which cannot will not have this field.
                if gpu_dict['nvidia_smi_log']['gpu'].get('grid_licensed_product', None) == None:
                    continue
                elif gpu_dict['nvidia_smi_log']['gpu']['grid_licensed_product'].get('license_status', None) == None:
                    continue
                # NVBug:3145085 - If the product name is QVDCW and status is licensed then only show healthy
                elif gpu_dict['nvidia_smi_log']['gpu']['grid_licensed_product'].get('licensed_product_name', None) == None:
                    continue

                product_name = gpu_dict['nvidia_smi_log']['gpu']['grid_licensed_product']['licensed_product_name']
                product_name = product_name.strip()
                
                if product_name == 'Quadro Virtual Data Center Workstation':
                    if not gpu_dict['nvidia_smi_log']['gpu']['grid_licensed_product']['license_status'] == 'Licensed':
                        healthy = False
                        break
                    else:
                        healthy = True


            if healthy == True:
                self.addHealthy()
            else:
                self.addCheckMessage("Check GRID License: Contact Nvidia")
                self.addUnHealthy()

        except:
            self.addCheckMessage("Error while performing GRID License Check")
            self.addUnknown()

check_nvidia_smi_gpu_bus_id

Brief

Verify GPU’s identified using nvidia-smi

Description

None

Module

nvidia_smi

Source Code Listing

    def run(self):
        # Get parameters from own task
        own_params = self.getParameters()

        # Get parameters from parameter task
        params = self.__parameters_task.getResult()

        expected_bdfs = None
        for param, req in own_params.items():
            if param in params.keys():
               expected_bdfs = params[param]
        # If parameter is not found in platform parameters - Do Nothing
        if expected_bdfs == None:
            return
        if modules.kvm.kvm_mode_on.getResult() == True:
            self.addCheckMessage('KVM mode is on, skipping check.')
            self.addInformational()
            return
        # Unknown check for no result from nvidia-smi gpu_bus_id parse task
        if not self.__parse_task.getResult():
            self.addCheckMessage(
                'No result for gpu_bus_id information gathered from nvidia-smi tool')
            self.addUnknown()
            return

        # Print/Stream Task info and command messages
        self.addCheckMessage(
            'Checking output of "nvidia-smi --query-gpu=gpu_bus_id --format=csv,noheader" for expected GPUs')

        message = ''
        healthy = True

        try:
            bdfs = self.__parse_task.getResult()
            for gpu in expected_bdfs.keys():
                if gpu not in bdfs:
                    message += '\nGPU not identified at PCI address "{}"'.format(
                        gpu)
                    healthy = False

            if not healthy:
                self.addCheckMessage(message)
                self.addUnHealthy()
            else:
                self.addHealthy()

        except:
            self.addCheckMessage('Error while identifying GPUs bus_id')
            self.addUnknown()

check_nvme_devices

Brief

Verify installed NVMe devices

Description

None

Module

nvme

Source Code Listing

    def run(self):
        import json

        own_params = self.getParameters()
        params = self.__parameters_task.getResult()

        nvme_config = None
        for param, req in own_params.items():
            if param in params.keys():
                nvme_config = params[param]

        if not nvme_config:
            return

        nvsm_config = config.read_nvsm_config_file()

        if nvsm_config != None:
            if not nvsm_config["use_standard_config_storage"]:
                return

        if self.__parse_nvme_devices.getResult():
            self.addCheckMessage(config.nvme_command_str)
            devices = json.loads(self.__parse_nvme_devices.getResult())
            if not [conf for conf in nvme_config if devices.items() == conf.items()]:
                count = 0
                res = 'Supported NVMe device(s) configuration:\n'
                for conf in nvme_config:
                    for size, count in conf.items():
                        res += '"{}" NVMe device(s) with capacity "{}"\n'.format(
                            count, size)
                    if count < len(nvme_config):
                        res += 'or \n'
                        count += 1

                res += 'Found NVMe device(s) configuration:'
                for size, count in devices.items():
                    res += '\n"{}" NVMe device(s) with capacity "{}"'.format(
                        count, size)
                self.addCheckMessage(res)
                self.addUnHealthy()

            else:
                self.addHealthy()
        else:
            self.addCheckMessage("No results from parse nvme devices")
            self.addUnknown()

check_nvme_smart_log

Brief

Check SMART status of NVMe devices

Description

None

Module

nvme

Source Code Listing

    def run(self):
        try:
            check = self.__parameters_task.getResult()['nvme_check_smart_log']
            if not check:
                return
        except:
            return

        # NVBUG2794792: Toshiba/Kioxia CM5: medium error observed in SMART log when I/O sent to the locked drive
        skipDriveModel = None
        try:
            skipDriveModel = self.__parameters_task.getResult()[
                'skip_nvme_drive_model']
        except:
            pass

        import json
        # Return if nvme run throws error or no results
        nvme_drive_map = {}
        if not self.__nvme_list.getReturnCode() and self.__nvme_list.getOutput():
            try:
                nvme_stream = json.loads(nvme_list.getOutput())
                devices = nvme_stream['Devices']
                for device in devices:
                    d = device['DevicePath']
                    deviceName = d.split(os.sep)[-1]
                    nvme_drive_map[deviceName] = device['ModelNumber']
            except Exception as e:
                logging.debug("Error while parsing nvsm devices:{}".format(e))
                pass

        self.addCheckMessage(
            "Checking output of 'nvme smart-log' for each NVMe device")
        nvme_smart_log = self.__nvme_smart_log_task.getResult()
        if not nvme_smart_log:
            self.addUnknown()
            return

        healthy = True
        for name, device in nvme_smart_log.items():
            # Check for critical warnings, which indicate drive is in error state
            critical_warnings = device.get('critical_warning', 0)
            if critical_warnings != '0':
                self.addCheckMessage('Found {0} critical warning(s) on NVMe drive "{1}".'.format(
                    critical_warnings, name))
                healthy = False

            # Check that remaining spare capacity is above the threshold
            available_spare = device.get('available_spare', 1.0)
            available_spare_threshold = device.get(
                'available_spare_threshold', 0.1)
            if available_spare < available_spare_threshold:
                self.addCheckMessage('Remaining spare capacity of {remaining}% on NVMe drive "{drive}" fails to meet threshold of {threshold}%.'.format(
                    drive=name,
                    remaining=int(available_spare * 1e2),
                    threshold=int(available_spare_threshold * 1e2)))
                healthy = False

            # Check that vendor estimate of percentage used is below 90%
            used = device.get('percentage_used', 0.0)
            if used > 0.9:
                self.addCheckMessage('Over {used}% expected life used on NVMe drive "{drive}".'.format(
                    used=int(used * 1e2),
                    drive=name))
                healthy = False

            # Skip media error check for locked devices
            if nvme_drive_map[name] == skipDriveModel:
                continue
            # Check for media errors, which occur when the controller detects
            # unrecovered data integrity errors
            media_errors = device.get('media_errors', 0)
            if media_errors != '0':
                self.addCheckMessage('Found {0} media error(s) on NVMe drive "{1}".'.format(
                    media_errors, name))
                healthy = False

        if healthy:
            self.addHealthy()
        else:
            self.addUnHealthy()

check_psu_bom

Brief

Verify chassis power supply presence

Description

None

Module

ipmitool

Source Code Listing

    def run(self):
        from nvsmhealth.lib import DictionarySuperset
        try:
            psu_bom = self.__parameters_task.getResult()['psu_bom']
        except:
            return

        output = self.__sdr_device_bom_task.getResult()
        dictionary_superset = DictionarySuperset.DictionarySuperset(
                    missing_message="Could not detect presence of chassis power supply {}")
        result = dictionary_superset.compare(output, psu_bom)
        self.addCheckMessage("Checking output of 'ipmitool sdr elist' for expected chassis PSUs")
        if result:
            self.addCheckMessage(result)
            self.addUnHealthy()
        else:
            healthy = True
            # NVBUG-200528273: Check for Power supply lost
            # Even if PSU status is ok, readings might have power supply ac lost message
            psu_res = self.__parse_ipmi_sdr_elist_task.getResult()
            # Filter PSU status keys for readings
            psu_status_keys = [k for k in psu_bom.keys() if 'status' in k.lower()]
            for s in psu_res:
                if s['name'] in psu_status_keys:
                    reading = s['reading']
                    if 'power supply ac lost' in reading.lower():
                        self.addCheckMessage("AC input is lost, {} has reading:\n{}".format(s['name'], s['reading']))
                        healthy = False

            if healthy:
                self.addHealthy()
            else:
                self.addUnHealthy()

check_psu_info

Brief

Check PSU Info (Vendor, Model) for Consistency

Description

None

Module

psu

Depends On

Source Code Listing

    def run(self):
        # print(self.__collect_psu_info_task.getReturnCode(), self.__collect_psu_info_task.getResult())

        if self.__collect_psu_info_task.getReturnCode():
            self.addCheckMessage("Unable to collect PSU (Vendor, Model) Information.")
            self.addUnknown()
            return
    
        psu_info = self.__collect_psu_info_task.getResult()

        for item in psu_info:
            s = set()
            for i in item.keys():
                k = i.split("_")[0]
                s.add(item[i])
            if len(s) > 1:
                self.addCheckMessage("Multiple PSU {}s found. {}".format(k,[i for i in s]))
                self.addUnHealthy()
                return
                
        self.addHealthy()

check_smartctl_disk_count

Brief

Verify installed disks

Description

Verify that all of the expected disks are installed

Module

disk

Source Code Listing

    def run(self):
        try:
            exp_disk_count = self.__parameters_task.getResult()[list(self.getParameters())[0]]
        except:
            return

        self.addCheckMessage("Checking output of 'smartctl' for expected disks")
        if self.__is_raid:
            disk_count = self.__get_disk_count_task.getResult()
        else:
            disk_count = self.__get_disk_count_task.getDisksBySize()
        if not disk_count:
            self.addCheckMessage("No disk(s) found")
            self.addUnknown()
            return

        dictionary_superset = DictionarySuperset.DictionarySuperset(
                missing_message="No disks of capacity '{}' were found",
                changed_message="Disks of capacity '{}' were found '{}' when '{}' disk(s) were expected")
        result = dictionary_superset.compare(disk_count, exp_disk_count)
        if result == None:
            # Healthy check - No diffs found
            self.addHealthy()
        else:
            # UnHealthy check - Print/Stream diffs found
            self.addUnHealthy()
            self.addCheckMessage(result)

check_smartctl_megaraid_disk_count

Brief

Verify installed MegaRAID disks

Description

Count the disks attached to the MegaRAID controller using the smartctl command

Module

disk

Source Code Listing

    def run(self):
        try:
            exp_disk_count = self.__parameters_task.getResult()[list(self.getParameters())[0]]
        except:
            return

        self.addCheckMessage("Checking output of 'smartctl' for expected disks")
        if self.__is_raid:
            disk_count = self.__get_disk_count_task.getResult()
        else:
            disk_count = self.__get_disk_count_task.getDisksBySize()
        if not disk_count:
            self.addCheckMessage("No disk(s) found")
            self.addUnknown()
            return

        dictionary_superset = DictionarySuperset.DictionarySuperset(
                missing_message="No disks of capacity '{}' were found",
                changed_message="Disks of capacity '{}' were found '{}' when '{}' disk(s) were expected")
        result = dictionary_superset.compare(disk_count, exp_disk_count)
        if result == None:
            # Healthy check - No diffs found
            self.addHealthy()
        else:
            # UnHealthy check - Print/Stream diffs found
            self.addUnHealthy()
            self.addCheckMessage(result)

check_smartctl_ssd_brick

Brief

Check for SSD health

Description

None

Module

disk

Source Code Listing

    def run(self):
        try:
            check = self.__parameters_task.getResult()['smartctl_check_ssd_brick']
            if not check:
                return
        except:
            return

        errormod_pattern = re.compile(r'ERRORMOD', flags=re.IGNORECASE)
        # Look for disks with bricked firmware conditions
        brick = False
        try:
            disk_bom = self.__smartctl_info_task.getBomList()

            if disk_bom is None:
                return

            for key, value in disk_bom.items():
                # Check for "ERRORMOD" in firmware version
                if not 'firmware_version' in value:
                    continue
                firmware_version = value['firmware_version']
                m = errormod_pattern.match(firmware_version)
                if not m:
                    continue
                # NOTE: It is likely that the disk capacity is also incorrect, but
                # we do not check for this here. Disk capacity is checked elsewhere
                # as part of the "smartctl-disk-count" check.
                brick = key
                break
        except Exception as e:
            self.addCheckMessage(str(e))
            self.addUnknown()
            return

        # Print details message
        if brick:
            self.addUnHealthy()
            self.addCheckMessage('Possible firmware bug on disk "{0}"'.format(brick))
        else:
            self.addHealthy()
            self.addCheckMessage('No disks with firmware bug found')

check_storcli_disk_state

Brief

None

Description

None

Module

storcli

Source Code Listing

    def run(self):
        if self.__run_command.getReturnCode():
            # command failed to execute
            return
        try:
            devices = self.__parameters_task.getResult()['storcli_disk_stats']
        except:
            return
        output = json.loads(self.__run_command.getOutput().strip())

        for name, key, idx, exp_val in devices:
            check_cls = CheckDisk(output, name, key, idx, exp_val)
            check_cls.setCallback(self.getCallback())
            check_cls.run()
            self.addHealthy(count=check_cls.getResult()['healthy'])
            self.addUnHealthy(count=check_cls.getResult()['unhealthy'])
            self.addUnknown(count=check_cls.getResult()['unknown'])
            check_cls.sendComplete()
            super().addMessages(check_cls.getMessages())

        # clear message as this task doesnt print anything
        self.title('')

check_storcli_sanity_installed

Brief

[sanity] MegaRAID storcli utility installed

Description

None

Module

storcli

Source Code Listing

    def run(self):
        try:
            storcli_platform_string = self.__parameters_task.getResult()['storcli_platform_string']
        except:
            # paramter not found
            return
        if self.__run_command.getReturnCode():
            self.addUnHealthy()
            self.addCheckMessage('The storcli utility does not appear to be installed')
            self.addCheckMessage('Please ensure storcli64 is installed in the /opt/MegaRAID/storcli/ directory')
        else:
            self.addHealthy()

check_storcli_sanity_supported

Brief

[sanity] {} BaseOS support for storcli utility

Description

None

Module

storcli

Source Code Listing

    def run(self):
        try:
            storcli_platform_string = self.__parameters_task.getResult()['storcli_platform_string']
        except:
            # paramter not found
            return

        self.title(self.getTitle().format(storcli_platform_string))

        baseos_version = self.__base_os_version_task.getResult()
        if not baseos_version or 'sw_version' not in baseos_version:
            self.addCheckMessage("Error checking {} BaseOS version".format( storcli_platform_string))
            self.addUnknown()
            return

        # Check DGX BaseOS version for storcli support
        message = 'Installed {} BaseOS version "{}" '.format(storcli_platform_string, baseos_version)
        if Version(baseos_version['sw_version']) >= Version('3.1.6'):
            # DGX BaseOS 3.1.6 introduces support for the storcli64 utility
            message += 'should support storcli'
            self.addHealthy()
        else:
            message += 'does not support storcli'
            self.addUnHealthy()
        self.addCheckMessage(message)

check_superuser_privileges

Brief

Check for superuser privileges

Description

This checks that NVSM Health is running with an effective user ID of 0, which indicates superuser or “root” privileges. Many NVSM Health checks require superuser privileges in order to run certain privileged commands or access privileged log files.

Module

common

Source Code Listing

    def run(self):
        import os
        # TODO: How should classes implementing ICheck communicate
        #       health check results?
        if os.geteuid() != 0:
            pass

check_xenserver_logical_core_count

Brief

Number of logical CPU cores [{0}]

Description

None

Module

xenserver

Source Code Listing

    def run(self):
        try:
            expected_core_count = modules.parameter.parameters.getResult()['xenserver_number_of_cores']
        except:
            return

        xlinfo_result = parse_xl_info.getResult()
        if not xlinfo_result:
            self.addUnknown()
            return

        observed_core_count = xlinfo_result['nr_cpus']
        self.title(self.getTitle().format(observed_core_count))

        if observed_core_count == expected_core_count:
            self.addCheckMessage('Observed {0} logical CPU cores, matching expectations'.format(
                        observed_core_count))
            self.addHealthy()
            return

        if observed_core_count * 2 == expected_core_count:
            # When only half of the expected logical cores are observed, we
            # suspect hyperthreading might be disabled

            # Look for the hyperthreading flag
            hyperthreading_enabled = xlinfo_result['threads_per_core'] == 2

            if not hyperthreading_enabled:
                self.addCheckMessage('It appears that Hyper-Threading is disabled.' \
                      ' Some customers choose to disable Hyper-Threading in' \
                  ' order to improve the performance of certain' \
                  ' workloads. If Hyper-Threading was intentionally' \
                  ' disabled, please ignore this message.')
                self.addUnHealthy()
                return

        self.addCheckMessage('Observed {0} logical CPU cores when {1} cores were expected'.format(
                    observed_core_count, expected_core_count))
        self.addUnHealthy()

dcv_check_fan_bom

Brief

Drive Constellation: Verify chassis fan presence for DCC

Description

None

Module

dcs_modules

Source Code Listing

    def run(self):
        from nvsmhealth.lib import DictionarySuperset
        try:
            dcv_fan_bom = self.__parameters_task.getResult()['dcv_fan_bom']
        except:
            return

        output = self.__dcv_sdr_device_bom_task.getResult()
        dictionary_superset = DictionarySuperset.DictionarySuperset(
                    missing_message="Could not detect presence of DCC chassis fan: {}")
        result = dictionary_superset.compare(output, dcv_fan_bom)
        self.addCheckMessage("Checking output of 'ipmitool sdr elist' for expected chassis fans on DCC")
        if result:
            self.addCheckMessage(result)
            self.addUnHealthy()
        else:
            self.addHealthy()

dcv_check_fru_consistency

Brief

Drive Constellation: Check FRU information for consistency

Description

The FRU (field replaceable unit) information recorded in the BMC (baseboard management controller) includes serial numbers for various FRUs on the system. For any given system, these serial numbers should be consistent among all FRUs. However, it is possible for these serial numbers to become inconsistent as the result of normal maintenance (such as FRU replacement). This check makes sure serial numbers are consistent for all FRUs recorded in the DCC BMC.

Module

dcs_modules

Source Code Listing

    def run(self):
        # TODO:[Kenzen-499] Check if FRU in self.__fru_task is consistent and report health using ICheck interface
        # TODO:Similar approach needs to be used by DCC BMC
        pass

dcv_check_ipmi_sensor_thresholds

Brief

Drive Constellation: Check DCC BMC sensor thresholds

Description

None

Module

dcs_modules

Source Code Listing

    def run(self):

        try:
            dcs_dcv_sensor_threshold = self.__parameters_task.getResult()['dcs_dcv_sensor_threshold']
            if not dcs_dcv_sensor_threshold:
                return
        except:
            return

        threshold_check_dispatch = {
            'lower_non_recoverable': lambda observed, threshold: threshold < observed,
            'lower_critical': lambda observed, threshold: threshold < observed,
            'upper_non_recoverable': lambda observed, threshold: observed < threshold,
            'upper_critical': lambda observed, threshold: observed < threshold
        }

        threshold_display = {
            'lower_non_recoverable': '{name}: Observed value "{observed}" ({units}) below non-recoverable lower threshold "{threshold}"',
            'lower_critical': '{name}: Observed value "{observed}" ({units}) below critical lower threshold "{threshold}"',
            'upper_non_recoverable': '{name}: Observed value "{observed}" ({units}) above non-recoverable upper threshold "{threshold}"',
            'upper_critical': '{name}: Observed value "{observed}" ({units}) above critical upper threshold "{threshold}"'
        }

        # Look for any sensor values that fall outside of critical thresholds
        healthy = True
        try:
            sensors = self.__dcv_parse_ipmi_sensor_task.getResult()
            for sensor in sensors:
                name = sensor['name']
                observed = sensor['current_reading']
                try:
                    observed = float(observed)
                except:
                    continue
                units = sensor['type']
                if units.lower() == 'discrete':
                    continue
                for field in [
                        'lower_non_recoverable',
                        'lower_critical',
                        'upper_non_recoverable',
                        'upper_critical' ]:
                    threshold = sensor.get(field)
                    try:
                        threshold = float(threshold)
                    except:
                        continue
                    check = threshold_check_dispatch[field]
                    if check(observed, threshold):
                        continue  # Observed value is within threshold
                    healthy = False
                    display = threshold_display[field]
                    self.addCheckMessage(display.format(
                            name=name,
                            observed=observed,
                            units=units,
                            threshold=threshold))
            self.addCheckMessage('Checked {count} sensor values against DCC BMC thresholds.'.format(
                 count=len(sensors)))
        except:
            self.addUnknown()

        if healthy:
            self.addHealthy()
        else:
            self.addUnHealthy()

dcv_check_psu_bom

Brief

Drive Constellation: Verify chassis power supply presence on DCC

Description

None

Module

dcs_modules

Source Code Listing

    def run(self):
        from nvsmhealth.lib import DictionarySuperset
        try:
            dcv_psu_bom = self.__parameters_task.getResult()['dcv_psu_bom']
        except:
            return

        output = self.__dcv_sdr_device_bom_task.getResult()
        dictionary_superset = DictionarySuperset.DictionarySuperset(
                    missing_message="Could not detect presence of DCC chassis power supply: {}")
        result = dictionary_superset.compare(output, dcv_psu_bom)
        self.addCheckMessage("Checking output of 'ipmitool sdr elist' for expected chassis PSUs on DCC")
        if result:
            self.addCheckMessage(result)
            self.addUnHealthy()
        else:
            healthy = True
            # NVBUG-200554527: Check for Power supply lost for DCC
            # Even if PSU status is ok, readings might have power supply ac lost message
            psu_res = self.__dcv_parse_ipmi_sdr_elist_task.getResult()
            # Filter PSU status keys for readings
            psu_status_keys = [k for k in dcv_psu_bom.keys() if 'status' in k.lower()]
            for s in psu_res:
                if s['name'] in psu_status_keys:
                    reading = s['reading']
                    if 'power supply ac lost' in reading.lower():
                        self.addCheckMessage("AC input is lost, {} has reading:\n{}".format(s['name'], s['reading']))
                        healthy = False

            if healthy:
                self.addHealthy()
            else:
                self.addUnHealthy()

Miscellaneous Tasks

base_os_version

Brief

Get base OS version

Description

None

Module

system

Depends On

Source Code Listing

    def run(self):
        try:
            rel_file = self.__parameters_task.getResult()['sw_rel_file']
        except:
            return
        rel_file = config.etc_rel_file.format(rel_file)
        with open(rel_file, encoding="utf-8") as f:
            output = f.read()

        res = {}
        # Get the latest OTA version, always appended to release file
        for m in re.compile(r"_OTA_VERSION\=(?P<ver>.*)").finditer(output):
            res['sw_version'] = m.group('ver').strip().strip('"')

        # Look for build version if OTA version not found
        if not res:
            keys = [('sw_version', "_SWBUILD_VERSION\=(?P<{}>.*)"),
                    ('sw_version', "VERSION\=(?P<{}>.*)")]

            for key, pattern in keys:
                match_string = pattern.format(key)
                p = re.compile(r"{}".format(match_string))
                for m in p.finditer(output):
                    if key in res.keys():
                        # consider first match only
                        continue
                    res[key] = m.group(key).strip().strip('"')

        self.__result = res

check_application_config_health

Brief

Drive Constellation: Check DCC Application config health

Description

None

Module

dcs_modules

Source Code Listing

    def run(self):
        json_output = self.__run_dcc_health_api_task.getOutput()
        if json_output != None:
            try:
                self.__output = json_output['application_config_health']
                #print(self.__output)
            except:
                self.__output = None
                #self.addUnknown()
                return

check_dcc_ecu_application_health

Brief

Drive Constellation: Check DCC ECU Application health

Description

None

Module

dcs_modules

Source Code Listing

    def run(self):
        json_output = self.__run_dcc_ecu_health_api_task.getOutput()
        if json_output != None:
            try:
                self.__output = json_output['ecu_application_health']
            except:
                self.__output = None
                return

check_dcc_ecu_hardware_health

Brief

Drive Constellation: Check DCC ECU Hardware health

Description

None

Module

dcs_modules

Source Code Listing

    def run(self):
        json_output = self.__run_dcc_ecu_health_api_task.getOutput()
        if json_output != None:
            try:
                self.__output = json_output['ecu_hardware_health']
                #print(self.__output)
            except:
                self.__output = None
                #self.addUnknown()
                return

check_dcc_ecu_health

Brief

Drive Constellation: Check DCC ECU health

Description

None

Module

dcs_modules

Source Code Listing

    def run(self):
        json_output = self.__run_dcc_health_api_task.getOutput()
        if json_output != None:
            try:
                self.__output = json_output['ecu_health']
                #print(self.__output)
            except:
                self.__output = None
                #self.addUnknown()
                return

check_dcc_hardware_health

Brief

Drive Constellation: Check DCC Hardware health

Description

None

Module

dcs_modules

Source Code Listing

    def run(self):
        json_output = self.__run_dcc_health_api_task.getOutput()
        if json_output != None:
            try:
                self.__output = json_output['dcc_hardware_health']
                #print(self.__output)
            except:
                self.__output = None 
                #self.addUnknown()
                return

check_gpu_monitor_active

Brief

Check if nvsm_core GPU URI is populated

Description

None

Module

vgpu

Source Code Listing

    def run(self):

        if self.__run_task.getReturnCode() or not self.__run_task.getOutput():
            return

        out = self.__run_task.getOutput().split('\n')
        for line in out:
            if 'Code' in line:
                if '200' in line:
                    self.__result = True

check_usb_device_presence

Brief

Check for the presence of USB devices

Description

None

Module

usb

Depends On

Source Code Listing

    def run(self):
        pass  # TODO

collect_mem_info

Brief

Verify memory information using the content of /proc/meminfo

Description

None

Module

meminfo

Source Code Listing

    def run(self):
        import os
        meminfo_path = os.path.join(os.sep, 'proc', 'meminfo')

        with open(meminfo_path) as f:
            meminfo_data = f.read()

        pattern = re.compile(r'MemTotal:\s+(?P<MemTotal>\d+)')
        match = pattern.match(meminfo_data)
        if not match:
            return

        # convert and factor
        mem_total = match.group('MemTotal').strip()

        try:
            mem_total = float(mem_total) * 9.537e-7
            self.__result = mem_total
        except:
            return

collect_nvme_smart_log

Brief

Collect nvme smartlogs

Description

None

Module

nvme

Source Code Listing

    def run(self):
        try:
            check = self.__parameters_task.getResult()['nvme_check_smart_log']
            if not check:
                return
        except:
            return

        # Return when no output from nvme list
        if self.__nvme_list.getReturnCode() or not self.__nvme_list.getOutput():
            return
        import json
        nvme_stream = json.loads(self.__nvme_list.getOutput())
        nvme_devices = []
        try:
            devices = nvme_stream['Devices']
            for device in devices:
                nvme_devices.append(device['DevicePath'])
        except Exception as e:
            logging.debug("Error while parsing nvsm devices:{}".format(e))
            pass

        nvme_smart_log = {}
        for device in nvme_devices:
            args = "nvme smart-log " + device
            cmd_task = tasks.RunCommand(args.split()).depends(
                modules.common.check_superuser_privileges)
            cmd_task.run()
            if cmd_task.getReturnCode():
                # could not run the info command
                continue
            out = cmd_task.getOutput().strip()
            nvme_smart_log[device] = out
        self.__result = nvme_smart_log

collect_psu_info

Brief

Collect PSU Information

Description

None

Module

psu

Source Code Listing

    def run(self):
        if self.__ipmi_info_task.getReturnCode():
            # ipmitool isn't working so return
            return
        
        psu_vendor_info = None
        psu_model_info = None

        # Get psu_info_cmds from parameters if not return None
        try:
            psu_vendor_info = self.__parameters_task.getResult()['psu_vendor_info']
            psu_model_info = self.__parameters_task.getResult()['psu_model_info']
        except:
            return None

        psu_vendor_res = {}
        psu_model_res = {}

        '''
        psu_vendor_info looks like
        {'PSU0': '0x30 0x0e 0x00 0x02', .. } 

        psu_model_info looks like
        {'PSU0': '0x30 0x0e 0x00 0x01', .. } 
        '''

        try:
            ## Run all PSU Vendor info cmds and fill psu_vendor_res
            for i in psu_vendor_info.keys():
                cmd = 'ipmitool raw {}'.format(psu_vendor_info[i])
                cmd_task = tasks.RunCommand(shell=True, args=[cmd])
                cmd_task.run()
                if cmd_task.getReturnCode() != 0:
                    return None
                psu_vendor_res[i] = self.get_data(cmd_task.getOutput())
        except:
            self.__result = None
            self.__retCode = 1
            return

        try:
            ## Run all PSU Model info cmds and fill psu_vendor_res
            for i in psu_model_info.keys():
                cmd = 'ipmitool raw {}'.format(psu_model_info[i])
                cmd_task = tasks.RunCommand(shell=True, args=[cmd])
                cmd_task.run()
                if cmd_task.getReturnCode() != 0:
                    return None
                psu_model_res[i] = self.get_data(cmd_task.getOutput())
        except:
            self.__result = None
            self.__retCode = 1
            return

        # Return result is a list 
        self.__result = []
        # Append all vendor information
        self.__result.append(psu_vendor_res)
        # Append all model information 
        self.__result.append(psu_model_res)
        # Set retCode as 0
        self.__retCode = 0

dcv_bmc_firmware_revision_info

Brief

Drive Constellation: DCC BMC Firmware Revision

Description

None

Module

dcs_modules

Source Code Listing

    def run(self):
        try:
            dcs_dcv_bmc = self.__parameters_task.getResult()['dcs_dcv_bmc']
            if not dcs_dcv_bmc:
                return
        except:
            return

        firmware_revision = "UNKNOWN"
        try:
            firmware_revision = self.__dcv_bmc_parse_ipmi_info_task.getResult()['firmware_revision']
        except:
            pass

        #TODO: Bug[200383435] - fix the defect after platform abstraction work
        self.send(msg=firmware_revision)
        self.__result = firmware_revision

dcv_bmc_parse_ipmi_info

Brief

Parse output from ipmitool mc info of DCC BMC

Description

None

Module

dcs_modules

Source Code Listing

    def run(self):
        try:
            dcs_dcv_bmc = self.__parameters_task.getResult()['dcs_dcv_bmc']
            if not dcs_dcv_bmc:
                return
        except:
            return

        if self.__ipmi_info_task.getReturnCode():
            # command failed
            return
        res = {}
        output = self.__ipmi_info_task.getOutput()

        ipmi_info_pattern = re.compile(r"^Device ID\s*:\s*(?P<device_id>.*)\nDevice Revision\s*:\s*(?P<device_revision>.*)\nFirmware Revision\s*:\s*(?P<firmware_revision>.*)\nIPMI Version\s*:\s*(?P<ipmi_version>.*)\nManufacturer ID\s*:\s*(?P<manufacturer_id>.*)\nManufacturer Name\s*:\s*(?P<manufacturer_name>.*)\nProduct ID\s*:\s*(?P<product_id>.*)\nProduct Name\s*:\s*(?P<product_name>.*)\nDevice Available\s*:\s*(?P<device_available>.*)\nProvides Device SDRs\s*:\s*(?P<provides_device_sdrs>.*)\nAdditional Device Support\s*:\s*\n(?P<additional_device_support>(.*\n)*)Aux Firmware Rev Info\s*:\s*\n(?P<aux_firmware_rev_info>(.*\n)*)")

        match = ipmi_info_pattern.match(output)
        if match:
            res = match.groupdict()

            res['additional_device_support'] = res['additional_device_support'].split('\n')
            res['additional_device_support'] = [a.strip() for a in res['additional_device_support'] if a]

            # BUG-2547367: Use Byte 13 or aux[0] (Most significant byte of Auxiliary firmware revision information)
            # to update firmware revision number
            rev_pattern = re.compile(r"^\s*0x(?P<hex1>.*)\n\s*0x(?P<hex2>.*)\n\s*0x(?P<hex3>.*)\n\s*0x(?P<hex4>.*)")
            rev_match = rev_pattern.match(res['aux_firmware_rev_info'])
            if rev_match:
                rev = rev_match.groupdict()
                res['firmware_revision'] = '{}.{}'.format(res['firmware_revision'], rev['hex1'])

            res['aux_firmware_rev_info'] = res['aux_firmware_rev_info'].split('\n')
            res['aux_firmware_rev_info'] = [a.strip() for a in res['aux_firmware_rev_info'] if a]

        self.__result = res

dcv_bmc_run_ipmi_info

Brief

Run ipmitool mc info command

Description

Check DCC BMC status with ipmitool. This command requires superuser privileges.

Module

dcs_modules

Source Code Listing

    def run(self):
        try:
            if(self.__dcc_passgen_task.getReturnCode()==0):
                psswd = self.__dcc_passgen_task.getOutput().strip()
                cmd = 'ipmitool -I lanplus -H {} -U {} -P {} mc info'.format(platforms.cgx.dcv_bmc_config['ip'], platforms.cgx.dcv_bmc_config['user'], psswd)
                cmd_task = tasks.RunCommand(shell=True, args=[cmd])
                cmd_task.run()
                self.__ret = cmd_task.getReturnCode()
                self.__result = cmd_task.getOutput()
        except:
            return

dcv_parse_ipmi_fru

Brief

Parse output from ipmitool fru print for DCC BMC

Description

None

Module

dcs_modules

Source Code Listing

    def run(self):
        try:
            dcs_dcv_bmc = self.__parameters_task.getResult()['dcs_dcv_bmc']
            if not dcs_dcv_bmc:
                return
        except:
            return

        if self.__dcv_run_ipmi_fru_task.getReturnCode():
            return

        output = self.__dcv_run_ipmi_fru_task.getOutput().strip()

        fru_pattern = re.compile(
                r'^FRU Device Description\s*:\s*(?P<fru_name>[^\(]*)\((?P<fru_id>[^\)\n]*)\)?\n(?P<fru_body>(?:[^\n]*\S+[^\n]*\n?)+)',
                flags=re.MULTILINE)

        # Attempt to parse FRU values
        try:
            result = {}
            for m in fru_pattern.finditer(output):
                fru = {}
                fru_name = m.group('fru_name').strip()
                result[fru_name] = fru
                for line in m.group('fru_body').split('\n'):
                    line = line.strip()
                    if not line:
                        continue
                    field = line.split(':', 1)
                    if len(field) != 2:
                        continue
                    field_name = field[0].strip().lower().replace(' ', '_')
                    field_value = field[1].strip()
                    fru[field_name] = field_value
        except Exception as e:
            print("EXCEPTION:", e)
            pass

        self.__result = result

dcv_parse_ipmi_getenables

Brief

Parse output from ipmitool get enables for DCC BMC

Description

None

Module

dcs_modules

Source Code Listing

    def run(self):
        try:
            dcs_dcv_bmc = self.__parameters_task.getResult()['dcs_dcv_bmc']
            if not dcs_dcv_bmc:
                return
        except:
            return

        if self.__dcv_run_ipmi_getenables_task.getReturnCode():
            return

        res = {}
        output = self.__dcv_run_ipmi_getenables_task.getOutput().strip()

        try:
            ipmi_getenables_pattern = re.compile(r"^Receive Message Queue Interrupt\s*:\s*(?P<receive_message_queue_interrupt>.*)\nEvent Message Buffer Full Interrupt\s*:\s*(?P<event_message_buffer_full_interrupt>.*)\nEvent Message Buffer\s*:\s*(?P<event_message_buffer>.*)\nSystem Event Logging\s*:\s*(?P<system_event_logging>.*)\nOEM 0\s*:\s*(?P<oem_0>.*)\nOEM 1\s*:\s*(?P<oem_1>.*)\nOEM 2\s*:\s*(?P<oem_2>.*)")

            match = ipmi_getenables_pattern.match(output)
            if match:
                res = match.groupdict()
        except Exception as e:
            print("EXCEPTION:", e)
            pass

        self.__result = res

dcv_parse_ipmi_sdr_elist

Brief

Parse output from ipmitool sdr elist for DCC BMC

Description

None

Module

dcs_modules

Source Code Listing

    def run(self):
        try:
            dcs_dcv_bmc = self.__parameters_task.getResult()['dcs_dcv_bmc']
            if not dcs_dcv_bmc:
                return
        except:
            return

        if self.__ipmi_sdr_elist_task.getReturnCode():
            return

        res = {}
        output = self.__ipmi_sdr_elist_task.getOutput().strip()

        sensor_pattern = re.compile(
                r'^(?P<name>[^|\n]+)\|' \
                r'(?P<id>[^|\n]+)\|' \
                r'(?P<status>[^|\n]+)\|' \
                r'(?P<entity_id>[^|\n]+)\|' \
                r'(?P<reading>[^|\n]+)' ,
                flags=re.MULTILINE)


        try:
            sensors = []
            for m in sensor_pattern.finditer(output):
                s = {
                    'name': m.group('name').strip(),
                    'id': m.group('id').strip(),
                    'status': m.group('status').strip(),
                    'entity_id': m.group('entity_id').strip(),
                    'reading': m.group('reading').strip()
                }
                sensors.append(s)
        except:
            pass

        self.__result = sensors

dcv_parse_ipmi_sensor

Brief

Parse output from ipmitool sensor for DCC BMC

Description

None

Module

dcs_modules

Source Code Listing

    def run(self):
        try:
            dcs_dcv_bmc = self.__parameters_task.getResult()['dcs_dcv_bmc']
            if not dcs_dcv_bmc:
                return
        except:
            return

        if self.__ipmi_sensor_task.getReturnCode():
            # command failed
            return
        res = {}
        output = self.__ipmi_sensor_task.getOutput().strip()

        sensor_pattern = re.compile(
                r'^(?P<name>[^|\n]+)\|' \
                r'(?P<current_reading>[^|\n]+)\|' \
                r'(?P<type>[^|\n]+)\|' \
                r'(?P<status>[^|\n]+)\|' \
                r'(?P<lower_non_recoverable>[^|\n]+)\|' \
                r'(?P<lower_critical>[^|\n]+)\|' \
                r'(?P<lower_non_critical>[^|\n]+)\|' \
                r'(?P<upper_non_critical>[^|\n]+)\|' \
                r'(?P<upper_critical>[^|\n]+)\|' \
                r'(?P<upper_non_recoverable>[^|\n]+)' ,
                flags=re.MULTILINE)

        # Attempt to parse the table of sensor values
        sensors = []
        try:
            for m in sensor_pattern.finditer(output):
                s = {
                    'name': m.group('name').strip(),
                    'status': m.group('status').strip(),
                    'type': m.group('type').strip(),
                    'current_reading': m.group('current_reading').strip(),
                    'lower_non_recoverable': m.group('lower_non_recoverable').strip(),
                    'lower_critical': m.group('lower_critical').strip(),
                    'lower_non_critical': m.group('lower_non_critical').strip(),
                    'upper_non_recoverable': m.group('upper_non_recoverable').strip(),
                    'upper_critical': m.group('upper_critical').strip(),
                    'upper_non_critical': m.group('upper_non_critical').strip()
                }
                sensors.append(s)
        except:
            pass

        self.__result = sensors

dcv_run_ipmi_fru

Brief

Run ipmitool mc info command

Description

Check DCC BMC status with ipmitool. This command requires superuser privileges.

Module

dcs_modules

Source Code Listing

    def run(self):
        try:
            if(self.__dcc_passgen_task.getReturnCode()==0):
                psswd = self.__dcc_passgen_task.getOutput().strip()
                cmd = 'ipmitool -I lanplus -H {} -U {} -P {} fru print'.format(platforms.cgx.dcv_bmc_config['ip'], platforms.cgx.dcv_bmc_config['user'], psswd)
                cmd_task = tasks.RunCommand(shell=True, args=[cmd])
                cmd_task.run()
                self.__ret = cmd_task.getReturnCode()
                self.__result = cmd_task.getOutput()
        except:
            return

dcv_run_ipmi_getenables

Brief

Run ipmitool mc info command

Description

Check DCC BMC status with ipmitool. This command requires superuser privileges.

Module

dcs_modules

Source Code Listing

    def run(self):
        try:
            if(self.__dcc_passgen_task.getReturnCode()==0):
                psswd = self.__dcc_passgen_task.getOutput().strip()
                cmd = 'ipmitool -I lanplus -H {} -U {} -P {} mc getenables'.format(platforms.cgx.dcv_bmc_config['ip'], platforms.cgx.dcv_bmc_config['user'], psswd)
                cmd_task = tasks.RunCommand(shell=True, args=[cmd])
                cmd_task.run()
                self.__ret = cmd_task.getReturnCode()
                self.__result = cmd_task.getOutput()
        except:
            return

dcv_run_ipmi_sdr_elist

Brief

Run ipmitool mc info command

Description

Check DCC BMC status with ipmitool. This command requires superuser privileges.

Module

dcs_modules

Source Code Listing

    def run(self):
        try:
            if(self.__dcc_passgen_task.getReturnCode()==0):
                psswd = self.__dcc_passgen_task.getOutput().strip()
                cmd = 'ipmitool -I lanplus -H {} -U {} -P {} sdr elist'.format(platforms.cgx.dcv_bmc_config['ip'], platforms.cgx.dcv_bmc_config['user'], psswd)
                cmd_task = tasks.RunCommand(shell=True, args=[cmd])
                cmd_task.run()
                self.__ret = cmd_task.getReturnCode()
                self.__result = cmd_task.getOutput()
        except:
            return

dcv_run_ipmi_sensor

Brief

Run ipmitool mc info command

Description

Check DCC BMC status with ipmitool. This command requires superuser privileges.

Module

dcs_modules

Source Code Listing

    def run(self):
        try:
            if(self.__dcc_passgen_task.getReturnCode()==0):
                psswd = self.__dcc_passgen_task.getOutput().strip()
                cmd = 'ipmitool -I lanplus -H {} -U {} -P {} sensor'.format(platforms.cgx.dcv_bmc_config['ip'], platforms.cgx.dcv_bmc_config['user'], psswd)
                cmd_task = tasks.RunCommand(shell=True, args=[cmd])
                cmd_task.run()
                self.__ret = cmd_task.getReturnCode()
                self.__result = cmd_task.getOutput()
        except:
            return

dcv_sdr_device_bom

Brief

Prepare Device Bom from ipmitool sdr elist

Description

None

Module

dcs_modules

Source Code Listing

    def run(self):
        # Create dictionary containing devices that appear to be present
        try:
            bom = {}
            ns_pattern = re.compile(r'^[nN]/?[sS]$')
            for s in self.__dcv_parse_ipmi_sdr_elist_task.getResult():
                name = s['name']
                status = s['status']
                if not ns_pattern.match(status):
                    # Sensor status is not ns, so device appears to be present
                    bom[name] = True
        except:
            pass

        self.__result = bom

determine_platform

Brief

Determine platform using IPlatform objects

Description

None

Module

platform

Used By

Source Code Listing

    def run(self):
        # Loop all over IPlatform objects to find correct variant
        matching_platform = None
        for platform in self.__platforms:
            if not platform.matches():
                continue
            if matching_platform != None and matching_platform.getName() != platform.getName():
                # IPlatform objects *must* be mutually exclusive
                logging.error('ERROR: Multiple matching platform configs found')
                raise Exception('Multiple matching platform configs found')

            matching_platform = platform

        if matching_platform == None:
            raise PlatformNotFound

        logging.debug("Platform Selection Complete: {} {} ".format(matching_platform.getName(),
            matching_platform.getPrettyConfig()))
        self.__platform = matching_platform

export_storcli_path

Brief

Export storcli path

Description

None

Module

storcli

Source Code Listing

    def run(self):
        import os
        os.environ["PATH"] += os.pathsep + '/opt/MegaRAID/storcli'

firmware_revision_info

Brief

BMC Firmware Revision

Description

None

Module

ipmitool

Source Code Listing

    def run(self):
        try:
            if self.__parameters_task.getResult()['skip_bmc_revision_check']:
                return
        except:
            pass

        firmware_revision = "UNKNOWN"
        try:
            firmware_revision = self.__parse_firmware_revision_task.getResult()
        except:
            pass

        #TODO: Bug[200383435] - fix the defect after platform abstraction work
        self.send(msg=firmware_revision)
        self.__result = firmware_revision

get_dimm_vendors

Brief

Verify system as described by SMBIOS/DMI using the dmidecode tool

Description

None

Module

dmidecode

Depends On

Source Code Listing

    def run(self):
        out = self.__parse_dmidecode_task.getResult()
        if not out:
            return

        self.__result = set()

        try:

            def checkIfNoDimm(vendor):
                dimm_no_presence=['no dimm','not specified']
                if any(vendor.lower() in s for s in dimm_no_presence):
                    return True
                else:
                    return False

            for key, body in out.items():
                if 'Memory Device' in key \
                    and 'manufacturer' in body \
                    and not checkIfNoDimm(body['manufacturer']):
                    self.__result.add(body['manufacturer'])
        except:
            return

gpu_bdf_pstate

Brief

Get gpu pstate mapped to bdfs

Description

None

Module

nvidia_smi

Source Code Listing

    def run(self):
        try:
            gpus = self.__parameters_task.getResult()['gpu_link_info']
        except:
            return

        # If parameter is not found in platform parameters - Do Nothing
        if gpus == None:
            return

        nvidia_smi = self.__parse_nvidia_smi.getResult()
        if not nvidia_smi:
            return

        bdf_pstate = {}
        for _, info in nvidia_smi.items():
            gpu_dict = xmltodict.parse(info)
            bdf = None
            for gpu in gpus:
                try:
                    if gpu['bdf'].lower() in gpu_dict['nvidia_smi_log']['gpu']['@id'].lower():
                        bdf = gpu['bdf']
                        break
                except:
                    pass
            if not bdf:
                continue
            try:
                bdf_pstate[bdf] = gpu_dict['nvidia_smi_log']['gpu']['power_readings']['power_state']
            except:
                pass
        self.__result = bdf_pstate

gpu_mig_status

Brief

GPUs MIG Status Task

Description

None

Module

nvidia_smi

Source Code Listing

    def run(self):
        # If nvidia-smi run got a failure, skip parsing the output
        nvidia_smi_res = self.__parse_task.getResult()
        if not nvidia_smi_res:
            logging.debug("No result from nvidia-smi tool")
            return

        gpus_mig_enabled = {}
        try:
            for gpu, info in nvidia_smi_res.items():
                gpu_dict = xmltodict.parse(info)
                mig_mode = gpu_dict['nvidia_smi_log']['gpu']['mig_mode']['current_mig']
                gpus_mig_enabled[gpu] = 1 if mig_mode == 'Enabled' else 0
        except:
            return

        self.__result = gpus_mig_enabled

mdadm_parse_detail

Brief

Parse mdadm detail

Description

None

Module

mdadm

Depends On

Source Code Listing

    def run(self):
        volume_detail_out = self.__run_mdadm_detail.getResult()
        if not volume_detail_out:
            return
        # Parse disk information printed by mdadm
        volume_info = {}
        for volume, out in volume_detail_out.items():
            volume_details_pattern = re.compile(
                    r'^(?P<volume_name>[^:]*):\n(?P<volume_body>(?:.*\n?)*)',
                    flags=re.MULTILINE)
            field_pattern = re.compile(
                    r'^(?P<name>[^:]*):(?P<value>.*)$',
                    flags=re.MULTILINE)

            # Use regex to separate mdadm output into volume name and body
            volume_match = volume_details_pattern.match(out)
            if not volume_match:
                continue  # Could not parse mdadm output for this volume
            volume_name = volume_match.group('volume_name').strip()
            volume_body = volume_match.group('volume_body')
            # Parse volume details into JSON fields
            volume_fields = {}
            for field in field_pattern.finditer(volume_body):
                field_name = field.group('name')
                field_name = field_name.strip().lower().replace(' ', '_')
                field_value = field.group('value').strip()
                volume_fields[field_name] = field_value
            volume_info[volume] = volume_fields

        self.__result = volume_info

mdadm_parse_examine

Brief

Parse mdadm examine

Description

None

Module

mdadm

Depends On

Source Code Listing

    def run(self):
        disk_info_out = self.__run_mdadm_examine_task.getResult()
        if not disk_info_out:
            return

        # Parse disk information printed by mdadm
        disk_info = {}
        for dev_name, out in disk_info_out.items():
            field_pattern = re.compile(
                    r'^(?P<name>[^:]*):(?P<value>.*)$',
                    flags=re.MULTILINE)

            # Parse disk details into JSON fields
            disk_fields = {}
            for field in field_pattern.finditer(out):
                field_name = field.group('name')
                field_name = field_name.strip().lower().replace(' ', '_')
                field_value = field.group('value').strip()
                disk_fields[field_name] = field_value
            disk_info[dev_name] = disk_fields

        self.__result = disk_info

nvidia_driver_version

Brief

Checking nvidia driver version

Description

None

Module

nvidia_smi

Depends On

Source Code Listing

    def run(self):
        # If nvidia-smi run got a failure, skip parsing the output
        nvidia_smi = tasks.RunCommand(
            args=['nvidia-smi', '--query-gpu=driver_version', '--format=csv,noheader'])
        nvidia_smi.run()
        if nvidia_smi.getReturnCode() != 0:
            return
        nvidia_smi_output = nvidia_smi.getOutput()
        nvidia_driver_version = nvidia_smi_output.split('\n')[0]
        self.title(config.nvidia_driver_version_info_str)
        self.send(msg=nvidia_driver_version)
        self.__result = nvidia_driver_version

nvidia_vbios_version

Brief

Get the nvidia vbios version for health summary

Description

None

Module

nvidia_smi

Depends On

Source Code Listing

    def run(self):
        # If nvidia-smi run got a failure, skip parsing the output
        nvidia_smi = tasks.RunCommand(
            args=['nvidia-smi', '-i', '0', '--query-gpu=vbios_version', '--format=csv,noheader'])
        nvidia_smi.run()
        if nvidia_smi.getReturnCode() != 0:
            return
        nvidia_vbios_version = nvidia_smi.getOutput()
        self.__result = nvidia_vbios_version

parameters

Brief

Initialise platform parameters after determining the platform

Description

None

Module

parameter

Used By

Source Code Listing

    def run(self):
        platform = self.__platform_task.getPlatform()
        parameters = platform.getParameters()
        # Parameters stored in this task are global for all ICheck tasks
        for param, value in parameters.items():
            self.__parameters[param] = value

parse_cec_info

Brief

Parse output from CEC

Description

None

Module

cec_info

Depends On

Source Code Listing

    def run(self):
        if self.__cec_version_task.getReturnCode():
            # command failed
            return
        res = {}
        output = self.__cec_version_task.getOutput().strip()
        output_bytes = output.split()
        if len(output_bytes) != 3:
            # Wrong output
            res['cec_version'] = "Not available"
        else:
            res['cec_version'] = "{}.{}".format(int(output_bytes[1].strip(), 16), int(output_bytes[2].strip(), 16))

        if self.__cec_boot_status_task.getReturnCode():
            # command failed
            res['EC_FW_TAG0'] = 'Not Authenticated'
            res['EC_FW_TAG1'] = 'Not Authenticated'
            res['bmc_fw_auth_state'] = 'Not Authenticated'
        else:
            output = self.__cec_boot_status_task.getOutput().strip()
            res['EC_FW_TAG0'] =  'Not Authenticated' if int(output[3])&0x01  == 0 else "Authenticated"
            res['EC_FW_TAG1'] =  'Not Authenticated' if int(output[3])&0x02  == 0 else "Authenticated"
            res['bmc_fw_auth_state'] =  'Not Authenticated' if int(output[1])&0x40  == 0 else "Authenticated"
        if self.__bmc_boot_slot_task.getReturnCode():
            # command failed
            res['bmc_boot_slot'] = "Not Available"
        else:
            output = self.__bmc_boot_slot_task.getOutput().strip()
            res['bmc_boot_slot']  =  'Slot 0' if output[0] == "00" else "Slot 1"
        self.__result = res

parse_dmidecode

Brief

Parse dmidecode tool output

Description

None

Module

dmidecode

Depends On

Source Code Listing

    def run(self):
        if self.__dmi_command_task.getReturnCode():
            return

        def to_underscore_name(name):
            return '_'.join(map(lambda w: w.lower(), name.split(':')[0].split(' ')))

        sub_block_pattern = re.compile(r'^(?P<block_name>\S.*)\n(?P<block_body>(\t.*\n?)*)', flags=re.MULTILINE)
        attribute_pattern = re.compile(r'^(?P<attribute_name>[^:]+):\s*(?P<attribute_value>.*\S)\s*$', flags=re.MULTILINE)
        def parse_block(block, blockValue):
            dictionary = {}
            array = []
            # Iterate over the sub-blocks contained in this block
            match_count = 0
            empty_count = 0
            for m in re.finditer(sub_block_pattern, block):
                match_count += 1
                block_name = to_underscore_name(m.group('block_name'))
                # Remove leading tabs from the sub-block body
                block_body = re.sub(r'^\t', '', m.group('block_body'), flags=re.MULTILINE)
                sub_block_value = VariantRef()
                parse_block(block_body, sub_block_value)
                if sub_block_value.value() == None:
                    # Check for an attribute
                    m2 = re.search(attribute_pattern, m.group(0))
                    if m2:
                        sub_block_value.attribute = m2.group('attribute_value')
                    else:
                        array.append(m.group(0).rstrip())
                        empty_count += 1
                # Check for this sub-block in our dictionary
                if block_name not in dictionary:
                    dictionary[block_name] = sub_block_value.value()
                else:
                    if not isinstance(dictionary[block_name], list):
                        # Convert the existing element into an array
                        old = dictionary[block_name]
                        dictionary[block_name] = []
                        dictionary[block_name].append(old)
                    dictionary[block_name].append(sub_block_value.value())
            # Determine the return value for this block
            if match_count == 0:
                pass  # The blockValue is None
            else:
                if empty_count == match_count:
                    # All of the sub-blocks are empty; return an array of strings
                    blockValue.array = array
                else:
                    # At least some of the sub-blocks are not empty; return a dictionary
                    blockValue.dict = dictionary

        def parse_record(record, recordValue):
            pass

        header_pattern = re.compile(r'^# dmidecode\s+(?P<dmidecode_version>.+)\n(.*\n)*SMBIOS\s+(?P<smbios_version>.+)\s+present.\n(.*\n)*Table at\s+(?P<dmi_table_address>.+)\.$', flags=re.MULTILINE)
        def parse_header(header, headerValue):
            dictionary = {}
            # Grab each field from the header
            m = re.search(header_pattern, header)
            if not m:
                self.send(info="Could not parse dmidecode header")
                return
            for key in ['dmidecode_version',
                      'smbios_version',
                      'dmi_table_address']:
                dictionary[key] = m.group(key)
            headerValue.dict = dictionary

        dmidecode_pattern = re.compile(r'^(?P<dmidecode_header># dmidecode.*\n(\S.*\n)*)\n(?P<dmidecode_body>Handle.*\n(.*\n)+)', flags=re.MULTILINE)
        record_pattern = re.compile(r'^Handle (?P<record_handle>0x[0-9A-F]{4}), DMI type (?P<record_dmi_type>\d+), (?P<record_bytes>\d+) bytes?\n(?P<record_name>.*)\n(?P<record_body>(\t.*\n)+)', flags=re.MULTILINE)
        def parse_dmidecode(output, dmidecodeValue):
            dictionary = {}
            # Get the header and body
            m1 = re.search(dmidecode_pattern, output)
            if not m1:
                self.send(info="Could not find dmidecode header")
                return
            # Parse the header
            header_value = VariantRef()
            parse_header(m1.group('dmidecode_header'), header_value)
            dictionary['header'] = header_value.value()
            # Iterate over each record in the body
            for m2 in re.finditer(record_pattern, m1.group('dmidecode_body')):
                # Remove leading tabs from the record body
                record_body = re.sub(r'^\t', '', m2.group('record_body'), flags=re.MULTILINE)
                # Parse the record body
                record_value = VariantRef()
                parse_block(record_body, record_value)
                # Add the record value to the output dictionary
                if m2.group('record_name') == 'Memory Device':
                    record_name = '{0} ({1})'.format(
                            m2.group('record_name'),
                            record_value.value().get('locator', m2.group('record_handle')))
                else:
                    record_name = '{0} ({1})'.format(m2.group('record_name'), m2.group('record_handle'))
                dictionary[record_name] = record_value.value()
            dmidecodeValue.dict = dictionary

        # Parse the dmidecode output into a Python dictionary
        dmidecode_value = VariantRef()
        parse_dmidecode(self.__dmi_command_task.getOutput().strip(), dmidecode_value)

        self.__result = dmidecode_value.value()

parse_firmware_revision

Brief

Parse BMC Firmware Revision

Description

None

Module

ipmitool

Source Code Listing

    def run(self):
        version = "UNKNOWN"
        try:
            version = self.__parse_ipmi_info_task.getResult()['firmware_revision']
            is_type = self.__parameters_task.getResult()['aux_firmware_version_type']
            if is_type == 'hex':
                # Best effort - try to convert hex to decimal
                try:
                    ver = version.split(".")
                    dec = int(ver[2], 16)
                    # Only format the version when greater than '09'
                    if dec > 9:
                        ver[2] = "{}".format(dec)
                        ver = ".".join(ver)
                        version = ver
                except Exception as ex:
                    logging.debug('Error while converting hex to decimal: {}'.format(ex))
                    pass
        except:
            # Skip conversion
            pass

        self.__result =  version

parse_fru

Brief

Parse output from ipmitool fru print

Description

None

Module

fru

Depends On

Source Code Listing

    def run(self):

        if self.__run_ipmi_fru_task.getReturnCode():
            return


        output = self.__run_ipmi_fru_task.getOutput().strip()

        fru_pattern = re.compile(
                r'^FRU Device Description\s*:\s*(?P<fru_name>[^\(]*)\((?P<fru_id>[^\)\n]*)\)?\n(?P<fru_body>(?:[^\n]*\S+[^\n]*\n?)+)',
                flags=re.MULTILINE)

        # Attempt to parse FRU values
        try:
            result = {}
            for m in fru_pattern.finditer(output):
                fru = {}
                fru_name = m.group('fru_name').strip()
                result[fru_name] = fru
                for line in m.group('fru_body').split('\n'):
                    line = line.strip()
                    if not line:
                        continue
                    field = line.split(':', 1)
                    if len(field) != 2:
                        continue
                    field_name = field[0].strip().lower().replace(' ', '_')
                    field_value = field[1].strip()
                    fru[field_name] = field_value
        except Exception as e:
            print("EXCEPTION:", e)
            pass

        self.__result = result

parse_ipmi_fru

Brief

Parse output from ipmitool fru print

Description

None

Module

ipmitool

Depends On

Source Code Listing

    def run(self):
        if self.__run_ipmi_fru_task.getReturnCode():
            return

        output = self.__run_ipmi_fru_task.getOutput().strip()

        fru_pattern = re.compile(
                r'^FRU Device Description\s*:\s*(?P<fru_name>[^\(]*)\((?P<fru_id>[^\)\n]*)\)?\n(?P<fru_body>(?:[^\n]*\S+[^\n]*\n?)+)',
                flags=re.MULTILINE)

        # Attempt to parse FRU values
        try:
            result = {}
            for m in fru_pattern.finditer(output):
                fru = {}
                fru_name = m.group('fru_name').strip()
                result[fru_name] = fru
                for line in m.group('fru_body').split('\n'):
                    line = line.strip()
                    if not line:
                        continue
                    field = line.split(':', 1)
                    if len(field) != 2:
                        continue
                    field_name = field[0].strip().lower().replace(' ', '_')
                    field_value = field[1].strip()
                    fru[field_name] = field_value
        except Exception as e:
            print("EXCEPTION:", e)
            pass

        self.__result = result

parse_ipmi_getenables

Brief

Parse output from ipmitool get enables

Description

None

Module

ipmitool

Source Code Listing

    def run(self):
        if self.__run_ipmi_getenables_task.getReturnCode():
            return

        res = {}
        output = self.__run_ipmi_getenables_task.getOutput().strip()

        try:
            ipmi_getenables_pattern = re.compile(r"^Receive Message Queue Interrupt\s*:\s*(?P<receive_message_queue_interrupt>.*)\nEvent Message Buffer Full Interrupt\s*:\s*(?P<event_message_buffer_full_interrupt>.*)\nEvent Message Buffer\s*:\s*(?P<event_message_buffer>.*)\nSystem Event Logging\s*:\s*(?P<system_event_logging>.*)\nOEM 0\s*:\s*(?P<oem_0>.*)\nOEM 1\s*:\s*(?P<oem_1>.*)\nOEM 2\s*:\s*(?P<oem_2>.*)")

            match = ipmi_getenables_pattern.match(output)
            if match:
                res = match.groupdict()
        except Exception as e:
            print("EXCEPTION:", e)
            pass

        self.__result = res

parse_ipmi_info

Brief

Parse output from ipmitool mc info

Description

None

Depends On

Source Code Listing

    def run(self):
        if self.__ipmi_info_task.getReturnCode():
            # command failed
            return
        res = {}
        output = self.__ipmi_info_task.getOutput()

        ipmi_info_pattern = re.compile(r"^Device ID\s*:\s*(?P<device_id>.*)\nDevice Revision\s*:\s*(?P<device_revision>.*)\nFirmware Revision\s*:\s*(?P<firmware_revision>.*)\nIPMI Version\s*:\s*(?P<ipmi_version>.*)\nManufacturer ID\s*:\s*(?P<manufacturer_id>.*)\nManufacturer Name\s*:\s*(?P<manufacturer_name>.*)\nProduct ID\s*:\s*(?P<product_id>.*)\nProduct Name\s*:\s*(?P<product_name>.*)\nDevice Available\s*:\s*(?P<device_available>.*)\nProvides Device SDRs\s*:\s*(?P<provides_device_sdrs>.*)\nAdditional Device Support\s*:\s*\n(?P<additional_device_support>(.*\n)*)Aux Firmware Rev Info\s*:\s*\n(?P<aux_firmware_rev_info>(.*\n)*)")

        match = ipmi_info_pattern.match(output)
        if match:
            res = match.groupdict()

            res['additional_device_support'] = res['additional_device_support'].split(
                '\n')
            res['additional_device_support'] = [a.strip()
                                                for a in res['additional_device_support'] if a]

            # BUG-2547367: Use Byte 13 or aux[0] (Most significant byte of Auxiliary firmware revision information)
            # to update firmware revision number
            if res['manufacturer_id'] in [config.quanta_manufacturer_id, config.nvidia_manufacturer_id]:
                rev_pattern = re.compile(
                    r"^\s*0x(?P<hex1>.*)\n\s*0x(?P<hex2>.*)\n\s*0x(?P<hex3>.*)\n\s*0x(?P<hex4>.*)")
                rev_match = rev_pattern.match(res['aux_firmware_rev_info'])
                if rev_match:
                    rev = rev_match.groupdict()
                    res['firmware_revision'] = '{}.{}'.format(
                        res['firmware_revision'], rev['hex1'])

            res['aux_firmware_rev_info'] = res['aux_firmware_rev_info'].split(
                '\n')
            res['aux_firmware_rev_info'] = [a.strip()
                                            for a in res['aux_firmware_rev_info'] if a]

        self.__result = res

parse_ipmi_sdr_elist

Brief

Parse output from ipmitool sdr elist

Description

None

Module

ipmitool

Source Code Listing

    def run(self):
        if self.__ipmi_sdr_elist_task.getReturnCode():
            return

        res = {}
        output = self.__ipmi_sdr_elist_task.getOutput().strip()

        sensor_pattern = re.compile(
                r'^(?P<name>[^|\n]+)\|' \
                r'(?P<id>[^|\n]+)\|' \
                r'(?P<status>[^|\n]+)\|' \
                r'(?P<entity_id>[^|\n]+)\|' \
                r'(?P<reading>[^|\n]+)' ,
                flags=re.MULTILINE)


        try:
            sensors = []
            for m in sensor_pattern.finditer(output):
                s = {
                    'name': m.group('name').strip(),
                    'id': m.group('id').strip(),
                    'status': m.group('status').strip(),
                    'entity_id': m.group('entity_id').strip(),
                    'reading': m.group('reading').strip()
                }
                sensors.append(s)
            self.__result = sensors
        except:
            pass

parse_ipmi_sensor

Brief

Parse output from ipmitool sensor

Description

None

Module

ipmitool

Depends On

Source Code Listing

    def run(self):
        if self.__ipmi_sensor_task.getReturnCode():
            # command failed
            return
        res = {}
        output = self.__ipmi_sensor_task.getOutput().strip()

        sensor_pattern = re.compile(
                r'^(?P<name>[^|\n]+)\|' \
                r'(?P<current_reading>[^|\n]+)\|' \
                r'(?P<type>[^|\n]+)\|' \
                r'(?P<status>[^|\n]+)\|' \
                r'(?P<lower_non_recoverable>[^|\n]+)\|' \
                r'(?P<lower_critical>[^|\n]+)\|' \
                r'(?P<lower_non_critical>[^|\n]+)\|' \
                r'(?P<upper_non_critical>[^|\n]+)\|' \
                r'(?P<upper_critical>[^|\n]+)\|' \
                r'(?P<upper_non_recoverable>[^|\n]+)' ,
                flags=re.MULTILINE)

        # Attempt to parse the table of sensor values
        sensors = []
        try:
            for m in sensor_pattern.finditer(output):
                s = {
                    'name': m.group('name').strip(),
                    'status': m.group('status').strip(),
                    'type': m.group('type').strip(),
                    'current_reading': m.group('current_reading').strip(),
                    'lower_non_recoverable': m.group('lower_non_recoverable').strip(),
                    'lower_critical': m.group('lower_critical').strip(),
                    'lower_non_critical': m.group('lower_non_critical').strip(),
                    'upper_non_recoverable': m.group('upper_non_recoverable').strip(),
                    'upper_critical': m.group('upper_critical').strip(),
                    'upper_non_critical': m.group('upper_non_critical').strip()
                }
                sensors.append(s)
        except:
            pass

        self.__result = sensors

parse_lsblk_scsi_device_info

Brief

Parse lsblk scsi block devices information gathered from lsblk tool

Description

None

Module

lsblk

Source Code Listing

    def run(self):
        # Parse the result as a JSON object from parent task
        if self.__lsblk_task.getReturnCode():
            return

        if not self.__lsblk_task.getOutput():
            return

        try:
            output = self.__lsblk_task.getOutput()
            # Filter physical scsi devices
            devices = []
            info_regex = re.compile(r'^(NAME=\"(?P<name>[^\"]*)).*(HCTL=\"(?P<hctl>[^\"]*)).*(TYPE=\"(?P<type>[^\"]*)).*(VENDOR=\"(?P<vendor>[^\"]*)).*(MODEL=\"(?P<model>[^\"]*)).*(REV=\"(?P<rev>[^\"]*)).*(TRAN=\"(?P<tran>[^\"]*))?$')
            # Split each line
            output_split = output.splitlines()
            # Run regex pattern on each line
            for device in output_split:
                m = info_regex.match(device)
                if m:
                    if m.group() is not None and 'vdisk' in m.group('model').strip().lower():
                        continue
                    device_data = { k:v.strip() if v is not None else "" for k, v in m.groupdict().items()}
                    devices.append(device_data)
            self.__result = devices

        except Exception as e:
            logging.error("ERROR: Failed to parse 'lsblk -S -P -o NAME,HCTL,TYPE,VENDOR,MODEL,REV,TRAN' information: {}".format(str(e)))

parse_lscpu

Brief

Parse output from lscpu

Description

None

Module

lscpu

Depends On

Source Code Listing

    def run(self):
        res = {}
        output = self.__lscpu_cmd_task.getOutput().strip()

        keys = [('hyperthread', "Thread\(s\) per core:\s+(?P<{}>\d+)"),
                ('NUMA', "NUMA node\(s\):\s+(?P<{}>\d+)"),
                ('CPU', "CPU\(s\):\s+(?P<{}>\d+)"),
                ('core_per_socket', "Core\(s\) per socket:\s+(?P<{}>\d+)"),
                ('cpu_flags', "Flags:\s*(?P<{}>.*)")]

        for key, pattern in keys:
            match_string = pattern.format(key)
            p = re.compile(r"{}".format(match_string))
            for m in p.finditer(output):
                if key in res.keys():
                    # consider first match only
                    continue
                res[key] = m.group(key).strip()
                try:
                    res[key] = eval(res[key])
                except:
                    pass
        self.__result = res

parse_lspci

Brief

Parse output from the lspci command

Description

None

Module

lspci

Depends On

Source Code Listing

    def run(self):
        string_id_pattern = re.compile(r'^\s*(?P<field>[^:]*):\s*(?P<value>(?P<string>.*)\s*\[(?P<id>[0-9a-fA-F]{4})\])\s*$')
        no_id_pattern = re.compile(r'^\s*(?P<field>[^:]*):\s*(?P<value>.*)\s*$')

        output = self.__lspci_task.getOutput()
        devices = output.split('\n\n')

        res = {}
        for device in devices:
            if device == '':
                continue

            fields = device.split('\n')

            # The first field is guaranteed to be the slot
            m = no_id_pattern.match(fields.pop(0))
            if not m:
                raise RuntimeError("Unable to parse the output of lspci")
            slot = m.group('value').strip()

            res[slot] = {}

            for field in fields:
                # First look for the ID
                m = string_id_pattern.match(field)
                if m:
                    field = m.group('field').lower()
                    res[slot][field] = m.group('value').strip()
                    res[slot]['{0}_string'.format(field)] = m.group('string').strip()
                    res[slot]['{0}_id'.format(field)] = m.group('id')
                    continue
                # If the ID cannot be found, use the entire field value
                m = no_id_pattern.match(field)
                if m:
                    field = m.group('field').lower()
                    res[slot][field] = m.group('value').strip()
                    continue
                raise RuntimeError("Unable to parse the output of lspci")

        # Add the parsed result as JSON object
        self.__result = json.dumps(res, ensure_ascii=True)
        return res

parse_lspci_n

Brief

Parse output from the lspci command

Description

None

Module

lspci

Depends On

Source Code Listing

    def run(self):
        string_id_pattern = re.compile(r'^\s*(?P<field>[^:]*):\s*(?P<value>(?P<string>.*)\s*\[(?P<id>[0-9a-fA-F]{4})\])\s*$')
        no_id_pattern = re.compile(r'^\s*(?P<field>[^:]*):\s*(?P<value>.*)\s*$')

        output = self.__lspci_task.getOutput()
        devices = output.split('\n\n')

        res = {}
        for device in devices:
            if device == '':
                continue

            fields = device.split('\n')

            # The first field is guaranteed to be the slot
            m = no_id_pattern.match(fields.pop(0))
            if not m:
                raise RuntimeError("Unable to parse the output of lspci")
            slot = m.group('value').strip()

            res[slot] = {}

            for field in fields:
                # First look for the ID
                m = string_id_pattern.match(field)
                if m:
                    field = m.group('field').lower()
                    res[slot][field] = m.group('value').strip()
                    res[slot]['{0}_string'.format(field)] = m.group('string').strip()
                    res[slot]['{0}_id'.format(field)] = m.group('id')
                    continue
                # If the ID cannot be found, use the entire field value
                m = no_id_pattern.match(field)
                if m:
                    field = m.group('field').lower()
                    res[slot][field] = m.group('value').strip()
                    continue
                raise RuntimeError("Unable to parse the output of lspci")

        # Add the parsed result as JSON object
        self.__result = json.dumps(res, ensure_ascii=True)
        return res

parse_lspci_verbose

Brief

Parse verbose output from the lspci command

Description

None

Module

lspci

Depends On

Source Code Listing

    def run(self):
        # If lspci verbose got failure run got a failure, skip parsing the output
        if self.__lspci_task.getReturnCode() != 0:
            return

        out = self.__lspci_task.getOutput()
        # Parse the lspci output
        lspci_value = VariantRef()
        self.parse_lspci(out, lspci_value)

        self.__result = lspci_value.value()

parse_memory_dimms

Brief

Parse memory DMI type information gathered from dmidecode tool

Description

None

Module

dmidecode

Source Code Listing

    def run(self):
        memory_pattern = re.compile(r'Memory\s*Device(.*\n)*?\s*Size:\s*(?P<size>.*)', flags=re.MULTILINE)
        output = self.__dimm_task.getOutput()
        if not output:
            return
        memory = []
        for m in memory_pattern.finditer(output):
            size = m.group('size').strip()
            memory.append(size)

        # Add the parsed result as JSON object
        self.__result = json.dumps(memory, ensure_ascii=True)

parse_net_ifconfig

Brief

Parse network interfaces

Description

None

Module

net

Depends On

Source Code Listing

    def run(self):
        if(self.__run_net_ifconfig_task.getReturnCode()!=0):
            return
        output = self.__run_net_ifconfig_task.getOutput()
        self.getInterfaceNames(output)

parse_nvidia_smi

Brief

Parse nvidia-smi tool output for GPUs consistency checks

Description

None

Module

nvidia_smi

Source Code Listing

    def run(self):
        # If nvidia-smi run got a failure, skip parsing the output
        nvidia_smi = tasks.RunCommand(args=['nvidia-smi', '--list-gpus'])
        nvidia_smi.run()
        if nvidia_smi.getReturnCode() != 0:
            return
        nvidia_smi_output = nvidia_smi.getOutput().strip()
        gpus = nvidia_smi_output.split('\n')
        gpus_count = len(gpus)
        gpus_out = {}
        for gpu in range(gpus_count):
            nvidia_smi_query = tasks.RunCommand(
                args=['nvidia-smi', '-i', '{}'.format(gpu), '--query', '--xml-format'])
            nvidia_smi_query.run()
            if nvidia_smi_query.getReturnCode() == 0:
                nvidia_smi_query_out = nvidia_smi_query.getOutput()
                gpus_out[gpu] = nvidia_smi_query_out

        self.__result = gpus_out

parse_nvidia_smi_gpu_bus_id

Brief

Parse GPU’s identified with the NVIDIA System Management Interface (nvidia-smi) tool

Description

None

Module

nvidia_smi

Source Code Listing

    def run(self):

        # If nvidia-smi run got a failure, skip parsing the output
        if self.__nvidia_smi_task.getReturnCode() or not self.__nvidia_smi_task.getOutput():
            return

        res = self.__nvidia_smi_task.getOutput()

        try:
            res_bdfs = res.strip().split('\n')
            bdfs = []
            pattern = re.compile(
                "^(?P<domain>.*):(?P<bus>.*):(?P<device>.*)\.(?P<function>.*)")
            for bdf in res_bdfs:
                match = pattern.search(bdf)
                if match:
                    match_dict = match.groupdict()
                    bdfs.append('{}:{}.{}'.format(match_dict['bus'].lower(
                    ), match_dict['device'], match_dict['function']))

            self.__result = bdfs

        except:
            pass

parse_nvidia_smi_p2p_topology

Brief

Parse GPUs p2p topology information gathered from nvidia-smi tool

Description

None

Module

nvidia_smi

Source Code Listing

    def run(self):
        # If nvidia-smi run got a failure, skip parsing the output
        if self.__nvidia_smi_task.getReturnCode() != 0:
            return

        matrix_string = self.__nvidia_smi_task.getOutput()
        matrix = genericParseNvidiaSmi(matrix_string, True)
        # Add the parsed result as JSON object
        self.__result = json.dumps(matrix, ensure_ascii=True)

parse_nvidia_smi_topology

Brief

Parse GPUDirect topology information gathered from nvidia-smi tool

Description

None

Module

nvidia_smi

Source Code Listing

    def run(self):
        # If nvidia-smi run got a failure, skip parsing the output
        if self.__nvidia_smi_task.getReturnCode() != 0:
            return

        matrix_string = self.__nvidia_smi_task.getOutput()
        matrix = genericParseNvidiaSmi(matrix_string)
        # Add the parsed result as JSON object
        self.__result = json.dumps(matrix, ensure_ascii=True)

parse_nvme_devices

Brief

Parse nvme list information gathered from nvme-cli tool

Description

None

Module

nvme

Depends On

Source Code Listing

    def run(self):
        import json
        # Return if nvme run throws error or no results
        if self.__nvme_list.getReturnCode() or not self.__nvme_list.getOutput():
            return

        # Parse the result as a JSON object from parent task
        nvme_list_in = self.__nvme_list.getOutput()
        nvme_stream = json.loads(nvme_list_in)
        count = {}
        try:
            # Attempt to quantize NVMe device counts in 100GB increments
            devices = nvme_stream['Devices']
            for d in devices:
                num_bytes = d.get("PhysicalSize", 0)
                quantized = self.quantize_bytes(num_bytes)
                if quantized not in count:
                    count[quantized] = 0
                count[quantized] += 1
        except:
            # unable to parse or no devices in the nvme output
            pass
        # Add the parsed result as JSON object
        self.__result = json.dumps(count, ensure_ascii=True)

parse_nvme_smart_log

Brief

Parse and create NVME smart log objects

Description

None

Module

nvme

Source Code Listing

    def run(self):
        nvme_smartlog_out = self.__collect_nvme_smart_log.getResult()
        if not nvme_smartlog_out:
            return

        nvme_smart_log = {}
        percent_pattern = re.compile(r'^(?P<percent_value>\d+(?:\.\d+)?)\s*%$')

        def percent_to_float(percent):
            m = percent_pattern.match(percent)
            if not m:
                return percent
            try:
                percent_value = float(m.group('percent_value'))
            except:
                return percent
            return percent_value * 1e-2

        for device, out in nvme_smartlog_out.items():
            field_pattern = re.compile(
                r'^(?P<name>[^:]*):(?P<value>.*)$', flags=re.MULTILINE)
            smartlog = {}
            # Iterate over fields of each device
            for f in field_pattern.finditer(out):
                # Normalize the field name and store the field value
                name = f.group('name').strip().lower().replace(' ', '_')
                value = f.group('value').strip()
                # Convert percentage values to floats
                if percent_pattern.match(value):
                    value = percent_to_float(value)
                smartlog[name] = value
            nvme_smart_log[device.split(os.sep)[-1]] = smartlog

        self.__result = nvme_smart_log

parse_psu0

Brief

Parse PSU-0 output

Description

None

Module

dcs_modules

Source Code Listing

    def run(self):
        psu_info = {}
        psu_info['Model'] = self.parse_task_output_ascii(self.__run_psu_model_task, 6)
        psu_info['Vendor'] = self.parse_task_output_ascii(self.__run_psu_vendor_task, 6)
        psu_info['SN'] = self.parse_task_output_ascii(self.__run_psu_serial_number_task, 6)
        if psu_info.get('Vendor', '') == 'DELTA':
            fw_task_output = self.parse_task_output(self.__run_psu_fw_ver_task, 3)
            try:
                fw_ver = fw_task_output.replace(" ", ".")
            except:
                fw_ver = fw_task_output

            psu_info['FW Ver'] = fw_ver
        else:
            psu_info['FW Ver'] = self.parse_task_output_ascii(self.__run_psu_fw_ver_task, 3)

        self.__output = psu_info

parse_psu1

Brief

Parse PSU-1 output

Description

None

Module

dcs_modules

Source Code Listing

    def run(self):
        psu_info = {}
        psu_info['Model'] = self.parse_task_output_ascii(self.__run_psu_model_task, 6)
        psu_info['Vendor'] = self.parse_task_output_ascii(self.__run_psu_vendor_task, 6)
        psu_info['SN'] = self.parse_task_output_ascii(self.__run_psu_serial_number_task, 6)
        if psu_info.get('Vendor', '') == 'DELTA':
            fw_task_output = self.parse_task_output(self.__run_psu_fw_ver_task, 3)
            try:
                fw_ver = fw_task_output.replace(" ", ".")
            except:
                fw_ver = fw_task_output

            psu_info['FW Ver'] = fw_ver
        else:
            psu_info['FW Ver'] = self.parse_task_output_ascii(self.__run_psu_fw_ver_task, 3)

        self.__output = psu_info

parse_smartctl_device_info

Brief

Parse smartctl device information gathered from parse smartctl scan tool

Description

None

Module

smartctl

Source Code Listing

    def run(self):
        # Parse the result as a JSON object from parent task
        smartctl_field_pattern = re.compile(r'^\s*(?P<field>[^:]+):\s*(?P<value>.*)\s*$')
        space_pattern = re.compile('\s+')

        devices = self.__smartctl_scan_task.getResult()
        if not devices:
            return
        bom_list = {}

        for d in devices:
            # Invoke smartctl for this device
            args = ['smartctl', '--info']
            args.extend(d)
            info_task = tasks.RunCommand(args).depends(modules.common.check_superuser_privileges)
            info_task.run()
            if info_task.getReturnCode():
                # could not run the info command
                continue
            out = info_task.getOutput().strip()

            # Extract BOM fields for this device
            bom = {}
            lines = out.split('\n')
            for l in lines:
                l = l.strip()
                m = smartctl_field_pattern.match(l)
                if not m:
                    continue

                field = m.group('field').strip().lower()
                field = space_pattern.sub('_', field)
                value = m.group('value').strip()
                bom[field] = value
            bom_list[' '.join(d)] = bom
        self.__bom_list = bom_list

        # Count the disks by capacity
        disk_count = {}
        capacity_pattern = re.compile(r'^.*\[(?P<human_readable>.*)\]$')
        megaraid_pattern = re.compile(r'^.*megaraid.*$')
        for key, value in bom_list.items():
            m = megaraid_pattern.match(key)
            if not m:
                # Skip disks not in MegaRAID
                continue
            capacity = value.get('user_capacity')
            if not capacity:
                continue
            # Get the human-readable disk capacity
            m = capacity_pattern.match(capacity)
            if not m:
                continue
            capacity = m.group('human_readable')
            # Increment the disk count for this capacity
            if capacity not in disk_count:
                disk_count[capacity] = 0
            disk_count[capacity] += 1
        self.__result = disk_count

        # Quantize disk
        def quantize_bytes(num_bytes):
            # Quantize bytes into 100GB increments, expressed in TB
            num_gigabytes = float(num_bytes) / float(1e12)
            return '{0:.1f}TB'.format(num_gigabytes)

        disks_by_size = {}
        # Attempt to quantize disk counts in 100GB increments
        capacity_pattern = re.compile(r'^(?P<bytes>[\d,]+)\s*bytes')
        try:
            for key, value in bom_list.items():
                m = capacity_pattern.match(value['user_capacity'])
                num_bytes = int(m.group('bytes').replace(',', ''))
                quantized = quantize_bytes(num_bytes)
                if quantized not in disks_by_size:
                    disks_by_size[quantized] = 0
                disks_by_size[quantized] += 1
        except:
            pass
        self.__disks_by_size = disks_by_size

parse_smartctl_scan

Brief

Parse smartctl scan devices information gathered from smartctl tool

Description

None

Module

smartctl

Source Code Listing

    def run(self):
        # Avoid locale differences in numbers printed by smartctl
        os.environ['LC_NUMERIC'] = 'POSIX'

        # Parse the result as a JSON object from parent task
        devices = []
        comment_pattern = re.compile(r'^(?P<value>[^#]*)(#(?P<comment>.*))?$')
        space_pattern = re.compile('\s+')
        if self.__smartctl_task.getReturnCode() == 0:
            out = self.__smartctl_task.getOutput().strip()
            lines = out.split('\n')
            for l in lines:
                l = l.strip()
                if not l:
                    # Ignore blank lines
                    continue
                m = comment_pattern.match(l)
                if m:
                    # Ignore comments
                    l = m.group('value').strip()
                # Split the string into arguments
                args = space_pattern.split(l)
                devices.append(args)

        # NVBugs-200587168: NGC clusters contains scsi VDisks
        # These disks needs to filtered to avoid smartctl info hang failures
        # Perform smartctl operation on physical devices only filtered from lsblk tool
        if not self.__lsblk_task.getResult():
            return

        lsblk_devices = self.__lsblk_task.getResult()

        # Backup megaraid devices
        megaraid_devices = []
        for device in devices:
            for d in device:
                if "megaraid" in d:
                    megaraid_devices.append(device)

        filter_devices = []
        for lsblk_device in lsblk_devices:
            device_name = "/dev/{}".format(lsblk_device['name'])
            # Filter smart devices
            for smart_device in devices:
                if device_name in smart_device:
                    filter_devices.append(smart_device)

        # Append megaraid devices, As lsblk doesn't provide its information
        filter_devices.extend(megaraid_devices)
        self.__result = filter_devices

parse_smartctl_system_disk_info

Brief

Verify disk information using the smartctl system command

Description

None

Module

smartctl

Source Code Listing

    def run(self):
        devices = self.__smartctl_scan_task.getResult()
        if not devices:
            return

        disk_info = {}

        for d in devices:
            pattern = re.compile(r'megaraid,[0-9]+')
            match = pattern.search(' '.join(d))
            if match:
                disk = d[2]
            else:
                disk = d[0]

            # Invoke smartctl for this device
            args = ['smartctl', '-a']
            args.extend(d)
            info_task = tasks.RunCommand(args).depends(modules.common.check_superuser_privileges)
            info_task.run()
            if info_task.getReturnCode():
                # could not run the info command
                continue
            out = info_task.getOutput().strip()

            keys = [('rotation_rate', "Rotation Rate:\s+(?P<{}>[\w ]+)"),
                    ('wear_leveling_count', '(?:177 Wear_Leveling_Count|233 Media_Wearout_Indicator)\\s+0[xX][0-9a-fA-F]+\\s+(?P<{}>\\d+)'),
                    ]

            disk_info[disk] = {}
            for key, pattern in keys:
                match_string = pattern.format(key)
                p = re.compile(r"{}".format(match_string))
                for m in p.finditer(out):
                    if key in  disk_info[disk]:
                        # consider first match only
                        continue
                    disk_info[disk][key] = m.group(key).strip()
                if key == 'wear_leveling_count':
                    try:
                        disk_info[disk][key] = int(disk_info[disk][key])
                    except:
                        pass
        self.__result = disk_info

parse_uname

Brief

Run and Parse uame tool

Description

None

Module

uname

Source Code Listing

    def run(self):
        cmds = [('kernel-name', ['uname', '-s']),
                ('kernel-release', ['uname', '-r']),
                ('kernel-version', ['uname', '-v']),
                ('kernel-nodename', ['uname', '-n']),
                ('kernel-machine', ['uname', '-m']),
                ('kernel-processor', ['uname', '-p']),
                ('kernel-hardware-platform', ['uname', '-i']),
                ('kernel-operating-system', ['uname', '-o'])]

        res = {}
        for key, c_args in cmds:
            t = tasks.RunCommand(args=c_args)
            t.run()
            res[key] = t.getOutput().strip()

        self.__result = res

parse_usb_sysfs

Brief

Parse USB information gathered from sysfs

Description

None

Module

usb

Depends On

Source Code Listing

    def run(self):
        pass  # TODO

parse_xl_info

Brief

Parse output from the “xl info” command

Description

The output from “xl info” is in a simple colon-separated format that this task parses into a Python dict.

Module

xenserver

Depends On

Source Code Listing

    def run(self):
        self.__result = {}
        if run_xl_info.getReturnCode() != 0:
            return
        output = run_xl_info.getOutput()
        item_pattern = re.compile(r'^(?P<name>\S+)\s+:\s+(?P<value>.*)$', flags=re.MULTILINE)
        for m in item_pattern.finditer(output):
            try:
                self.__result[m.group('name')] = int(m.group('value').strip())
            except:
                self.__result[m.group('name')] = m.group('value').strip()

run_dcc_health_api

Brief

Run nvpm dcc health command

Description

Fetch nvpm DCC API Info.

Module

dcs_modules

Source Code Listing

    def run(self):
        if self.dcc_health_api_output is None:
            output = self.__run_nvpm_dcc_health_api_task.getOutput(dcc_health_api,"GET")
            if (output is not None):
                try:
#                    print("Container OP:{}".format(output))
                    self.decodeDccHealthOutput(output)
                    if self.dcc_health_api_output == "":
                        print("Unable to decode DCC Health Output")
                        return
#                    print("Decoded OP:{}".format(self.dcc_health_api_output))
                except Exception as e:
                    print("DCC Health unable to decode JSON {}".format(e)) 
            else:
                print("DCC Health Output is None")
                self.dcc_health_api_output = None
        else:
            return self.dcc_health_api_output

run_mdadm_detail

Brief

Capture mdadm detail output for all the disks

Description

None

Module

mdadm

Source Code Listing

    def run(self):
        import glob
        volumes = glob.glob("/dev/md*")

        # Parse disk information printed by mdadm
        volume_info = {}
        for volume in volumes:
            # capture mdadm info for each device
            args = "mdadm --detail "+volume
            cmd_task = tasks.RunCommand(args.split())
            cmd_task.depends(modules.common.check_superuser_privileges)
            cmd_task.run()
            if cmd_task.getReturnCode():
                # could not run the info command
                continue
            out = cmd_task.getOutput().strip()
            volume_info[volume] = out

        self.__result = volume_info

run_mdadm_examine

Brief

Capture mdadm examine output for all the disks

Description

None

Module

mdadm

Source Code Listing

    def run(self):
        import glob
        volumes = glob.glob("/sys/block/md*")
        disks = sum([glob.glob(v+"/md/dev-*") for v in volumes], [])

        # capture disk info with mdadm examine command
        disk_info = {}
        for disk in disks:
            uevent_path = disk+'/block/uevent'
            if not os.path.exists(uevent_path):
                continue
            dev_name = ''
            with open(uevent_path) as f:
                lines = f.readlines()
                for line in lines:
                    pattern = re.compile(r'DEVNAME=(?P<dev_name>.*)')
                    match = pattern.match(line)
                    if match:
                        dev_name = match.group('dev_name')
                        break
            if dev_name == '':
                continue

            # capture mdadm info for each device
            args = "mdadm --examine /dev/"+dev_name
            cmd_task = tasks.RunCommand(args.split())
            cmd_task.depends(modules.common.check_superuser_privileges)
            cmd_task.run()
            if cmd_task.getReturnCode():
                # could not run the info command
                continue
            out = cmd_task.getOutput().strip()
            disk_info[dev_name] = out
        self.__result = disk_info

run_psu0_fw_version

Brief

Run ipmitool PSU-0 Firmware Version command

Description

None

Module

dcs_modules

Used By

Source Code Listing

    def run(self):
        # Get the raw command to use
        psu_raw_cmd = self.getPsuCmd()
        if not psu_raw_cmd:
            self.__ret = 1
            self.__result = None
            return

        try:
            if(self.__dcc_passgen_task.getReturnCode()==0):
                psswd = self.__dcc_passgen_task.getOutput().strip()
                cmd = 'ipmitool -I lanplus -H {} -U {} -P {} raw {}'.format(platforms.cgx.dcv_bmc_config['ip'], platforms.cgx.dcv_bmc_config['user'], psswd, psu_raw_cmd)
                cmd_task = tasks.RunCommand(shell=True, args=[cmd])
                cmd_task.run()
                self.__ret = cmd_task.getReturnCode()
                self.__result = cmd_task.getOutput()
        except:
            self.__ret = 1
            self.__result = None
            return

run_psu0_model

Brief

Run ipmitool PSU-0 Model command

Description

None

Module

dcs_modules

Used By

Source Code Listing

    def run(self):
        # Get the raw command to use
        psu_raw_cmd = self.getPsuCmd()
        if not psu_raw_cmd:
            self.__ret = 1
            self.__result = None
            return

        try:
            if(self.__dcc_passgen_task.getReturnCode()==0):
                psswd = self.__dcc_passgen_task.getOutput().strip()
                cmd = 'ipmitool -I lanplus -H {} -U {} -P {} raw {}'.format(platforms.cgx.dcv_bmc_config['ip'], platforms.cgx.dcv_bmc_config['user'], psswd, psu_raw_cmd)
                cmd_task = tasks.RunCommand(shell=True, args=[cmd])
                cmd_task.run()
                self.__ret = cmd_task.getReturnCode()
                self.__result = cmd_task.getOutput()
        except:
            self.__ret = 1
            self.__result = None
            return

run_psu0_serial_number

Brief

Run ipmitool PSU-0 Serial Number command

Description

None

Module

dcs_modules

Used By

Source Code Listing

    def run(self):
        # Get the raw command to use
        psu_raw_cmd = self.getPsuCmd()
        if not psu_raw_cmd:
            self.__ret = 1
            self.__result = None
            return

        try:
            if(self.__dcc_passgen_task.getReturnCode()==0):
                psswd = self.__dcc_passgen_task.getOutput().strip()
                cmd = 'ipmitool -I lanplus -H {} -U {} -P {} raw {}'.format(platforms.cgx.dcv_bmc_config['ip'], platforms.cgx.dcv_bmc_config['user'], psswd, psu_raw_cmd)
                cmd_task = tasks.RunCommand(shell=True, args=[cmd])
                cmd_task.run()
                self.__ret = cmd_task.getReturnCode()
                self.__result = cmd_task.getOutput()
        except:
            self.__ret = 1
            self.__result = None
            return

run_psu0_vendor

Brief

Run ipmitool PSU-0 Vendor command

Description

None

Module

dcs_modules

Source Code Listing

    def run(self):
        # Get the raw command to use
        psu_raw_cmd = self.getPsuCmd()
        if not psu_raw_cmd:
            self.__ret = 1
            self.__result = None
            return

        try:
            if(self.__dcc_passgen_task.getReturnCode()==0):
                psswd = self.__dcc_passgen_task.getOutput().strip()
                cmd = 'ipmitool -I lanplus -H {} -U {} -P {} raw {}'.format(platforms.cgx.dcv_bmc_config['ip'], platforms.cgx.dcv_bmc_config['user'], psswd, psu_raw_cmd)
                cmd_task = tasks.RunCommand(shell=True, args=[cmd])
                cmd_task.run()
                self.__ret = cmd_task.getReturnCode()
                self.__result = cmd_task.getOutput()
        except:
            self.__ret = 1
            self.__result = None
            return

run_psu1_fw_version

Brief

Run ipmitool PSU-1 Firmware Version command

Description

None

Module

dcs_modules

Used By

Source Code Listing

    def run(self):
        # Get the raw command to use
        psu_raw_cmd = self.getPsuCmd()
        if not psu_raw_cmd:
            self.__ret = 1
            self.__result = None
            return

        try:
            if(self.__dcc_passgen_task.getReturnCode()==0):
                psswd = self.__dcc_passgen_task.getOutput().strip()
                cmd = 'ipmitool -I lanplus -H {} -U {} -P {} raw {}'.format(platforms.cgx.dcv_bmc_config['ip'], platforms.cgx.dcv_bmc_config['user'], psswd, psu_raw_cmd)
                cmd_task = tasks.RunCommand(shell=True, args=[cmd])
                cmd_task.run()
                self.__ret = cmd_task.getReturnCode()
                self.__result = cmd_task.getOutput()
        except:
            self.__ret = 1
            self.__result = None
            return

run_psu1_model

Brief

Run ipmitool PSU-1 Model command

Description

None

Module

dcs_modules

Used By

Source Code Listing

    def run(self):
        # Get the raw command to use
        psu_raw_cmd = self.getPsuCmd()
        if not psu_raw_cmd:
            self.__ret = 1
            self.__result = None
            return

        try:
            if(self.__dcc_passgen_task.getReturnCode()==0):
                psswd = self.__dcc_passgen_task.getOutput().strip()
                cmd = 'ipmitool -I lanplus -H {} -U {} -P {} raw {}'.format(platforms.cgx.dcv_bmc_config['ip'], platforms.cgx.dcv_bmc_config['user'], psswd, psu_raw_cmd)
                cmd_task = tasks.RunCommand(shell=True, args=[cmd])
                cmd_task.run()
                self.__ret = cmd_task.getReturnCode()
                self.__result = cmd_task.getOutput()
        except:
            self.__ret = 1
            self.__result = None
            return

run_psu1_serial_number

Brief

Run ipmitool PSU-1 Serial Number command

Description

None

Module

dcs_modules

Used By

Source Code Listing

    def run(self):
        # Get the raw command to use
        psu_raw_cmd = self.getPsuCmd()
        if not psu_raw_cmd:
            self.__ret = 1
            self.__result = None
            return

        try:
            if(self.__dcc_passgen_task.getReturnCode()==0):
                psswd = self.__dcc_passgen_task.getOutput().strip()
                cmd = 'ipmitool -I lanplus -H {} -U {} -P {} raw {}'.format(platforms.cgx.dcv_bmc_config['ip'], platforms.cgx.dcv_bmc_config['user'], psswd, psu_raw_cmd)
                cmd_task = tasks.RunCommand(shell=True, args=[cmd])
                cmd_task.run()
                self.__ret = cmd_task.getReturnCode()
                self.__result = cmd_task.getOutput()
        except:
            self.__ret = 1
            self.__result = None
            return

run_psu1_vendor

Brief

Run ipmitool PSU-1 Vendor command

Description

None

Module

dcs_modules

Source Code Listing

    def run(self):
        # Get the raw command to use
        psu_raw_cmd = self.getPsuCmd()
        if not psu_raw_cmd:
            self.__ret = 1
            self.__result = None
            return

        try:
            if(self.__dcc_passgen_task.getReturnCode()==0):
                psswd = self.__dcc_passgen_task.getOutput().strip()
                cmd = 'ipmitool -I lanplus -H {} -U {} -P {} raw {}'.format(platforms.cgx.dcv_bmc_config['ip'], platforms.cgx.dcv_bmc_config['user'], psswd, psu_raw_cmd)
                cmd_task = tasks.RunCommand(shell=True, args=[cmd])
                cmd_task.run()
                self.__ret = cmd_task.getReturnCode()
                self.__result = cmd_task.getOutput()
        except:
            self.__ret = 1
            self.__result = None
            return

sdr_device_bom

Brief

Prepare Device Bom from ipmitool sdr elist

Description

None

Module

ipmitool

Source Code Listing

    def run(self):
        # Create dictionary containing devices that appear to be present
        try:
            bom = {}
            ns_pattern = re.compile(r'^[nN]/?[sS]$')
            for s in self.__parse_ipmi_sdr_elist_task.getResult():
                name = s['name']
                status = s['status']
                reading = s.get('reading')
                if not ns_pattern.match(status) and reading:
                    # Sensor status is not ns and reading is not empty, so device appears to be present
                    bom[name] = True
        except:
            pass

        self.__result = bom

show_dcs_psu_info

Brief

Run ipmitool for DCS PSU information

Description

None

Module

dcs_modules

Source Code Listing

    def run(self):
        # Iterate over all PSUs
        title = "DCS PSU-{}"
        try:
            for i, psu in enumerate(self.psus):
                self.disable_sensor()
                self.switch_psu_channel(psu)
                mfr = self.get_manufacturer()
                self.title(title.format(i) + " {}".format("Vendor"))
                self.send(msg=mfr)
                model = self.get_model()
                self.title(title.format(i) + " {}".format("Model"))
                self.send(msg=model)
                serial = self.get_serial()
                self.title(title.format(i) + " {}".format("Serial Number"))
                self.send(msg=serial)
                fw_version = self.get_fw_version()
                self.title(title.format(i) + " {}".format("FW Version"))
                self.send(msg=fw_version)
                self.enable_sensor()
        except:
            self.enable_sensor()
            pass

show_psu0

Brief

Drive Constellation: DCC PSU-0

Description

None

Module

dcs_modules

Used By

Source Code Listing

    def run(self):
        psu_info = self.__parse_psu_task.getOutput()

        org_title=self.getTitle()
        self.title(org_title+" {}".format("Model"))
        self.send(msg=psu_info.get('Model', 'UNKNOWN'))

        self.title(org_title+" {}".format("Vendor"))
        self.send(msg=psu_info.get('Vendor', 'UNKNOWN'))

        self.title(org_title+" {}".format("Serial Number"))
        self.send(msg=psu_info.get('SN', 'UNKNOWN'))

        self.title(org_title+" {}".format("FW Version"))
        self.send(msg=psu_info.get('FW Ver', 'UNKNOWN'))

        # Restore original title
        self.title(org_title)

        self.__output = psu_info

show_psu1

Brief

Drive Constellation: DCC PSU-1

Description

None

Module

dcs_modules

Depends On

Source Code Listing

    def run(self):
        psu_info = self.__parse_psu_task.getOutput()

        org_title=self.getTitle()
        self.title(org_title+" {}".format("Model"))
        self.send(msg=psu_info.get('Model', 'UNKNOWN'))

        self.title(org_title+" {}".format("Vendor"))
        self.send(msg=psu_info.get('Vendor', 'UNKNOWN'))

        self.title(org_title+" {}".format("Serial Number"))
        self.send(msg=psu_info.get('SN', 'UNKNOWN'))

        self.title(org_title+" {}".format("FW Version"))
        self.send(msg=psu_info.get('FW Ver', 'UNKNOWN'))

        # Restore original title
        self.title(org_title)

        self.__output = psu_info

system_summary

Brief

Prepare System Summary

Description

None

Module

system

Source Code Listing

    def run(self):
        fmt = ""
        # Get the system uptime
        uptime_output = self.__uptime_task.getOutput().strip()
        if uptime_output:
            if uptime_output.lower() == 'up':
                self.__system_summary['system']['uptime'] = 'up less than 1 minute'
            else:
                self.__system_summary['system']['uptime'] = uptime_output

        # Get info from impitool FRU
        fru_output = self.__ipmi_fru_task.getResult()

        # Get the system serial number from ipmitool FRU
        chassis_serial_number = None
        try:
            index = self.__parameters_task.getResult()[
                'chassis_serial_number_index']
            fru_task = tasks.RunCommand(args=['ipmitool', 'fru', 'print', index]) \
                .depends(modules.common.check_superuser_privileges)
            fru_task.run()
            output = fru_task.getOutput().strip()
            for line in output.split('\n'):
                line = line.strip()
                if not line:
                    continue
                field = line.split(':', 1)
                if len(field) != 2:
                    continue
                field_name = field[0].strip().lower().replace(' ', '_')
                if 'chassis_serial' == field_name:
                    chassis_serial_number = field[1].strip()
        except:
            pass

        # Get info from DMI table
        dmi_output = self.__dmidecode_task.getResult()
        if dmi_output:
            chassis_info = None
            for k,v in dmi_output.items():
                if 'Chassis Information' in k:
                    chassis_info = v
                    break
            if chassis_info:
                for key in [ 'manufacturer', 'serial_number' ]:
                    if key in chassis_info:
                        self.__system_summary['chassis'][key] = chassis_info[key]

            system_info = None
            for k, v in dmi_output.items():
                if 'System Information' in k:
                    system_info = v
                    break
            if system_info:
                for key in ['manufacturer', 'product_name', 'serial_number']:
                    if key in system_info:
                        self.__system_summary['system'][key] = system_info[key]
            if 'DGX-2' in self.__system_summary['system']['product_name']:
                # Obtain system serial number from mid-plane
                try:
                    self.__system_summary['system']['serial_number'] = fru_output['Mid_FRU']['chassis_serial']
                except:
                    self.__system_summary['system']['serial_number'] = 'Unknown'

            bios_info = None
            for k, v in dmi_output.items():
                if 'BIOS Information' in k:
                    bios_info = v
                    break
            if bios_info:
                if 'version' in bios_info:
                    self.__system_summary['motherboard']['firmware_version'] = bios_info['version']
                if 'bios_revision' in bios_info:
                    self.__system_summary['motherboard']['bios_revision'] = bios_info['bios_revision']
            base_board_info = None
            for k, v in dmi_output.items():
                if 'Base Board Information' in k:
                    base_board_info = v
                    break
            if base_board_info:
                if 'serial_number' in base_board_info:
                    self.__system_summary['motherboard']['serial_number'] = base_board_info['serial_number']

        # Overwrite serial number if found in FRU
        if chassis_serial_number:
            self.__system_summary['system']['serial_number'] = chassis_serial_number
        # Look for BaseOS version in /etc/<platform>-release
        sw_rel_output = self.__base_os_version_task.getResult()
        key = 'sw_version'
        if sw_rel_output and key in sw_rel_output:
            self.__system_summary['software']['baseos_version'] = sw_rel_output[key]

        # Look for kernel version from uname
        uname_output = self.__parse_uname_task.getResult()
        key = 'kernel-release'
        if uname_output and key in uname_output:
            self.__system_summary['software']['kernel_version'] = uname_output[key]

        # Look for GPU information from nvidia-smi
        driver_version = self.__nvidia_driver_version_task.getResult()
        if driver_version != None:
            self.__system_summary['gpu']['driver_version'] = driver_version

        gpus_dict = self.__check_gpus_task.getGPUResult()
        if gpus_dict != None:
            vbios_versions = set()
            product_names = set()
            for gpu, info in gpus_dict.items():
                try:
                    # Create list of unique VBIOS versions
                    vbios_versions.add(info['vbios_version'])
                    vbios_version = ' ; '.join(vbios_versions)
                    # Create list of unique product names
                    product_names.add(info['product_name'])
                    product_name = ' ; '.join(product_names)
                except:
                    pass

            self.__system_summary['gpu']['vbios_version'] = vbios_version
            self.__system_summary['gpu']['product_name'] = product_name

        # Look for the BMC firmware version
        bmc_output = self.__ipmi_info_task.getResult()
        if bmc_output:
            if 'firmware_revision' in bmc_output:
                self.__system_summary['bmc']['firmware_version'] = self.__firmware_revision_task.getResult()
            if 'ipmi_version' in bmc_output:
                self.__system_summary['bmc']['ipmi_version'] = bmc_output['ipmi_version']

        # Look for the CEC Info
        cec_output = self.__cec_data_task.getResult()
        if cec_output:
            # Caller will take care of filling all values
            self.__system_summary['CEC'] = cec_output
            # for key, value in cec_output.items():
            #     if 'cec_version' == key:
            #         self.__system_summary['CEC']['cec_version'] = value
            #     if 'EC_FW_TAG0' == key:
            #         self.__system_summary['CEC']['EC_FW_TAG0'] = value
            #     if 'EC_FW_TAG1' == key:
            #         self.__system_summary['CEC']['EC_FW_TAG1'] = value
            #     if 'bmc_boot_slot' == key:
            #         self.__system_summary['CEC']['bmc_boot_slot'] = value

        try:
            summary_serial_num_string = self.__parameters_task.getResult()[
                'summary_serial_num_string']
            summary_base_os_string = self.__parameters_task.getResult()[
                'summary_base_os_string']
        except:
            return

        if 'RTX' in self.__system_summary['system']['product_name']:
            fmt += '\n' \
                  'System Summary\n' \
                  '--------------\n' \
                  '    Product Name: {system[product_name]}\n' \
                  '    Manufacturer: {system[manufacturer]}\n' \
                  '    Chassis Serial Number: {chassis[serial_number]}\n' \
                  '    {0}: {system[serial_number]}\n' \
                  '    Uptime: {system[uptime]}\n' \
                  'Motherboard:\n' \
                  '    BIOS Version: {motherboard[firmware_version]}\n' \
                  '    Serial Number: {motherboard[serial_number]}'.format(summary_serial_num_string, **self.__system_summary)
        else:
            fmt += '\n' \
                  'System Summary\n' \
                  '--------------\n' \
                  '    Product Name: {system[product_name]}\n' \
                  '    Manufacturer: {system[manufacturer]}\n' \
                  '    {0}: {system[serial_number]}\n' \
                  '    Uptime: {system[uptime]}\n' \
                  'Motherboard:\n' \
                  '    BIOS Version: {motherboard[firmware_version]}\n' \
                  '    Serial Number: {motherboard[serial_number]}'.format(summary_serial_num_string, **self.__system_summary)

        # Only print the BMC section if the system appears to have a BMC
        if bmc_output:
            fmt += '\nBMC:\n' \
                '    Firmware Version: {bmc[firmware_version]}\n' \
                '    IPMI Version: {bmc[ipmi_version]}'.format(
                    **self.__system_summary)

        fmt += '\nGPU:\n' \
            '    NVIDIA Driver Version: {gpu[driver_version]}\n' \
            '    Product Name(s): {gpu[product_name]}\n' \
            '    VBIOS Version(s): {gpu[vbios_version]}\n' \
            'Software:\n' \
            '    {0}: {software[baseos_version]}\n' \
            '    Kernel Version: {software[kernel_version]}'.format(
                summary_base_os_string, **self.__system_summary)

        if cec_output:
            fmt += '\nCEC:\n' \
                '    CEC Version: {CEC[cec_version]}\n' \
                '    EC_FW_TAG0: {CEC[EC_FW_TAG0]}\n' \
                '    EC_FW_TAG1: {CEC[EC_FW_TAG1]}\n' \
                '    BMC FW authentication state: {CEC[bmc_fw_auth_state]}\n' \
                '    Currently running BMC Slot : {CEC[bmc_boot_slot]}'.format(
                    **self.__system_summary)

        self.__result = fmt