NVSM Health

Summary Information

Summary Information

Table of Logs Collected

Name Log Path
acpi_video_info /proc/acpi/video/*/info
application_dump /var/crash/*.crash
apt_log /var/log/apt/*
apt_preferences_nvidia /etc/apt/preferences.d/nvidia
apt_sources /etc/apt/sources.list
apt_sources_list_d /etc/apt/sources.list.d/*
bmc_sel_log /var/log/bmc_sel*.log
cmdline /proc/cmdline
collectd_log /var/log/collectd.log
comp_fw_log /var/log/comp_fw_log.txt
cosmos_log /var/log/cosmos/*.log
cpuinfo /proc/cpuinfo
debian_release /etc/debian_release
debian_version /etc/debian_version
dgx_release /etc/dgx-release
dmesg_log /var/log/dmesg*
docker_volume_netshare_log /var/log/docker-volume-netshare.log
dshm_log /var/log/nvsm/dshm/*
etc_netplan /etc/netplan/*.yaml
fabricmanager_log /var/log/fabricmanager.log*
fedora_release /etc/fedora-release
fscache_stats /proc/fs/fscache/stats
gds_collect None
gentoo_release /etc/gentoo-release
installer_syslog /var/log/installer/syslog*
interrupts /proc/interrupts
iomem /proc/iomem
issue /etc/issue
kern_log /var/log/kern.log*
kernel_log /var/log/kernel.log*
lib_netplan /lib/netplan/*.yaml
mandrake_release /etc/mandrake-release
mdstat /proc/mdstat
meminfo /proc/meminfo
mesos_master_error /var/log/mesos/mesos-master.ERROR
mesos_master_fatal /var/log/mesos/mesos-master.FATAL
mesos_master_info /var/log/mesos/mesos-master.INFO
mesos_master_warning /var/log/mesos/mesos-master.WARNING
mesos_slave_error /var/log/mesos/mesos-slave.ERROR
mesos_slave_fatal /var/log/mesos/mesos-slave.FATAL
mesos_slave_info /var/log/mesos/mesos-slave.INFO
mesos_slave_warning /var/log/mesos/mesos-slave.WARNING
messages /var/log/messages*
modules /proc/modules
monit_log /var/log/monit
mtrr /proc/mtrr
network_interfaces /etc/network/interfaces
network_interfaces_d /etc/network/interfaces.d/*
nfsfs_servers /proc/fs/nfsfs/servers
nfsfs_volumes /proc/fs/nfsfs/volumes
nginx_log /var/log/nginx/*
nvidia_application_profiles1 /etc/nvidia/nvidia-application-profiles-rc
nvidia_application_profiles2 /etc/nvidia/nvidia-application-profiles-rc.d/*
nvidia_application_profiles3 /usr/share/nvidia/nvidia-application-profiles-*-rc
nvidia_dcshwapikey_conf /etc/nvidia/gridd.conf*
nvidia_dcshwapikey_license /etc/nvidia/license/NV-*
nvidia_driver_gpu_information /proc/driver/nvidia/gpus/*/information
nvidia_driver_gpu_registry /proc/driver/nvidia/gpus/*/registry
nvidia_driver_params /proc/driver/nvidia/params
nvidia_driver_registry /proc/driver/nvidia/registry
nvidia_driver_version /proc/driver/nvidia/version
nvidia_driver_warnings /proc/driver/nvidia/warnings/*
nvidia_fs_peer_affinity /proc/driver/nvidia-fs/peer_affinity
nvidia_fs_peer_distance /proc/driver/nvidia-fs/peer_distance
nvidia_fs_stats /proc/driver/nvidia-fs/stats
nvidia_fw_log /var/log/nvidia-fw.log
nvidia_installer_log /var/log/nvidia-installer.log
nvidia_uninstall_log /var/log/nvidia-uninstall.log
nvsm_log /var/log/nvsm/*log*
pci /proc/pci
pegasus_dbglog None
pegasus_syslog None
redhat_release /etc/redhat-release
redhat_version /etc/redhat_version
release /etc/release
remote_bmc_sel_log /var/log/remote_bmc_sel.log
run_netplan /run/netplan/*.yaml
slackware_release /etc/slackware-release
slackware_version /etc/slackware-version
sosreport None
sun_release /etc/sun-release
syslog /var/log/syslog*
system_map /boot/System.map*
td_agent_log /var/log/td-agent/td-agent.*
upstart_log /var/log/upstart/*
var_lib_dhcp /var/lib/dhcp/*
version /proc/version
xfree86_log /var/log/XFree86.*.log*
xorg_log /var/log/Xorg.*.log*
yellowdog_release /etc/yellowdog-release
zookeeper_log /var/log/zookeeper/*.log

Table of Commands Executed

Name Command Line
bash bash --version
bash_hello_world ${NVSMHEALTH_DUMP_TOOLS}/hello.bash
collect_fru ipmitool fru print
collect_nvsm ${NVSMHEALTH_DUMP_TOOLS}/collect_nvsm.py
collect_usb_sysfs echo TODO
date date
date_utc date --utc
dcc_ipmitool_sel_writeraw ipmitool -I lanplus -H 192.168.1.42 -U n.....
dcc_passgen dcc_passgen
dcgmi_nvlink dcgmi nvlink -s
dcs_cam_camera_mapping python3 ${NVSMHEALTH_DUMP_TOOLS}/dcs_cam.....
dcs_cam_gpus_all python3 ${NVSMHEALTH_DUMP_TOOLS}/dcs_cam.....
dcs_cam_query_gpu_info python3 ${NVSMHEALTH_DUMP_TOOLS}/dcs_cam.....
df df -k
dmesg dmesg
dmidecode dmidecode
docker_info docker info
docker_ps docker ps
dpkg_list dpkg --list
dpkg_verify dpkg --verify
ethtool ${NVSMHEALTH_DUMP_TOOLS}/ethtool.sh
fru_dcc_version ipmitool fru print 0 | grep -E 'Product .....
gcc gcc -v
gds_check ${NVSMHEALTH_DUMP_GDS_CUDA_PATH}/gds/too.....
gds_stack_trace for x in `nvidia-smi --query-compute-app.....
gds_stats for x in `nvidia-smi --query-compute-app.....
glxinfo ldd /usr/bin/glxinfo
gpp g++ -v
hca_self_test hca_self_test.ofed
ibdev2netdev ibdev2netdev
ibstat ibstat
ibstatus ibstatus
ibv_devinfo ibv_devinfo
ip_addr_show ip addr show
ip_link_show ip link show
ip_route_show ip route show
ipmitool_bmc_info ipmitool bmc info
ipmitool_chassis_status ipmitool chassis status
ipmitool_fru ipmitool fru
ipmitool_lan_print ipmitool lan print 1
ipmitool_power_led_status ${NVSMHEALTH_DUMP_TOOLS}/ipmitool_power_.....
ipmitool_raw ${NVSMHEALTH_DUMP_TOOLS}/ipmitool_raw.sh
ipmitool_raw_dgxa100 ${NVSMHEALTH_DUMP_TOOLS}/ipmitool_raw_dg.....
ipmitool_sdr ipmitool sdr
ipmitool_sdr_dump out=$(mktemp); ipmitool sdr dump $out > .....
ipmitool_sdr_info ipmitool sdr info
ipmitool_sel_elist ipmitool sel elist
ipmitool_sel_info ipmitool sel info
ipmitool_sel_list ipmitool sel list
ipmitool_sel_time_get ipmitool sel time get
ipmitool_sel_writeraw ${NVSMHEALTH_DUMP_TOOLS}/sel_writeraw.sh
ipmitool_user_list_1 ipmitool user list 1
java java -version
java_hello_world java -classpath ${NVSMHEALTH_DUMP_TOOLS}.....
ldconfig ldconfig -p
lsb_release lsb_release -a
lsblk lsblk
lsblk_discard lsblk --discard
lsblk_topology lsblk --topology
lscpu lscpu
lshw lshw
lslocks lslocks
lsmod lsmod
lspci lspci -vvn
lspci_plain lspci
lspci_tree lspci -t
lsusb lsusb
lsusb_tree lsusb -t
lsusb_verbose lsusb --verbose
mdadm_detail ${NVSMHEALTH_DUMP_TOOLS}/mdadm-detail.sh
mdadm_examine ${NVSMHEALTH_DUMP_TOOLS}/mdadm-examine.sh
mlx_fetch_arm_log ${NVSMHEALTH_DUMP_TOOLS}/mlnx_arm_logs.sh
mlxcables mst start && mst cable add && mlxcables
modinfo ${NVSMHEALTH_DUMP_TOOLS}/modinfo.sh
mount mount
ntpq ntpq -p
numactl numactl --hardware
nvcc nvcc --version
nvidia_address_text ${NVSMHEALTH_DUMP_TOOLS}/nvidia_address_.....
nvidia_debugdump ${NVSMHEALTH_DUMP_TOOLS}/nvidia-debugdum.....
nvidia_dkms_log ${NVSMHEALTH_DUMP_TOOLS}/nvidia-dkms-log.....
nvidia_driver_ko ${NVSMHEALTH_DUMP_TOOLS}/nvidia_driver_k.....
nvidia_settings nvidia-settings -q all
nvidia_smi nvidia-smi
nvidia_smi_nvlink nvidia-smi topo -p2p rw >/dev/null && nv.....
nvidia_smi_query nvidia-smi -q
nvidia_smi_query_unit nvidia-smi -q -u
nvidia_smi_topo nvidia-smi topo -m
nvidia_smi_xml nvidia-smi -q -x
nvidia_vm_health_check_show nvidia-vm health-check show
nvidia_vm_image_show nvidia-vm image show
nvidia_vm_resources_show nvidia-vm resources show
nvme_list nvme list
nvme_list nvme list --output-format=json
nvme_logs ${NVSMHEALTH_DUMP_TOOLS}/nvme-logs.sh
nvsm_health_show_debug nvsm-health --show --log-level=debug
nvsm_show nvsm show -level all
nvsm_show_alerts nvsm show alerts
nvsm_show_debug nvsm --log-level=debug show -level all
ofed_info ofed_info
perl perl -v
perl_hello_world ${NVSMHEALTH_DUMP_TOOLS}/hello.pl
ping_compute ping -w 5 ngc.nvidia.com
printenv printenv
ps ps -wwo pid,uid,pcpu,pmem,etime,state,pp.....
ps_aux ps aux
psu_info_dgx1 ${NVSMHEALTH_DUMP_TOOLS}/psu_info_dgx1.sh
python python --version
python_hello_world ${NVSMHEALTH_DUMP_TOOLS}/hello.py
run_bmc_boot_slot_task ipmitool raw 0x3C 0x3 0x0
run_cec_boot_status ipmitool raw 0x3C 0x68 0x00
run_cec_version ipmitool raw 0x3C 0xF 0x9
run_dmidecode dmidecode
run_dmidecode_memory dmidecode --type memory
run_dpkg_grep_kvm bash -c "dpkg -l | grep -c dgx-kvm-sw"
run_gpu_monitor_status nvsm_core --mode=client GET /nvsm/v1/Sys.....
run_ipmi_fru ipmitool fru print
run_ipmi_getenables ipmitool mc getenables
run_ipmi_info ipmitool mc info
run_ipmi_sdr_elist ipmitool sdr elist
run_ipmi_sensor ipmitool sensor
run_ipmitool ipmitool
run_lsblk_scsi_device_info lsblk -S -P -o NAME,HCTL,TYPE,VENDOR,MOD.....
run_lscpu lscpu
run_lspci lspci -vmm -nn
run_lspci_n lspci -vmm -n
run_lspci_verbose lspci -vvv -nn -D
run_mlxfwmanager mlxfwmanager --query-format xml
run_net_ifconfig ifconfig -a
run_nvidia_smi_gpu_bus_id nvidia-smi --query-gpu=gpu_bus_id --form.....
run_nvidia_smi_p2p_topology nvidia-smi topo -p2p rw
run_nvidia_smi_topology nvidia-smi topo --matrix
run_smartctl_scan smartctl --scan
run_storcli_pall storcli64 /c0/pall show all J
run_storcli_vall storcli64 /c0/vall show all J
run_storcli_version storcli64 -v -NoLog
run_xl_info xl info
service_cachefilesd_status service cachefilesd status
service_status_all service --status-all
smartctl ${NVSMHEALTH_DUMP_TOOLS}/smartctl.sh
smartctl_scan smartctl --scan
storcli_cmds ${NVSMHEALTH_DUMP_TOOLS}/storcli_cmds.sh
sysctl sysctl -a
sysfs_dmi_bios_version cat /sys/devices/virtual/dmi/id/bios_ver.....
sysfs_dmi_product_name cat /sys/devices/virtual/dmi/id/product_.....
sysfs_dmi_system_vendor cat /sys/devices/virtual/dmi/id/sys_vend.....
timedatectl_status timedatectl status
top top -b -n 5
ulimit bash -c "ulimit -a"
uname uname -a
uptime uptime -p
virsh_list_all virsh list --all
xenserver_status_report ${NVSMHEALTH_DUMP_TOOLS}/xenserver-statu.....
xl_info xl info
xrandr xrandr --verbose
xset xset -q

Table of Health Checks Performed

Name Brief
check_blacklist_recommendations Check DCGM for GPU blacklist recommendations
check_bom_dimms Check Memory DIMMs devices information for consistency
check_bom_disk_controllers Check Disk Controllers PCIe devices information for consistency
check_bom_ethernet_controllers Check Ethernet Controllers PCIe devices information for consistency
check_bom_gpus Check GPUs PCIe devices information for consistency
check_bom_ib_controllers Check Infiband controllers PCIe devices information for consistency
check_bom_nvswitch Check NVSwitch controller PCIe devices information for consistency
check_bom_pcie_switches Check PCIe Switches PCIe devices information for consistency
check_bom_vgas Check VGA Controller PCIe devices information for consistency
check_dcc_can_health Drive Constellation: Check DCC CAN Health
check_dcc_can_reachability Drive Constellation: Check DCC CAN reachability Health
check_dcc_display_configuration Drive Constellation: Check DCC Display Configuration Health
check_dcc_display_synchronization Drive Constellation: Check DCC Display Synchronization Health
check_dcc_ecu_tegraA_health Drive Constellation: Check DCC ECU TegraA Health
check_dcc_ecu_tegraA_storage_health Drive Constellation: Check DCC ECU TegraA Storage Health
check_dcc_ecu_tegraB_health Drive Constellation: Check DCC ECU TegraB Health
check_dcc_ecu_tegraB_storage_health Drive Constellation: Check DCC ECU TegraB Storage Health
check_dcc_ethernet_health Drive Constellation: Check DCC Ethernet Health
check_dcc_fan_health Drive Constellation: Check DCC Fan Health
check_dcc_gpu_health Drive Constellation: Check DCC GPU Health
check_dcc_info Drive Constellation: Get DCC Info
check_dcc_network_reachability Drive Constellation: Check DCC Network Reachability
check_dcc_serializer_configuration Drive Constellation: Check DCC Serializer Configuration Health
check_dcc_usb_health Drive Constellation: Check DCC USB Health
check_dcc_usb_reachability Drive Constellation: Check DCC USB reachability Health
check_dcs_psu_info Drive Constellation: Check DCC PSU Info
check_dimm_part_number Verify DIMM part number
check_dimm_vendors Verify DIMM vendors
check_ecu_info Drive Constellation: Get ECU Info
check_ethernet_controller_info None
check_fan_bom Verify chassis fan presence
check_fru_consistency Check FRU information for consistency
check_gpu_direct_topology Check GPUDirect Topology information for consistency
check_gpu_link_info None
check_gpu_p2p_topology Check GPUs p2p Topology information for consistency
check_gpu_vbios_version_consistency Verify GPUs VBIOS version consistency
check_gpus Check GPU health retired page count, retired pages pending, inforom storage version and vbios version
check_ib_controller_link_info None
check_instant_blacklist_recommendations Quick health check of GPU using DCGM
check_ipmi_sensor_thresholds Check BMC sensor thresholds
check_ipmitool_working Check that the ipmitool command is working
check_logical_core_count Number of logical CPU cores [{0}]
check_mdadm_disks Status of software RAID disk superblocks
check_mdadm_volumes Status of software RAID volumes
check_meminfo_mem_size Installed memory capacity [{0:.2f}GB]
check_mlx_fw_version Verify Mellanox devices firmware version consistency
check_net_link Verify Network Interfaces Link
check_net_ping Verify Network IP Reachability
check_nvidia_grid_license Drive Constellation: GRID License Status
check_nvidia_smi_gpu_bus_id Verify GPU’s identified using nvidia-smi
check_nvidia_smi_nvlink_status Parse nvlink status with NVIDIA System Management Interface (nvidia-smi)
check_nvme_devices Verify installed NVMe devices
check_nvme_link_info None
check_nvme_smart_log Check SMART status of NVMe devices
check_psu_bom Verify chassis power supply presence
check_psu_info Check PSU Info (Vendor, Model) for Consistency
check_smartctl_disk_count Verify installed disks
check_smartctl_megaraid_disk_count Verify installed MegaRAID disks
check_smartctl_ssd_brick Check for SSD health
check_storcli_disk_state None
check_storcli_phy_links None
check_storcli_sanity_installed [sanity] MegaRAID storcli utility installed
check_storcli_sanity_supported [sanity] {} BaseOS support for storcli utility
check_superuser_privileges Check for superuser privileges
check_xenserver_logical_core_count Number of logical CPU cores [{0}]
dcv_check_fan_bom Drive Constellation: Verify chassis fan presence for DCC
dcv_check_fru_consistency Drive Constellation: Check FRU information for consistency
dcv_check_ipmi_sensor_thresholds Drive Constellation: Check DCC BMC sensor thresholds
dcv_check_psu_bom Drive Constellation: Verify chassis power supply presence on DCC