NVSM Health
Summary Information
Summary Information¶
Table of Logs Collected¶
Name | Log Path |
---|---|
acpi_video_info | /proc/acpi/video/*/info |
application_dump | /var/crash/*.crash |
apt_log | /var/log/apt/* |
apt_preferences_nvidia | /etc/apt/preferences.d/nvidia |
apt_sources | /etc/apt/sources.list |
apt_sources_list_d | /etc/apt/sources.list.d/* |
bmc_sel_log | /var/log/bmc_sel*.log |
cmdline | /proc/cmdline |
collectd_log | /var/log/collectd.log |
comp_fw_log | /var/log/comp_fw_log.txt |
cosmos_log | /var/log/cosmos/*.log |
cpuinfo | /proc/cpuinfo |
debian_release | /etc/debian_release |
debian_version | /etc/debian_version |
dgx_release | /etc/dgx-release |
dmesg_log | /var/log/dmesg* |
docker_volume_netshare_log | /var/log/docker-volume-netshare.log |
dshm_log | /var/log/nvsm/dshm/* |
etc_netplan | /etc/netplan/*.yaml |
fabricmanager_log | /var/log/fabricmanager.log* |
fedora_release | /etc/fedora-release |
fscache_stats | /proc/fs/fscache/stats |
gds_collect | None |
gentoo_release | /etc/gentoo-release |
installer_syslog | /var/log/installer/syslog* |
interrupts | /proc/interrupts |
iomem | /proc/iomem |
issue | /etc/issue |
kern_log | /var/log/kern.log* |
kernel_log | /var/log/kernel.log* |
lib_netplan | /lib/netplan/*.yaml |
mandrake_release | /etc/mandrake-release |
mdstat | /proc/mdstat |
meminfo | /proc/meminfo |
mesos_master_error | /var/log/mesos/mesos-master.ERROR |
mesos_master_fatal | /var/log/mesos/mesos-master.FATAL |
mesos_master_info | /var/log/mesos/mesos-master.INFO |
mesos_master_warning | /var/log/mesos/mesos-master.WARNING |
mesos_slave_error | /var/log/mesos/mesos-slave.ERROR |
mesos_slave_fatal | /var/log/mesos/mesos-slave.FATAL |
mesos_slave_info | /var/log/mesos/mesos-slave.INFO |
mesos_slave_warning | /var/log/mesos/mesos-slave.WARNING |
messages | /var/log/messages* |
modules | /proc/modules |
monit_log | /var/log/monit |
mtrr | /proc/mtrr |
network_interfaces | /etc/network/interfaces |
network_interfaces_d | /etc/network/interfaces.d/* |
nfsfs_servers | /proc/fs/nfsfs/servers |
nfsfs_volumes | /proc/fs/nfsfs/volumes |
nginx_log | /var/log/nginx/* |
nvidia_application_profiles1 | /etc/nvidia/nvidia-application-profiles-rc |
nvidia_application_profiles2 | /etc/nvidia/nvidia-application-profiles-rc.d/* |
nvidia_application_profiles3 | /usr/share/nvidia/nvidia-application-profiles-*-rc |
nvidia_dcshwapikey_conf | /etc/nvidia/gridd.conf* |
nvidia_dcshwapikey_license | /etc/nvidia/license/NV-* |
nvidia_driver_gpu_information | /proc/driver/nvidia/gpus/*/information |
nvidia_driver_gpu_registry | /proc/driver/nvidia/gpus/*/registry |
nvidia_driver_params | /proc/driver/nvidia/params |
nvidia_driver_registry | /proc/driver/nvidia/registry |
nvidia_driver_version | /proc/driver/nvidia/version |
nvidia_driver_warnings | /proc/driver/nvidia/warnings/* |
nvidia_fs_peer_affinity | /proc/driver/nvidia-fs/peer_affinity |
nvidia_fs_peer_distance | /proc/driver/nvidia-fs/peer_distance |
nvidia_fs_stats | /proc/driver/nvidia-fs/stats |
nvidia_fw_log | /var/log/nvidia-fw.log |
nvidia_installer_log | /var/log/nvidia-installer.log |
nvidia_uninstall_log | /var/log/nvidia-uninstall.log |
nvsm_log | /var/log/nvsm/*log* |
pci | /proc/pci |
pegasus_dbglog | None |
pegasus_syslog | None |
redhat_release | /etc/redhat-release |
redhat_version | /etc/redhat_version |
release | /etc/release |
remote_bmc_sel_log | /var/log/remote_bmc_sel.log |
run_netplan | /run/netplan/*.yaml |
slackware_release | /etc/slackware-release |
slackware_version | /etc/slackware-version |
sosreport | None |
sun_release | /etc/sun-release |
syslog | /var/log/syslog* |
system_map | /boot/System.map* |
td_agent_log | /var/log/td-agent/td-agent.* |
upstart_log | /var/log/upstart/* |
var_lib_dhcp | /var/lib/dhcp/* |
version | /proc/version |
xfree86_log | /var/log/XFree86.*.log* |
xorg_log | /var/log/Xorg.*.log* |
yellowdog_release | /etc/yellowdog-release |
zookeeper_log | /var/log/zookeeper/*.log |
Table of Commands Executed¶
Name | Command Line |
---|---|
bash | bash --version |
bash_hello_world | ${NVSMHEALTH_DUMP_TOOLS}/hello.bash |
collect_fru | ipmitool fru print |
collect_nvsm | ${NVSMHEALTH_DUMP_TOOLS}/collect_nvsm.py |
collect_usb_sysfs | echo TODO |
date | date |
date_utc | date --utc |
dcc_ipmitool_sel_writeraw | ipmitool -I lanplus -H 192.168.1.42 -U n..... |
dcc_passgen | dcc_passgen |
dcgmi_nvlink | dcgmi nvlink -s |
dcs_cam_camera_mapping | python3 ${NVSMHEALTH_DUMP_TOOLS}/dcs_cam..... |
dcs_cam_gpus_all | python3 ${NVSMHEALTH_DUMP_TOOLS}/dcs_cam..... |
dcs_cam_query_gpu_info | python3 ${NVSMHEALTH_DUMP_TOOLS}/dcs_cam..... |
df | df -k |
dmesg | dmesg |
dmidecode | dmidecode |
docker_info | docker info |
docker_ps | docker ps |
dpkg_list | dpkg --list |
dpkg_verify | dpkg --verify |
ethtool | ${NVSMHEALTH_DUMP_TOOLS}/ethtool.sh |
fru_dcc_version | ipmitool fru print 0 | grep -E 'Product ..... |
gcc | gcc -v |
gds_check | ${NVSMHEALTH_DUMP_GDS_CUDA_PATH}/gds/too..... |
gds_stack_trace | for x in `nvidia-smi --query-compute-app..... |
gds_stats | for x in `nvidia-smi --query-compute-app..... |
glxinfo | ldd /usr/bin/glxinfo |
gpp | g++ -v |
hca_self_test | hca_self_test.ofed |
ibdev2netdev | ibdev2netdev |
ibstat | ibstat |
ibstatus | ibstatus |
ibv_devinfo | ibv_devinfo |
ip_addr_show | ip addr show |
ip_link_show | ip link show |
ip_route_show | ip route show |
ipmitool_bmc_info | ipmitool bmc info |
ipmitool_chassis_status | ipmitool chassis status |
ipmitool_fru | ipmitool fru |
ipmitool_lan_print | ipmitool lan print 1 |
ipmitool_power_led_status | ${NVSMHEALTH_DUMP_TOOLS}/ipmitool_power_..... |
ipmitool_raw | ${NVSMHEALTH_DUMP_TOOLS}/ipmitool_raw.sh |
ipmitool_raw_dgxa100 | ${NVSMHEALTH_DUMP_TOOLS}/ipmitool_raw_dg..... |
ipmitool_sdr | ipmitool sdr |
ipmitool_sdr_dump | out=$(mktemp); ipmitool sdr dump $out > ..... |
ipmitool_sdr_info | ipmitool sdr info |
ipmitool_sel_elist | ipmitool sel elist |
ipmitool_sel_info | ipmitool sel info |
ipmitool_sel_list | ipmitool sel list |
ipmitool_sel_time_get | ipmitool sel time get |
ipmitool_sel_writeraw | ${NVSMHEALTH_DUMP_TOOLS}/sel_writeraw.sh |
ipmitool_user_list_1 | ipmitool user list 1 |
java | java -version |
java_hello_world | java -classpath ${NVSMHEALTH_DUMP_TOOLS}..... |
ldconfig | ldconfig -p |
lsb_release | lsb_release -a |
lsblk | lsblk |
lsblk_discard | lsblk --discard |
lsblk_topology | lsblk --topology |
lscpu | lscpu |
lshw | lshw |
lslocks | lslocks |
lsmod | lsmod |
lspci | lspci -vvn |
lspci_plain | lspci |
lspci_tree | lspci -t |
lsusb | lsusb |
lsusb_tree | lsusb -t |
lsusb_verbose | lsusb --verbose |
mdadm_detail | ${NVSMHEALTH_DUMP_TOOLS}/mdadm-detail.sh |
mdadm_examine | ${NVSMHEALTH_DUMP_TOOLS}/mdadm-examine.sh |
mlx_fetch_arm_log | ${NVSMHEALTH_DUMP_TOOLS}/mlnx_arm_logs.sh |
mlxcables | mst start && mst cable add && mlxcables |
modinfo | ${NVSMHEALTH_DUMP_TOOLS}/modinfo.sh |
mount | mount |
ntpq | ntpq -p |
numactl | numactl --hardware |
nvcc | nvcc --version |
nvidia_address_text | ${NVSMHEALTH_DUMP_TOOLS}/nvidia_address_..... |
nvidia_debugdump | ${NVSMHEALTH_DUMP_TOOLS}/nvidia-debugdum..... |
nvidia_dkms_log | ${NVSMHEALTH_DUMP_TOOLS}/nvidia-dkms-log..... |
nvidia_driver_ko | ${NVSMHEALTH_DUMP_TOOLS}/nvidia_driver_k..... |
nvidia_settings | nvidia-settings -q all |
nvidia_smi | nvidia-smi |
nvidia_smi_nvlink | nvidia-smi topo -p2p rw >/dev/null && nv..... |
nvidia_smi_query | nvidia-smi -q |
nvidia_smi_query_unit | nvidia-smi -q -u |
nvidia_smi_topo | nvidia-smi topo -m |
nvidia_smi_xml | nvidia-smi -q -x |
nvidia_vm_health_check_show | nvidia-vm health-check show |
nvidia_vm_image_show | nvidia-vm image show |
nvidia_vm_resources_show | nvidia-vm resources show |
nvme_list | nvme list |
nvme_list | nvme list --output-format=json |
nvme_logs | ${NVSMHEALTH_DUMP_TOOLS}/nvme-logs.sh |
nvsm_health_show_debug | nvsm-health --show --log-level=debug |
nvsm_show | nvsm show -level all |
nvsm_show_alerts | nvsm show alerts |
nvsm_show_debug | nvsm --log-level=debug show -level all |
ofed_info | ofed_info |
perl | perl -v |
perl_hello_world | ${NVSMHEALTH_DUMP_TOOLS}/hello.pl |
ping_compute | ping -w 5 ngc.nvidia.com |
printenv | printenv |
ps | ps -wwo pid,uid,pcpu,pmem,etime,state,pp..... |
ps_aux | ps aux |
psu_info_dgx1 | ${NVSMHEALTH_DUMP_TOOLS}/psu_info_dgx1.sh |
python | python --version |
python_hello_world | ${NVSMHEALTH_DUMP_TOOLS}/hello.py |
run_bmc_boot_slot_task | ipmitool raw 0x3C 0x3 0x0 |
run_cec_boot_status | ipmitool raw 0x3C 0x68 0x00 |
run_cec_version | ipmitool raw 0x3C 0xF 0x9 |
run_dmidecode | dmidecode |
run_dmidecode_memory | dmidecode --type memory |
run_dpkg_grep_kvm | bash -c "dpkg -l | grep -c dgx-kvm-sw" |
run_gpu_monitor_status | nvsm_core --mode=client GET /nvsm/v1/Sys..... |
run_ipmi_fru | ipmitool fru print |
run_ipmi_getenables | ipmitool mc getenables |
run_ipmi_info | ipmitool mc info |
run_ipmi_sdr_elist | ipmitool sdr elist |
run_ipmi_sensor | ipmitool sensor |
run_ipmitool | ipmitool |
run_lsblk_scsi_device_info | lsblk -S -P -o NAME,HCTL,TYPE,VENDOR,MOD..... |
run_lscpu | lscpu |
run_lspci | lspci -vmm -nn |
run_lspci_n | lspci -vmm -n |
run_lspci_verbose | lspci -vvv -nn -D |
run_mlxfwmanager | mlxfwmanager --query-format xml |
run_net_ifconfig | ifconfig -a |
run_nvidia_smi_gpu_bus_id | nvidia-smi --query-gpu=gpu_bus_id --form..... |
run_nvidia_smi_p2p_topology | nvidia-smi topo -p2p rw |
run_nvidia_smi_topology | nvidia-smi topo --matrix |
run_smartctl_scan | smartctl --scan |
run_storcli_pall | storcli64 /c0/pall show all J |
run_storcli_vall | storcli64 /c0/vall show all J |
run_storcli_version | storcli64 -v -NoLog |
run_xl_info | xl info |
service_cachefilesd_status | service cachefilesd status |
service_status_all | service --status-all |
smartctl | ${NVSMHEALTH_DUMP_TOOLS}/smartctl.sh |
smartctl_scan | smartctl --scan |
storcli_cmds | ${NVSMHEALTH_DUMP_TOOLS}/storcli_cmds.sh |
sysctl | sysctl -a |
sysfs_dmi_bios_version | cat /sys/devices/virtual/dmi/id/bios_ver..... |
sysfs_dmi_product_name | cat /sys/devices/virtual/dmi/id/product_..... |
sysfs_dmi_system_vendor | cat /sys/devices/virtual/dmi/id/sys_vend..... |
timedatectl_status | timedatectl status |
top | top -b -n 5 |
ulimit | bash -c "ulimit -a" |
uname | uname -a |
uptime | uptime -p |
virsh_list_all | virsh list --all |
xenserver_status_report | ${NVSMHEALTH_DUMP_TOOLS}/xenserver-statu..... |
xl_info | xl info |
xrandr | xrandr --verbose |
xset | xset -q |
Table of Health Checks Performed¶
Name | Brief |
---|---|
check_blacklist_recommendations | Check DCGM for GPU blacklist recommendations |
check_bom_dimms | Check Memory DIMMs devices information for consistency |
check_bom_disk_controllers | Check Disk Controllers PCIe devices information for consistency |
check_bom_ethernet_controllers | Check Ethernet Controllers PCIe devices information for consistency |
check_bom_gpus | Check GPUs PCIe devices information for consistency |
check_bom_ib_controllers | Check Infiband controllers PCIe devices information for consistency |
check_bom_nvswitch | Check NVSwitch controller PCIe devices information for consistency |
check_bom_pcie_switches | Check PCIe Switches PCIe devices information for consistency |
check_bom_vgas | Check VGA Controller PCIe devices information for consistency |
check_dcc_can_health | Drive Constellation: Check DCC CAN Health |
check_dcc_can_reachability | Drive Constellation: Check DCC CAN reachability Health |
check_dcc_display_configuration | Drive Constellation: Check DCC Display Configuration Health |
check_dcc_display_synchronization | Drive Constellation: Check DCC Display Synchronization Health |
check_dcc_ecu_tegraA_health | Drive Constellation: Check DCC ECU TegraA Health |
check_dcc_ecu_tegraA_storage_health | Drive Constellation: Check DCC ECU TegraA Storage Health |
check_dcc_ecu_tegraB_health | Drive Constellation: Check DCC ECU TegraB Health |
check_dcc_ecu_tegraB_storage_health | Drive Constellation: Check DCC ECU TegraB Storage Health |
check_dcc_ethernet_health | Drive Constellation: Check DCC Ethernet Health |
check_dcc_fan_health | Drive Constellation: Check DCC Fan Health |
check_dcc_gpu_health | Drive Constellation: Check DCC GPU Health |
check_dcc_info | Drive Constellation: Get DCC Info |
check_dcc_network_reachability | Drive Constellation: Check DCC Network Reachability |
check_dcc_serializer_configuration | Drive Constellation: Check DCC Serializer Configuration Health |
check_dcc_usb_health | Drive Constellation: Check DCC USB Health |
check_dcc_usb_reachability | Drive Constellation: Check DCC USB reachability Health |
check_dcs_psu_info | Drive Constellation: Check DCC PSU Info |
check_dimm_part_number | Verify DIMM part number |
check_dimm_vendors | Verify DIMM vendors |
check_ecu_info | Drive Constellation: Get ECU Info |
check_ethernet_controller_info | None |
check_fan_bom | Verify chassis fan presence |
check_fru_consistency | Check FRU information for consistency |
check_gpu_direct_topology | Check GPUDirect Topology information for consistency |
check_gpu_link_info | None |
check_gpu_p2p_topology | Check GPUs p2p Topology information for consistency |
check_gpu_vbios_version_consistency | Verify GPUs VBIOS version consistency |
check_gpus | Check GPU health retired page count, retired pages pending, inforom storage version and vbios version |
check_ib_controller_link_info | None |
check_instant_blacklist_recommendations | Quick health check of GPU using DCGM |
check_ipmi_sensor_thresholds | Check BMC sensor thresholds |
check_ipmitool_working | Check that the ipmitool command is working |
check_logical_core_count | Number of logical CPU cores [{0}] |
check_mdadm_disks | Status of software RAID disk superblocks |
check_mdadm_volumes | Status of software RAID volumes |
check_meminfo_mem_size | Installed memory capacity [{0:.2f}GB] |
check_mlx_fw_version | Verify Mellanox devices firmware version consistency |
check_net_link | Verify Network Interfaces Link |
check_net_ping | Verify Network IP Reachability |
check_nvidia_grid_license | Drive Constellation: GRID License Status |
check_nvidia_smi_gpu_bus_id | Verify GPU’s identified using nvidia-smi |
check_nvidia_smi_nvlink_status | Parse nvlink status with NVIDIA System Management Interface (nvidia-smi) |
check_nvme_devices | Verify installed NVMe devices |
check_nvme_link_info | None |
check_nvme_smart_log | Check SMART status of NVMe devices |
check_psu_bom | Verify chassis power supply presence |
check_psu_info | Check PSU Info (Vendor, Model) for Consistency |
check_smartctl_disk_count | Verify installed disks |
check_smartctl_megaraid_disk_count | Verify installed MegaRAID disks |
check_smartctl_ssd_brick | Check for SSD health |
check_storcli_disk_state | None |
check_storcli_phy_links | None |
check_storcli_sanity_installed | [sanity] MegaRAID storcli utility installed |
check_storcli_sanity_supported | [sanity] {} BaseOS support for storcli utility |
check_superuser_privileges | Check for superuser privileges |
check_xenserver_logical_core_count | Number of logical CPU cores [{0}] |
dcv_check_fan_bom | Drive Constellation: Verify chassis fan presence for DCC |
dcv_check_fru_consistency | Drive Constellation: Check FRU information for consistency |
dcv_check_ipmi_sensor_thresholds | Drive Constellation: Check DCC BMC sensor thresholds |
dcv_check_psu_bom | Drive Constellation: Verify chassis power supply presence on DCC |