一、查看顯卡
dnf install pciutils
# 查看顯卡信息
lspci | grep -i vga
lshw -numeric -C display
二、安裝NIVIDIA顯卡驅動
禁用 Nouveau 驅動
設置blacklist文件
cat > /etc/modprobe.d/blacklist.conf <<EOF
blacklist nouveau
options nouveau modeset=0
EOF
更新 GRUB 配置文件 /etc/default/grub,在 GRUB_CMDLINE_LINUX 中添加:
rd.driver.blacklist=nouveau nouveau.modeset=0
生成新的 GRUB 配置:
sudo grub2-mkconfig -o /boot/grub2/grub.cfg
重啟系統:
sudo reboot
安裝必要的開發工具和內核頭文件
安裝必要工具
dnf install epel-release
sudo dnf install -y gcc make dkms
安裝頭文件和開發工具
wget https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/kernel-headers-$(uname -r).rpm
wget https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/kernel-devel-$(uname -r).rpm
yum install -y kernel-headers-$(uname -r).rpm
yum install -y kernel-devel-$(uname -r).rpm
- 網上有建議
yum install -y kernel-headers kernel-devel
這種方式安裝,但必須要保證他們的版本和系統內核版本一致(完全一致),否則會導致安裝NVIDIA驅動時報錯:ERROR: Unable to find the kernel source tree .......
- 這里是針對Rockylinux(Centos)系統,如果你的是ubuntu或其他系統,可能這倆文件不能使用(未測試)
查看安裝的版本
rpm -qa | egrep kernel
下載并安裝 NVIDIA 驅動
從 NVIDIA 官方網站下載適用于你的顯卡型號和系統版本的 .run 文件。
https://www.nvidia.cn/drivers/lookup/
以A40顯卡為例
下載后的文件為:NVIDIA-Linux-x86_64-570.86.15.run
上傳到服務器上
賦予下載的 .run 文件可執行權限:
chmod +x NVIDIA-Linux-x86_64-*.run
運行安裝程序:
sudo ./NVIDIA-Linux-x86_64-*.run
按照提示完成安裝過程。
驗證安裝
安裝完成后,重啟系統:
sudo reboot
使用 nvidia-smi
命令驗證驅動是否安裝成功:
[root@bogon ~]# nvidia-smi
Mon Feb 10 13:45:19 2025
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.86.15 Driver Version: 570.86.15 CUDA Version: 12.8 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA A40 Off | 00000000:00:1A.0 Off | 0 |
| 0% 43C P0 88W / 300W | 1MiB / 46068MiB | 5% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| No running processes found |
+-----------------------------------------------------------------------------------------+
三、docker支持nvidia gpu
前提是已安裝docker環境
安裝nvidia-container-toolkit
#Configure the repository
curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo \
| tee /etc/yum.repos.d/nvidia-container-toolkit.repo
#Install the NVIDIA Container Toolkit packages
yum install -y nvidia-container-toolkit
#Configure Docker to use Nvidia driver
# 會更新/etc/docker/daemon.json文件
sudo nvidia-ctk runtime configure --runtime=docker
# 重啟docker
sudo systemctl restart docker
測試容器是否能使用gpu
docker run --rm --gpus all nvidia/cuda:12.6.3-base-rockylinux9 nvidia-smi
顯示如下則說明docker已支持nvidia gpu
[root@bogon ~]# docker run --rm --gpus=all nvidia/cuda:12.6.3-base-rockylinux9 nvidia-smi
Mon Feb 10 05:46:25 2025
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.86.15 Driver Version: 570.86.15 CUDA Version: 12.8 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA A40 Off | 00000000:00:1A.0 Off | 0 |
| 0% 44C P0 89W / 300W | 1MiB / 46068MiB | 5% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| No running processes found |
+-----------------------------------------------------------------------------------------+