在 CentOS 上充分发挥 PyTorch 性能
一 环境搭建与 GPU 就绪
pip install torch torchvision torchaudiopip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu117import torch; print(torch.__version__); print(torch.cuda.is_available()),应返回版本号与 True。二 性能优化要点
三 推理部署与可移植性
四 监控排障与系统级调优
五 一键可用的最小实践模板
torchrun 启动。# train.py
import torch, torch.nn as nn, torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.cuda.amp import autocast, GradScaler
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
import os
def main(rank=0, world_size=1):
torch.manual_seed(42)
device = torch.device(f"cuda:{rank}" if torch.cuda.is_available() else "cpu")
model = nn.Linear(784, 10).to(device)
if world_size > 1:
model = DDP(model, device_ids=[rank])
opt = optim.Adam(model.parameters(), lr=1e-3)
scaler = GradScaler()
loss_fn = nn.CrossEntropyLoss()
# 示例数据
x = torch.randn(1024, 784, device=device)
y = torch.randint(0, 10, (1024,), device=device)
loader = DataLoader(TensorDataset(x, y), batch_size=256, shuffle=True,
num_workers=4, pin_memory=True)
model.train()
for epoch in range(3):
for data, target in loader:
opt.zero_grad()
with autocast():
out = model(data)
loss = loss_fn(out, target)
scaler.scale(loss).backward()
scaler.step(opt)
scaler.update()
print(f"Epoch {epoch} done on rank {rank}")
# 验证
model.eval()
with torch.no_grad():
logits = model(x[:32])
preds = logits.argmax(dim=-1)
print("Sample preds:", preds.cpu().numpy())
if __name__ == "__main__":
# 多卡启动示例:torchrun --nproc_per_node=2 train.py
rank = int(os.environ.get("RANK", 0))
world_size = int(os.environ.get("WORLD_SIZE", 1))
if world_size > 1:
dist.init_process_group("nccl", rank=rank, world_size=world_size)
main(rank, world_size)
python train.pytorchrun --nproc_per_node=2 train.pymodel = torch.compile(model)(PyTorch 2.x)。