centos

如何在CentOS上充分利用PyTorch功能

小樊
40
2025-11-16 04:54:00
栏目: 智能运维

在 CentOS 上充分发挥 PyTorch 性能

一 环境搭建与 GPU 就绪

二 性能优化要点

三 推理部署与可移植性

四 监控排障与系统级调优

五 一键可用的最小实践模板

# train.py
import torch, torch.nn as nn, torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.cuda.amp import autocast, GradScaler
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
import os

def main(rank=0, world_size=1):
    torch.manual_seed(42)
    device = torch.device(f"cuda:{rank}" if torch.cuda.is_available() else "cpu")
    model = nn.Linear(784, 10).to(device)
    if world_size > 1:
        model = DDP(model, device_ids=[rank])

    opt = optim.Adam(model.parameters(), lr=1e-3)
    scaler = GradScaler()
    loss_fn = nn.CrossEntropyLoss()

    # 示例数据
    x = torch.randn(1024, 784, device=device)
    y = torch.randint(0, 10, (1024,), device=device)
    loader = DataLoader(TensorDataset(x, y), batch_size=256, shuffle=True,
                      num_workers=4, pin_memory=True)

    model.train()
    for epoch in range(3):
        for data, target in loader:
            opt.zero_grad()
            with autocast():
                out = model(data)
                loss = loss_fn(out, target)
            scaler.scale(loss).backward()
            scaler.step(opt)
            scaler.update()
        print(f"Epoch {epoch} done on rank {rank}")

    # 验证
    model.eval()
    with torch.no_grad():
        logits = model(x[:32])
        preds = logits.argmax(dim=-1)
    print("Sample preds:", preds.cpu().numpy())

if __name__ == "__main__":
    # 多卡启动示例:torchrun --nproc_per_node=2 train.py
    rank = int(os.environ.get("RANK", 0))
    world_size = int(os.environ.get("WORLD_SIZE", 1))
    if world_size > 1:
        dist.init_process_group("nccl", rank=rank, world_size=world_size)
    main(rank, world_size)

0
看了该问题的人还看了