linux

Linux上PyTorch内存如何管理

小樊
34
2025-12-22 11:04:30
栏目: 智能运维

Linux上PyTorch内存管理实战指南

一 内存构成与监控要点

二 GPU显存优化策略

三 系统层面与主机内存管理

四 实战代码模板

import torch, gc
from torch.cuda.amp import autocast, GradScaler

model = model.cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scaler = GradScaler()
accum_steps = 4

for epoch in range(epochs):
    for i, (inputs, targets) in enumerate(train_loader):
        inputs, targets = inputs.cuda(), targets.cuda()
        with autocast():
            outputs = model(inputs)
            loss = criterion(outputs, targets) / accum_steps

        scaler.scale(loss).backward()

        if (i + 1) % accum_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        # 释放本轮临时张量
        del outputs, loss, inputs, targets
        if (i + 1) % 50 == 0:  # 适度清理,避免频繁
            gc.collect()
            torch.cuda.empty_cache()
def find_max_batch(model, input_shape, max_mem=8*1024**3, start=1):
    bsz = start
    while True:
        try:
            inp = torch.randn(*input_shape, device='cuda')
            with torch.cuda.amp.autocast():
                _ = model(inp[:bsz])
            used = torch.cuda.max_memory_allocated()
            if used > 0.9 * max_mem:
                return max(1, bsz - 1)
            bsz *= 2
        except RuntimeError:
            return max(1, bsz // 2)
# GPU
print(f"Alloc: {torch.cuda.memory_allocated()/1024**2:.1f}MB  "
      f"Reserved: {torch.cuda.memory_reserved()/1024**2:.1f}MB")
print(torch.cuda.memory_summary())

# 系统
# watch -n 1 'free -h'
sudo fallocate -l 8G /swapfile
sudo chmod 600 /swapfile
sudo mkswap /swapfile
sudo swapon /swapfile
echo '/swapfile none swap sw 0 0' | sudo tee -a /etc/fstab

以上模板可与梯度检查点FSDP更轻量优化器等按需组合,以在有限硬件上获得更稳健的训练流程。

0
看了该问题的人还看了