Linux上PyTorch内存管理实战指南
一 内存构成与监控要点
二 GPU显存优化策略
三 系统层面与主机内存管理
四 实战代码模板
import torch, gc
from torch.cuda.amp import autocast, GradScaler
model = model.cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scaler = GradScaler()
accum_steps = 4
for epoch in range(epochs):
for i, (inputs, targets) in enumerate(train_loader):
inputs, targets = inputs.cuda(), targets.cuda()
with autocast():
outputs = model(inputs)
loss = criterion(outputs, targets) / accum_steps
scaler.scale(loss).backward()
if (i + 1) % accum_steps == 0:
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad()
# 释放本轮临时张量
del outputs, loss, inputs, targets
if (i + 1) % 50 == 0: # 适度清理,避免频繁
gc.collect()
torch.cuda.empty_cache()
def find_max_batch(model, input_shape, max_mem=8*1024**3, start=1):
bsz = start
while True:
try:
inp = torch.randn(*input_shape, device='cuda')
with torch.cuda.amp.autocast():
_ = model(inp[:bsz])
used = torch.cuda.max_memory_allocated()
if used > 0.9 * max_mem:
return max(1, bsz - 1)
bsz *= 2
except RuntimeError:
return max(1, bsz // 2)
# GPU
print(f"Alloc: {torch.cuda.memory_allocated()/1024**2:.1f}MB "
f"Reserved: {torch.cuda.memory_reserved()/1024**2:.1f}MB")
print(torch.cuda.memory_summary())
# 系统
# watch -n 1 'free -h'
sudo fallocate -l 8G /swapfile
sudo chmod 600 /swapfile
sudo mkswap /swapfile
sudo swapon /swapfile
echo '/swapfile none swap sw 0 0' | sudo tee -a /etc/fstab
以上模板可与梯度检查点、FSDP、更轻量优化器等按需组合,以在有限硬件上获得更稳健的训练流程。