在PyTorch中,使用多GPU进行训练可以显著提高训练速度。以下是一个简单的示例,展示了如何使用torch.nn.DataParallel
或torch.nn.parallel.DistributedDataParallel
进行多GPU训练。
torch.nn.DataParallel
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
# 定义一个简单的模型
class SimpleModel(nn.Module):
def __init__(self):
super(SimpleModel, self).__init__()
self.fc = nn.Linear(784, 10)
def forward(self, x):
return self.fc(x)
# 创建模型实例
model = SimpleModel()
# 使用 DataParallel 包装模型
if torch.cuda.device_count() > 1:
print("Using", torch.cuda.device_count(), "GPUs")
model = nn.DataParallel(model)
# 将模型放到GPU上
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
# 数据预处理
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
# 加载数据集
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
# 训练模型
num_epochs = 10
for epoch in range(num_epochs):
for data, target in train_loader:
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")
torch.nn.parallel.DistributedDataParallel
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
import torch.distributed as dist
import torch.multiprocessing as mp
# 定义一个简单的模型
class SimpleModel(nn.Module):
def __init__(self):
super(SimpleModel, self).__init__()
self.fc = nn.Linear(784, 10)
def forward(self, x):
return self.fc(x)
def train(rank, world_size):
# 初始化进程组
dist.init_process_group("nccl", rank=rank, world_size=world_size)
# 创建模型实例
model = SimpleModel()
# 使用 DistributedDataParallel 包装模型
model = nn.parallel.DistributedDataParallel(model, device_ids=[rank])
# 将模型放到GPU上
device = torch.device(f"cuda:{rank}")
model.to(device)
# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
# 数据预处理
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
# 加载数据集
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, num_replicas=world_size, rank=rank)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, sampler=train_sampler)
# 训练模型
num_epochs = 10
for epoch in range(num_epochs):
train_sampler.set_epoch(epoch)
for data, target in train_loader:
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
print(f"Rank {rank}, Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")
# 结束进程组
dist.destroy_process_group()
def main():
world_size = torch.cuda.device_count()
mp.spawn(train, args=(world_size,), nprocs=world_size, join=True)
if __name__ == "__main__":
main()
torch.nn.DataParallel
:适用于单机多GPU环境,简单且易于使用。torch.nn.parallel.DistributedDataParallel
:适用于分布式环境,性能更优,但配置和使用相对复杂。选择哪种方法取决于你的具体需求和环境。