在CentOS上实现PyTorch的并行计算,通常涉及以下几个方面:
DataParallel或DistributedDataParallel来在多个GPU上进行模型训练。torch.distributed)在多个计算节点上进行训练。以下是详细的步骤和示例代码:
DataParallelDataParallel 是一个简单的并行方法,适用于单节点多GPU的情况。
import torch
import torch.nn as nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
# 定义一个简单的模型
class SimpleModel(nn.Module):
def __init__(self):
super(SimpleModel, self).__init__()
self.fc = nn.Linear(784, 10)
def forward(self, x):
x = x.view(x.size(0), -1)
return self.fc(x)
# 检查是否有可用的GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 创建模型并将其移动到GPU
model = SimpleModel().to(device)
# 使用DataParallel包装模型
if torch.cuda.device_count() > 1:
print(f"Let's use {torch.cuda.device_count()} GPUs!")
model = nn.DataParallel(model)
# 加载数据集
transform = transforms.Compose([transforms.ToTensor()])
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
# 训练模型
for epoch in range(5):
for data, target in train_loader:
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
print(f"Epoch {epoch+1}, Loss: {loss.item()}")
DistributedDataParallelDistributedDataParallel 是更高效的并行方法,适用于多节点多GPU的情况。
import torch
import torch.nn as nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
def train(rank, world_size):
# 初始化分布式环境
dist.init_process_group(backend='nccl', init_method='env://', world_size=world_size, rank=rank)
# 创建模型并将其移动到GPU
model = SimpleModel().to(rank)
ddp_model = DDP(model, device_ids=[rank])
# 加载数据集
transform = transforms.Compose([transforms.ToTensor()])
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank)
train_loader = DataLoader(train_dataset, batch_size=64, sampler=sampler)
# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(ddp_model.parameters(), lr=0.01)
# 训练模型
for epoch in range(5):
sampler.set_epoch(epoch)
for data, target in train_loader:
data, target = data.to(rank), target.to(rank)
optimizer.zero_grad()
output = ddp_model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
print(f"Rank {rank}, Epoch {epoch+1}, Loss: {loss.item()}")
# 清理分布式环境
dist.destroy_process_group()
def main():
world_size = torch.cuda.device_count()
mp.spawn(train, args=(world_size,), nprocs=world_size, join=True)
if __name__ == "__main__":
main()
多节点并行需要设置多个计算节点,并在每个节点上运行相同的训练脚本。你需要配置好网络和分布式环境变量。
在每个节点上设置以下环境变量:
export MASTER_ADDR='master_ip'
export MASTER_PORT='12345'
export WORLD_SIZE=number_of_nodes * number_of_gpus_per_node
export RANK=rank_per_node
在每个节点上运行以下命令:
python -m torch.distributed.launch --nproc_per_node=number_of_gpus_per_node your_training_script.py
通过上述步骤,你可以在CentOS上实现PyTorch的并行计算。DataParallel适用于单节点多GPU,而DistributedDataParallel适用于多节点多GPU。确保正确配置网络和环境变量,以便顺利进行分布式训练。