在Linux环境下,PyTorch可以通过多种方式进行并行计算,主要包括数据并行和模型并行。以下是具体的方法:
使用torch.nn.DataParallel
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
# 定义模型
model = nn.Sequential(
nn.Linear(10, 10),
nn.ReLU(),
nn.Linear(10, 10)
)
# 检查是否有可用的GPU
if torch.cuda.device_count() > 1:
print(f"Let's use {torch.cuda.device_count()} GPUs!")
model = nn.DataParallel(model)
model.to('cuda') # 将模型移动到GPU
# 创建数据加载器
dataset = ... # 定义你的数据集
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
# 训练模型
for inputs, targets in dataloader:
inputs, targets = inputs.to('cuda'), targets.to('cuda')
outputs = model(inputs)
loss = nn.CrossEntropyLoss()(outputs, targets)
loss.backward()
optimizer.step()
optimizer.zero_grad()
使用torch.nn.parallel.DistributedDataParallel
import torch
import torch.nn as nn
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader, DistributedSampler
import torch.distributed as dist
# 初始化分布式环境
dist.init_process_group(backend='nccl')
# 定义模型
model = nn.Sequential(
nn.Linear(10, 10),
nn.ReLU(),
nn.Linear(10, 10)
).to(torch.device("cuda"))
# 包装模型为DDP模型
model = DDP(model)
# 创建数据加载器
dataset = ... # 定义你的数据集
sampler = DistributedSampler(dataset)
dataloader = DataLoader(dataset, batch_size=32, sampler=sampler)
# 训练模型
for inputs, targets in dataloader:
inputs, targets = inputs.to(torch.device("cuda")), targets.to(torch.device("cuda"))
outputs = model(inputs)
loss = nn.CrossEntropyLoss()(outputs, targets)
loss.backward()
optimizer.step()
optimizer.zero_grad()
模型并行适用于模型太大无法放入单个GPU内存的情况。可以将模型的不同部分放在不同的GPU上。
import torch
import torch.nn as nn
class ModelParallelModel(nn.Module):
def __init__(self):
super(ModelParallelModel, self).__init__()
self.part1 = nn.Linear(10, 10).to('cuda:0')
self.part2 = nn.Linear(10, 10).to('cuda:1')
def forward(self, x):
x = x.to('cuda:0')
x = self.part1(x)
x = x.to('cuda:1')
x = self.part2(x)
return x
model = ModelParallelModel()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
# 训练模型
for inputs, targets in dataloader:
inputs, targets = inputs.to('cuda:0'), targets.to('cuda:1')
outputs = model(inputs)
loss = nn.CrossEntropyLoss()(outputs, targets)
loss.backward()
optimizer.step()
optimizer.zero_grad()
DataParallel在多GPU环境下可能会遇到性能瓶颈,特别是在数据加载和梯度聚合阶段。DistributedDataParallel通常更高效。通过合理使用这些方法,可以在Linux环境下高效地进行PyTorch的并行计算。