在Linux系统中利用PyTorch进行自然语言处理(NLP)是一个相对直接的过程,涉及多个步骤,包括环境配置、安装必要的库、数据处理、模型训练和评估。以下是一个详细的指南:
sudo apt-get update
sudo apt-get install python3 python3-pip
python3 -m venv myenv
source myenv/bin/activate
pip install torch torchvision torchaudio
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
请将cu118
替换为你实际的CUDA版本号。确保你的NVIDIA驱动和CUDA toolkit已正确安装。
pip install transformers nltk spacy
python -m spacy download en_core_web_sm
import torch
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, random_split
tokenizer = get_tokenizer('spacy', language="en_core_web_sm")
def yield_tokens(data_iter):
for _, text in data_iter:
yield tokenizer(text)
train_iter, test_iter = IMDB(split=('train', 'test'))
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])
def text_pipeline(text):
return vocab(tokenizer(text))
def label_pipeline(x):
return int(x) - 1
train_iter, test_iter = random_split(IMDB(split='train'), [85000, 25000])
BATCH_SIZE = 64
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_dataloader = DataLoader(list(train_iter), batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(list(test_iter), batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
import torch.nn as nn
import torch.nn.functional as F
class TextClassifier(nn.Module):
def __init__(self, vocab_size, embed_dim, hidden_dim, num_class):
super(TextClassifier, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=1)
self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
self.fc = nn.Linear(hidden_dim, num_class)
def forward(self, text):
embedded = self.embedding(text)
output, (hidden, _) = self.lstm(embedded)
return self.fc(hidden.squeeze(0))
EMBED_DIM = 64
EPOCHS = 10
model = TextClassifier(len(vocab), EMBED_DIM, hidden_dim=128, num_class=2).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.04)
criterion = torch.nn.CrossEntropyLoss().to(device)
for epoch in range(EPOCHS):
for i, (labels, text) in enumerate(train_dataloader):
labels = labels.to(device)
text = text.to(device)
optimizer.zero_grad()
outputs = model(text)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
correct = 0
total = len(test_dataloader.dataset)
with torch.no_grad():
for labels, text in test_dataloader:
labels = labels.to(device)
text = text.to(device)
outputs = model(text)
predicted = outputs.argmax(1)
correct += (predicted == labels).sum().item()
print("Accuracy: {:.2f}%".format(100 * correct / total))
# 保存模型
model.save_pretrained('my_model')
tokenizer.save_pretrained('my_model')
# 加载模型和分词器
model = BertForSequenceClassification.from_pretrained('my_model', num_labels=2)
tokenizer = BertTokenizer.from_pretrained('my_model')
通过以上步骤,你可以在Linux系统中利用PyTorch进行自然语言处理。根据具体需求,你可以进一步扩展和优化代码。