我有 4 种不同语言的句子数据集,我想要一个能够识别给定测试句子的模型。
这是我的代码:
import torch
import torch.nn as nn
import torch.optim as optim
import json
from torch.utils.data import DataLoader, Dataset
# getting data from JSON file
with open('data.json', 'r', encoding='utf-8') as f:
data = json.load(f)
# Data preprocessing
all_sentences = []
all_labels = []
for lang, sentences in data.items():
all_sentences.extend(sentences)
all_labels.extend([lang] * len(sentences))
# Creation of vocabulary
vocab = set()
for sentence in all_sentences:
words = sentence.split()
vocab.update(words)
# Mapping words to index and vice versa
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for word, i in word_to_ix.items()}
# Mapping language labels to index and vice versa
label_to_ix = {label: i for i, label in enumerate(set(all_labels))}
ix_to_label = {i: label for label, i in label_to_ix.items()}
# Conversion of sentences into index, with management of unknown words
UNK_IDX = len(vocab)
MAX_SEQ_LENGTH = 200
def sentence_to_indices(sentence):
indexed_sentence = []
for word in sentence.split():
indexed_sentence.append(word_to_ix.get(word, UNK_IDX))
# Rembourrage ou troncature pour avoir une longueur maximale fixe
if len(indexed_sentence) > MAX_SEQ_LENGTH:
indexed_sentence = indexed_sentence[:MAX_SEQ_LENGTH]
else:
indexed_sentence += [UNK_IDX] * (MAX_SEQ_LENGTH - len(indexed_sentence))
return indexed_sentence
indexed_sentences = [sentence_to_indices(sentence) for sentence in all_sentences]
indexed_labels = [label_to_ix[label] for label in all_labels]
class LanguageDataset(Dataset):
def __init__(self, sentences, labels):
self.sentences = sentences
self.labels = labels
def __len__(self):
return len(self.sentences)
def __getitem__(self, idx):
return torch.tensor(self.sentences[idx]), torch.tensor(self.labels[idx])
dataset = LanguageDataset(indexed_sentences, indexed_labels)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
class LanguageClassifier(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
super(LanguageClassifier, self).__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.fc1 = nn.Linear(embedding_dim * MAX_SEQ_LENGTH, hidden_dim)
self.fc2 = nn.Linear(hidden_dim, output_dim)
def forward(self, x):
embedded = self.embedding(x)
flattened = embedded.view(x.size(0), -1) # Aplatir les tenseurs
output = torch.relu(self.fc1(flattened))
output = self.fc2(output)
return output
EMBEDDING_DIM = 50
HIDDEN_DIM = 128
OUTPUT_DIM = len(label_to_ix)
LEARNING_RATE = 0.001
NUM_EPOCHS = 10
model = LanguageClassifier(len(vocab) + 1, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
# model training
for epoch in range(NUM_EPOCHS):
model.train()
total_correct = 0
total_samples = 0
running_loss = 0.0
for sentences, labels in dataloader:
optimizer.zero_grad()
output = model(sentences)
loss = loss_function(output, labels)
loss.backward()
optimizer.step()
_, predicted = torch.max(output, 1)
# Update the running total of correct predictions and samples
total_correct += (predicted == labels).sum().item()
total_samples += labels.size(0)
running_loss += loss.item()
print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {running_loss}")
# Calculate the accuracy for this epoch
accuracy = 100 * total_correct / total_samples
print(f'Epoch {epoch+1}: Accuracy = {accuracy:.2f}%')
# saving the model
torch.save(model.state_dict(), 'language_classifier_model.pth')
def predict_language(model, sentence):
model.eval()
indexed_sentence = [word_to_ix.get(word, UNK_IDX) for word in sentence.split()]
tensor_sentence = torch.tensor(indexed_sentence).unsqueeze(0)
output = model(tensor_sentence)
_, predicted_idx = torch.max(output, 1)
return predicted_idx.item()
def test_model(test_sentence):
model = LanguageClassifier(len(vocab) + 1, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model.load_state_dict(torch.load("language_classifier_model.pth"))
predicted_idx = predict_language(model, test_sentence)
predicted_language = list(data.keys())[predicted_idx]
print(f"Phrase: '{test_sentence}', Langue prédite: {predicted_language}")
# test the model with a french sentence
test_sentence = "Votre modèle fonctionne-t-il bien ?"
test_model(test_sentence)
但是我有以下错误:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[1], line 138
136 # Teste le modèle avec une phrase donnée
137 test_sentence = "Votre modèle fonctionne-t-il bien ?"
--> 138 test_model(test_sentence)
Cell In[1], line 132
130 model = LanguageClassifier(len(vocab) + 1, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
131 model.load_state_dict(torch.load("language_classifier_model.pth"))
--> 132 predicted_idx = predict_language(model, test_sentence)
133 predicted_language = list(data.keys())[predicted_idx]
134 print(f"Phrase: '{test_sentence}', Langue prédite: {predicted_language}")
Cell In[1], line 125
123 indexed_sentence = [word_to_ix.get(word, UNK_IDX) for word in sentence.split()]
124 tensor_sentence = torch.tensor(indexed_sentence).unsqueeze(0)
--> 125 output = model(tensor_sentence)
126 _, predicted_idx = torch.max(output, 1)
127 return predicted_idx.item()
File c:\Users\natha\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
1509 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1510 else:
-> 1511 return self._call_impl(*args, **kwargs)
File c:\Users\natha\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py:1520, in Module._call_impl(self, *args, **kwargs)
1515 # If we don't have any hooks, we want to skip the rest of the logic in
1516 # this function, and just call forward.
1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1518 or _global_backward_pre_hooks or _global_backward_hooks
1519 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520 return forward_call(*args, **kwargs)
1522 try:
1523 result = None
Cell In[1], line 77
75 embedded = self.embedding(x)
76 flattened = embedded.view(x.size(0), -1) # Aplatir les tenseurs
---> 77 output = torch.relu(self.fc1(flattened))
78 output = self.fc2(output)
79 return output
File c:\Users\natha\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
1509 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1510 else:
-> 1511 return self._call_impl(*args, **kwargs)
File c:\Users\natha\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py:1520, in Module._call_impl(self, *args, **kwargs)
1515 # If we don't have any hooks, we want to skip the rest of the logic in
1516 # this function, and just call forward.
1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1518 or _global_backward_pre_hooks or _global_backward_hooks
1519 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520 return forward_call(*args, **kwargs)
1522 try:
1523 result = None
File c:\Users\natha\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\linear.py:116, in Linear.forward(self, input)
115 def forward(self, input: Tensor) -> Tensor:
--> 116 return F.linear(input, self.weight, self.bias)
RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x250 and 10000x128)
我尝试了几种方法,但没有成功,谢谢你的帮助。
我尝试更改 LanguageClassifier 中 self.embedding 和 self.fc1 的维度值。
这是在
nn.Linear
中发生形状不匹配时的常见错误:
运行时错误:mat1 和 mat2 形状无法相乘(1x250 和 10000x128)
第一个形状对应于输入,第二个形状是线性层的形状,可能是您定义的第一个形状。我们可以检查:您的
fc1
被定义为 embedding_dim * MAX_SEQ_LENGTH
x hidden_dim
ie. 50*200
x 128
。这与错误消息中的第二个形状相符。另一方面,接收到的张量的形状为(1,250)
。所以你的第一个线性层应该是 250
x 128
,而不是 1000
x 128
。