无法测试语言识别模型

问题描述 投票:0回答:1

我有 4 种不同语言的句子数据集,我想要一个能够识别给定测试句子的模型。

这是我的代码:

import torch
import torch.nn as nn
import torch.optim as optim
import json
from torch.utils.data import DataLoader, Dataset

# getting data from JSON file
with open('data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Data preprocessing
all_sentences = []
all_labels = []
for lang, sentences in data.items():
    all_sentences.extend(sentences)
    all_labels.extend([lang] * len(sentences))

# Creation of vocabulary
vocab = set()
for sentence in all_sentences:
    words = sentence.split()
    vocab.update(words)

# Mapping words to index and vice versa
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for word, i in word_to_ix.items()}

# Mapping language labels to index and vice versa
label_to_ix = {label: i for i, label in enumerate(set(all_labels))}
ix_to_label = {i: label for label, i in label_to_ix.items()}

# Conversion of sentences into index, with management of unknown words
UNK_IDX = len(vocab)
MAX_SEQ_LENGTH = 200

def sentence_to_indices(sentence):
    indexed_sentence = []
    for word in sentence.split():
        indexed_sentence.append(word_to_ix.get(word, UNK_IDX))
    # Rembourrage ou troncature pour avoir une longueur maximale fixe
    if len(indexed_sentence) > MAX_SEQ_LENGTH:
        indexed_sentence = indexed_sentence[:MAX_SEQ_LENGTH]
    else:
        indexed_sentence += [UNK_IDX] * (MAX_SEQ_LENGTH - len(indexed_sentence))
    return indexed_sentence

indexed_sentences = [sentence_to_indices(sentence) for sentence in all_sentences]
indexed_labels = [label_to_ix[label] for label in all_labels]

class LanguageDataset(Dataset):
    def __init__(self, sentences, labels):
        self.sentences = sentences
        self.labels = labels

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return torch.tensor(self.sentences[idx]), torch.tensor(self.labels[idx])

dataset = LanguageDataset(indexed_sentences, indexed_labels)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

class LanguageClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LanguageClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim * MAX_SEQ_LENGTH, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        flattened = embedded.view(x.size(0), -1)  # Aplatir les tenseurs
        output = torch.relu(self.fc1(flattened))
        output = self.fc2(output)
        return output

EMBEDDING_DIM = 50
HIDDEN_DIM = 128
OUTPUT_DIM = len(label_to_ix)
LEARNING_RATE = 0.001
NUM_EPOCHS = 10

model = LanguageClassifier(len(vocab) + 1, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# model training
for epoch in range(NUM_EPOCHS):
    model.train()
    total_correct = 0
    total_samples = 0
    running_loss = 0.0

    for sentences, labels in dataloader:
        optimizer.zero_grad()
        output = model(sentences)
        loss = loss_function(output, labels)
        loss.backward()
        optimizer.step()
        _, predicted = torch.max(output, 1)
        # Update the running total of correct predictions and samples
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)
        running_loss += loss.item()

    print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {running_loss}")
    # Calculate the accuracy for this epoch
    accuracy = 100 * total_correct / total_samples
    print(f'Epoch {epoch+1}: Accuracy = {accuracy:.2f}%')

# saving the model
torch.save(model.state_dict(), 'language_classifier_model.pth')

def predict_language(model, sentence):
    model.eval()
    indexed_sentence = [word_to_ix.get(word, UNK_IDX) for word in sentence.split()]
    tensor_sentence = torch.tensor(indexed_sentence).unsqueeze(0)
    output = model(tensor_sentence)
    _, predicted_idx = torch.max(output, 1)
    return predicted_idx.item()

def test_model(test_sentence):
    model = LanguageClassifier(len(vocab) + 1, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
    model.load_state_dict(torch.load("language_classifier_model.pth"))
    predicted_idx = predict_language(model, test_sentence)
    predicted_language = list(data.keys())[predicted_idx]
    print(f"Phrase: '{test_sentence}', Langue prédite: {predicted_language}")

# test the model with a french sentence
test_sentence = "Votre modèle fonctionne-t-il bien ?"
test_model(test_sentence)

但是我有以下错误:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[1], line 138
    136 # Teste le modèle avec une phrase donnée
    137 test_sentence = "Votre modèle fonctionne-t-il bien ?"
--> 138 test_model(test_sentence)

Cell In[1], line 132
    130 model = LanguageClassifier(len(vocab) + 1, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
    131 model.load_state_dict(torch.load("language_classifier_model.pth"))
--> 132 predicted_idx = predict_language(model, test_sentence)
    133 predicted_language = list(data.keys())[predicted_idx]
    134 print(f"Phrase: '{test_sentence}', Langue prédite: {predicted_language}")

Cell In[1], line 125
    123 indexed_sentence = [word_to_ix.get(word, UNK_IDX) for word in sentence.split()]
    124 tensor_sentence = torch.tensor(indexed_sentence).unsqueeze(0)
--> 125 output = model(tensor_sentence)
    126 _, predicted_idx = torch.max(output, 1)
    127 return predicted_idx.item()

File c:\Users\natha\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
   1509     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1510 else:
-> 1511     return self._call_impl(*args, **kwargs)

File c:\Users\natha\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py:1520, in Module._call_impl(self, *args, **kwargs)
   1515 # If we don't have any hooks, we want to skip the rest of the logic in
   1516 # this function, and just call forward.
   1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1518         or _global_backward_pre_hooks or _global_backward_hooks
   1519         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520     return forward_call(*args, **kwargs)
   1522 try:
   1523     result = None

Cell In[1], line 77
     75 embedded = self.embedding(x)
     76 flattened = embedded.view(x.size(0), -1)  # Aplatir les tenseurs
---> 77 output = torch.relu(self.fc1(flattened))
     78 output = self.fc2(output)
     79 return output

File c:\Users\natha\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
   1509     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1510 else:
-> 1511     return self._call_impl(*args, **kwargs)

File c:\Users\natha\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py:1520, in Module._call_impl(self, *args, **kwargs)
   1515 # If we don't have any hooks, we want to skip the rest of the logic in
   1516 # this function, and just call forward.
   1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1518         or _global_backward_pre_hooks or _global_backward_hooks
   1519         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520     return forward_call(*args, **kwargs)
   1522 try:
   1523     result = None

File c:\Users\natha\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\linear.py:116, in Linear.forward(self, input)
    115 def forward(self, input: Tensor) -> Tensor:
--> 116     return F.linear(input, self.weight, self.bias)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x250 and 10000x128)

我尝试了几种方法,但没有成功,谢谢你的帮助。

我尝试更改 LanguageClassifier 中 self.embedding 和 self.fc1 的维度值。

python pytorch neural-network
1个回答
0
投票

这是在

nn.Linear
中发生形状不匹配时的常见错误:

运行时错误:mat1 和 mat2 形状无法相乘(1x250 和 10000x128)

第一个形状对应于输入,第二个形状是线性层的形状,可能是您定义的第一个形状。我们可以检查:您的

fc1
被定义为
embedding_dim * MAX_SEQ_LENGTH
x
hidden_dim
ie.
50*200
x
128
。这与错误消息中的第二个形状相符。另一方面,接收到的张量的形状为
(1,250)
。所以你的第一个线性层应该是
250
x
128
,而不是
1000
x
128

© www.soinside.com 2019 - 2024. All rights reserved.