使用棕色数据集(NLTK)的单词相似度的skip-gramm(word2vec)模型的准确性

问题描述 投票:0回答:1

我想基于NLTK库中的棕色数据集创建相似性矩阵。问题是损失

tf.reduce_mean(tf.nn.sampled_softmax_loss(weights = softmax_weight, biases = softmax_bias, inputs = embed,
                  labels = y, num_sampled = num_sampled, num_classes = num_words))

从4.2降低到2.0,然后开始上升和下降。问题是:如何提高模型的准确性?

这是我的完整代码:

import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding,Layer
from tensorflow.keras import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from numpy.random import choice
import random
from itertools import repeat
import tensorflow as tf
import nltk
import re
from nltk.corpus import stopwords
from nltk.corpus import brown
import string
nltk.download('brown')
nltk.download('stopwords')


#Dataset loading and preparation:
dataset = brown.sents()

punct = list(string.punctuation)
punct.append("``")
punct.append("''")
punct.append("--")
stops = set(stopwords.words("english")) 

dataset = [[word.lower() for word in sentence if word not in punct and word.lower() not in stops] for sentence in dataset] 


#tokenization
tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(dataset)

word2index = tokenizer.word_index
index_word = tokenizer.index_word

total_words = 5000

data_prep = tokenizer.texts_to_sequences(dataset) 
data_prep = [sentence for sentence in data_prep if len(sentence) >2] 

#word2vec
def word2vec_preparation(data,window_size,num_skips):
    grams = []
    context = []
    target = []

    assert window_size >= 1,'windows_size argument is <1!'
    assert num_skips >= 1,'num_skips argument <1!'
    for sentence in data:
        if len(sentence) - window_size > 1:
            #print(sentence)

            for i in range(len(sentence)):
                if i - window_size < 0:
                    gram = sentence[i+1:i+window_size + 1]
                    check = num_skips - len(set(gram))
                    #print(gram)
                    grams.append(gram)
                    if check > 0:
                        context.extend(random.sample(set(gram), len(set(gram))))
                        target.extend(repeat(sentence[i], len(set(gram))))
                    else:
                        context.extend(random.sample(set(gram), num_skips))
                        target.extend(repeat(sentence[i], num_skips))

                elif i + window_size > len(sentence) -1:
                    gram = sentence[i-window_size:i]
                    check = num_skips - len(set(gram))
                    #print(gram)
                    grams.append(gram)
                    if check > 0:
                        context.extend(random.sample(set(gram), len(set(gram))))
                        target.extend(repeat(sentence[i], len(set(gram))))
                    else:
                        context.extend(random.sample(set(gram), num_skips))
                        target.extend(repeat(sentence[i], num_skips))

                else:
                    gram = sentence[i-window_size:i] + sentence[i+1:i+window_size + 1]
                    check = num_skips - len(set(gram))
                    #print(gram)
                    grams.append(gram)
                    if check > 0:
                        context.extend(random.sample(set(gram), len(set(gram))))
                        target.extend(repeat(sentence[i], len(set(gram))))
                    else:
                        context.extend(random.sample(set(gram), num_skips))
                        target.extend(repeat(sentence[i], num_skips))

        #print('----------------------')

    return grams, context, target

grams,context,target = word2vec_preparation(data_prep,window_size = 2,num_skips = 3)

target = np.array(target,dtype= np.int64)
context = np.array(context,dtype= np.int64)


context = context.reshape(len(context),1)
dataset_train = tf.data.Dataset.from_tensor_slices((target, context))
dataset_train = dataset_train.shuffle(buffer_size=1024).batch(64)

#Parameters:
num_words = 5000
embed_size = 300
num_sampled = 64
initializer_softmax = tf.keras.initializers.GlorotUniform()
#Variables:
embeddings_weight = tf.Variable(tf.random.uniform([num_words,embed_size],-1.0,1.0))
softmax_weight = tf.Variable(initializer_softmax([num_words,embed_size]))
softmax_bias = tf.Variable(initializer_softmax([num_words]))

optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)

@tf.function
def training(X,y):
  with tf.GradientTape() as tape:
    embed = tf.nn.embedding_lookup(embeddings_weight,X)#embeddings_weight are parameters and X is a collection of indecies for looking up in the embedding table
    loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(weights = softmax_weight, biases = softmax_bias, inputs = embed,
                  labels = y, num_sampled = num_sampled, num_classes = num_words))
  variables = [embeddings_weight,softmax_weight,softmax_bias]  
  gradients = tape.gradient(loss,variables)
  optimizer.apply_gradients(zip(gradients,variables))
  return loss
  #tf.print('Loss:',loss)



EPOCHS = 100

for epoch in range(EPOCHS):
  for step, (X,y) in enumerate(dataset_train):
    loss = training(X,y)
  tf.print('Epoch:',epoch + 1, 'loss:',loss)
tensorflow deep-learning word2vec
1个回答
0
投票

报告的损失不是模型有用性的黄金标准,而是针对实际用途对所得的词向量进行实际测试。

并且,具有损耗平稳段(然后是上下抖动)是这种优化的自然和预期的行为。 (该模型永远无法完美地预测训练数据,除非它对于数据而言过于庞大-在这种情况下,它将“过度拟合”训练数据并在实际任务中表现不佳。)您希望每一个损失都低至在给定特定算法和模型参数的情况下,可以做到这一点–不得超过0.0。

您的代码中可能还有其他错误,但我尚未审查。我建议直接使用现成的,经过调试的Word2Vec实现,或者使用自己的实现非常重要(也许出于学习目的)(作为判断代码是否正确的基准)工作。

© www.soinside.com 2019 - 2024. All rights reserved.