对于背景,我指的是用于情感分类的Hierarchical Attention Network。
对于代码:我的完整代码发布在下面,但它只是作者在上面的链接上发布的原始代码的简单修订。我在下面解释我的更改。对于训练数据:here对于单词嵌入:这是手套嵌入here密钥配置:Keras 2.0.9,Scikit-Learn 0.19.1,Theano 0.9.0
在上面的链接中发布的原始代码采用3D形状输入,即(评论,句子,单词)。注意机制适用于句子,也适用于单词。因此,它有两个注意组件,您可以在网页的第四个代码块中看到。
我想将其更改为仅采用2D形状输入的一个。我是这样做的
但是,当调用'model.fit'时,代码会生成错误。我在下面发布了完整的代码和错误。
码:
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
import os
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils import plot_model
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input
from keras.layers import Embedding, GRU, Bidirectional, TimeDistributed
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers
MAX_SENT_LENGTH = 100
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
def clean_str(string):
"""
Tokenization/string cleaning for dataset
Every dataset is lower cased except
"""
string = re.sub(r"\\", "", string)
string = re.sub(r"\'", "", string)
string = re.sub(r"\"", "", string)
return string.strip().lower()
#replace this to your own file path
data_train = pd.read_csv('/home/zz/Work/wop/data/sentiment/labeledTrainData_small.tsv', sep='\t')
print(data_train.shape)
labels = []
texts = []
for idx in range(data_train.review.shape[0]):
text = BeautifulSoup(data_train.review[idx])
text = clean_str(text.get_text().encode('ascii', 'ignore').decode('ascii'))
texts.append(text)
labels.append(data_train.sentiment[idx])
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
##################################
# Change 1. The input shape is now 2D (sentence, words) instead of 3D
##################################
data = np.zeros((len(texts), MAX_SENT_LENGTH), dtype='int32')
for i, content in enumerate(texts):
wordTokens = text_to_word_sequence(content)
k = 0
for _, word in enumerate(wordTokens):
if k < MAX_SENT_LENGTH and tokenizer.word_index[word] < MAX_NB_WORDS:
data[i, k] = tokenizer.word_index[word]
k = k + 1
##################################
word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]
print('Number of positive and negative reviews in traing and validation set')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))
#replace with your own embedding file path
GLOVE_DIR = "/home/zz/Work/data/glove.6B"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
print('Total %s word vectors.' % len(embeddings_index))
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
# building Hierachical Attention network
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
embedding_layer = Embedding(len(word_index) + 1,
EMBEDDING_DIM,
weights=[embedding_matrix],
input_length=MAX_SENT_LENGTH,
trainable=True,
mask_zero=True)
class AttLayer(Layer):
def __init__(self, attention_dim,**kwargs):
self.init = initializers.get('normal')
self.supports_masking = True
self.attention_dim = attention_dim
super(AttLayer, self).__init__(**kwargs)
def build(self, input_shape):
assert len(input_shape) == 3
self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
self.b = K.variable(self.init((self.attention_dim,)))
self.u = K.variable(self.init((self.attention_dim, 1)))
self.trainable_weights = [self.W, self.b, self.u]
super(AttLayer, self).build(input_shape)
def compute_mask(self, inputs, mask=None):
return None
def call(self, x, mask=None):
# size of x :[batch_size, sel_len, attention_dim]
# size of u :[batch_size, attention_dim]
# uit = tanh(xW+b)
uit = K.tile(K.expand_dims(self.W, axis=0), (K.shape(x)[0], 1, 1))
uit = tf.matmul(x, uit)
uit = K.tanh(K.bias_add(uit, self.b))
ait = K.dot(uit, self.u)
ait = K.squeeze(ait, -1)
ait = K.exp(ait)
if mask is not None:
# Cast the mask to floatX to avoid float64 upcasting in theano
ait *= K.cast(mask, K.floatx())
ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
ait = K.expand_dims(ait)
weighted_input = x * ait
output = K.sum(weighted_input, axis=1)
return output
def compute_output_shape(self, input_shape):
return (input_shape[0], input_shape[-1])
#################################################
# Change 2. The model contains only one attention block now
#################################################
sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
l_lstm = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
l_dense = TimeDistributed(Dense(200))(l_lstm)
l_att = AttLayer(100)(l_dense)
############################################
preds = Dense(2, activation='softmax')(l_att)
model = Model(sentence_input, preds)
#### clone the model #### Line X
model_copy = clone_model(model)
plot_model(model, to_file="model.png")
model.summary()
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
print("model fitting - Hierachical attention network")
model.fit(x_train, y_train, validation_data=(x_val, y_val),
nb_epoch=10, batch_size=50,verbose=2)
错误:代码的最后一行生成错误跟踪:
Epoch 1/10
Traceback (most recent call last):
File "/home/zz/Programs/anaconda3/lib/python3.6/site-packages/theano/compile/function_module.py", line 884, in __call__
self.fn() if output_subset is None else\
ValueError: Input dimension mis-match. (input[0].shape[1] = 50, input[1].shape[1] = 100)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/zz/Work/wop/code/python/src/3rdparty/han/textClassfierHATT2D.py", line 187, in <module>
nb_epoch=10, batch_size=50,verbose=2)
File "/home/zz/Programs/anaconda3/lib/python3.6/site-packages/keras/engine/training.py", line 1631, in fit
validation_steps=validation_steps)
File "/home/zz/Programs/anaconda3/lib/python3.6/site-packages/keras/engine/training.py", line 1213, in _fit_loop
outs = f(ins_batch)
File "/home/zz/Programs/anaconda3/lib/python3.6/site-packages/keras/backend/theano_backend.py", line 1223, in __call__
return self.function(*inputs)
File "/home/zz/Programs/anaconda3/lib/python3.6/site-packages/theano/compile/function_module.py", line 898, in __call__
storage_map=getattr(self.fn, 'storage_map', None))
File "/home/zz/Programs/anaconda3/lib/python3.6/site-packages/theano/gof/link.py", line 325, in raise_with_op
reraise(exc_type, exc_value, exc_trace)
File "/home/zz/Programs/anaconda3/lib/python3.6/site-packages/six.py", line 692, in reraise
raise value.with_traceback(tb)
File "/home/zz/Programs/anaconda3/lib/python3.6/site-packages/theano/compile/function_module.py", line 884, in __call__
self.fn() if output_subset is None else\
ValueError: Input dimension mis-match. (input[0].shape[1] = 50, input[1].shape[1] = 100)
Apply node that caused the error: Elemwise{mul,no_inplace}(InplaceDimShuffle{x,0}.0, Elemwise{Cast{float32}}.0)
Toposort index: 459
Inputs types: [TensorType(float32, row), TensorType(float32, matrix)]
Inputs shapes: [(1, 50), (50, 100)]
Inputs strides: [(200, 4), (400, 4)]
Inputs values: ['not shown', 'not shown']
Outputs clients: [[Sum{axis=[1], acc_dtype=float64}(Elemwise{mul,no_inplace}.0)]]
HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.
我真的很感激这方面的一些建议,非常感谢!
在参考教程中,它选择使用theano
而不是tensorflow
,因为dot
中tensorflow
的行为与numpy
中的行为不同。但我不熟悉theano
所以我很难通过使用theano
后端使其正常工作。我宁愿使用一系列操作来模仿dot
中numpy
的行为。之后我将K.dot
改为一系列操作。
import tensorflow as tf
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
import os
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils import plot_model
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input
from keras.layers import Embedding, GRU, Bidirectional, TimeDistributed, Lambda
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers
class AttLayer(Layer):
def __init__(self, attention_dim, **kwargs):
self.init = initializers.get('normal')
self.supports_masking = True
self.attention_dim = attention_dim
super(AttLayer, self).__init__(**kwargs)
def build(self, input_shape):
assert len(input_shape) == 3
self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
self.b = K.variable(self.init((self.attention_dim,)))
self.u = K.variable(self.init((self.attention_dim, 1)))
self.trainable_weights = [self.W, self.b, self.u]
super(AttLayer, self).build(input_shape)
def compute_mask(self, inputs, mask=None):
return None
def call(self, x, mask=None):
# size of x :[batch_size, sel_len, attention_dim]
# size of u :[batch_size, attention_dim]
# uit = tanh(xW+b)
uit = K.tile(K.expand_dims(self.W, axis=0), (K.shape(x)[0], 1, 1))
uit = tf.matmul(x, uit)
uit = K.tanh(K.bias_add(uit, self.b))
ait = K.dot(uit, self.u)
ait = K.squeeze(ait, -1)
ait = K.exp(ait)
if mask is not None:
# Cast the mask to floatX to avoid float64 upcasting in theano
ait *= K.cast(mask, K.floatx())
ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
ait = K.expand_dims(ait)
weighted_input = x * ait
output = K.sum(weighted_input, axis=1)
return output
def compute_output_shape(self, input_shape):
return (input_shape[0], input_shape[-1])
# https://github.com/keras-team/keras/issues/5401
# solve the problem of keras.models.clone_model
# and model.save_weights, model.load_weights
def get_config(self):
config = {'attention_dim': self.attention_dim}
base_config = super(AttLayer, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
此外compute_mask
现在返回None
,因为在sel_len
的输出中没有AttLayer
轴。
以下是验证两个操作是等效的脚本:
B = 8
S = 100
E = 200
A = 50
X = np.random.randn(B, S, E)
W = np.random.randn(E, A)
np_result = np.dot(X, W) #shape correct
X_ph = tf.placeholder(tf.float64)
W_ph = tf.placeholder(tf.float64)
tf_dot = tf.matmul(X_ph,
tf.tile(
tf.expand_dims(W_ph, axis=0),
(K.shape(X_ph)[0], 1, 1)))
with tf.Session() as sess:
tf_result = sess.run(tf_dot,
feed_dict = {X_ph:X, W_ph:W})
print(np.allclose(np_result, tf_result)) #True
训练历史(我将batch_size
设为8):
Train on 20000 samples, validate on 5000 samples
Epoch 1/10
20000/20000 [==============================] - 1247s 62ms/step - loss: 0.4203 - acc: 0.8044 - val_loss: 0.3520 - val_acc: 0.8468
Epoch 2/10
20000/20000 [==============================] - 985s 49ms/step - loss: 0.2344 - acc: 0.9070 - val_loss: 0.3411 - val_acc: 0.8586
Epoch 3/10
20000/20000 [==============================] - 996s 50ms/step - loss: 0.0982 - acc: 0.9628 - val_loss: 0.4474 - val_acc: 0.8512
Epoch 4/10
20000/20000 [==============================] - 966s 48ms/step - loss: 0.0285 - acc: 0.9904 - val_loss: 0.7837 - val_acc: 0.8408
Epoch 5/10
20000/20000 [==============================] - 912s 46ms/step - loss: 0.0179 - acc: 0.9936 - val_loss: 1.0177 - val_acc: 0.8440
Epoch 6/10
20000/20000 [==============================] - 910s 45ms/step - loss: 0.0105 - acc: 0.9963 - val_loss: 1.0635 - val_acc: 0.8418
Epoch 7/10
20000/20000 [==============================] - 909s 45ms/step - loss: 0.0101 - acc: 0.9964 - val_loss: 1.0966 - val_acc: 0.8372
Epoch 8/10
20000/20000 [==============================] - 909s 45ms/step - loss: 0.0057 - acc: 0.9981 - val_loss: 1.2678 - val_acc: 0.8392
Epoch 9/10
20000/20000 [==============================] - 910s 46ms/step - loss: 0.0077 - acc: 0.9974 - val_loss: 1.2166 - val_acc: 0.8258
Epoch 10/10
20000/20000 [==============================] - 910s 46ms/step - loss: 0.0056 - acc: 0.9985 - val_loss: 1.4640 - val_acc: 0.8204