我正在使用张量流(Distilbert)对文本进行分类。 我使用 tflite_flutter 包运行文本分类,使用 Distilbert 对文本中的主题进行分类。训练模型如下所示:
dbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
dbert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')
max_len=32
input_ids=[]
attention_masks=[]
def read_data():
test_csv = pd.read_csv('datasets/cleaned_test_data.csv')
train_csv = pd.read_csv('datasets/clean_train.csv')
test_csv = test_csv.drop(test_csv.index[0])
return train_csv,test_csv
df_train,df_test = read_data()
df_balanced = df_train[df_train['class']==1].sample(2000)
for index in range(2,11):
df_balanced = pd.concat([df_balanced,df_train[df_train['class']==index].sample(2000)])
x_train = df_balanced['text']
labels = df_balanced['class']
for sent in x_train:
dbert_inps=dbert_tokenizer.encode_plus(sent,add_special_tokens = True,max_length =max_len,pad_to_max_length = True,return_attention_mask = True,truncation=True)
input_ids.append(dbert_inps['input_ids'])
attention_masks.append(dbert_inps['attention_mask'])
input_ids=np.asarray(input_ids)
attention_masks=np.array(attention_masks)
labels=np.array(labels)
train_inp,val_inp,train_label,val_label,train_mask,val_mask=train_test_split(input_ids,labels,attention_masks,test_size=0.2)
def create_model():
inps = Input(shape = (max_len,), dtype='int64')
masks= Input(shape = (max_len,), dtype='int64')
dbert_layer = dbert_model(inps, attention_mask=masks)[0][:,0,:]
dense = Dense(512,activation='relu',kernel_regularizer=regularizers.l2(0.01))(dbert_layer)
dropout= Dropout(0.5)(dense)
pred = Dense(11, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout)
model = tf.keras.Model(inputs=[inps,masks], outputs=pred)
print(model.summary())
return model
log_dir='dbert_model_new'
model_save_path='./dbert_model.h5'
callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=model_save_path,save_weights_only=True,monitor='val_loss',mode='min',save_best_only=True),keras.callbacks.TensorBoard(log_dir=log_dir)]
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
callbacks= [tf.keras.callbacks.ModelCheckpoint(filepath=model_save_path,save_weights_only=True,monitor='val_loss',mode='min',save_best_only=True),keras.callbacks.TensorBoard(log_dir=log_dir)]
model.compile(loss=loss,optimizer=optimizer, metrics=[metric])
model.fit([train_inp,train_mask],train_label,batch_size=16,epochs=5,validation_data=([val_inp,val_mask],val_label),callbacks=callbacks)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
trained_model = create_model()
trained_model.compile(loss=loss,optimizer=optimizer, metrics=[metric])
trained_model.load_weights(model_save_path)
converter = tf.lite.TFLiteConverter.from_keras_model(trained_model)
tflite_model = converter.convert()
open("distilbert_slim_model.tflite","wb").write(tflite_model)
上面的代码是模型训练的地方,在 python 上运行时效果非常好。然后该模型被转换为 tflite,用于在 flutter 中对文本进行分类,这将用于根据给定输入预测主题。 尽管格式和类型满足输入张量的要求,但输出张量对于不同的输入总是给出相同的结果。 这是 flutter 中输入张量所需的格式:
The input tensor:
[Tensor{_tensor: Pointer: address=0x7c028b8522c0, name: serving_default_input_1:0, type: int64, shape: [1, 32], data: 256}, Tensor{_tensor: Pointer: address=0x7c028b852330, name: serving_default_input_2:0, type: int64, shape: [1, 32], data: 256}]
The output tensor:
Tensor{_tensor: Pointer: address=0x7c028b8658f0, name: StatefulPartitionedCall:0, type: float32, shape: [1, 11], data: 44}
flutter中的代码:
String classifyText({required String rawText}) {
inputId = tokenizeInputText(rawText);
Map category = {
1: 'Society & Culture',
2: 'Science & Mathematics',
3: 'Health',
4: 'Education & Reference',
5: 'Computers & Internet',
6: 'Sports',
7: 'Business & Finance',
8: 'Entertainment & Music',
9: 'Family & Relationships',
10: 'Politics & Government'
};
List<List<double>> output = [[]];
for (var i = 0; i < 11; i++) {
output[0].add(0.0);
}
_interpreter.run(inputId, output);
final maximum = output[0].reduce(
(curr, next) => (curr as double) > (next as double) ? curr : next);
final string =
'$rawText\n$inputId\noutput: $output\nhighest: $maximum\nindex: ${output[0].indexOf(maximum)}\ncategory: ${category[output[0].indexOf(maximum)]}';
return string;
}
tokenizeInputText 返回与下面 python 中的 tflite 输入相同的格式和类型。 我尝试使用tensorflow模块在python中实现它,该模块根据输入给出不同的输出。 这是 python 中 tflite 的代码:
dbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
interpreter = tf.lite.Interpreter(model_path="distilbert_slim_model.tflite")
interpreter.allocate_tensors()
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
string = input()
input_list = []
mask_list=[]
dbert_inps=dbert_tokenizer.encode_plus(string,add_special_tokens = True,max_length =256,pad_to_max_length = True,return_attention_mask = True,truncation=True)
input_list.append(dbert_inps['input_ids'])
mask_list.append(dbert_inps['attention_mask'])
input_id = np.array(input_list,dtype=np.int64)
mask = np.array(mask_list,dtype=np.int64)
input_shape = input_details[0]['shape']
interpreter.set_tensor(input_details[0]['index'], input_id)
interpreter.set_tensor(input_details[1]['index'], mask)
interpreter.invoke()
output_data = interpreter.get_tensor(output_details[0]['index'])
print(np.argmax(output_data))
上面的代码使用不同的输入给出不同的输出。 任何帮助将不胜感激。预先感谢
我尝试了很多可能的方法来通过使用 tflite_flutter 包中的包来解决这个问题,但问题仍然存在。我尝试使用 python 检查模型是否有问题,但它工作得很好并给出了所需的结果。
你找到解决方法了吗?我也面临同样的问题