我训练了一个包含 38 个类别的 BERT 文本分类模型。现在,对于这 38 个类别中的每一个,我想找出前 N 个词。
为此,我使用 sklearn 的 CountVectorizer 从训练数据集中创建词汇表。
我将该词汇表传递给标记器,并使用这些标记传递给模型并获得最后一层激活。所以现在我有一个
vocab x num categories
大小的数据框。
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pandas as pd
import torch, os
from transformers import AutoTokenizer, AutoModelForSequenceClassification
if torch.cuda.is_available():
device = torch.device("cuda")
print('GPU:', torch.cuda.get_device_name(0))
else:
device = torch.device("cpu")
# Define a function to compute the activations of a hidden state for a given input
def get_hidden_state_activations(input_ids, model, layer_idx):
with torch.no_grad():
outputs = model(input_ids)
activations = outputs[layer_idx]
return activations#.mean(dim=1)
# create the vocabulary
vectorizer = CountVectorizer(ngram_range=(1,1), max_features=10000, min_df=50)
# vectorizer = TfidfVectorizer(ngram_range=(1,10), max_features=200, use_idf=True)
vecs = vectorizer.fit_transform(corpus)
print('collecting frequencies')
dense = vecs.todense()
lst1 = dense.tolist()
feature_names = vectorizer.get_feature_names_out()
print('sorting them around')
# get all data into a DF
df = pd.DataFrame(lst1, columns=feature_names)
df = df.T.sum(axis=1).sort_values(ascending=False).reset_index()
df.columns = ['word', 'frequency']
df = df[~df.word.str.isdigit()].reset_index(drop=True)
# load models
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(BERT_MODEL_NAME).to(device)
# data and labels
data = df.word.tolist()
labels = []
# Convert the data to input ids using the tokenizer
input_ids = [tokenizer.encode(example, add_special_tokens=True, return_tensors='pt') for example in data]
# Compute the activations of the hidden states for each input in the data
layer_idx = -1 # Choose the index of the layer to extract activations from
activations = [get_hidden_state_activations(input_id.to(device), model, layer_idx) for input_id in tqdm(input_ids)]
# get all activations into a df
activations_df = pd.DataFrame([i.cpu().detach().numpy().reshape(len(labels)) for i in activations])
activations_df.columns = labels
activations_df['word'] = data
activations_df = activations_df[['word'] + labels.tolist()]
# activations_df = activations_df.set_index('word')
activations_df.head(2)