#我在做主题建模 但是在计算每个主题的概率时,总和不等于
#vectorizer
ctfidf = ClassTfidfTransformer()
bow = CountVectorizer(ngram_range=(1,3))#Reduce dimensionality
#Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
#Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
#model
embedding_model = 'all-MiniLM-L6-v2'
model = BERTopic(language='Portuguese',
embedding_model = embedding_model,
umap_model = umap_model,
hdbscan_model = hdbscan_model,
vectorizer_model = bow,
ctfidf_model=ctfidf,
nr_topics=11 ,
calculate_probabilities=True)
topics, prob = model.fit_transform(corpus)
#set(主题) #{0, 1}
#概率[0] #array([0.01342076, 0.78132301])