我正在使用 k-prototyps 库来实现混合数值和数值数据类型。根据https://github.com/nicodv/kmodes/issues/46 为了计算k个原型中的轮廓分数,我计算了分类数据的轮廓分数(基于汉明距离)和数值数据的轮廓分数(基于欧几里德距离),但是开发的代码非常慢,需要10小时才能计算60000 条记录的剪影。我的笔记本电脑有 12G RAM 和 corei 7。 请问有什么可以提高代码速度的帮助吗?
import numpy as np
import pandas as pd
from kmodes.kprototypes import KPrototypes
# -------- import data
df = pd.read_csv(r'C:\Users\data.csv')
# ------------- Normalize the data ---------------
# print(df.columns) # To get columns name
x_df = df[['R', 'F']]
x_df_norm = x_df.apply(lambda x: (x - x.min(axis=0)) / (x.max(axis=0) - x.min(axis=0)))
x_df_norm['COType'] = df[['COType']]
def calc_euclian_dis(_s1, _s2):
# s1 = np.array((3, 5))
_eucl_dist = np.linalg.norm(_s2 - _s1) # calculate Euclidean distance, accept input an array [2 6]
return _eucl_dist
def calc_simpleMatching_dis(_s1, _s2):
_cat_dist = 0
if (_s1 != _s2):
_cat_dist = 1
return _cat_dist
k = 3
# calculate silhoutte for one cluster number
kproto = KPrototypes(n_clusters=k, init='Cao', verbose=2)
clusters_label = kproto.fit_predict(x_df_norm, categorical=[2])
_identical_cluster_labels = list(dict.fromkeys(clusters_label))
# Assign clusters lables to the Dataset
x_df_norm['Cluster_label'] = clusters_label
# ------------- calculate _silhouette_Index -------------
# 1. Calculate ai
_silhouette_Index_arr = []
for i in x_df_norm.itertuples():
_ai_cluster_label = i[-1]
# return samples of the same cluster
_samples_cluster = x_df_norm[x_df_norm['Cluster_label'] == _ai_cluster_label]
_dist_array_ai = []
_s1_nume_att = np.array((i[1], i[2]))
_s1_cat_att = i[3]
for j in _samples_cluster.itertuples():
_s2_nume_att = np.array((j[1], j[2]))
_s2_cat_att = j[3]
_euclian_dis = calc_euclian_dis(_s1_nume_att, _s2_nume_att)
_cat_dis = calc_simpleMatching_dis(_s1_cat_att, _s2_cat_att)
_dist_array_ai.append(_euclian_dis + (kproto.gamma * _cat_dis))
ai = np.average(_dist_array_ai)
# 2. Calculate bi
# 2.1. determine the samples of other clusters
_identical_cluster_labels.remove(_ai_cluster_label)
_dic_cluseter = {}
_bi_arr = []
for ii in _identical_cluster_labels:
_samples = x_df_norm[x_df_norm['Cluster_label'] == ii]
# 2.2. calculate bi
_dist_array_bi = []
for j in _samples.itertuples():
_s2_nume_att = np.array((j[1], j[2]))
_s2_cat_att = j[3]
_euclian_dis = calc_euclian_dis(_s1_nume_att, _s2_nume_att)
_cat_dis = calc_simpleMatching_dis(_s1_cat_att, _s2_cat_att)
_dist_array_bi.append(_euclian_dis + (kproto.gamma * _cat_dis))
_bi_arr.append(np.average(_dist_array_bi))
_identical_cluster_labels.append(_ai_cluster_label)
# min bi is determined as final bi variable
bi = min(_bi_arr)
# 3. calculate silhouette Index
if ai == bi:
_silhouette_i = 0
elif ai < bi:
_silhouette_i = 1 - (ai / bi)
elif ai > bi:
_silhouette_i = 1 - (bi / ai)
_silhouette_Index_arr.append(_silhouette_i)
silhouette_score = np.average(_silhouette_Index_arr)
print('_silhouette_Index = ' + str(silhouette_score))
嘿!我通过使用线性代数运算符来计算差异而不是使用大量 for 循环来重新实现您的函数: 速度更快:-)
def euclidean_dissim(a, b, **_):
"""Euclidean distance dissimilarity function
b is the single point, a is the matrix of vectors"""
if np.isnan(a).any() or np.isnan(b).any():
raise ValueError("Missing values detected in numerical columns.")
return np.linalg.norm(a - b, axis=1)
def matching_dissim(a, b, **_):
"""Simple matching dissimilarity function
b is the single point, a is the matrix of all other vectors,
count how many matching values so difference = 0 """
# We are subtracting to dimension since is not similarity but a dissimilarity
dimension = len(b)
return dimension - np.sum((b-a)==0,axis=1)
def calc_silhouette_proto(dataset,numerical_pos, cat_pos,kproto_model):
'''------------- calculate _silhouette_Index -------------'''
# 1. Compute a(i)
silhouette_Index_arr = []
for i in dataset.itertuples():
# convert tuple to np array
i = np.array(i)
unique_cluster_labels = list(np.unique(dataset['cluster_labels']))
# We need each time to remove the considered tuple from the dataset since we don't compute distances from itself
data = dataset.copy()
ai_cluster = i[-1] # The cluster is in the last position of the tuple
# Removing the tuple from the dataset
tuple_index = dataset.index.isin([i[0]])
data = data[~tuple_index]
# Get samples of the same cluster
samples_of_cluster = data[data['cluster_labels'] == ai_cluster].loc[:,data.columns!='cluster_labels'].to_numpy()
# Compute the 2 distances among the single points and all the others
euclidian_distances = euclidean_dissim(samples_of_cluster[:,numerical_pos],i[np.array(numerical_pos)+1])
categ_distances = matching_dissim(samples_of_cluster[:,cat_pos],i[np.array(cat_pos)+1])
# Weighted average of the 2 distances
ai = np.average(euclidian_distances) + (kproto_model.gamma * np.average(categ_distances))
# 2. Calculate bi
unique_cluster_labels.remove(ai_cluster)
bi_arr = []
for ii in unique_cluster_labels:
# Get all the samples of cluster ii
samples = data[data['cluster_labels'] == ii].loc[:,data.columns!='cluster_labels'].to_numpy()
# Compute the 2 distances among the single points and all the others
euclidian_distances = np.linalg.norm(samples[:,numerical_pos] - i[np.array(numerical_pos)+1], axis=1)
categ_distances = matching_dissim(samples[:,cat_pos],i[np.array(cat_pos)+1])
distance_bi = np.average(euclidian_distances) + (kproto_model.gamma * np.average(categ_distances))
bi_arr.append(np.average(distance_bi))
# min bi is determined as final bi variable
if(len(bi_arr)==0):
bi = 0
else:
bi = min(bi_arr)
# 3. calculate silhouette Index
if ai == bi:
silhouette_i = 0
elif ai < bi:
silhouette_i = 1 - (ai / bi)
elif ai > bi:
silhouette_i = 1 - (bi / ai)
silhouette_Index_arr.append(silhouette_i)
silhouette_score = np.average(silhouette_Index_arr)
return silhouette_score
这是我当前正在使用的该功能的工作加速版本。
def silhouette_score_kproto(data, labels, categorical_indices, kproto_gamma):
"""
Calculate silhouette scores for clustering results using k-prototypes algorithm.
Parameters:
data (pd.DataFrame): Input data with features and cluster labels.
labels (np.array): Cluster labels assigned by the k-prototypes algorithm.
categorical_indices (list): List of indices representing categorical columns in the data.
kproto_gamma (float): Scaling factor for matching dissimilarity in k-prototypes algorithm.
Returns:
dict: A dictionary mapping data point indices to their corresponding cluster labels and silhouette scores.
"""
# Create a copy of the data to avoid modifying the original dataset
kproto_data = data.copy()
# Add cluster labels to the copied data
kproto_data['cluster'] = labels
# Convert the data to a NumPy array for efficient computations
kproto_data = kproto_data.to_numpy()
# Separate numerical and categorical columns based on indices
cat_cols = categorical_indices
num_cols = [i for i in range(kproto_data.shape[1]) if i not in categorical_indices]
# Initialize an empty list to store silhouette scores
values = []
# Iterate through each data point in the dataset
for row in tqdm(range(kproto_data.shape[0]), total=kproto_data.shape[0]):
point = kproto_data[row, :]
removed_point_data = np.delete(kproto_data, row, axis=0)
cluster = point[-1]
# Create a boolean mask to filter rows with the same cluster label as the current point
mask = (removed_point_data[:, -1] == cluster)
ai_data = removed_point_data[mask]
# Calculate dissimilarity for numerical and categorical features using appropriate metrics
distance = euclidean_dissim(ai_data[:, num_cols], point[num_cols]) + \
kproto_gamma * matching_dissim(point[cat_cols], ai_data[:, cat_cols], axis=1)
a_i = np.mean(distance)
# Create a boolean mask to filter rows with different cluster labels from the current point
mask = (removed_point_data[:, -1] != cluster)
bi_data = removed_point_data[mask]
# Calculate dissimilarity for numerical and categorical features using appropriate metrics
distance = euclidean_dissim(bi_data[:, num_cols], point[num_cols]) + \
kproto_gamma * matching_dissim(point[cat_cols], bi_data[:, cat_cols], axis=1)
b_i = np.mean(distance)
# Calculate silhouette score for the current point
if max(a_i, b_i) == 0:
s_i = 0 # Avoid division by zero
else:
s_i = (b_i - a_i) / max(a_i, b_i)
# Store the index, cluster label, and silhouette score in the values list as a tuple
values.append(s_i)
# Return a dictionary mapping data point indices to their corresponding cluster labels and silhouette scores
return values