在混合分类和数值数据的 k 原型聚类算法中运行轮廓分数计算速度很慢

Question

我正在使用 k-prototyps 库来实现混合数值和数值数据类型。根据https://github.com/nicodv/kmodes/issues/46 为了计算k个原型中的轮廓分数，我计算了分类数据的轮廓分数（基于汉明距离）和数值数据的轮廓分数（基于欧几里德距离），但是开发的代码非常慢，需要10小时才能计算60000 条记录的剪影。我的笔记本电脑有 12G RAM 和 corei 7。请问有什么可以提高代码速度的帮助吗？

import numpy as np
import pandas as pd
from kmodes.kprototypes import KPrototypes
# -------- import data
df = pd.read_csv(r'C:\Users\data.csv')
# ------------- Normalize the data ---------------
# print(df.columns) # To get columns name
x_df = df[['R', 'F']]
x_df_norm = x_df.apply(lambda x: (x - x.min(axis=0)) / (x.max(axis=0) - x.min(axis=0)))
x_df_norm['COType'] = df[['COType']]
def calc_euclian_dis(_s1, _s2):
    # s1 = np.array((3, 5))
    _eucl_dist = np.linalg.norm(_s2 - _s1)  # calculate Euclidean distance, accept input an array [2 6]
    return _eucl_dist
def calc_simpleMatching_dis(_s1, _s2):
    _cat_dist = 0
    if (_s1 != _s2):
        _cat_dist = 1
    return _cat_dist
k = 3
# calculate silhoutte for one cluster number
kproto = KPrototypes(n_clusters=k, init='Cao', verbose=2)
clusters_label = kproto.fit_predict(x_df_norm, categorical=[2])
_identical_cluster_labels = list(dict.fromkeys(clusters_label))
# Assign clusters lables to the Dataset
x_df_norm['Cluster_label'] = clusters_label
# ------------- calculate _silhouette_Index -------------
# 1. Calculate ai
_silhouette_Index_arr = []
for i in x_df_norm.itertuples():
    _ai_cluster_label = i[-1]
    # return samples of the same cluster
    _samples_cluster = x_df_norm[x_df_norm['Cluster_label'] == _ai_cluster_label]
    _dist_array_ai = []
    _s1_nume_att = np.array((i[1], i[2]))
    _s1_cat_att = i[3]
    for j in _samples_cluster.itertuples():
        _s2_nume_att = np.array((j[1], j[2]))
        _s2_cat_att = j[3]
        _euclian_dis = calc_euclian_dis(_s1_nume_att, _s2_nume_att)
        _cat_dis = calc_simpleMatching_dis(_s1_cat_att, _s2_cat_att)
        _dist_array_ai.append(_euclian_dis + (kproto.gamma * _cat_dis))
    ai = np.average(_dist_array_ai)
    # 2. Calculate bi
    # 2.1. determine the samples of other clusters
    _identical_cluster_labels.remove(_ai_cluster_label)
    _dic_cluseter = {}
    _bi_arr = []
    for ii in _identical_cluster_labels:
        _samples = x_df_norm[x_df_norm['Cluster_label'] == ii]
        # 2.2. calculate bi
        _dist_array_bi = []
        for j in _samples.itertuples():
            _s2_nume_att = np.array((j[1], j[2]))
            _s2_cat_att = j[3]
            _euclian_dis = calc_euclian_dis(_s1_nume_att, _s2_nume_att)
            _cat_dis = calc_simpleMatching_dis(_s1_cat_att, _s2_cat_att)
            _dist_array_bi.append(_euclian_dis + (kproto.gamma * _cat_dis))
        _bi_arr.append(np.average(_dist_array_bi))
    _identical_cluster_labels.append(_ai_cluster_label)
    # min bi is determined as final bi variable
    bi = min(_bi_arr)
    # 3. calculate silhouette Index
    if ai == bi:
        _silhouette_i = 0
    elif ai < bi:
        _silhouette_i = 1 - (ai / bi)
    elif ai > bi:
        _silhouette_i = 1 - (bi / ai)
    _silhouette_Index_arr.append(_silhouette_i)
silhouette_score = np.average(_silhouette_Index_arr)
print('_silhouette_Index = ' + str(silhouette_score))

Answer 1

嘿！我通过使用线性代数运算符来计算差异而不是使用大量 for 循环来重新实现您的函数：速度更快:-)

def euclidean_dissim(a, b, **_):

    """Euclidean distance dissimilarity function
    b is the single point, a is the matrix of vectors"""
    if np.isnan(a).any() or np.isnan(b).any():
        raise ValueError("Missing values detected in numerical columns.")  
    return np.linalg.norm(a - b, axis=1)

def matching_dissim(a, b, **_):
    """Simple matching dissimilarity function
    b is the single point, a is the matrix of all other vectors,
    count how many matching values so difference = 0 """
    # We are subtracting to dimension since is not similarity but a dissimilarity
    dimension = len(b) 
    return dimension - np.sum((b-a)==0,axis=1)

def calc_silhouette_proto(dataset,numerical_pos, cat_pos,kproto_model):

    '''------------- calculate _silhouette_Index -------------'''
    # 1. Compute a(i)
    silhouette_Index_arr = []
    for i in dataset.itertuples():
        # convert tuple to np array
        i = np.array(i)
        unique_cluster_labels = list(np.unique(dataset['cluster_labels']))
    
        # We need each time to remove the considered tuple from the dataset since we don't compute distances from itself
        data = dataset.copy()
        
        ai_cluster = i[-1] # The cluster is in the last position of the tuple
        
        # Removing the tuple from the dataset
        tuple_index = dataset.index.isin([i[0]])
        data = data[~tuple_index]
        
        # Get samples of the same cluster
        samples_of_cluster = data[data['cluster_labels'] == ai_cluster].loc[:,data.columns!='cluster_labels'].to_numpy()
        
        # Compute the 2 distances among the single points and all the others
        euclidian_distances = euclidean_dissim(samples_of_cluster[:,numerical_pos],i[np.array(numerical_pos)+1])
        categ_distances = matching_dissim(samples_of_cluster[:,cat_pos],i[np.array(cat_pos)+1])
        
        # Weighted average of the 2 distances
        ai = np.average(euclidian_distances) + (kproto_model.gamma * np.average(categ_distances)) 
        
        # 2. Calculate bi
        
        unique_cluster_labels.remove(ai_cluster)
        bi_arr = []
        
        for ii in unique_cluster_labels:
            # Get all the samples of cluster ii
            samples = data[data['cluster_labels'] == ii].loc[:,data.columns!='cluster_labels'].to_numpy()
 
            # Compute the 2 distances among the single points and all the others
            euclidian_distances = np.linalg.norm(samples[:,numerical_pos] - i[np.array(numerical_pos)+1], axis=1)
            categ_distances = matching_dissim(samples[:,cat_pos],i[np.array(cat_pos)+1])
            distance_bi = np.average(euclidian_distances) + (kproto_model.gamma * np.average(categ_distances))  
            bi_arr.append(np.average(distance_bi))
            
        # min bi is determined as final bi variable
        if(len(bi_arr)==0):
            bi = 0
        else:
            bi = min(bi_arr)
        
        # 3. calculate silhouette Index
        if ai == bi:
            silhouette_i = 0
        elif ai < bi:
            silhouette_i = 1 - (ai / bi)
        elif ai > bi:
            silhouette_i = 1 - (bi / ai)
        silhouette_Index_arr.append(silhouette_i)
        
    silhouette_score = np.average(silhouette_Index_arr)
    
    return silhouette_score

Answer 2

这是我当前正在使用的该功能的工作加速版本。

def silhouette_score_kproto(data, labels, categorical_indices, kproto_gamma):
    """
    Calculate silhouette scores for clustering results using k-prototypes algorithm.

    Parameters:
        data (pd.DataFrame): Input data with features and cluster labels.
        labels (np.array): Cluster labels assigned by the k-prototypes algorithm.
        categorical_indices (list): List of indices representing categorical columns in the data.
        kproto_gamma (float): Scaling factor for matching dissimilarity in k-prototypes algorithm.

    Returns:
        dict: A dictionary mapping data point indices to their corresponding cluster labels and silhouette scores.
    """
    # Create a copy of the data to avoid modifying the original dataset
    kproto_data = data.copy()

    # Add cluster labels to the copied data
    kproto_data['cluster'] = labels

    # Convert the data to a NumPy array for efficient computations
    kproto_data = kproto_data.to_numpy()

    # Separate numerical and categorical columns based on indices
    cat_cols = categorical_indices
    num_cols = [i for i in range(kproto_data.shape[1]) if i not in categorical_indices]

    # Initialize an empty list to store silhouette scores
    values = []

    # Iterate through each data point in the dataset
    for row in tqdm(range(kproto_data.shape[0]), total=kproto_data.shape[0]):
        point = kproto_data[row, :]
        removed_point_data = np.delete(kproto_data, row, axis=0)
        cluster = point[-1]

        # Create a boolean mask to filter rows with the same cluster label as the current point
        mask = (removed_point_data[:, -1] == cluster)
        ai_data = removed_point_data[mask]

        # Calculate dissimilarity for numerical and categorical features using appropriate metrics
        distance = euclidean_dissim(ai_data[:, num_cols], point[num_cols]) + \
                   kproto_gamma * matching_dissim(point[cat_cols], ai_data[:, cat_cols], axis=1)
        a_i = np.mean(distance)

        # Create a boolean mask to filter rows with different cluster labels from the current point
        mask = (removed_point_data[:, -1] != cluster)
        bi_data = removed_point_data[mask]

        # Calculate dissimilarity for numerical and categorical features using appropriate metrics
        distance = euclidean_dissim(bi_data[:, num_cols], point[num_cols]) + \
                   kproto_gamma * matching_dissim(point[cat_cols], bi_data[:, cat_cols], axis=1)
        b_i = np.mean(distance)

        # Calculate silhouette score for the current point
        if max(a_i, b_i) == 0:
            s_i = 0  # Avoid division by zero
        else:
            s_i = (b_i - a_i) / max(a_i, b_i)

        # Store the index, cluster label, and silhouette score in the values list as a tuple
        values.append(s_i)


    # Return a dictionary mapping data point indices to their corresponding cluster labels and silhouette scores
    return values

在混合分类和数值数据的 k 原型聚类算法中运行轮廓分数计算速度很慢

问题描述投票：0回答：2

2个回答

最新问题

在混合分类和数值数据的 k 原型聚类算法中运行轮廓分数计算速度很慢

问题描述 投票：0回答：2

2个回答

最新问题

问题描述投票：0回答：2