贝叶斯网络[变量消除]:使用pandas合并和groupby内存崩溃

问题描述 投票:0回答:1

尝试加速我的函数并使它们对于贝叶斯网络上的变量消除算法具有更高的内存效率,但一旦数据帧变得太大,它仍然会崩溃。

我创建了一个示例脚本来播放。您可以按照指示调整列大小等。

请注意:示例中未给出防止表大小爆炸的预处理和启发式方法。

import numpy as np
import pandas as pd
import itertools
import time

# Adjust amount of dataframes here:
how_many_dataframes = 20 

def make_array(how_many_dataframes):
    dic = {}
    for i in range(how_many_dataframes):

# Adjust possible range of column sizes here
        column_size = np.random.choice(list([4,5,6,7,8,9,10,11,12,13,14,15,16])) 

        combinations = list(itertools.product([0, 1], repeat=column_size))
        bool_columns = np.array(combinations, dtype=int)
        random_values = np.random.uniform(0, 0.5, size=(2**column_size, 1))
        array = np.concatenate((bool_columns, random_values), axis=1)
        letters = np.random.choice(list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')) # Adjust possible column names here
        unique_columns = set()
        while len(unique_columns) < column_size:
            unique_columns.add(''.join(np.random.choice(list('ABCDEFGHIJKLMNOPQRSTUVWXYZ'), size=1)))
        df = pd.DataFrame(array, columns=list(unique_columns) + ['prob'])
        dic[letters] = df
        
    return dic

network = make_array(how_many_dataframes)


def merge_factors(network, elim_order):   
    print("###########################################")
    print(f"Elimination order: {elim_order}")
    print(f"Elimination order length: {len(elim_order)}")
    print("###########################################") 
    eliminate_node = elim_order[0] # note: this is the node to be eliminated
    factor_list = [] # list to store the relevant factors

    for network_key, network_df in network.items():  
        if elim_order[0] in network_df.columns:
            factor_list.append(network_key) # append the relevant factors to the list

    while len(factor_list) > 1: # as long as there are more than one relevant factors
        factor_1, factor_2 = factor_list[:2]
        merge_on = [col for col in network[factor_1].columns if col != "prob" and col in network[factor_2]] # merge on all columns except for the probability column
    
        network[factor_1].sort_values(by="prob", inplace=True)
        network[factor_2].sort_values(by="prob", inplace=True)

        start_time = time.time()
        print(f"Merging {factor_1} and {factor_2} on {merge_on}")
        print(f"{factor_1}: {network[factor_1].shape[0]} rows * {network[factor_1].shape[1]} columns")
        print(f"{factor_2}: {network[factor_2].shape[0]} rows * {network[factor_2].shape[1]} columns")
        print(f"Starting merge ...")
        network[factor_2] = pd.merge(network[factor_1], network[factor_2], on=merge_on, suffixes=('_f1', '_f2'), how='inner') # merge the two relevant factors
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"Merging complete: {elapsed_time} seconds")

        network[factor_2]['prob'] = network[factor_2]['prob_f1'].mul(network[factor_2]['prob_f2']) # calculate the new probability
        print(f"Calculate new probability complete")
        print(f"Drop redundant columns ...")
        network[factor_2].drop(columns=['prob_f1', 'prob_f2'], inplace = True) # drop the old probability columns
        print(f"Drop redundant columns complete")
        #print(f"Merged frame: {network[factor_2]}")
        print(f"Deleting {factor_1} ...")
        del network[factor_1] # delete the factor that was merged
        print(f"Delete complete")

        factor_list.pop(0)
    
    elim_order.pop(0)
    return eliminate_node

def find_relevant_factor(network, node):
    for network_key, network_df in network.items():  
        if node in network_df.columns:
            relevant_factors = network_key
            return relevant_factors
    return None
        

def marginalize(node, relevant_factor_key, network):

    relevant_factor = network[relevant_factor_key]
    print(f"Marginalizing {node}: {relevant_factor .shape[0]} rows * {relevant_factor .shape[1]} columns ...")

    if len(relevant_factor.columns) > 2 and node in relevant_factor.columns:
        relevant_factor = relevant_factor.drop(columns=[node])
        #relevant_factor = relevant_factor.groupby(relevant_factor.columns[:-1].tolist(), as_index=False).agg({relevant_factor.columns[-1]: 'sum'})
        relevant_factor = relevant_factor.groupby(relevant_factor.columns[:-1].tolist())["prob"].sum().reset_index()
        print(f"Marginalization complete")
        #print(f"Marginalized frame: {relevant_factor}")
        network[relevant_factor_key] = relevant_factor
    return

# Adjust elimination order here:
elim_order = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M"] # Adjust elimination order here

while len(elim_order) > 0: # Main loop: Continues to multiply and marginalize factors until all variables have been eliminated
        eliminate_node = merge_factors(network, elim_order)
        relevant_factor = find_relevant_factor(network, eliminate_node)
        if relevant_factor is not None:
            start_time = time.time()
            marginalize(eliminate_node, find_relevant_factor(network, eliminate_node), network)
            end_time = time.time()
            elapsed_time = end_time - start_time
            print(f"Elapsed time for marginalization: {elapsed_time} seconds")

对于提高合并/边缘化的操作性能有什么建议吗?

阅读有关 swiftly.apply 的内容,但到目前为止未能成功实现。 切换到 numpy 进行操作似乎很笨重。

pandas performance memory-management large-data bayesian-networks
1个回答
0
投票

您已达到 Pandas 性能的极限。虽然以更快的方式手动执行操作是可能的,但在 Python 中却相当困难,尤其是在许多列上完成合并时。一种解决方案是使用 Polars,这通常要快得多,因为它可以使用多核。在这种情况下,Polar 在我的 6 核机器上的大多数功能上似乎快了 4-6 倍(相当不错)。但请注意,Polar 函数与 Pandas 中的函数并不完全相同。

首先,在

make_array
中,您只需要将
pd
替换为
pl
,并在Python脚本的开头使用
import polars as pl

然后,对于

merge_factors
函数,连接、列删除和排序都需要稍微修改一下。这是一个例子:

def merge_factors(network, elim_order):   
    print("###########################################")
    print(f"Elimination order: {elim_order}")
    print(f"Elimination order length: {len(elim_order)}")
    print("###########################################") 
    eliminate_node = elim_order[0] # note: this is the node to be eliminated
    factor_list = [] # list to store the relevant factors

    for network_key, network_df in network.items():  
        if elim_order[0] in network_df.columns:
            factor_list.append(network_key) # append the relevant factors to the list

    while len(factor_list) > 1: # as long as there are more than one relevant factors
        factor_1, factor_2 = factor_list[:2]
        merge_on = [col for col in network[factor_1].columns if col != "prob" and col in network[factor_2].columns] # merge on all columns except for the probability column

        network[factor_1].sort(by="prob", in_place=True)
        network[factor_2].sort(by="prob", in_place=True)

        start_time = time.time()
        print(f"Merging {factor_1} and {factor_2} on {merge_on}")
        print(f"{factor_1}: {network[factor_1].shape[0]} rows * {network[factor_1].shape[1]} columns")
        print(f"{factor_2}: {network[factor_2].shape[0]} rows * {network[factor_2].shape[1]} columns")
        print(f"Starting merge ...")
        print(factor_1, factor_2, merge_on)
        network[factor_2] = network[factor_1].join(network[factor_2], on=merge_on, suffix='_f2', how='inner').rename({'prob': 'prob_f1'}) # merge the two relevant factors
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"Merging complete: {elapsed_time} seconds")

        network[factor_2]['prob'] = network[factor_2]['prob_f1'] * network[factor_2]['prob_f2'] # calculate the new probability
        print(f"Calculate new probability complete")
        print(f"Drop redundant columns ...")
        network[factor_2].drop_in_place('prob_f1') # drop the old probability columns
        network[factor_2].drop_in_place('prob_f2') # drop the old probability columns
        print(f"Drop redundant columns complete")
        #print(f"Merged frame: {network[factor_2]}")
        print(f"Deleting {factor_1} ...")
        del network[factor_1] # delete the factor that was merged
        print(f"Delete complete")

        factor_list.pop(0)
    
    elim_order.pop(0)
    return eliminate_node

至于功能

marginalize
,这是修改后的版本:

def marginalize(node, relevant_factor_key, network):
    relevant_factor = network[relevant_factor_key]
    print(f"Marginalizing {node}: {relevant_factor .shape[0]} rows * {relevant_factor .shape[1]} columns ...")

    if len(relevant_factor.columns) > 2 and node in relevant_factor.columns:
        relevant_factor.drop_in_place(node)
        relevant_factor = relevant_factor.groupby(relevant_factor.columns[:-1])["prob"].sum().rename({'prob_sum': 'prob'})
        print(f"Marginalization complete")
        network[relevant_factor_key] = relevant_factor
    return

代码几乎没有经过测试,但行为似乎相同,并且性能明显提高。事实上,整个脚本的时间从 45.3 秒减少到 10.8 秒。这意味着 这个新版本在我的 6 核机器(i5-9600KF CPU)上速度提高了 4.2 倍。

请注意,需要设置随机种子来比较结果(包括时序)。

© www.soinside.com 2019 - 2024. All rights reserved.