尝试加速我的函数并使它们对于贝叶斯网络上的变量消除算法具有更高的内存效率,但一旦数据帧变得太大,它仍然会崩溃。
我创建了一个示例脚本来播放。您可以按照指示调整列大小等。
请注意:示例中未给出防止表大小爆炸的预处理和启发式方法。
import numpy as np
import pandas as pd
import itertools
import time
# Adjust amount of dataframes here:
how_many_dataframes = 20
def make_array(how_many_dataframes):
dic = {}
for i in range(how_many_dataframes):
# Adjust possible range of column sizes here
column_size = np.random.choice(list([4,5,6,7,8,9,10,11,12,13,14,15,16]))
combinations = list(itertools.product([0, 1], repeat=column_size))
bool_columns = np.array(combinations, dtype=int)
random_values = np.random.uniform(0, 0.5, size=(2**column_size, 1))
array = np.concatenate((bool_columns, random_values), axis=1)
letters = np.random.choice(list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')) # Adjust possible column names here
unique_columns = set()
while len(unique_columns) < column_size:
unique_columns.add(''.join(np.random.choice(list('ABCDEFGHIJKLMNOPQRSTUVWXYZ'), size=1)))
df = pd.DataFrame(array, columns=list(unique_columns) + ['prob'])
dic[letters] = df
return dic
network = make_array(how_many_dataframes)
def merge_factors(network, elim_order):
print("###########################################")
print(f"Elimination order: {elim_order}")
print(f"Elimination order length: {len(elim_order)}")
print("###########################################")
eliminate_node = elim_order[0] # note: this is the node to be eliminated
factor_list = [] # list to store the relevant factors
for network_key, network_df in network.items():
if elim_order[0] in network_df.columns:
factor_list.append(network_key) # append the relevant factors to the list
while len(factor_list) > 1: # as long as there are more than one relevant factors
factor_1, factor_2 = factor_list[:2]
merge_on = [col for col in network[factor_1].columns if col != "prob" and col in network[factor_2]] # merge on all columns except for the probability column
network[factor_1].sort_values(by="prob", inplace=True)
network[factor_2].sort_values(by="prob", inplace=True)
start_time = time.time()
print(f"Merging {factor_1} and {factor_2} on {merge_on}")
print(f"{factor_1}: {network[factor_1].shape[0]} rows * {network[factor_1].shape[1]} columns")
print(f"{factor_2}: {network[factor_2].shape[0]} rows * {network[factor_2].shape[1]} columns")
print(f"Starting merge ...")
network[factor_2] = pd.merge(network[factor_1], network[factor_2], on=merge_on, suffixes=('_f1', '_f2'), how='inner') # merge the two relevant factors
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Merging complete: {elapsed_time} seconds")
network[factor_2]['prob'] = network[factor_2]['prob_f1'].mul(network[factor_2]['prob_f2']) # calculate the new probability
print(f"Calculate new probability complete")
print(f"Drop redundant columns ...")
network[factor_2].drop(columns=['prob_f1', 'prob_f2'], inplace = True) # drop the old probability columns
print(f"Drop redundant columns complete")
#print(f"Merged frame: {network[factor_2]}")
print(f"Deleting {factor_1} ...")
del network[factor_1] # delete the factor that was merged
print(f"Delete complete")
factor_list.pop(0)
elim_order.pop(0)
return eliminate_node
def find_relevant_factor(network, node):
for network_key, network_df in network.items():
if node in network_df.columns:
relevant_factors = network_key
return relevant_factors
return None
def marginalize(node, relevant_factor_key, network):
relevant_factor = network[relevant_factor_key]
print(f"Marginalizing {node}: {relevant_factor .shape[0]} rows * {relevant_factor .shape[1]} columns ...")
if len(relevant_factor.columns) > 2 and node in relevant_factor.columns:
relevant_factor = relevant_factor.drop(columns=[node])
#relevant_factor = relevant_factor.groupby(relevant_factor.columns[:-1].tolist(), as_index=False).agg({relevant_factor.columns[-1]: 'sum'})
relevant_factor = relevant_factor.groupby(relevant_factor.columns[:-1].tolist())["prob"].sum().reset_index()
print(f"Marginalization complete")
#print(f"Marginalized frame: {relevant_factor}")
network[relevant_factor_key] = relevant_factor
return
# Adjust elimination order here:
elim_order = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M"] # Adjust elimination order here
while len(elim_order) > 0: # Main loop: Continues to multiply and marginalize factors until all variables have been eliminated
eliminate_node = merge_factors(network, elim_order)
relevant_factor = find_relevant_factor(network, eliminate_node)
if relevant_factor is not None:
start_time = time.time()
marginalize(eliminate_node, find_relevant_factor(network, eliminate_node), network)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Elapsed time for marginalization: {elapsed_time} seconds")
对于提高合并/边缘化的操作性能有什么建议吗?
阅读有关 swiftly.apply 的内容,但到目前为止未能成功实现。 切换到 numpy 进行操作似乎很笨重。
您已达到 Pandas 性能的极限。虽然以更快的方式手动执行操作是可能的,但在 Python 中却相当困难,尤其是在许多列上完成合并时。一种解决方案是使用 Polars,这通常要快得多,因为它可以使用多核。在这种情况下,Polar 在我的 6 核机器上的大多数功能上似乎快了 4-6 倍(相当不错)。但请注意,Polar 函数与 Pandas 中的函数并不完全相同。
首先,在
make_array
中,您只需要将pd
替换为pl
,并在Python脚本的开头使用import polars as pl
。
然后,对于
merge_factors
函数,连接、列删除和排序都需要稍微修改一下。这是一个例子:
def merge_factors(network, elim_order):
print("###########################################")
print(f"Elimination order: {elim_order}")
print(f"Elimination order length: {len(elim_order)}")
print("###########################################")
eliminate_node = elim_order[0] # note: this is the node to be eliminated
factor_list = [] # list to store the relevant factors
for network_key, network_df in network.items():
if elim_order[0] in network_df.columns:
factor_list.append(network_key) # append the relevant factors to the list
while len(factor_list) > 1: # as long as there are more than one relevant factors
factor_1, factor_2 = factor_list[:2]
merge_on = [col for col in network[factor_1].columns if col != "prob" and col in network[factor_2].columns] # merge on all columns except for the probability column
network[factor_1].sort(by="prob", in_place=True)
network[factor_2].sort(by="prob", in_place=True)
start_time = time.time()
print(f"Merging {factor_1} and {factor_2} on {merge_on}")
print(f"{factor_1}: {network[factor_1].shape[0]} rows * {network[factor_1].shape[1]} columns")
print(f"{factor_2}: {network[factor_2].shape[0]} rows * {network[factor_2].shape[1]} columns")
print(f"Starting merge ...")
print(factor_1, factor_2, merge_on)
network[factor_2] = network[factor_1].join(network[factor_2], on=merge_on, suffix='_f2', how='inner').rename({'prob': 'prob_f1'}) # merge the two relevant factors
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Merging complete: {elapsed_time} seconds")
network[factor_2]['prob'] = network[factor_2]['prob_f1'] * network[factor_2]['prob_f2'] # calculate the new probability
print(f"Calculate new probability complete")
print(f"Drop redundant columns ...")
network[factor_2].drop_in_place('prob_f1') # drop the old probability columns
network[factor_2].drop_in_place('prob_f2') # drop the old probability columns
print(f"Drop redundant columns complete")
#print(f"Merged frame: {network[factor_2]}")
print(f"Deleting {factor_1} ...")
del network[factor_1] # delete the factor that was merged
print(f"Delete complete")
factor_list.pop(0)
elim_order.pop(0)
return eliminate_node
至于功能
marginalize
,这是修改后的版本:
def marginalize(node, relevant_factor_key, network):
relevant_factor = network[relevant_factor_key]
print(f"Marginalizing {node}: {relevant_factor .shape[0]} rows * {relevant_factor .shape[1]} columns ...")
if len(relevant_factor.columns) > 2 and node in relevant_factor.columns:
relevant_factor.drop_in_place(node)
relevant_factor = relevant_factor.groupby(relevant_factor.columns[:-1])["prob"].sum().rename({'prob_sum': 'prob'})
print(f"Marginalization complete")
network[relevant_factor_key] = relevant_factor
return
代码几乎没有经过测试,但行为似乎相同,并且性能明显提高。事实上,整个脚本的时间从 45.3 秒减少到 10.8 秒。这意味着 这个新版本在我的 6 核机器(i5-9600KF CPU)上速度提高了 4.2 倍。
请注意,需要设置随机种子来比较结果(包括时序)。