假设我有一个具有特征
x..
和结果 y
: 的数据框
import pandas as pd
def crossing(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:
return pd.merge(df1.assign(key=1), df2.assign(key=1), on='key').drop(columns='key')
def crossing_many(*args):
from functools import reduce
return reduce(crossing, args)
df = crossing_many(
pd.DataFrame({'x1': ['A', 'B', 'C']}),
pd.DataFrame({'x2': ['X', 'Y', 'Z']}),
pd.DataFrame({'x3': ['xxx', 'yyy', 'zzz']}),
).assign(y = lambda d: np.random.choice([0, 1], size=len(d)))
我可以用
bigtree
包非常简单地绘制一棵树:
from bigtree import dataframe_to_tree
def view_pydot(pdot):
from IPython.display import Image, display
plt = Image(pdot.create_png())
display(plt)
tree = (
df
.assign(y=lambda d: d['y'].astype('str'))
.assign(root='Everyone')
.assign(path=lambda d: d[['root'] + features + ['y']].agg('/'.join, axis=1))
.pipe(dataframe_to_tree, path_col='path')
)
view_pydot(tree_to_dot(tree))
树比它想象的要复杂得多。我想迭代地“加入”在所有级别上具有相同离开节点的分支/节点。例如,类似这样的事情:
基本上我想创建尽可能简单的树,以便人们能够在某种意义上使用它,IF x1=A AND x2=X THEN 1(因此通过可能的最短路径做出决定)。删除覆盖此特征所有可能值的节点(例如
xxx|yyy|zzz
)也是有意义的。谢谢!
我能够使用littletree解决你的问题(我是作者)。 该代码可能不是解决此问题的最有效方法,但它确实有效。
# df was created using code by OP
import itertools
from littletree import Node
# Convert df to tree
tree = Node.from_rows(df, path_name=None)
tree.identifier = "Everyone"
tree.to_image('before.png')
def simplify_tree(tree):
"""Simplify it iteratively."""
tree = tree.copy()
for node in tree.iter_tree(order="post"):
children = list(node.children)
for s1, s2 in itertools.combinations(children, 2):
if not s1.compare(s2):
s1.identifier += "|" + s2.identifier
s2.detach()
return tree
new_tree = simplify_tree(tree)
new_tree.to_image('after.png')
由于问题与简化布尔表达式有关,您可能还想研究一下卡诺映射。