我是python的初学者,尝试不使用任何库就计算熵和信息增益。现在,计算结果比代码质量更重要。
我有数据框,并希望像这样列出属性列表
Outlook = [
[4, 0], # overcast
[2, 3], # sunny
[3, 2] # rain
]
temperature = [
[2, 2], # hot
[3, 1], # cool
[4, 2] # mild
]
这是我的数据:play golf data set
#Entropy
def entropy(Y):
"""
Also known as Shanon Entropy
Reference: https://en.wikipedia.org/wiki/Entropy_(information_theory)
"""
unique, count = np.unique(Y, return_counts=True, axis=0)
prob = count/len(Y)
en = np.sum((-1)*prob*np.log2(prob))
return en
#Joint Entropy
def jEntropy(Y,X):
"""
H(Y;X)
Reference: https://en.wikipedia.org/wiki/Joint_entropy
"""
YX = np.c_[Y,X]
return entropy(YX)
#Conditional Entropy
def cEntropy(Y, X):
"""
conditional entropy = Joint Entropy - Entropy of X
H(Y|X) = H(Y;X) - H(X)
Reference: https://en.wikipedia.org/wiki/Conditional_entropy
"""
return jEntropy(Y, X) - entropy(X)
#Information Gain
def gain(Y, X):
"""
Information Gain, I(Y;X) = H(Y) - H(Y|X)
Reference: https://en.wikipedia.org/wiki/Information_gain_in_decision_trees#Formal_definition
"""
return entropy(Y) - cEntropy(Y,X)