我有一个csv文件,如下图所示
page Page_Value
page1 12
page2 1
page3 2
page4 3
page5 10
page6 11
page7 13
page8 67
page9 70
#covert page to numeric
labelEncoder = LabelEncoder()
labelEncoder.fit(dataset_PV['page'])
dataset_PV['page'] = labelEncoder.transform(dataset_PV['page'])
#find out no. of cluster using elbow methos
from sklearn.cluster import KMeans
from sklearn import preprocessing
wcss = []
for i in range(1,10):
kmeans = KMeans(n_clusters=i, init='k-means++', random_state=0)
kmeans.fit(dataset_PV)
wcss.append(kmeans.inertia_)
plt.figure(figsize=(15,8))
plt.plot(range(1,10), wcss,marker='o')
plt.title('Elbow graph')
plt.xlabel('Number of clusters')
plt.ylabel('within-cluster sums of squares (WCSS)')
plt.show()
#fit model
kmeans = KMeans(n_clusters=5, init='k-means++', random_state=42)
clusters = kmeans.fit_predict(dataset_PV)
dataset_PV['clusters'] = clusters
我想根据页面值创建页面组合,以获得最大的页面值。. 在这里,我使用了K-means,我已经将页面变量转换为数字。我不知道我是应该使用K-means还是对Page_value变量进行排序,然后对它们进行分组(不确定代码)。
输出的东西是这样的。
Cluster1 = page2,page3,page4
Cluster2 = page5,page6,page7,page12
Cluster3 = page7,page8,page9
谢谢你
你不需要先对页面进行排序。你有没有试过Opencv的K-mean?希望对你有帮助。https:/docs.opencv.orgmasterd1d5ctutorial_py_kmeans_opencv.html] 。
import numpy as np
import cv2 as cv
pages = ['p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9']
x = np.array((12,1,2,3,10,11,13,67,70))
x = np.float32(x)
x = np.reshape(x, (-1,1))
# Define criteria = ( type, max_iter = 10 , epsilon = 1.0 )
criteria = (cv.TERM_CRITERIA_EPS + cv.TERM_CRITERIA_MAX_ITER, 10, 1.0)
# Set flags (Just to avoid line break in the code)
flags = cv.KMEANS_RANDOM_CENTERS
K = 3
# Apply KMeans
compactness, labels, centers = cv.kmeans(x, K, None, criteria, 10, flags)
labels = labels.flatten()
# result
res = dict()
for i in range(K):
res[i] = []
for idx, lab in enumerate(labels):
res[lab].append(pages[idx])
print(res)
另一个使用Sklearn的解决方案。
from sklearn.cluster import KMeans
import numpy as np
pages = ['p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9']
x = np.array((12,1,2,3,10,11,13,67,70))
x = np.float32(x)
x = np.reshape(x, (-1,1))
K=3
km = KMeans(n_clusters=K)
km.fit(x)
labels = km.predict(x)
labels = labels.flatten()
# result
res = dict()
for i in range(K):
res[i] = []
for idx, lab in enumerate(labels):
res[lab].append(pages[idx])
print(res)
你已经完成了大部分的工作,但页面的名称不应该包括在计算中的 KMeans
,这就没有意义了。
即LabelEncoder是不必要的。
tl;Dr
简单的答案你可以参考@Sơn Ninh。
如果你想把我的答案可视化,我的答案可能会帮助你。
我写了一个函数(label_encoding
)给你,你可以用它来获得有助于画图的id的映射。
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import textwrap
from io import StringIO
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
from typing import Union, NamedTuple
from collections import defaultdict
def main():
df = import_file()
suitable_n: int
x = df.Page_Value.values.reshape(-1, 1)
if not 'Use the Elbow method to get a suitable N.':
# You can omit this if you don't want to see it at all.
elbow_find_n(x)
suitable_n = 3
# fit model
kmeans = KMeans(n_clusters=suitable_n, init='k-means++', random_state=42)
clusters = kmeans.fit_predict(x)
# labelEncoder = LabelEncoder()
# labelEncoder.fit(df['page'])
# df['page'] = labelEncoder.transform(df['page'])
df.loc[:, ['page']], mapping_table = label_encoding(df.loc[:, ['page']])
df = rebuild_df(df, clusters, mapping_table) # 'page-id', 'page', 'Page_Value', 'clusters'
print(df)
dict_by_cluster_value = defaultdict(list)
for cur_set in set(df['clusters']): # Output the format that you define.
print(f'Cluster{cur_set} = {",".join(df.page[df.clusters == cur_set])}')
dict_by_cluster_value[cur_set].extend(df.page[df.clusters == cur_set].to_list())
print(dict(dict_by_cluster_value)) # it's ok with defaultdict, I convert the type is for print beautiful.
visualizing_the_clusters(kmeans, df)
class RGBColor(NamedTuple):
BLACK = '#000000'
# AZURE = '#F0FFFF'
OLIVE = '#808000'
PINK = '#FFC0CB'
# WHITE = '#000000' <-- not suitable put it on background is white.
GOLD = 'FFD700'
BLUE = '#0000FF'
GREEN = '#00FF00'
RED = '#FF0000'
YELLOW = '#FFFF00'
ORANGE = '#FFA500'
PURPLE = '#FF00FF'
def get_tuple(self):
return (attr_name for attr_name in dir(self) if not attr_name.startswith('_') and attr_name.isupper())
def label_encoding(label_col: Union[pd.DataFrame, np.ndarray], is_need_mapping_table=True) -> tuple:
"""
USAGE:
df.loc[:, ['col_xxx', ]], mapping_table = label_encoding(df.loc[:, ['col_xxx']])
"""
nda_rtn_value = LabelEncoder().fit_transform(label_col.values.ravel()) if isinstance(label_col, pd.DataFrame) else LabelEncoder().fit_transform(label_col)
rtn_dict = dict()
if is_need_mapping_table:
list_value = [e[0] for e in label_col.values] if isinstance(label_col, pd.DataFrame) else [e for e in label_col]
rtn_dict = dict(zip(nda_rtn_value, list_value))
if isinstance(label_col, pd.DataFrame):
nda_rtn_value = nda_rtn_value.reshape(-1, 1)
return nda_rtn_value, rtn_dict
def import_file() -> pd.DataFrame:
page_content = textwrap.dedent( # Remove any common leading whitespace from every line in text.
"""\
page,Page_Value
page1,12
page2,1
page3,2
page4,3
page5,10
page6,11
page7,13
page8,67
page9,70
"""
)
df = pd.read_csv(StringIO(page_content), header=0)
return df
def elbow_find_n(x):
wcss = []
for i in range(1, 10):
kmeans = KMeans(n_clusters=i, init='k-means++', random_state=0)
kmeans.fit(x)
wcss.append(kmeans.inertia_)
plt.figure(figsize=(15, 8))
plt.plot(range(1, 10), wcss, marker='o')
plt.title('Elbow graph')
plt.xlabel('Number of Clusters')
plt.ylabel('within-cluster sums of squares WCSS')
plt.show()
def rebuild_df(df, clusters, mapping_table):
df['clusters'] = clusters
df.rename(columns={'page': 'page-id'}, inplace=True)
df['page'] = df.apply(lambda df_: mapping_table[df_['page-id']], axis=1)
df = df.reindex(['page-id', 'page', 'clusters', 'Page_Value', ], axis=1)
return df
def visualizing_the_clusters(kmeans: KMeans, df: pd.DataFrame):
standard_rgb = RGBColor()
# plt.scatter(df[df.clusters == 0]['page-id'], df[df.clusters == 0]['Page_Value'], s=2, c='red', label='Careful')
# plt.scatter(df[df.clusters == 2]['page-id'], df[df.clusters == 2]['Page_Value'], s=2, c='cyan', label='Careless')
# ...
for color_idx, (cur_set, color) in enumerate(
zip(set(df.clusters), standard_rgb.get_tuple())
):
contain_cluster_index = df.clusters == cur_set
plt.scatter(df[contain_cluster_index]['page-id'], df[contain_cluster_index]['Page_Value'],
s=2, c=color, label=f'Cluster{cur_set}: {kmeans.cluster_centers_[cur_set][0]}')
n_cluster = len(kmeans.cluster_centers_)
plt.scatter(np.ones(n_cluster)*(-1), kmeans.cluster_centers_[:, 0], s=(50/n_cluster), c='purple', label='Centroids')
plt.title('Page and Page_Value')
plt.xlabel('Page_ID')
plt.ylabel('Page_Value')
plt.legend(loc=(1.05, 0.5))
plt.tight_layout()
plt.show()
if __name__ == '__main__':
main()
输出
page-id page clusters Page_Value
0 0 page1 0 12
1 1 page2 2 1
2 2 page3 2 2
3 3 page4 2 3
4 4 page5 0 10
5 5 page6 0 11
6 6 page7 0 13
7 7 page8 1 67
8 8 page9 1 70
Cluster0 = page1,page5,page6,page7
Cluster1 = page8,page9
Cluster2 = page2,page3,page4
{0: ['page1', 'page5', 'page6', 'page7'], 1: ['page8', 'page9'], 2: ['page2', 'page3', 'page4']}