我正在尝试开发一种用于生物信息学的 K-Means 算法,到目前为止我设法开发了一个但它没有任何 k 参数,我将 k 值设置为 2。这段代码工作正常但现在我需要改进此代码并添加 k 参数以更改群集值。我不想使用 sklearn。
这是我的第一个代码,默认 k=2。
``import random
from matplotlib import pyplot
def CreateRandomPoints(n=20, _range=(-10.0, 10.0)):
dataset = set()
for i in range(n):
dataset.add((random.uniform(_range[0], _range[1]), random.uniform(_range[0], _range[1])))
return dataset
def centroid(datapoints):
"""Takes a collection of datapoints and returns the centroid"""
x_values = [k[0] for k in datapoints]
y_values = [k[1] for k in datapoints]
sum_of_x = sum(x_values)
sum_of_y = sum(y_values)
mean_of_x = sum_of_x / len(x_values)
mean_of_y = sum_of_y / len(y_values)
return mean_of_x, mean_of_y
def SplitIntoGroups(dataset, slope, intercept):
_setTop = set()
_setBottom = set()
for dp in dataset:
if (slope * dp[0] + intercept > dp[1]):
_setTop.add(dp)
else:
_setBottom.add(dp)
return _setTop, _setBottom
`
def K2Means(dataset: set, show_plots=False):
for datapoint in dataset:
assert type(datapoint) == tuple, f"{datapoint} is not a tuple."
assert len(datapoint) == 2, f"{datapoint} does not have 2 values."
assert isinstance(datapoint[0], (float, int)) and isinstance(datapoint[1], (float, int)), f"{datapoint}"
setA, setB = set(), set()
x_values = [k[0] for k in dataset]
dataset_space_on_x_axis = (min(x_values), max(x_values))
# randomly distribute the values to two sets
setA.add(dataset.pop())
setB.add(dataset.pop())
for datapoint in dataset:
if random.random() > 0.5:
setA.add(datapoint)
else:
setB.add(datapoint)
for cycle in range(100):
if show_plots:
pyplot.scatter([k[0] for k in setA], [k[1] for k in setA])
pyplot.scatter([k[0] for k in setB], [k[1] for k in setB])
# pyplot.show()
setA_centroid = centroid(setA)
setB_centroid = centroid(setB)
center_of_centroid = centroid([setA_centroid, setB_centroid])
# Slope = (y1 - y2) / (x1 - x2)
slope = (setA_centroid[1] - setB_centroid[1]) / (setA_centroid[0] - setB_centroid[0])
# The separating line should be perpendicular to the line that passes through
# That means the slope is multiplied by -1
slope = -1 / slope
# The line should pass through the center_of_centroids
# yc = slope * xc + intercept ==> intercept = yc - slope * xc
intercept = center_of_centroid[1] - slope * center_of_centroid[0]
# our separation line is y=slope * x + intercept
if show_plots:
pyplot.scatter([setA_centroid[0]], [setA_centroid[1]], marker="x")
pyplot.scatter([setB_centroid[0]], [setB_centroid[1]], marker="x")
x_values = list(range(int(dataset_space_on_x_axis[0]), int(dataset_space_on_x_axis[1])))
y_values = [slope * x + intercept for x in x_values]
pyplot.plot(x_values, y_values, "r--")
pyplot.title(f"Cycle Counter: {cycle}" , color="brown")
pyplot.show()
# check if finished
_temp_setA, _temp_setB = SplitIntoGroups(setA.union(setB), slope, intercept)
if (setA == _temp_setA):
print("K2Means Finished")
print("set 1", _temp_setA)
print("set 2", _temp_setB)
return _temp_setA, _temp_setB
setA, setB = _temp_setA, _temp_setB
if __name__ == "__main__":
pass
ds = CreateRandomPoints(n=100)
ds.update(CreateRandomPoints(n=50, _range=(10, 20)))
K2Means(ds, show_plots=True)
`
这是我的第二个代码,我想添加一个 k 参数,但是我做不到。
import random
from matplotlib import pyplot
def CreateRandomPoints(n=20, _range=(-10.0, 10.0)):
dataset = []
for i in range(n):
dataset.append((random.uniform(_range[0], _range[1]), random.uniform(_range[0], _range[1])))
return dataset
def centroid(datapoints):
"""Takes a collection of datapoints and returns the centroid"""
x_values = [k[0] for k in datapoints]
y_values = [k[1] for k in datapoints]
sum_of_x = sum(x_values)
sum_of_y = sum(y_values)
mean_of_x = sum_of_x / len(x_values)
mean_of_y = sum_of_y / len(y_values)
return mean_of_x, mean_of_y
def SplitIntoGroups(dataset, slope, intercept):
_setTop =set()
_setBottom =set()
for dp in dataset:
if (slope * dp[0] + intercept > dp[1]):
_setTop.add(dp)
else:
_setBottom.add(dp)
return _setTop, _setBottom
def KMeans(dataset: list, k=2, show_plots=False):
assert k > 1, "k must be greater than 1"
for datapoint in dataset:
assert type(datapoint) == tuple, f"{datapoint} is not a tuple."
assert len(datapoint) == 2, f"{datapoint} does not have 2 values."
assert isinstance(datapoint[0], (float, int)) and isinstance(datapoint[1], (float, int)), f"{datapoint}"
sets = [set() for _ in range(k)] #girilen k değeri kadar set oluşturabilmek için
x_values = [k[0] for k in dataset]
dataset_space_on_x_axis = (min(x_values), max(x_values))
for dp in dataset:
random_set_index = random.randint(0, k - 1)
sets[random_set_index].add(dp)
for cycle in range(100):
if show_plots:
for i, _set in enumerate(sets):
pyplot.scatter([k[0] for k in _set], [k[1] for k in _set], label=f"Set {i + 1}")
pyplot.legend()
pyplot.title(f"Cycle: {cycle + 1}")
pyplot.show()
set_centroids = [centroid(_set) for _set in sets]
center_of_centroids = centroid(list(set_centroids))
for i, _set in enumerate(sets):
slope = (set_centroids[i][1] - center_of_centroids[1]) / (set_centroids[i][0] - center_of_centroids[0])
slope = -1 / slope
intercept = center_of_centroids[1] - slope * center_of_centroids[0]
setA, setB = SplitIntoGroups(_set, slope, intercept)
sets[i] = setA
if len(setB) > 0:
sets[i] = setB
#tamamlandıysa döngüyü sonlandır
if len(sets[i]) == 0:
print("KMeans Tamamlandı")
for i, _set in enumerate(sets):
print(f"Set {i + 1}: {_set}")
break
set_centroids = [centroid(_set) for _set in sets]
center_of_centroids = centroid(set_centroids)
for i, _set in enumerate(sets):
slope = (set_centroids[i][1] - center_of_centroids[1]) / (set_centroids[i][0] - center_of_centroids[0])
print(f"i={i}, set_centroids={set_centroids}, center_of_centroids={center_of_centroids}")
print(f"set_centroids length={len(set_centroids)}, center_of_centroids length={len(center_of_centroids)}")
slope = -1 / slope
intercept = center_of_centroids[1] - slope * center_of_centroids[0]
setA, setB = SplitIntoGroups(_set, slope, intercept)
sets[i] = setA
if len(setB) > 0:
sets[i]=setB
if show_plots:
for i, _set in enumerate(sets):
pyplot.scatter([k[0] for k in _set], [k[1] for k in _set], label=f"Set {i + 1}")
pyplot.legend()
pyplot.title(f"Final Clustering with k={k}")
#pyplot.show()
return sets
if __name__ == "__main__":
ds = CreateRandomPoints(n=100)
ds.extend(CreateRandomPoints(n=50, _range=(10, 20)))
KMeans(ds, k=2, show_plots=True)
pyplot.show()