Python K-Means 算法不能正常工作

问题描述 投票:0回答:0

我正在尝试开发一种用于生物信息学的 K-Means 算法,到目前为止我设法开发了一个但它没有任何 k 参数,我将 k 值设置为 2。这段代码工作正常但现在我需要改进此代码并添加 k 参数以更改群集值。我不想使用 sklearn。

这是我的第一个代码,默认 k=2。

``import random
from matplotlib import pyplot


def CreateRandomPoints(n=20, _range=(-10.0, 10.0)):
    dataset = set()
    for i in range(n):
        dataset.add((random.uniform(_range[0], _range[1]), random.uniform(_range[0], _range[1])))
    return dataset


def centroid(datapoints):
    """Takes a collection of datapoints and returns the centroid"""
    x_values = [k[0] for k in datapoints]
    y_values = [k[1] for k in datapoints]
    sum_of_x = sum(x_values)
    sum_of_y = sum(y_values)
    mean_of_x = sum_of_x / len(x_values)
    mean_of_y = sum_of_y / len(y_values)
    return mean_of_x, mean_of_y


def SplitIntoGroups(dataset, slope, intercept):
    _setTop = set()
    _setBottom = set()

    for dp in dataset:
        if (slope * dp[0] + intercept > dp[1]):
            _setTop.add(dp)
        else:
            _setBottom.add(dp)

    return _setTop, _setBottom
`

def K2Means(dataset: set, show_plots=False):
    for datapoint in dataset:
        assert type(datapoint) == tuple, f"{datapoint} is not a tuple."
        assert len(datapoint) == 2, f"{datapoint} does not have 2 values."
        assert isinstance(datapoint[0], (float, int)) and isinstance(datapoint[1], (float, int)), f"{datapoint}"

    setA, setB = set(), set()

    x_values = [k[0] for k in dataset]
    dataset_space_on_x_axis = (min(x_values), max(x_values))

    # randomly distribute the values to two sets
    setA.add(dataset.pop())
    setB.add(dataset.pop())

    for datapoint in dataset:
        if random.random() > 0.5:
            setA.add(datapoint)
        else:
            setB.add(datapoint)

    for cycle in range(100):

        if show_plots:
            pyplot.scatter([k[0] for k in setA], [k[1] for k in setA])
            pyplot.scatter([k[0] for k in setB], [k[1] for k in setB])
            # pyplot.show()

        setA_centroid = centroid(setA)
        setB_centroid = centroid(setB)
        center_of_centroid = centroid([setA_centroid, setB_centroid])

        # Slope = (y1 - y2) / (x1 - x2)
        slope = (setA_centroid[1] - setB_centroid[1]) / (setA_centroid[0] - setB_centroid[0])

        # The separating line should be perpendicular to the line that passes through
        # That means the slope is multiplied by -1
        slope = -1 / slope

        # The line should pass through the center_of_centroids
        # yc = slope * xc + intercept ==> intercept = yc - slope * xc
        intercept = center_of_centroid[1] - slope * center_of_centroid[0]
        # our separation line is y=slope * x + intercept

        if show_plots:
            pyplot.scatter([setA_centroid[0]], [setA_centroid[1]], marker="x")
            pyplot.scatter([setB_centroid[0]], [setB_centroid[1]], marker="x")
            x_values = list(range(int(dataset_space_on_x_axis[0]), int(dataset_space_on_x_axis[1])))
            y_values = [slope * x + intercept for x in x_values]

            pyplot.plot(x_values, y_values, "r--")
            pyplot.title(f"Cycle Counter: {cycle}" , color="brown")


            pyplot.show()

        # check if finished
        _temp_setA, _temp_setB = SplitIntoGroups(setA.union(setB), slope, intercept)
        if (setA == _temp_setA):
            print("K2Means Finished")
            print("set 1", _temp_setA)
            print("set 2", _temp_setB)
            return _temp_setA, _temp_setB
        setA, setB = _temp_setA, _temp_setB


if __name__ == "__main__":
    pass
    ds = CreateRandomPoints(n=100)
    ds.update(CreateRandomPoints(n=50, _range=(10, 20)))
    K2Means(ds, show_plots=True)

`

这是我的第二个代码,我想添加一个 k 参数,但是我做不到。

import random
from matplotlib import pyplot

def CreateRandomPoints(n=20, _range=(-10.0, 10.0)):
    dataset = []
    for i in range(n):
        dataset.append((random.uniform(_range[0], _range[1]), random.uniform(_range[0], _range[1])))
    return dataset

def centroid(datapoints):
    """Takes a collection of datapoints and returns the centroid"""
    x_values = [k[0] for k in datapoints]
    y_values = [k[1] for k in datapoints]
    sum_of_x = sum(x_values)
    sum_of_y = sum(y_values)
    mean_of_x = sum_of_x / len(x_values)
    mean_of_y = sum_of_y / len(y_values)
    return mean_of_x, mean_of_y


def SplitIntoGroups(dataset, slope, intercept):
    _setTop =set()
    _setBottom =set()

    for dp in dataset:
        if (slope * dp[0] + intercept > dp[1]):
            _setTop.add(dp)
        else:
            _setBottom.add(dp)

    return _setTop, _setBottom


def KMeans(dataset: list, k=2, show_plots=False):
    assert k > 1, "k must be greater than 1"

    for datapoint in dataset:
        assert type(datapoint) == tuple, f"{datapoint} is not a tuple."
        assert len(datapoint) == 2, f"{datapoint} does not have 2 values."
        assert isinstance(datapoint[0], (float, int)) and isinstance(datapoint[1], (float, int)), f"{datapoint}"

    sets = [set() for _ in range(k)] #girilen k değeri kadar set oluşturabilmek için

    x_values = [k[0] for k in dataset]
    dataset_space_on_x_axis = (min(x_values), max(x_values))

    for dp in dataset:
        random_set_index = random.randint(0, k - 1)
        sets[random_set_index].add(dp)

    for cycle in range(100):

        if show_plots:
            for i, _set in enumerate(sets):
                pyplot.scatter([k[0] for k in _set], [k[1] for k in _set], label=f"Set {i + 1}")
            pyplot.legend()
            pyplot.title(f"Cycle: {cycle + 1}")
            pyplot.show()

        set_centroids = [centroid(_set) for _set in sets]
        center_of_centroids = centroid(list(set_centroids))

        for i, _set in enumerate(sets):
            slope = (set_centroids[i][1] - center_of_centroids[1]) / (set_centroids[i][0] - center_of_centroids[0])
            slope = -1 / slope
            intercept = center_of_centroids[1] - slope * center_of_centroids[0]

            setA, setB = SplitIntoGroups(_set, slope, intercept)
            sets[i] = setA

            if len(setB) > 0:
                sets[i] = setB


                #tamamlandıysa döngüyü sonlandır
                if len(sets[i]) == 0:
                    print("KMeans Tamamlandı")
                    for i, _set in enumerate(sets):
                        print(f"Set {i + 1}: {_set}")
                    break

            set_centroids = [centroid(_set) for _set in sets]
            center_of_centroids = centroid(set_centroids)

            for i, _set in enumerate(sets):
                slope = (set_centroids[i][1] - center_of_centroids[1]) / (set_centroids[i][0] - center_of_centroids[0])
                print(f"i={i}, set_centroids={set_centroids}, center_of_centroids={center_of_centroids}")
                print(f"set_centroids length={len(set_centroids)}, center_of_centroids length={len(center_of_centroids)}")
                slope = -1 / slope
                intercept = center_of_centroids[1] - slope * center_of_centroids[0]

                setA, setB = SplitIntoGroups(_set, slope, intercept)
                sets[i] = setA
                if len(setB) > 0:
                    sets[i]=setB

        if show_plots:
            for i, _set in enumerate(sets):
                pyplot.scatter([k[0] for k in _set], [k[1] for k in _set], label=f"Set {i + 1}")
            pyplot.legend()
            pyplot.title(f"Final Clustering with k={k}")
            #pyplot.show()

    return sets


if __name__ == "__main__":
    ds = CreateRandomPoints(n=100)
    ds.extend(CreateRandomPoints(n=50, _range=(10, 20)))

    KMeans(ds, k=2, show_plots=True)
    pyplot.show()

python bioinformatics k-means
© www.soinside.com 2019 - 2024. All rights reserved.