我一直在尝试用 Python 实现我自己的梯度下降算法,但无法获得适合数据的输出。
class GradientDescent:
def __init__(self,
feature_data: list[float],
response_data: list[float],
initial_vector: list[float, float],
learning_rate: float=0.001,
iterations: int=20) -> None:
'''
Initialise GradientDescent class.
Args:
feature_data (list[float]): A list of feature values for each data point.
response_data (list[float]): A list of actual response values for each data point.
initial_vector (list[float, float]): The initial starting point for the gradient descent. Vector is gradient and y-intercept.
learning_rate (float): Factor determining how much to adjust the vector.
iterations (int): Number of times to run the gradient descent algorithm.
'''
self.feature_data = feature_data
self.response_data = response_data
self.initial_vector = initial_vector
self.learning_rate = learning_rate
self.iterations = iterations
def _partial_difference_quotient(self,
vector: list[float, float],
index: int,
small_change: float=0.0001):
'''
Calculate the partial difference quotient for a specific element of the vector.
Args:
vector (list[float, float]): The current vector representing model parameters.
index (int): The index of the element in the vector for which to calculate the difference quotient.
small_change (float): The small change applied to the specified element for estimating the gradient.
Returns:
float: The calculated partial difference quotient.
'''
modified_vector = vector.copy()
modified_vector[index] += small_change
return (self._cost(modified_vector) - self._cost(vector)) / small_change
def _estimate_gradient(self, vector):
'''
Estimate the gradient of the cost function at a given vector.
Args:
vector (list[float, float]): The current vector representing model parameters.
Returns:
list[float]: A list of gradient estimates for each element of the vector.
'''
gradient_estimates = []
for i in range(len(vector)):
gradient_estimates.append(self._partial_difference_quotient(vector, i))
return gradient_estimates
def _update_vector(self, vector: list[float, float]) -> list[float, float]:
'''
Update the vector using gradient descent.
Args:
vector (list[float, float]): The current vector representing model parameters.
Returns:
list[float, float]: The updated vector after applying gradient descent.
'''
direction = self._estimate_gradient(vector)
for i in range(len(vector)):
vector[i] -= self.learning_rate * direction[i]
return vector
def _cost(self, vector: list[float, float]) -> float:
'''
Calculate the cost (loss) of a linear regression model's predictions.
This function computes the cost, also known as the mean squared error, of a linear regression model's
predictions compared to the actual response values, given the model parameters and the corresponding
features and response data.
Args:
vector (list[float, float]): A list representing the model parameters (gradient and y-intercept).
Returns:
float: The calculated cost (mean squared error) of the model's predictions.
'''
cost = 0
for i, element in enumerate(self.response_data):
error = element - vector[0] - vector[1] * self.feature_data[i]
cost += error * error
return cost / (2 * len(self.response_data))
def compute_gradient_descent(self) -> tuple[float, list[float]]:
'''
Perform gradient descent to optimize the model parameters.
Returns:
tuple[float, list[float]]: A tuple containing the optimized vector of model parameters and a list of cost values during optimization.
'''
next_vector = self.initial_vector
cost_values = [self._cost(next_vector)]
for i in range(self.iterations):
next_vector = self._update_vector(next_vector)
cost_values.append(self._cost(next_vector))
return next_vector, cost_values
为了测试代码,我随机选择起始向量来尝试最小化落在局部最小值的机会:
def main(runs: int=50, **kwargs: dict[str, float | int]) -> None:
# Unpack keyword arguments
learning_rate = kwargs.get("learning_rate", 0.0001)
iterations = kwargs.get("iterations", 50)
# Initialise lowest cost value at high value
lowest_cost_value = 1_000_000
# Perform gradient descent for each starting vector and draw the output line on the plot
for i in range(runs):
starting_vector = [randint(0, 1500)/100, randint(0, 1500)/100]
gradient_descent = GradientDescent(
feature_data=data["Exam Score"],
response_data=data["Study Hours"],
initial_vector=starting_vector.copy(),
learning_rate=learning_rate,
iterations=iterations
)
latest_vector, cost_values = gradient_descent.compute_gradient_descent()
if min(cost_values) < lowest_cost_value:
lowest_cost_value = min(cost_values)
lowest_cost_vector = latest_vector
lowest_cost_starting_vector = starting_vector
print(f"Lowest Cost: Value={lowest_cost_value}, Vector={lowest_cost_vector}, Starting Vector={lowest_cost_starting_vector}, Color={line_colors[i % len(line_colors)]}")
输出:
Lowest Cost: Value=1.3919698325898155, Vector=[0.3304394780995701, 0.07838640040339392], Starting Vector=[0.36, 1.92]
我绘制了每次运行的结果,它似乎不正确(粗体红线是我对正确输出的手动估计,而底部的粗体黑线是算法以最低成本进行的最佳尝试): 然而成本函数似乎还不错,最好的成本是 1.39,最差的成本是 16.23:
所有输出似乎在 (0.015, 0.084) 处收敛到接近 (0, 0) 是否有特殊原因?算法输出的解决方案都不能代表数据,而且似乎更高的成本值会产生更准确的梯度,所以我真的不确定我在这里做错了什么。
我的期望是,产生最低成本值的起始向量的输出将是最准确的并且最适合数据。然而,该算法的输出似乎根本无法准确拟合数据,具有一系列梯度但 y 轴截距不正确。
我尝试过调整学习率和随机化初始向量以及手动选择初始向量。我见过关于数据标准化的内容,但我不确定这是否有必要,因为考试分数的值范围仅为 1-100,学习时间的值范围为 1-10。
感谢您的阅读。