练习说明如下:
我的做法: 我预测了圆形和方形星星的数量并比较了两个列表,但我只得到 accuracy_score 为 0.715。
import sklearn
import sklearn.linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
import numpy as np
data = np.load('stars_train_X_y.npz')
X = data['X_train'] # np.ndarray of size (8000, 4096), flattened image (64x64) of the night sky.
y = data['y_train'] # np.ndarray of size (8000, 2), number of circular stars and number square stars
#Split dataset into 80% trian, 10% test, 10# val
(X_train, X_temp, y_train, y_temp) = train_test_split(data['X_train'], data['y_train'], test_size=0.2, random_state=0, shuffle = True)
(X_test, X_val, y_test, y_val) = train_test_split(X_temp, y_temp, test_size=0.5, random_state=0, shuffle = True)
#Getting list of either number of circular stars and square stars
def shapeList(inp, elm):
n = [(t[elm]) for t in inp]
return n
cLstTrain = shapeList(y_train, 0) # y_train: only circular stars count
cLstTest = shapeList(y_test, 0) # y_test: only circular stars count
sLstTrain = shapeList(y_train, 1) # y_train: only square stars count
sLstTest = shapeList(y_test, 1) # y_test: only square stars count
def predict(trainX_, trainY_, testX_, testY_ ):
model = sklearn.linear_model.LogisticRegression(random_state=0, max_iter=1000)
model.fit(trainX_, trainY_)
pred = model.predict(testX_)
MSE = mean_squared_error(testY_, pred)
ACC = accuracy_score(testY_, pred)
print("MSE : " + str(MSE))
print("ACC : " + str(ACC) )
return pred
circularPredictions = (predict(X_train, cLstTrain, X_test, cLstTest))
squarePredictions = (predict(X_train, sLstTrain, X_test, sLstTest))
'''1 for pictures with more stars, 0 for pictures with less stars. (compared with square stars)'''
def compare(CircPred, SqrePred):
lst = []
for i in range (0, len(CircPred)):
if CircPred[i] > SqrePred[i]:
n = 1
lst.append([n])
else:
n = 0
lst.append([n])
return lst
predictions = (compare(circularPredictions, squarePredictions))
testValues = (compare(cLstTest, sLstTest))
print(accuracy_score(testValues, predictions))
为了提高准确性,我尝试了以下方法:
网格搜索
我用过:
paramGrid = {
'penalty': ['l1', 'l2'],
'C': [0.1, 1, 10, 100]
}
model_search = GridSearchCV(model, paramGrid, cv=5)
这给了我 0.72 的稍微好一点的准确度,但运行时间从 2 分钟增加到 18 分钟。
我很确定解决方案是为此进行预处理我已经尝试了以下技术:
我对这类事情很陌生,你能建议什么样的预处理技术,或者有没有其他方法可以改进?
提前致谢:D