目录文件:
D:\Development\AllDataSet\Data�0\ER ['50K_PreProcessed.csv']
D:\Development\AllDataSet\Data�0\PMV ['50K_PreProcessed.csv']
D:\Development\AllDataSet\Data�0\PNY ['50K_PreProcessed.csv']
D:\Development\AllDataSet\Data�1\ER ['50K_PreProcessed.csv']
D:\Development\AllDataSet\Data�1\PMV ['50K_PreProcessed.csv']
D:\Development\AllDataSet\Data�1\PNY ['50K_PreProcessed.csv']
D:\Development\AllDataSet\Data�2_2023\ER ['50K_PreProcessed.csv']
D:\Development\AllDataSet\Data�2_2023\PMV ['50K_PreProcessed.csv']
D:\Development\AllDataSet\Data�2_2023\PNY ['50K_PreProcessed.csv']
我有以下代码
writer = pd.ExcelWriter('Allpros.xlsx', engine='xlsxwriter')
# num = 1
classifiers = [
['ExtraTreesClassifier :', ExtraTreeClassifier(min_samples_split=2, random_state = 2)],
['LGBMClassifier : ', LGBMClassifier(n_estimators = 400, max_depth=15,learning_rate=1)],
['XGB :', XGBClassifier(tree_method = "hist", random_state= 2, learning_rate= 1)],
]
for root,dirs,files in os.walk(filepath):
for i in files:
if i.endswith('PreProcessed.csv'):
uio = f'{root}/{i}'
year = root.rsplit('\\',2, )[1]
strategy = root.rsplit('\\',2, )[2]
print(root,files)
df = pd.read_csv(uio,usecols = col, nrows = 500)
for i in df.columns:
df[i] = df[i].astype('float64')
X = df.drop(['WinTrade'], axis=1)
Y = df.WinTrade
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=32, stratify= Y)
predictions_df = []
for name,classifier in classifiers:
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
kappa = round(cohen_kappa_score(y_test, predictions) * 100, 1)
predictions_df.append([name,kappa])
predictions_df = pd.DataFrame(predictions_df, columns=['Algo','Kappa',])
display(predictions_df.sort_values('Kappa', ascending=False))
num = 1
for name ,classifier in classifiers:
if name in predictions_df.sort_values('Kappa', ascending=False)['Algo'][:2].to_list():
print(name,year,strategy)
Model = classifier
Model.fit(X_train, y_train)
importances = Model.feature_importances_
weights = pd.Series(importances, index=X.columns.values).sort_values(ascending=False)
print(weights.index[:5].to_list())
weights.reset_index().rename(columns= {'index': str(strategy)+ '_'+ str(year)}).iloc[:3,:1].to_excel(writer,startcol = num, startrow = 1, index = False,)
num = num+2
writer.save()
writer.close()
代码获取每个文件并运行上面提到的模型,并在每个文件的前 2 个模型的 excel 文件中写入前 3 个特征
我期望的输出是一个 excel 文件中的年度和文件特征: 但我只得到第一个文件的输出,我知道我在某处错误地循环但无法弄清楚如何。请帮助。
我认为为什么你没有得到预期输出的答案在你收到的警告中:
FutureWarning: save 不是公共 API 的一部分,用法 can give 意想不到的结果 并将在未来的版本中删除
pd.ExcelWriter
和 pathlib 的上下文管理器来使您的代码更清晰/更短。这是一个示例,因此您可以获得一般逻辑。确保在需要时进行调整,或者如果没有帮助请告诉我;):
classifiers = [
['ExtraTreesClassifier:', ExtraTreeClassifier(min_samples_split=2, random_state=2)],
['LGBMClassifier:', LGBMClassifier(n_estimators=400, max_depth=15, learning_rate=1)],
['XGB:', XGBClassifier(tree_method="hist", random_state=2, learning_rate=1)]
]
cols_to_use = ['column1', 'column2', 'column3', 'column4', 'column5', 'WinTrade']
with pd.ExcelWriter('Allpros.xlsx') as writer:
start_col = 1
for filepath in Path('tmp/').rglob('*.csv'):
year, strategy = filepath.parts[-3:-1]
df = pd.read_csv(filepath, usecols=cols_to_use, nrows=500, dtype="float64")
X, Y = df.drop(['WinTrade'], axis=1), df["WinTrade"]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=32, stratify=Y)
predictions_df = []
for name, classifier in classifiers:
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
kappa = round(cohen_kappa_score(y_test, predictions) * 100, 1)
predictions_df.append([name, kappa])
predictions_df = (pd.DataFrame(predictions_df, columns=['Algo', 'Kappa'])
.sort_values(by=['Kappa'], ascending=False))
for name, classifier in classifiers:
if name in predictions_df['Algo'].tolist()[:2]:
classifier.fit(X_train, y_train)
importances = classifier.feature_importances_
col_name = f'{strategy}_{year}'
weights = pd.Series(importances, index=X.columns).sort_values(ascending=False)
(weights.reset_index().rename(columns= {'index': col_name})
.iloc[:3,:1].to_excel(writer, startrow=1, startcol=start_col, index=False))
start_col += 2
输出(电子表格):
这里是使用的目录/数据/示例:
.. # <- the script above was run from here
┗━━ tmp
┗━━ Development
┗━━ AllDataSet
┗━━ Data
┣━━ 2020
┃ ┣━━ ER
┃ ┃ ┗━━ 50K_PreProcessed.csv
┃ ┣━━ PMV
┃ ┃ ┗━━ 50K_PreProcessed.csv
┃ ┗━━ PNY
┃ ┗━━ 50K_PreProcessed.csv
┣━━ 2021
┃ ┣━━ ER
┃ ┃ ┗━━ 50K_PreProcessed.csv
┃ ┣━━ PMV
┃ ┃ ┗━━ 50K_PreProcessed.csv
┃ ┗━━ PNY
┃ ┗━━ 50K_PreProcessed.csv
┗━━ 2022_2023
┣━━ ER
┃ ┗━━ 50K_PreProcessed.csv
┣━━ PMV
┃ ┗━━ 50K_PreProcessed.csv
┗━━ PNY
┗━━ 50K_PreProcessed.csv