随机森林优化

问题描述 投票:0回答:0

我有这段代码,但它必须迭代超过 2000 列和 3000 行,有人可以帮我避免 for 循环吗? 数据是取自yahoo finance pack的经典数据,所以它们有adj close, close, high, low, volume;为了使模型具有可比性,我下载了数据并将它们保存到文件“data full 10”中。

这是我的代码,有人可以帮我创建一个函数来避免它吗? 谢谢你的建议

import pandas as pd
from pandas_datareader import data as pdr
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from datetime import datetime, timedelta

# Load data
data = pd.read_csv("data 10 full.csv", index_col=[0], header=[0,1])

# Get number of stocks
num_stock = len(data['Adj Close'].columns)

# Define window size
window_size = 60

# Define grid for hyperparameter tuning
grid_rf = {
    'n_estimators': [500],  
    'max_depth': [5,10,15,20,25,30],  
    'min_samples_split': [2,5,10,15,20,25,30], 
    'min_samples_leaf': [1,5,10,15,20,25,30]
}

# Initialize predictions array
predictions = []

# Define a function to perform the random search and return the trained model
def train_model(x_train, y_train):
    # Drop rows that are not present in both datasets
    idx = x_train.index.intersection(y_train.index)
    x_train = x_train.loc[idx]
    y_train = y_train.loc[idx]

    # Fit randomized search cross validation
    model = RandomForestRegressor() 
    rscv = RandomizedSearchCV(estimator=model, param_distributions=grid_rf, cv=3, n_jobs=-1, verbose=2, n_iter=40)
    x_train = x_train.fillna(0)
    y_train = y_train.fillna(0)
    rscv_fit = rscv.fit(x_train, y_train)
    best_parameters = rscv_fit.best_params_

    # Train model
    model = RandomForestRegressor(n_estimators=best_parameters['n_estimators'], 
                                   min_samples_split=best_parameters['min_samples_split'], 
                                   min_samples_leaf=best_parameters['min_samples_leaf'], 
                                   max_depth=best_parameters['max_depth'])
    model = model.fit(x_train, y_train)
    return model


# Define a function to make predictions on a single window
def predict_window(window, model):
    x_test = window.iloc[-1,:-num_stock].fillna(0)
    prediction = model.predict(x_test.values.reshape(1, -1))
    return prediction[0]

# Use rolling to iterate over windows
models = []
for i in range(window_size+1, len(data)):
    # Get current window
    window = data.iloc[i-window_size-1:i]

    # Define training and testing data
    x_train = window.iloc[:-1,:-num_stock].fillna(0)
    y_train = window.iloc[:-1,num_stock:].fillna(0).shift(-1).dropna()

    # Train model and store it
    model = train_model(x_train, y_train)
    models.append(model)

    # Make predictions and append to list
    prediction = predict_window(window, model)
    predictions.append(prediction)

# Convert predictions to dataframe and set index
predictions_df = pd.DataFrame(predictions, columns=data.columns.get_level_values(0))
predictions_df.set_index(data.index[window_size+1:], inplace=True)

# Calculate root mean squared error
rmse = np.sqrt(mean_squared_error(data.iloc[window_size+1:, :num_stock], predictions_df))
print('RMSE: ', rmse)

# Plot actual and predicted data
import matplotlib.pyplot as plt
plt.plot(data.index, data.iloc[:, :num_stock])
plt.plot(predictions_df.index, predictions_df)
plt.legend(['Actual', 'Predicted'])
plt.show()
for-loop machine-learning random-forest prediction
© www.soinside.com 2019 - 2024. All rights reserved.