Python：我们如何并行化 Python 程序以利用 GPU 服务器？

Question

在我们的实验室，我们拥有NVIDIA Tesla K80 GPU加速器计算，具有以下特性：

Intel(R) Xeon(R) CPU E5-2670 v3 @2.30GHz, 48 CPU processors, 128GB RAM, 12 CPU cores

在Linux 64位下运行。

我正在运行以下代码，该代码在将不同的数据帧集垂直附加到单个系列的

GridSearchCV

模型中后执行

RandomForestRegressor

操作。我正在考虑的两个示例数据集可以在此链接

中找到

import sys
import imp
import glob
import os
import pandas as pd
import math
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import matplotlib
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LassoCV
from sklearn.metrics import r2_score, mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split
from math import sqrt
from sklearn.cross_validation import train_test_split


df = pd.concat(map(pd.read_csv, glob.glob(os.path.join('', "cubic*.csv"))), ignore_index=True)
#df = pd.read_csv('cubic31.csv')

for i in range(1,3):
    df['X_t'+str(i)] = df['X'].shift(i)

print(df)

df.dropna(inplace=True)

X = (pd.DataFrame({ 'X_%d'%i : df['X'].shift(i) for i in range(3)}).apply(np.nan_to_num, axis=0).values)

X = df.drop('Y', axis=1)
y = df['Y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40)

X_train = X_train.drop('time', axis=1)
X_test = X_test.drop('time', axis=1)

#Fit models with some grid search CV=5 (not to low), use the best model
parameters = {'n_estimators': [10,30,100,500,1000]}
clf_rf = RandomForestRegressor(random_state=1)
clf = GridSearchCV(clf_rf, parameters, cv=5, scoring='neg_mean_squared_error')
model = clf.fit(X_train, y_train)
model.cv_results_['params'][model.best_index_]
math.sqrt(model.best_score_*-1)
model.grid_scores_

#####
print()
print(model.grid_scores_)

print(math.sqrt(model.best_score_*-1))

#reg = RandomForestRegressor(criterion='mse')
clf_rf.fit(X_train,y_train)
modelPrediction = clf_rf.predict(X_test)
print(modelPrediction)

print("Number of predictions:",len(modelPrediction))

meanSquaredError=mean_squared_error(y_test, modelPrediction)
print("Mean Square Error (MSE):", meanSquaredError)
rootMeanSquaredError = sqrt(meanSquaredError)
print("Root-Mean-Square Error (RMSE):", rootMeanSquaredError)


####### to add the trendline
fig, ax = plt.subplots()
#df.plot(x='time', y='Y', ax=ax)
ax.plot(df['time'].values, df['Y'].values)


fig, ax = plt.subplots()
index_values=range(0,len(y_test))

y_test.sort_index(inplace=True)
X_test.sort_index(inplace=True)

modelPred_test = clf_rf.predict(X_test)
ax.plot(pd.Series(index_values), y_test.values)


PlotInOne=pd.DataFrame(pd.concat([pd.Series(modelPred_test), pd.Series(y_test.values)], axis=1))

plt.figure(); PlotInOne.plot(); plt.legend(loc='best')

当我为一个巨大的数据集（大约 200 万行）运行这个程序时，需要 3 天以上的时间才能完成

GridSearchCV

。因此，我想知道

Python

线程是否可以利用多个 CPU。我们怎样才能让这个（或其他

Python

程序）利用多个CPU，以便它在短时间内更快地完成任务？谢谢您的任何提示！

Answer 1

您可能想首先尝试在现有的 GridSearchCV 代码中使用“n_jobs=-1”：https://scikit-learn.org/stable/modules/ generated/sklearn.model_selection.GridSearchCV.html

正如建议的，多处理可以让您利用更多的 CPU 来减少处理时间。下面的代码是如何更改代码以合并多处理的示例：

import sys
import imp
import glob
import os
import pandas as pd
import math
import multiprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import matplotlib
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LassoCV
from sklearn.metrics import r2_score, mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split
from math import sqrt
from sklearn.cross_validation import train_test_split

def train_random_forest(X_train, y_train):
    clf_rf = RandomForestRegressor(random_state=1)
    parameters = {'n_estimators': [10, 30, 100, 500, 1000]}
    clf = GridSearchCV(clf_rf, parameters, cv=5, scoring='neg_mean_squared_error')
    model = clf.fit(X_train, y_train)
    return model

def main():
    df = pd.concat(map(pd.read_csv, glob.glob(os.path.join('', "cubic*.csv"))), ignore_index=True)
    # df = pd.read_csv('cubic31.csv')

    for i in range(1, 3):
        df['X_t' + str(i)] = df['X'].shift(i)

    print(df)

    df.dropna(inplace=True)

    X = (pd.DataFrame({'X_%d' % i: df['X'].shift(i) for i in range(3)}).apply(np.nan_to_num, axis=0).values)

    X = df.drop('Y', axis=1)
    y = df['Y']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40)

    X_train = X_train.drop('time', axis=1)
    X_test = X_test.drop('time', axis=1)

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())

    results = pool.apply(train_random_forest, (X_train, y_train))
    modelPrediction = results.predict(X_test)

    pool.close()
    pool.join()

    print("Number of predictions:", len(modelPrediction))

    meanSquaredError = mean_squared_error(y_test, modelPrediction)
    print("Mean Square Error (MSE):", meanSquaredError)
    rootMeanSquaredError = sqrt(meanSquaredError)
    print("Root-Mean-Square Error (RMSE):", rootMeanSquaredError)

    fig, ax = plt.subplots()
    # df.plot(x='time', y='Y', ax=ax)
    ax.plot(df['time'].values, df['Y'].values)

    fig, ax = plt.subplots()
    index_values = range(0, len(y_test))

    y_test.sort_index(inplace=True)
    X_test.sort_index(inplace=True)

    modelPred_test = results.predict(X_test)
    ax.plot(pd.Series(index_values), y_test.values)

    PlotInOne = pd.DataFrame(pd.concat([pd.Series(modelPred_test), pd.Series(y_test.values)], axis=1))

    plt.figure()
    PlotInOne.plot()
    plt.legend(loc='best')

if __name__ == "__main__":
    main()

Python：我们如何并行化 Python 程序以利用 GPU 服务器？

问题描述投票：0回答：1

1个回答

最新问题

Python：我们如何并行化 Python 程序以利用 GPU 服务器？

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1