我有一个具有100000行的汽车数据集。该列是“制造商”,这是输出的重要方面,因此我不能删除它。我应该如何处理这个问题?
很难说出您要在这里做什么。一方面,10万条记录并不庞大。我认为您在谈论分类是因为您是在指分类数据点,因此在下面的示例代码中,我将重点介绍它。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
from pylab import rcParams
import seaborn as sb
import scipy
from scipy.stats import spearmanr
import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import scale
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
#from sklearn.cross_validation import train_test_split
import sklearn.metrics as sm
# import data from web
url = 'https://python-graph-gallery.com/wp-content/uploads/mtcars.csv'
df = pd.read_csv(url)
# check for nulls
df.isnull().sum()
data = df.iloc[:,[2,10]].values
#data_names = ["drat","carb"]
y = df.iloc[:,[1]].values
# classification
y_predict = LogReg.predict(x)
from sklearn.metrics import classification_report
report = classification_report(y,y_predict)
print(report)
# Result:
precision recall f1-score support
4 0.91 0.91 0.91 11
6 1.00 0.43 0.60 7
8 0.78 1.00 0.88 14
accuracy 0.84 32
macro avg 0.90 0.78 0.79 32
weighted avg 0.87 0.84 0.83 32
# continuing...
X_train, X_test, y_train, y_test = train_test_split(data, y,
train_size=0.75, test_size=0.25)
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train,y_train)
predictions = lm.predict(X_test)
plt.scatter(y_test,predictions)
# find Best pipeline:
# Fit the TPOT classifier
tpot = TPOTClassifier(verbosity=2, max_time_mins=5, population_size=40)
tpot.fit(X_train, y_train)
# Final Result:
Best pipeline: RandomForestClassifier(SGDClassifier(FastICA(input_matrix, tol=0.6000000000000001), alpha=0.01, eta0=1.0, fit_intercept=True, l1_ratio=0.75, learning_rate=invscaling, loss=modified_huber, penalty=elasticnet, power_t=0.1), bootstrap=False, criterion=entropy, max_features=0.45, min_samples_leaf=1, min_samples_split=3, n_estimators=100)
Out[128]:
TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=5,
disable_update_check=False, early_stop=None, generations=100,
max_eval_time_mins=5, max_time_mins=5, memory=None,
mutation_rate=0.9, n_jobs=1, offspring_size=None,
periodic_checkpoint_folder=None, population_size=40,
random_state=None, scoring=None, subsample=1.0, template=None,
use_dask=False, verbosity=2, warm_start=False)