我被要求使用泰坦尼克号数据集编写代码并执行以下任务:
我编写了以下代码,但该代码不起作用。我认为我在编写代码时犯了错误。混淆矩阵根本不准确。你能帮我解决这些问题吗?
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
df=pd.read_csv("titanic_train")
df.describe()
(df['Survived'].value_counts()) / len(df) * 100
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap = 'viridis')
sns.countplot(x='Survived',data=df,hue = 'Sex')
sns.countplot(x='Survived',data=df,hue = 'Pclass')
sns.heatmap(df.corr(),cmap='coolwarm')
df['Age'].dropna().plot.hist(bins=30)
df.groupby('Survived', group_keys=False).apply(lambda x: x.sample(frac=0.3)) #Create stratified sampling test set
df["Survived"].value_counts(normalize=True)*100
df.isnull().sum()
missing_values=df.isna().any()
print("Columns with missing values: \n{0}".format(missing_values[missing_values==True].index.tolist()))
categoricals=df.nunique().sort_values(ascending=True)
print("Categorical Variables in df data: \n{0}".format(categoricals))
def clean_data(df):
df.drop(['Cabin'], axis=1, inplace=True)
df.drop(['Embarked', 'Fare', 'Ticket', 'Name'], axis=1, inplace=True)
return df
def impute_age(cols):
Age=cols[0]
Pclass=cols[1]
if(pd.isnull(Age)):
if(Pclass==1):
return 37
elif(Pclass==2):
return 29
else:
return 24
else:
return Age
df["Age"]=df[["Age","Pclass"]].apply(impute_age,axis=1)
X=df[['Age','Sex','Pclass']]
from sklearn.model_selection import train_test_split
df_train, df_test, df_train, df_test = train_test_split(df, df,stratify=y,test_size=0.3)
X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.35, random_state=1, stratify=y)
X=df[['Pclass','Sex','Age']]
y=df['Survived']
from sklearn.model_selection import train_test_split
y_train.value_counts(normalize=True)*100
Y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.35, stratify=y, random_state = 12)
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors =10)
KNN.fit(X_train,y_train)
y_prediction = KNN.predict(X_test)
from sklearn.metrics import confusion_matrix, classification_report
[[177 15]
[ 65 55]]
print(classification_report(y_test,y_prediction))
精确召回 f1-score 支持
0 0.73 0.92 0.82 192
1 0.79 0.46 0.58 120
accuracy 0.74 312
宏观平均 0.76 0.69 0.70 312 加权平均 0.75 0.74 0.72 312
当 k 值增加时,错误率显着增加。
更改以下内容
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
df=pd.read_csv("titanic_train") #<-- change this to incliude the file extansion like df=pd.read_csv("titanic_train.csv") assuming the dataset is in CSV format.
df.describe()
(df['Survived'].value_counts()) / len(df) * 100
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap = 'viridis')
sns.countplot(x='Survived',data=df,hue = 'Sex')
sns.countplot(x='Survived',data=df,hue = 'Pclass')
sns.heatmap(df.corr(),cmap='coolwarm')
df['Age'].dropna().plot.hist(bins=30)
df.groupby('Survived', group_keys=False).apply(lambda x: x.sample(frac=0.3)) #Create stratified sampling test set
df["Survived"].value_counts(normalize=True)*100
df.isnull().sum()
missing_values=df.isna().any()
print("Columns with missing values: \n{0}".format(missing_values[missing_values==True].index.tolist()))
categoricals=df.nunique().sort_values(ascending=True)
print("Categorical Variables in df data: \n{0}".format(categoricals))
def clean_data(df):
df.drop(['Cabin'], axis=1, inplace=True)
df.drop(['Embarked', 'Fare', 'Ticket', 'Name'], axis=1, inplace=True)
return df
def impute_age(cols):
Age=cols[0]
Pclass=cols[1]
if(pd.isnull(Age)):
if(Pclass==1):
return 37
elif(Pclass==2):
return 29
else:
return 24
else:
return Age
df["Age"]=df[["Age","Pclass"]].apply(impute_age,axis=1)
X=df[['Age','Sex','Pclass']]
from sklearn.model_selection import train_test_split
df_train, df_test, df_train, df_test = train_test_split(df, df,stratify=y,test_size=0.3) #<-- df_train and df_test is assinged twice and used incorrect #variables. it should be df_train, df_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3).
X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.35, random_state=1, stratify=y)
X=df[['Pclass','Sex','Age']] #<--X=df[['Age','Sex','Pclass']] is duplicated. removed it.
y=df['Survived']
from sklearn.model_selection import train_test_split
y_train.value_counts(normalize=True)*100
Y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.35, stratify=y, random_state = 12)
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors =10)
KNN.fit(X_train,y_train)
y_prediction = KNN.predict(X_test)
from sklearn.metrics import confusion_matrix, classification_report
[[177 15] #<-- assing it correctely like confusion_matrix = [[177, 15], [65, 55]]
[ 65 55]]
print(classification_report(y_test,y_prediction))
尝试一下并告诉我们