머신러닝
타이타닉 생존자 분류 예측
J.H_DA
2022. 4. 13. 15:28
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
titanic_df = pd.read_csv('./datasets/titanic_train.csv', encoding = 'utf=8')
titanic_df.head(3)
Out[46]:
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked012
1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
In [47]:
### 데이터 정보
print('\n ### train 데이터 정보 ### \n')
print(titanic_df.info())
### train 데이터 정보 ###
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 714 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
In [48]:
titanic_df.describe()
Out[48]:
PassengerIdSurvivedPclassAgeSibSpParchFarecountmeanstdmin25%50%75%max
891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
223.500000 | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
In [49]:
titanic_df.isnull().sum()
Out[49]:
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
In [50]:
titanic_df['Age'].fillna(titanic_df['Age'].mean(), inplace=True)
titanic_df['Cabin'].fillna('N', inplace = True)
titanic_df['Embarked'].fillna('N', inplace = True)
print('데이터 세트 Null 값 갯수' , titanic_df.isnull().sum().sum())
데이터 세트 Null 값 갯수 0
In [51]:
titanic_df.isnull().sum()
Out[51]:
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 0
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 0
Embarked 0
dtype: int64
In [52]:
print('Sex 값 분포 : \n', titanic_df['Sex'].value_counts())
print('\n Cabin 값 분포: \n', titanic_df['Cabin'].value_counts())
print('\n Embarked 값 분포: \n', titanic_df['Embarked'].value_counts())
Sex 값 분포 :
male 577
female 314
Name: Sex, dtype: int64
Cabin 값 분포:
N 687
C23 C25 C27 4
G6 4
B96 B98 4
C22 C26 3
...
E34 1
C7 1
C54 1
E36 1
C148 1
Name: Cabin, Length: 148, dtype: int64
Embarked 값 분포:
S 644
C 168
Q 77
N 2
Name: Embarked, dtype: int64
In [53]:
titanic_df['Cabin']= titanic_df['Cabin'].str[:1]
print(titanic_df['Cabin'].head())
0 N
1 C
2 N
3 C
4 N
Name: Cabin, dtype: object
In [54]:
titanic_df['Cabin'].value_counts()
Out[54]:
N 687
C 59
B 47
D 33
E 32
A 15
F 13
G 4
T 1
Name: Cabin, dtype: int64
In [55]:
titanic_df.groupby(['Sex', 'Survived'])['Survived'].count()
Out[55]:
Sex Survived
female 0 81
1 233
male 0 468
1 109
Name: Survived, dtype: int64
In [56]:
sns.barplot(x="Sex", y="Survived", data=titanic_df)
plt.show()
sns.barplot(x='Pclass', y="Survived", hue="Sex", data=titanic_df)

Out[56]:
<AxesSubplot:xlabel='Pclass', ylabel='Survived'>

In [57]:
def get_category(age):
cat=''
if age <=-1 : cat="Unknown"
elif age <=5 : cat="Baby"
elif age <=12 : cat="Child"
elif age <=18 : cat="Teenager"
elif age <=25 : cat="Student"
elif age <=35 : cat="Young Adult"
elif age <=60 : cat="Adult"
else : cat = "Elderly"
return cat
plt.figure(figsize=(10,6))
group_names=["Unknown","Baby","Child","Teenager","Student","Young Adult","Adult","Elderly"]
titanic_df['Age_cat']=titanic_df['Age'].apply(lambda x : get_category(x))
sns.barplot(x='Age_cat', y='Survived', hue='Sex', data=titanic_df, order=group_names)
titanic_df.drop('Age_cat',axis=1,inplace=True)

In [58]:
from sklearn import preprocessing
def encode_features(dataDF):
features = ['Cabin', 'Sex', 'Embarked']
for feature in features:
le = preprocessing.LabelEncoder()
le = le.fit(dataDF[feature])
dataDF[feature]= le.transform(dataDF[feature])
return dataDF
titanic_df = encode_features(titanic_df)
titanic_df.head()
Out[58]:
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked01234
1 | 0 | 3 | Braund, Mr. Owen Harris | 1 | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | 7 | 3 |
2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 0 | 38.0 | 1 | 0 | PC 17599 | 71.2833 | 2 | 0 |
3 | 1 | 3 | Heikkinen, Miss. Laina | 0 | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | 7 | 3 |
4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | 0 | 35.0 | 1 | 0 | 113803 | 53.1000 | 2 | 3 |
5 | 0 | 3 | Allen, Mr. William Henry | 1 | 35.0 | 0 | 0 | 373450 | 8.0500 | 7 | 3 |
In [59]:
from sklearn.preprocessing import LabelEncoder
def fillna(df):
df['Age'].fillna(df['Age'].mean(),inplace=True)
df['Cabin'].fillna('N',inplace=True)
df['Embarked'].fillna('N',inplace=True)
df['Fare'].fillna(0,inplace=True)
return df
def drop_features(df):
df.drop(['PassengerId','Name','Ticket'],axis=1,inplace=True)
return df
def format_features(df):
df['Cabin']=df['Cabin'].str[:1]
features=['Cabin','Sex','Embarked']
for feature in features:
le=LabelEncoder()
le=le.fit(df[feature])
df[feature]=le.transform(df[feature])
return df
def transform_features(df):
df=fillna(df)
df=drop_features(df)
df=format_features(df)
return df
In [15]:
# 원본 데이터를 재로딩 하고, feature 데이터 셋과 Lable 데이터 셋 추출
titanic_df = pd.read_csv('./datasets/titanic_train.csv', encoding = 'utf=8')
y_titanic_df = titanic_df['Survived']
X_titanic_df=titanic_df.drop('Survived', axis=1)
X_titanic_df = transform_features(X_titanic_df)
In [16]:
### 로지스틱 랜덤포레스트, 결정트리 학습
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_titanic_df, y_titanic_df, test_size = 0.2, random_state=11)
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
# 결정트리, Random Forest, 로지스틱 회귀를 위한 사이킷런 classifier
dt_clf = DecisionTreeClassifier(random_state=11)
rf_clf = RandomForestClassifier(random_state=11)
lr_clf = LogisticRegression()
# DecisionTreeClassifier 학습/예측/평가
dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)
print('DecisionTreeClassfier 정확도: {0:.4f}'.format(accuracy_score(y_test, dt_pred)))
# RandomForestClassifier 학습/예측/평가
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)
print('RandomForestClassifier 정확도: {0:.4f}'.format(accuracy_score(y_test, rf_pred)))
# LogisticRegression 학습/예측/평가
lr_clf.fit(X_train, y_train)
lr_pred = lr_clf.predict(X_test)
print('LogisticRegression 정확도: {0:.4f}'.format(accuracy_score(y_test, lr_pred)))
DecisionTreeClassfier 정확도: 0.7877
RandomForestClassifier 정확도: 0.8547
LogisticRegression 정확도: 0.8492
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
타이타닉 데이터 5개 feature 뽑아서 DecisionTree
In [77]:
titanic_df = pd.read_csv('./datasets/titanic_train.csv', encoding = 'utf=8')
y_titanic_df = titanic_df['Survived']
X_titanic_df=titanic_df.drop('Survived', axis=1)
X_titanic_df = transform_features(X_titanic_df)
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')
X_train, X_test, y_train, y_test = train_test_split(X_titanic_df, y_titanic_df, test_size = 0.2, random_state=11)
# DecicionTreeClassifier 생성
dt_clf = DecisionTreeClassifier(random_state=156)
# DecisionTreeClassifier 학습
dt_clf.fit(X_train, y_train)
pred = dt_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)
print('결정 트리 예측 정확도: {0:.4f}'.format(accuracy))
결정 트리 예측 정확도: 0.8045
In [80]:
from sklearn.model_selection import GridSearchCV
params = {
'max_depth' : [ 6, 8, 10, 12, 16, 20, 24]
}
grid_cv = GridSearchCV(dt_clf, param_grid=params, scoring='accuracy', cv=5, verbose=1)
grid_cv.fit(X_train, y_train)
print('GridSearchCV 최고 평균 정확도 수치: {:.4f}'.format(grid_cv.best_score_))
print('GridSearchCV 최적 하이퍼파라미터: ', grid_cv.best_params_)
Fitting 5 folds for each of 7 candidates, totalling 35 fits
GridSearchCV 최고 평균 정확도 수치: 0.7992
GridSearchCV 최적 하이퍼파라미터: {'max_depth': 6}
In [81]:
cv_results_df = pd.DataFrame(grid_cv.cv_results_)
cv_results_df[['param_max_depth', 'mean_test_score']]
Out[81]:
param_max_depthmean_test_score0123456
6 | 0.799202 |
8 | 0.775347 |
10 | 0.773958 |
12 | 0.779553 |
16 | 0.768275 |
20 | 0.761243 |
24 | 0.761243 |
In [82]:
# GridSearch가 아닌 별도의 테스트 데이터셋에서 max_depth별 성능 측정
max_depths = [ 6, 8, 10, 12, 16, 20, 24]
for depth in max_depths:
dt_clf = DecisionTreeClassifier(max_depth=depth, random_state=156)
dt_clf.fit(X_train, y_train)
pred = dt_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)
print('max_depth = {0} 정확도 : {1:.4f}'.format(depth, accuracy))
max_depth = 6 정확도 : 0.8547
max_depth = 8 정확도 : 0.8156
max_depth = 10 정확도 : 0.8380
max_depth = 12 정확도 : 0.7989
max_depth = 16 정확도 : 0.7989
max_depth = 20 정확도 : 0.8045
max_depth = 24 정확도 : 0.8045
In [83]:
params = {
'max_depth' : [ 6, 8, 10, 12, 16, 20, 24],
'min_samples_split' : [16, 24]
}
grid_cv = GridSearchCV(dt_clf, param_grid=params, scoring='accuracy', cv=5, verbose=1)
grid_cv.fit(X_train, y_train)
print('GridSearchCV 최고 평균 정확도 수치: {:.4f}'.format(grid_cv.best_score_))
print('GridSearchCV 최적 하이퍼파라미터: ', grid_cv.best_params_)
# GridSearchCV 객체의 cv_results_ 속성을 데이터 프레임으로 생성
scores_df = pd.DataFrame(grid_cv.cv_results_)
Fitting 5 folds for each of 14 candidates, totalling 70 fits
GridSearchCV 최고 평균 정확도 수치: 0.7992
GridSearchCV 최적 하이퍼파라미터: {'max_depth': 6, 'min_samples_split': 16}
In [84]:
best_df_clf = grid_cv.best_estimator_
pred1 = best_df_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred1)
print('Desicion Tree 예측 정확도: {0:.4f}'.format(accuracy))
Desicion Tree 예측 정확도: 0.8547
In [85]:
import seaborn as sns
feature_importance_values = best_df_clf.feature_importances_
# Top 중요도로 정렬하고, 쉽게 시각화하기 위해 Series 변환
feature_importances = pd.Series(feature_importance_values, index=X_train.columns)
# 중요도값 순으로 Series를 정렬
feature_top10 = feature_importances.sort_values(ascending=False)[:10]
plt.figure(figsize=[8, 6])
plt.title('Feature Importances Top 10')
sns.barplot(x=feature_top10, y=feature_top10.index)
plt.show()

In [86]:
titanic_lr = LogisticRegression()
titanic_lr.fit(X_train, y_train)
pred_all = titanic_lr.predict(X_test)
print(np.round(accuracy_score(y_test, pred_all),3))
0.849
In [87]:
X_train_feat = X_train[["Pclass", "Sex", "Age", "Fare", "Cabin"]]
X_test_feat = X_test[["Pclass", "Sex", "Age", "Fare", "Cabin"]]
In [90]:
titanic_lr_feat = LogisticRegression()
titanic_lr_feat.fit(X_train_feat, y_train)
pred_feat = titanic_lr_feat.predict(X_test_feat)
print(np.round(accuracy_score(y_test, pred_feat),3))
0.838
K-Fold validataion
In [17]:
### k-fold validation
from sklearn.model_selection import KFold
def exec_kfold(clf,folds=5):
kfold = KFold(n_splits=folds)
scores = []
#Kfold 교차 검증 수행
for iter_count, (train_index, test_index) in enumerate(kfold.split(X_titanic_df)):
#kfold.split()으로 반환된 인덱스를 이용하여 학습용, 검증용 테스트 데이터 추출
X_train, X_test = X_titanic_df.values[train_index], X_titanic_df.values[test_index]
y_train, y_test = y_titanic_df.values[train_index], y_titanic_df.values[test_index]
#Classifier 학습, 예측 정확도 계산
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
scores.append(accuracy)
# print("교차 검증 {0} 정확도: {1:.4f}".format(iter_count, accuracy))
print(f"교차 검증 {iter_count} 정확도: {accuracy:.4f}")
# 5개 fold에서의 평균 정확도 계산.
mean_score = np.mean(scores)
#print("평균 정확도 : {0:4f}".format(mean_score))
print(f"평균 정확도 {mean_score:.4f}")
# exec_kfold 호출
exec_kfold(dt_clf, folds=5)
교차 검증 0 정확도: 0.7542
교차 검증 1 정확도: 0.7809
교차 검증 2 정확도: 0.7865
교차 검증 3 정확도: 0.7697
교차 검증 4 정확도: 0.8202
평균 정확도 0.7823
In [18]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(dt_clf, X_titanic_df, y_titanic_df, cv=5)
for iter_count, accuracy in enumerate(scores):
print("교차 검증 {0} 정확도: {1:.4f}".format(iter_count, accuracy))
print("평균 정확도: {0:.4f}".format(np.mean(scores)))
교차 검증 0 정확도: 0.7430
교차 검증 1 정확도: 0.7753
교차 검증 2 정확도: 0.7921
교차 검증 3 정확도: 0.7865
교차 검증 4 정확도: 0.8427
평균 정확도: 0.7879
In [19]:
from sklearn.model_selection import GridSearchCV
### parameter 들을 dictionary 형태로 설정
parameters = {'max_depth':[2,3,5,10], 'min_samples_split':[1,5,8]}
# param_grid의 하이퍼 파라미터들을 3개의 train, test set fold 로 나누어서 테스트 수행 설정.
### refit=True 가 default 임. True이면 가장 좋은 파라미터 설정으로 재 학습 시킴.
grid_dclf=GridSearchCV(dt_clf, param_grid=parameters, scoring='accuracy', cv=5)
# 붓꽃 Train 데이터로 param_grid의 하이퍼 파라미터들을 순차적으로 학습/평가 .
grid_dclf.fit(X_train, y_train)
print('GridSearchCV 최적 파라미터:', grid_dclf.best_params_)
print('GridSearchCV 최고 정확도: {0:.4f}'.format(grid_dclf.best_score_))
# GridSearchCV의 refit으로 이미 학습이 된 estimator 반환
best_dclf = grid_dclf.best_estimator_
# GridSearchCV의 best_estimator_는 이미 최적 하이퍼 파라미터로 학습된 Estimator로 예측 및 평가 수행.
dpredictions = best_dclf.predict(X_test)
accuracy = accuracy_score(y_test, dpredictions)
print('테스트 세트에서의 DecisioTreeClassifier 정확도: {0:.4f}'.format(accuracy))
GridSearchCV 최적 파라미터: {'max_depth': 3, 'min_samples_split': 5}
GridSearchCV 최고 정확도: 0.7992
테스트 세트에서의 DecisioTreeClassifier 정확도: 0.8715
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py:372: FitFailedWarning:
20 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.
Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 937, in fit
super().fit(
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 250, in fit
raise ValueError(
ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1
warnings.warn(some_fits_failed_message, FitFailedWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:969: UserWarning: One or more of the test scores are non-finite: [ nan 0.75844578 0.75844578 nan 0.79917266 0.79917266
nan 0.78520634 0.78940215 nan 0.78379789 0.78665419]
warnings.warn(
머신러닝 평가
In [20]:
# BestEstimator 에서 상속받은 MyDummyClassifier 생성
import numpy as np
from sklearn.base import BaseEstimator
class MyDummyClassifier(BaseEstimator):
# fit() 메소드는 아무것도 학습하지 않음
def fit(self, X, y = None):
pass
# predict() 메소드는 단순히 Sex feature가 1이면 0, 그렇지 않으면 1로 예측함
def predict(self, X):
pred = np.zeros((X.shape[0], 1))
for i in range(X.shape[0]):
if X['Sex'].iloc[i]==1:
pred[i] = 0
else:
pred[i] = 1
return pred
In [21]:
from sklearn.preprocessing import LabelEncoder
def fillna(df):
df['Age'].fillna(df['Age'].mean(),inplace=True)
df['Cabin'].fillna('N',inplace=True)
df['Embarked'].fillna('N',inplace=True)
df['Fare'].fillna(0,inplace=True)
return df
def drop_features(df):
df.drop(['PassengerId','Name','Ticket'],axis=1,inplace=True)
return df
def format_features(df):
df['Cabin']=df['Cabin'].str[:1]
features=['Cabin','Sex','Embarked']
for feature in features:
le=LabelEncoder()
le=le.fit(df[feature])
df[feature]=le.transform(df[feature])
return df
def transform_features(df):
df=fillna(df)
df=drop_features(df)
df=format_features(df)
return df
In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# 원본 데이터를 재로딩 하고, feature 데이터 셋과 Lable 데이터 셋 추출
titanic_df = pd.read_csv('./datasets/titanic_train.csv', encoding = 'utf=8')
y_titanic_df = titanic_df['Survived']
X_titanic_df=titanic_df.drop('Survived', axis=1)
X_titanic_df = transform_features(X_titanic_df)
X_train, X_test, y_train, y_test = train_test_split(X_titanic_df, y_titanic_df, test_size = 0.2, random_state=0)
# 위에서 생성한 Dummy Classifier를 이용하여 학습/예측/평가 수행.
myclf = MyDummyClassifier()
myclf.fit(X_train ,y_train)
mypredictions = myclf.predict(X_test)
print('Dummy Classifier의 정확도는: {0:.4f}'.format(accuracy_score(y_test , mypredictions)))
Dummy Classifier의 정확도는: 0.7877
In [23]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
class MyFakeClassifier(BaseEstimator):
def fit(self,X,y):
pass
# 입력값으로 들어오는 X 데이터 셋의 크기만큼 모두 0값으로 만들어서 반환
def predict(self,X):
return np.zeros( (len(X),1) , dtype=bool)
# 사이킷런의 내장 데이터 셋인 load_digits( )를 이용하여 MNIST 데이터 로딩
digits = load_digits()
# digits번호가 7번이면 True이고 이를 astype(int)로 1로 변환, 7번이 아니면 False이고 0으로 변환.
y = (digits.target == 7).astype(int)
X_train, X_test, y_train, y_test = train_test_split( digits.data, y, random_state=11)
# 불균형한 레이블 데이터 분포도 확인.
print('레이블 테스트 세트 크기 :', y_test.shape)
print('테스트 세트 레이블 0 과 1의 분포도')
print(pd.Series(y_test).value_counts())
# Dummy Classifier로 학습/예측/정확도 평가
fakeclf = MyFakeClassifier()
fakeclf.fit(X_train , y_train)
fakepred = fakeclf.predict(X_test)
print('모든 예측을 0으로 하여도 정확도는:{:.3f}'.format(accuracy_score(y_test , fakepred)))
레이블 테스트 세트 크기 : (450,)
테스트 세트 레이블 0 과 1의 분포도
0 405
1 45
dtype: int64
모든 예측을 0으로 하여도 정확도는:0.900
In [24]:
digits.target == 7
Out[24]:
array([False, False, False, ..., False, False, False])
In [25]:
from sklearn.metrics import accuracy_score, precision_score , recall_score , confusion_matrix
def get_clf_eval(y_test , pred):
confusion = confusion_matrix( y_test, pred)
accuracy = accuracy_score(y_test , pred)
precision = precision_score(y_test , pred)
recall = recall_score(y_test , pred)
print('오차 행렬')
print(confusion)
print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}'.format(accuracy , precision ,recall))
In [26]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# 원본 데이터를 재로딩 하고, feature 데이터 셋과 Lable 데이터 셋 추출
titanic_df = pd.read_csv('./datasets/titanic_train.csv', encoding = 'utf=8')
y_titanic_df = titanic_df['Survived']
X_titanic_df=titanic_df.drop('Survived', axis=1)
X_titanic_df = transform_features(X_titanic_df)
X_train, X_test, y_train, y_test = train_test_split(X_titanic_df, y_titanic_df, test_size = 0.2, random_state=11)
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
pred = lr_clf.predict(X_test)
get_clf_eval(y_test, pred)
오차 행렬
[[104 14]
[ 13 48]]
정확도: 0.8492, 정밀도: 0.7742, 재현율: 0.7869
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
In [27]:
pred_proba = lr_clf.predict_proba(X_test)
pred = lr_clf.predict(X_test)
print('pred_proba()결과 Shape : {0}'.format(pred_proba.shape))
print('pred_proba array에서 앞 3개만 샘플로 추출 \n:', pred_proba[:3])
# 예측 확률 array 와 예측 결과값 array 를 concatenate 하여 예측 확률과 결과값을 한눈에 확인
pred_proba_result = np.concatenate([pred_proba , pred.reshape(-1,1)],axis=1)
print('두개의 class 중에서 더 큰 확률을 클래스 값으로 예측 \n',pred_proba_result[:3])
pred_proba()결과 Shape : (179, 2)
pred_proba array에서 앞 3개만 샘플로 추출
: [[0.46208799 0.53791201]
[0.87861831 0.12138169]
[0.87710729 0.12289271]]
두개의 class 중에서 더 큰 확률을 클래스 값으로 예측
[[0.46208799 0.53791201 1. ]
[0.87861831 0.12138169 0. ]
[0.87710729 0.12289271 0. ]]
In [28]:
from sklearn.preprocessing import Binarizer
X = [[ 1, -1, 2],
[ 2, 0, 0],
[ 0, 1.1, 1.2]]
# threshold 기준값보다 같거나 작으면 0을, 크면 1을 반환
binarizer = Binarizer(threshold=1.1)
print(binarizer.fit_transform(X))
[[0. 0. 1.]
[1. 0. 0.]
[0. 0. 1.]]
In [29]:
from sklearn.preprocessing import Binarizer
#Binarizer의 threshold 설정값. 분류 결정 임곗값임.
custom_threshold = 0.5
# predict_proba( ) 반환값의 두번째 컬럼 , 즉 Positive 클래스 컬럼 하나만 추출하여 Binarizer를 적용
pred_proba_1 = pred_proba[:,1].reshape(-1,1)
binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_1)
custom_predict = binarizer.transform(pred_proba_1)
get_clf_eval(y_test, custom_predict)
오차 행렬
[[104 14]
[ 13 48]]
정확도: 0.8492, 정밀도: 0.7742, 재현율: 0.7869
In [30]:
# Binarizer의 threshold 설정값을 0.4로 설정. 즉 분류 결정 임곗값을 0.5에서 0.4로 낮춤
custom_threshold = 0.4
pred_proba_1 = pred_proba[:,1].reshape(-1,1)
binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_1)
custom_predict = binarizer.transform(pred_proba_1)
get_clf_eval(y_test , custom_predict)
오차 행렬
[[98 20]
[10 51]]
정확도: 0.8324, 정밀도: 0.7183, 재현율: 0.8361
In [31]:
# 테스트를 수행할 모든 임곗값을 리스트 객체로 저장.
thresholds = [0.4, 0.45, 0.50, 0.55, 0.60]
def get_eval_by_threshold(y_test , pred_proba_c1, thresholds):
# thresholds list객체내의 값을 차례로 iteration하면서 Evaluation 수행.
for custom_threshold in thresholds:
binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_c1)
custom_predict = binarizer.transform(pred_proba_c1)
print('임곗값:',custom_threshold)
get_clf_eval(y_test , custom_predict)
get_eval_by_threshold(y_test ,pred_proba[:,1].reshape(-1,1), thresholds )
임곗값: 0.4
오차 행렬
[[98 20]
[10 51]]
정확도: 0.8324, 정밀도: 0.7183, 재현율: 0.8361
임곗값: 0.45
오차 행렬
[[103 15]
[ 12 49]]
정확도: 0.8492, 정밀도: 0.7656, 재현율: 0.8033
임곗값: 0.5
오차 행렬
[[104 14]
[ 13 48]]
정확도: 0.8492, 정밀도: 0.7742, 재현율: 0.7869
임곗값: 0.55
오차 행렬
[[109 9]
[ 15 46]]
정확도: 0.8659, 정밀도: 0.8364, 재현율: 0.7541
임곗값: 0.6
오차 행렬
[[112 6]
[ 16 45]]
정확도: 0.8771, 정밀도: 0.8824, 재현율: 0.7377
In [32]:
from sklearn.metrics import precision_recall_curve
# 레이블 값이 1일때의 예측 확률을 추출
pred_proba_class1 = lr_clf.predict_proba(X_test)[:,1]
# 실제값 데이터 셋과 레이블 값이 1일 때의 예측 확률을 precision_recall_curve 인자로 입력
precisions, recalls, thresholds = precision_recall_curve(y_test, pred_proba_class1)
print('반환된 분류 결정 임계값 배열의 Shape:', thresholds.shape)
print('반환된 precisions 배열의 Shape', precisions.shape)
print('반환된 recalls 배열의 shape:', recalls.shape)
print("thresholds 5 sample: ", thresholds[:5])
print("precisions 5 sample: ", precisions[:5])
print("recalls 5 sample: ", recalls[:5])
# 반환된 임계값 배열 로우가 147건이므로 샘플로 10건만 추출하되, 임계값을 15 Step으로 추출
thr_index = np.arange(0, thresholds.shape[0], 15)
print('샘플 추출을 위한 임계값 배열의 index 10개:', thr_index)
print('샘플용 10개의 임계값: ', np.round(thresholds[thr_index],2))
# 15 step 단위로 추출된 임계값에 따른 정밀도와 재현율 값
print('샘플 임계값별 정밀도:', np.round(precisions[thr_index],3))
print('샘플 임계값별 재현율: ', np.round(recalls[thr_index],3))
반환된 분류 결정 임계값 배열의 Shape: (143,)
반환된 precisions 배열의 Shape (144,)
반환된 recalls 배열의 shape: (144,)
thresholds 5 sample: [0.10397374 0.10397598 0.1040012 0.10786877 0.10894799]
precisions 5 sample: [0.38853503 0.38461538 0.38709677 0.38961039 0.38562092]
recalls 5 sample: [1. 0.98360656 0.98360656 0.98360656 0.96721311]
샘플 추출을 위한 임계값 배열의 index 10개: [ 0 15 30 45 60 75 90 105 120 135]
샘플용 10개의 임계값: [0.1 0.12 0.14 0.19 0.28 0.4 0.56 0.67 0.82 0.95]
샘플 임계값별 정밀도: [0.389 0.44 0.466 0.539 0.647 0.729 0.836 0.949 0.958 1. ]
샘플 임계값별 재현율: [1. 0.967 0.902 0.902 0.902 0.836 0.754 0.607 0.377 0.148]
In [33]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
%matplotlib inline
def precision_recall_curve_plot(y_test , pred_proba_c1):
# threshold ndarray와 이 threshold에 따른 정밀도, 재현율 ndarray 추출.
precisions, recalls, thresholds = precision_recall_curve( y_test, pred_proba_c1)
# X축을 threshold값으로, Y축은 정밀도, 재현율 값으로 각각 Plot 수행. 정밀도는 점선으로 표시
plt.figure(figsize=(8,6))
threshold_boundary = thresholds.shape[0]
plt.plot(thresholds, precisions[0:threshold_boundary], linestyle='--', label='precision')
plt.plot(thresholds, recalls[0:threshold_boundary],label='recall')
# threshold 값 X 축의 Scale을 0.1 단위로 변경
start, end = plt.xlim()
plt.xticks(np.round(np.arange(start, end, 0.1),2))
# x축, y축 label과 legend, 그리고 grid 설정
plt.xlabel('Threshold value'); plt.ylabel('Precision and Recall value')
plt.legend(); plt.grid()
plt.show()
precision_recall_curve_plot( y_test, lr_clf.predict_proba(X_test)[:, 1] )

F1-Score
In [34]:
from sklearn.metrics import f1_score
f1 = f1_score(y_test , pred)
print('F1 스코어: {0:.4f}'.format(f1))
F1 스코어: 0.7805
In [35]:
def get_clf_eval(y_test , pred):
confusion = confusion_matrix( y_test, pred)
accuracy = accuracy_score(y_test , pred)
precision = precision_score(y_test , pred)
recall = recall_score(y_test , pred)
# F1 스코어 추가
f1 = f1_score(y_test,pred)
print('오차 행렬')
print(confusion)
# f1 score print 추가
print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1:{3:.4f}'.format(accuracy, precision, recall, f1))
thresholds = [0.4 , 0.45 , 0.50 , 0.55 , 0.60]
pred_proba = lr_clf.predict_proba(X_test)
get_eval_by_threshold(y_test, pred_proba[:,1].reshape(-1,1), thresholds)
임곗값: 0.4
오차 행렬
[[98 20]
[10 51]]
정확도: 0.8324, 정밀도: 0.7183, 재현율: 0.8361, F1:0.7727
임곗값: 0.45
오차 행렬
[[103 15]
[ 12 49]]
정확도: 0.8492, 정밀도: 0.7656, 재현율: 0.8033, F1:0.7840
임곗값: 0.5
오차 행렬
[[104 14]
[ 13 48]]
정확도: 0.8492, 정밀도: 0.7742, 재현율: 0.7869, F1:0.7805
임곗값: 0.55
오차 행렬
[[109 9]
[ 15 46]]
정확도: 0.8659, 정밀도: 0.8364, 재현율: 0.7541, F1:0.7931
임곗값: 0.6
오차 행렬
[[112 6]
[ 16 45]]
정확도: 0.8771, 정밀도: 0.8824, 재현율: 0.7377, F1:0.8036
In [36]:
from sklearn.metrics import roc_curve
# 레이블 값이 1일 때의 예측 확률을 추출
pred_proba_class1 = lr_clf.predict_proba(X_test)[:,1]
fprs, tprs, thresholds = roc_curve(y_test, pred_proba_class1)
# 반환된 임계값 배열에서 샘플로 데이터를 추출하되, 임계값을 5 Step으로 추출.
# threshholds[0]은 max(예측확률)+1로 임의 설정됨. 이를 제외하기 위해 np.arange는 1부터 시작
thr_index = np.arange(1, thresholds.shape[0], 5)
print('샘플 추출을 위한 임계값 배열의 index:', thr_index)
print('샘플 index로 추출한 임계값:', np.round(thresholds[thr_index],2))
# 5 step 단위로 추출된 임계깞에 따른 FPR, TPR 값
print('샘플 임곗값별 FPR:', np.round(fprs[thr_index],3))
print('샘플 임곗값별 TPR: ', np.round(tprs[thr_index],3))
샘플 추출을 위한 임계값 배열의 index: [ 1 6 11 16 21 26 31 36 41 46 51]
샘플 index로 추출한 임계값: [0.97 0.65 0.63 0.56 0.45 0.4 0.35 0.15 0.13 0.11 0.11]
샘플 임곗값별 FPR: [0. 0.017 0.034 0.076 0.127 0.169 0.203 0.466 0.585 0.686 0.797]
샘플 임곗값별 TPR: [0.033 0.639 0.721 0.754 0.803 0.836 0.885 0.902 0.934 0.967 0.984]
In [37]:
def roc_curve_plot(y_test, pred_proba_c1):
# 임계값에 따른 FPR, TPR 값을 반환 받음.
fprs, tprs, thresholds= roc_curve(y_test, pred_proba_c1)
# ROC Curve를 plot 곡선으로 그림.
plt.plot(fprs, tprs, label ="ROC")
# 가운데 대각선 직선을 그림.
plt.plot([0,1],[0,1], 'k--', label = "Random")
# FPR X 축의 Scale을 0.1 단위로 변경, X, Y 축명 설정등
start,end = plt.xlim()
plt.xticks(np.round(np.arange(start, end, 0.1),2))
plt.xlim(0,1); plt.ylim(0,1)
plt.xlabel('FPR(1-Sensitivity)'); plt.ylabel('TPR(Recall)')
plt.legend()
plt.show()
roc_curve_plot(y_test, lr_clf.predict_proba(X_test)[:,1])

In [38]:
from sklearn.metrics import roc_auc_score
#pred = lr_clf.predict(X_test)
#roc_score = roc_auc_score(y_test, pred)
pred_proba = lr_clf.predict_proba(X_test)[:,1]
roc_Score = roc_auc_score(y_test, pred_proba)
print('ROC AUC 값: {0:.4f}'.format(roc_Score))
ROC AUC 값: 0.9024
728x90