본문 바로가기
머신러닝 딥러닝

0914 titanic_keras

by 대금부는개발자 2021. 9. 14.
SMALL

import numpy as np

import pandas as pd

import tensorflow as tf

from sklearn.model_selection import train_test_split

 

# 이상치 처리와 정규화처리는 여러분들이 나중에 한번 해 보세요!

# Domain 분석부터 시작해 보아요!

 

# Raw Data Loading

df = pd.read_csv('/content/drive/MyDrive/9월 14일/titatic/train.csv', sep=',')

 

# display(df)

 

train_df = df.drop(['PassengerId''Name''Ticket''Fare''Cabin'], axis=1, inplace=False)

 

# display(train_df)

 

sex_mapping = { 'male' : 0'female' : 1 }

train_df['Sex'] = train_df['Sex'].map(sex_mapping)

train_df['Family'] = train_df['SibSp'] + train_df['Parch']

train_df.drop(['SibSp''Parch'], axis=1, inplace=True)

 

# Embarked에 NaN이 존재해요. 이 NaN을 다른 값으로 치환해서 사용해야 해요!

# 최빈값을 이용(Q)해서 처리하는 게 좋을 듯싶어요!

train_df['Embarked'] = train_df['Embarked'].fillna('Q')

 

embarked_mapping = { 'S' : 0'C' : 1'Q' : 2}

train_df['Embarked'] = train_df['Embarked'].map(embarked_mapping)

 

# Age에 NaN이 존재해요!

# 여기서는 전체 평균을 이용해서 NaN을 치환할 거예요!

train_df['Age'] = train_df['Age'].fillna(train_df['Age'].mean())

 

# display(train_df)

 

# Age는 어떻게 처리하면 좋을까요?? => Binning처리(Numerical value -> Categorical value)

train_df.loc[train_df['Age'] < 8,'Age'] = 0

train_df.loc[(train_df['Age'] >= 8) & (train_df['Age'] < 20),'Age'] = 1

train_df.loc[(train_df['Age'] >= 20) & (train_df['Age'] < 60),'Age'] = 2

train_df.loc[(train_df['Age'] >= 60),'Age'] = 3

 

display(train_df)

 

 

import tensorflow as tf

from tensorflow.keras.models import Sequential #  Sequential은 왼쪽에서 오른쪽으로 순차적 모델 진행

from tensorflow.keras.layers import Flatten,Dense    

# Flatten : 데이터를 input layer 안에 있는 node들이 받아줘요.

# Dense : Fully Connected Layer, 자기 앞에 있는 layer의 node와 뒤에 있는 layer의 node들이 완전하게 데이터를 주고받는 형태

# 다음 layer node한테 연결되어 데이터를 전달해요. 그런 node들만 들어있는 layer

 

# optimizers : 어떤 알고리즘을 이용해 weight와 bias를 계산하느냐?

from tensorflow.keras.optimizers import SGD 

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import  train_test_split # 데이터를 분할시킬 때 가장 좋은 것은 label의 비율 데로

from sklearn.linear_model import LogisticRegression   # logistic 구현

 

 

# sklearn의 Logistic Regression 구현

train_x_data, valid_x_data, train_t_data, valid_t_data = \

train_test_split(train_df.drop('Survived', axis=1, inplace=False),

                 train_df['Survived'],

                 test_size = 0.3

                 random_state = 0# seed값

                 stratify = train_df['Survived']) # stratify ~ 의비율대로 

 

# Nomalization

scaler = MinMaxScaler()

scaler.fit(train_x_data) # fit 해서 scaling정보를 줘요

# scaler.fit.(train_x_data, valid_x_data) 이렇게 전체 행을 포함시켜 주셔도 돼요!

train_x_data_norm = scaler.transform(train_x_data)  # 데이터를 Nomalization data로 바꿔줘요

valid_x_data_norm = scaler.transform(valid_x_data)  # 데이터를 Nomalization data로 바꿔줘요

 

# LogisticRegression # logistic 처리 → 이진분류를 의미

model = LogisticRegression() # 이진분류 모델 생성

model.fit(train_x_data_norm,train_t_data) # 정규화된 data와 label를 넣어줘요.

score =model.score(valid_x_data_norm, valid_t_data)   # accuracy로 평가를 진행

 

print('sklearn score:{}'.format(score))

# sklearn score:0.8059701492537313

 

#train_x_data_norm.shape

#(623, 5)

 

# tensorflow 구현

 

keras_model = Sequential()

# layer 추가, Flatten 사용할 땐 tuple형태로 독립변수 data 개수를 넣어주셔야 해요.

# keras_model.add(Flatten(input_shape=(train_x_data_norm.shape[1],))) #input layer, [1] 번째는 열(5)을 의미

# keras_model.add(Dense(1, activation='sigmoid')) #0이나 1이냐만 판단하기에 node는 하나만 있어도 돼요!, logistic인 sigmoid

# 두 개를 합쳐서 표현할 수 있어요!

# 뒤에 있는 layer에, 한 뒤 input layer를 붙여요

keras_model.add(Dense(1, activation='sigmoid',input_shape=(train_x_data_norm.shape[1],)))

####### 박스완성 ##########

 

# 설정

keras_model.compile(optimizer=SGD(learning_rate=1e-2),

                    loss='binary_crossentropy'#binary를 이용한 crossentropy, liner에서는 mse 사용했어요. 

                    metrics=['accuracy']) 

keras_model.fit(train_x_data_norm, train_t_data,

                epochs=1000,

                verbose=0# epoch이 반복할 때마다 출력을 안 찍을 거예요 의미 1의 찍겠다는 의미 

                               # 0은 출력 안 함, 1은 progress bar(진행바)출력, 2는 epoch마다 한 줄씩 출력

 

keras_result = keras_model.evaluate(valid_x_data_norm, valid_t_data)

# loss, accuracy

print('keras score:{}'.format(keras_result))

# keras score:[0.44274958968162537, 0.8022388219833374] 앞이 최종 loss, 두 번째가 accuracy


9/9 [==============================] - 0s 2ms/step - loss: 0.4398 - accuracy: 0.8097 keras score:[0.43982094526290894, 0.8097015023231506]

LIST

'머신러닝 딥러닝' 카테고리의 다른 글

0915 Neural Networks, XOR  (0) 2021.09.15
0914 MNIST_ keras  (0) 2021.09.14
0910 k-nearest Neighbor (KNN) 'K - 최근접 이웃'  (0) 2021.09.10
0910 Regression 정리  (0) 2021.09.10
0909 Tensor flow 2.x, keras in tensor 2.x  (0) 2021.09.09

댓글