「deep-forest」を使ってみた

「deep-forest」を使ってみた

【 Deep neural network の 代替モデル を 標榜 】DeepForestモデル の python & R言語 実装コード事例-Qiita

で紹介されている

leopiney/deep-forest

について,

一日一Python:kaggleのチュートリアル(タイタニックデータから生存予測)と同じ事をやってみる

一日一Python:kaggleのチュートリアル(タイタニックデータから生存予測)と同じ事をやってみるその3

のデータで試してみた.

import numpy as np
import pandas as pd
import random
import uuid
import xgboost as xgb
from sklearn.metrics import classification_report
from sklearn.datasets import fetch_mldata
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.grid_search import GridSearchCV
from deep_forest import MGCForest
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

titanic = sns.load_dataset('titanic')
data_train, data_test = train_test_split(titanic)
data_train2 = pd.get_dummies(data_train)
data_train2['adult_male'] = [1 if data else 0 for data in data_train2['adult_male']]
data_train2['alone'] = [1 if data else 0 for data in data_train2['alone']]
data_train3 = data_train2.dropna()
data_train4 = data_train3.drop(['alive_no', 'alive_yes'], axis=1)
data_test2 = pd.get_dummies(data_test)
data_test2['adult_male'] = [1 if data else 0 for data in data_test2['adult_male']]
data_test2['alone'] = [1 if data else 0 for data in data_test2['alone']]
data_test3 = data_test2.dropna()
data_test4 = data_test3.drop(['alive_no', 'alive_yes'], axis=1)

XGBoost(デフォルトパラメータ):

xgb_model = xgb.XGBClassifier()
xgb_model.fit(data_train4.ix[:, 'pclass':], data_train4.ix[:, 'survived'])
xgb_pred_train = xgb_model.predict(data_train4.ix[:, 'pclass':])
xgb_pred_test = xgb_model.predict(data_test4.ix[:, 'pclass':])
print(classification_report(data_train4.ix[:, 'survived'], xgb_pred_train))
print(classification_report(data_test4.ix[:, 'survived'], xgb_pred_test))
        precision    recall  f1-score   support



      0       0.87      0.94      0.91       325

      1       0.90      0.80      0.84       215

avg / total 0.88 0.88 0.88 540

         precision    recall  f1-score   support



      0       0.85      0.94      0.89        99

      1       0.91      0.77      0.83        75

avg / total 0.87 0.87 0.87 174

ランダムフォレスト(デフォルトパラメータ):

clf = RandomForestClassifier()
clf.fit(data_train4.ix[:, 'pclass':], data_train4.ix[:, 'survived'])
pred_train = clf.predict(data_train4.ix[:, 'pclass':])
pred_test = clf.predict(data_test4.ix[:, 'pclass':])
print(classification_report(data_train4.ix[:, 'survived'], pred_train))
print(classification_report(data_test4.ix[:, 'survived'], pred_test))
         precision    recall  f1-score   support



      0       0.97      0.99      0.98       325

      1       0.99      0.95      0.97       215

avg / total 0.98 0.98 0.98 540

         precision    recall  f1-score   support



      0       0.80      0.86      0.83        99

      1       0.79      0.72      0.76        75

avg / total 0.80 0.80 0.80 174

PAGE_BREAK: PageBreak

ランダムフォレスト(グリッドサーチでチューニング):

params = {'n_estimators'  : [2**i for i in range(1, 15)], 'n_jobs': [-1]}
cv = GridSearchCV(clf, params, cv = 3)
cv.fit(data_train4.ix[:, 'pclass':], data_train4.ix[:, 'survived'])
pred_train2 = cv.predict(data_train4.ix[:, 'pclass':])
pred_test2 = cv.predict(data_test4.ix[:, 'pclass':])
print(classification_report(data_train4.ix[:, 'survived'], pred_train2))
print(classification_report(data_test4.ix[:, 'survived'], pred_test2))
        precision    recall  f1-score   support



      0       0.99      0.99      0.99       325

      1       0.99      0.99      0.99       215

avg / total 0.99 0.99 0.99 540

         precision    recall  f1-score   support



      0       0.83      0.91      0.87        99

      1       0.86      0.76      0.81        75

avg / total 0.85 0.84 0.84 174

上のデータのままだとうまくいかなかったので,少しデータセットを修正.

(寄与率の低い3つのパラメータを取り除いた;結果にほぼ影響は無い(以下,例))

data_train5 = data_train4.drop(['embark_town_Queenstown', 
                                             'embarked_Q',
                                             'deck_F'], axis=1)
data_test5 = data_test4.drop(['embark_town_Queenstown', 
                                             'embarked_Q',
                                             'deck_F'], axis=1)
X_train = data_train5.ix[:, 'pclass':].as_matrix()
X_train = X_train.reshape(len(X_train), 5, 5)
y_train = data_train5.ix[:, 'survived'].as_matrix()
X_test = data_test5.ix[:, 'pclass':].as_matrix()
X_test = X_test.reshape(len(X_test), 5, 5)
y_test = data_test5.ix[:, 'survived'].as_matrix()
print('X_train:', X_train.shape, X_train.dtype)
print('y_train:', y_train.shape, y_train.dtype)
print('X_test:', X_test.shape)
print('y_test:', y_test.shape)
xgb_model.fit(data_train5.ix[:, 'pclass':], data_train5.ix[:, 'survived'])
xgb_pred_train = xgb_model.predict(data_train5.ix[:, 'pclass':])
xgb_pred_test = xgb_model.predict(data_test5.ix[:, 'pclass':])
print(classification_report(data_train5.ix[:, 'survived'], xgb_pred_train))
print(classification_report(data_test5.ix[:, 'survived'], xgb_pred_test))
         precision    recall  f1-score   support



      0       0.87      0.94      0.91       325

      1       0.90      0.80      0.84       215

avg / total 0.88 0.88 0.88 540

         precision    recall  f1-score   support



      0       0.85      0.94      0.89        99

      1       0.91      0.77      0.83        75

avg / total 0.87 0.87 0.87 174

PAGE_BREAK: PageBreak

ディープフォレスト:

mgc_forest = MGCForest(
    estimators_config={
        'mgs': [{
            'estimator_class': ExtraTreesClassifier,
            'estimator_params': {
                'n_estimators': 30,
                'min_samples_split': 21,
                'n_jobs': -1,
            }
        }, {
            'estimator_class': RandomForestClassifier,
            'estimator_params': {
                'n_estimators': 30,
                'min_samples_split': 21,
                'n_jobs': -1,
            }
        }],
        'cascade': [{
            'estimator_class': ExtraTreesClassifier,
            'estimator_params': {
                'n_estimators': 1000,
                'min_samples_split': 11,
                'max_features': 1,
                'n_jobs': -1,
            }
        }, {
            'estimator_class': ExtraTreesClassifier,
            'estimator_params': {
                'n_estimators': 1000,
                'min_samples_split': 11,
                'max_features': 'sqrt',
                'n_jobs': -1,
            }
        }, {
            'estimator_class': RandomForestClassifier,
            'estimator_params': {
                'n_estimators': 1000,
                'min_samples_split': 11,
                'max_features': 1,
                'n_jobs': -1,
            }
        }, {
            'estimator_class': RandomForestClassifier,
            'estimator_params': {
                'n_estimators': 1000,
                'min_samples_split': 11,
                'max_features': 'sqrt',
                'n_jobs': -1,
            }
        }]
    },
    stride_ratios=[1.0 / 4, 1.0 / 9, 1.0 / 16],
)

mgc_forest.fit(X_train, y_train)
y_train_pred = mgc_forest.predict(X_train)
y_pred = mgc_forest.predict(X_test)

print('Prediction shape:', y_pred.shape)
print(
    'Accuracy:', accuracy_score(y_test, y_pred),
    'F1 score:', f1_score(y_test, y_pred, average='weighted')
)

print(classification_report(y_train, y_train_pred))
print(classification_report(y_test, y_pred))

Prediction shape: (174,)

Accuracy: 0.816091954023 F1 score: 0.814482475705

         precision    recall  f1-score   support



      0       0.83      0.93      0.88       325

      1       0.86      0.71      0.78       215

avg / total 0.84 0.84 0.84 540

         precision    recall  f1-score   support



      0       0.81      0.88      0.84        99

      1       0.82      0.73      0.77        75

avg / total 0.82 0.82 0.81 174

PAGE_BREAK: PageBreak

カテゴリー: 未分類 パーマリンク

「deep-forest」を使ってみた への1件のフィードバック

  1. ピンバック: TitanicのデータでDNN | 粉末@それは風のように (日記)

コメントは受け付けていません。