美文网首页
生存预测 : kaggle titanic 泰坦尼克号 逻辑回

生存预测 : kaggle titanic 泰坦尼克号 逻辑回

作者: scpy | 来源:发表于2019-01-08 17:37 被阅读0次

    目录

    • test.csv
    • train.csv
    • titanic.py

    数据集

    https://www.kaggle.com/c/titanic/data

    titanic.py

    # /usr/bin/python
    # -*- encoding:utf-8 -*-
    
    import xgboost as xgb
    import numpy as np
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.ensemble import RandomForestClassifier
    import pandas as pd
    import csv
    
    
    def show_accuracy(a, b, tip):
        acc = a.ravel() == b.ravel()
        acc_rate = 100 * float(acc.sum()) / a.size
        # print '%s正确率:%.3f%%' % (tip, acc_rate)
        return acc_rate
    
    
    def load_data(file_name, is_train):
        data = pd.read_csv(file_name)  # 数据文件路径
        # print (data.describe())
    
        # 性别
        data['Sex'] = data['Sex'].map({'female': 0, 'male': 1}).astype(int)
    
        # 补齐船票价格缺失值
        if len(data.Fare[data.Fare.isnull()]) > 0:
            fare = np.zeros(3)
            for f in range(0, 3):
                fare[f] = data[data.Pclass == f + 1]['Fare'].dropna().median()
            for f in range(0, 3):  # loop 0 to 2
                data.loc[(data.Fare.isnull()) & (data.Pclass == f + 1), 'Fare'] = fare[f]
    
        # 年龄:使用均值代替缺失值
        # mean_age = data['Age'].dropna().mean()
        # data.loc[(data.Age.isnull()), 'Age'] = mean_age
        if is_train:
            # 年龄:使用随机森林预测年龄缺失值
            print('随机森林预测缺失年龄:--start--')
            data_for_age = data[['Age', 'Survived', 'Fare', 'Parch', 'SibSp', 'Pclass']]
            age_exist = data_for_age.loc[(data.Age.notnull())]  # 年龄不缺失的数据
            age_null = data_for_age.loc[(data.Age.isnull())]
            # print age_exist
            x = age_exist.values[:, 1:]
            y = age_exist.values[:, 0]
            rfr = RandomForestRegressor(n_estimators=1000)
            rfr.fit(x, y)
            age_hat = rfr.predict(age_null.values[:, 1:])
            # print age_hat
            data.loc[(data.Age.isnull()), 'Age'] = age_hat
            print('随机森林预测缺失年龄:--over--')
        else:
            print('随机森林预测缺失年龄2:--start--')
            data_for_age = data[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
            age_exist = data_for_age.loc[(data.Age.notnull())]  # 年龄不缺失的数据
            age_null = data_for_age.loc[(data.Age.isnull())]
            # print age_exist
            x = age_exist.values[:, 1:]
            y = age_exist.values[:, 0]
            rfr = RandomForestRegressor(n_estimators=1000)
            rfr.fit(x, y)
            age_hat = rfr.predict(age_null.values[:, 1:])
            # print age_hat
            data.loc[(data.Age.isnull()), 'Age'] = age_hat
            print('随机森林预测缺失年龄2:--over--')
    
        # 起始城市
        data.loc[(data.Embarked.isnull()), 'Embarked'] = 'S'  # 保留缺失出发城市
        # data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2, 'U': 0}).astype(int)
        # print data['Embarked']
        embarked_data = pd.get_dummies(data.Embarked)
        # print embarked_data
        # embarked_data = embarked_data.rename(columns={'S': 'Southampton', 'C': 'Cherbourg', 'Q': 'Queenstown', 'U': 'UnknownCity'})
        embarked_data = embarked_data.rename(columns=lambda x: 'Embarked_' + str(x))
        data = pd.concat([data, embarked_data], axis=1)
        print(data.describe())
        data.to_csv('New_Data.csv')
    
        x = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
        # x = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
        y = None
        if 'Survived' in data:
            y = data['Survived']
    
        x = np.array(x)
        y = np.array(y)
    
        x = np.tile(x, (5, 1))
        y = np.tile(y, (5,))
        if is_train:
            return x, y
        return x, data['PassengerId']
    
    
    def write_result(c, c_type):
        file_name = 'test.csv'
        x, passenger_id = load_data(file_name, False)
    
        if type == 3:
            x = xgb.DMatrix(x)
        y = c.predict(x)
        y[y > 0.5] = 1
        y[~(y > 0.5)] = 0
    
        predictions_file = open("Prediction_%d.csv" % c_type, "wb")
        open_file_object = csv.writer(predictions_file)
        open_file_object.writerow(["PassengerId", "Survived"])
        open_file_object.writerows(zip(passenger_id, y))
        predictions_file.close()
    
    
    if __name__ == "__main__":
        x, y = load_data('train.csv', True)
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=1)
    
        lr = LogisticRegression(penalty='l2')
        lr.fit(x_train, y_train)
        y_hat = lr.predict(x_test)
        lr_rate = show_accuracy(y_hat, y_test, 'Logistic回归 ')
        # write_result(lr, 1)
    
        rfc = RandomForestClassifier(n_estimators=100)
        rfc.fit(x_train, y_train)
        y_hat = rfc.predict(x_test)
        rfc_rate = show_accuracy(y_hat, y_test, '随机森林 ')
        # write_result(rfc, 2)
    
        # XGBoost
        data_train = xgb.DMatrix(x_train, label=y_train)
        data_test = xgb.DMatrix(x_test, label=y_test)
        watch_list = [(data_test, 'eval'), (data_train, 'train')]
        param = {'max_depth': 3, 'eta': 0.1, 'silent': 1, 'objective': 'binary:logistic'}
        # 'subsample': 1, 'alpha': 0, 'lambda': 0, 'min_child_weight': 1}
        bst = xgb.train(param, data_train, num_boost_round=100, evals=watch_list)
        y_hat = bst.predict(data_test)
        # write_result(bst, 3)
        y_hat[y_hat > 0.5] = 1
        y_hat[~(y_hat > 0.5)] = 0
        xgb_rate = show_accuracy(y_hat, y_test, 'XGBoost ')
    
        print('Logistic回归:%.3f%%' % lr_rate)
        print('随机森林:%.3f%%' % rfc_rate)
        print('XGBoost:%.3f%%' % xgb_rate)
    
    

    准确率

    Logistic回归:78.770%
    随机森林:97.935%
    XGBoost:85.772%

    转载

    原文作者 : lovesoft5

    相关文章

      网友评论

          本文标题:生存预测 : kaggle titanic 泰坦尼克号 逻辑回

          本文链接:https://www.haomeiwen.com/subject/cjybrqtx.html