美文网首页
生存预测 : kaggle titanic 泰坦尼克号 逻辑回归

生存预测 : kaggle titanic 泰坦尼克号 逻辑回归

作者: scpy | 来源:发表于2019-01-08 18:05 被阅读0次

目录

  • test.csv
  • train.csv
  • titanic.py

数据集

https://www.kaggle.com/c/titanic/data

titanic.py

import tensorflow as tf
import pandas as pd

TRAIN_PATH = "train.csv"
TEST_PATH = "test.csv"
LEARNING_RATE = 0.01 # 0.01
EPOCH_NUM = 15 # 15
BATCH_SIZE = 100 # 100
LOGS_PATH = 'tensorboard_logs'
'''
PassengerId => 乘客ID
Pclass => 乘客等级(1/2/3等舱位)
Name => 乘客姓名
Sex => 性别
Age => 年龄
SibSp => 堂兄弟/妹个数
Parch => 父母与小孩个数
Ticket => 船票信息
Fare => 票价
Cabin => 客舱
Embarked => 登船港口
'''

def preprocess_data(path, is_test=False):
    data = pd.read_csv(path, index_col='PassengerId')
    data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
    if is_test:
        data = data.replace([None], [0])
    else:
        data = data[pd.notnull(data['Age'])]
        data = data[pd.notnull(data['Embarked'])]
    data.replace(["female", "male"], [0, 1], inplace=True)
    data.replace(["Q", "C", "S"], [0, 1, 2], inplace=True)
    if "Survived" in data:
        data = data[pd.notnull(data['Survived'])]
    data_norm = (data - data.mean()) / (data.max() - data.min())
    return data_norm


def next_batch(df, i=None):
    """
    :param df: pandas dataframe
    :param i: batch index
    :return: (numpy array x, numpy array y)
    """
    if i is None:
        start = 0
        end = df.shape[0]
    else:
        start = BATCH_SIZE * i
        end = BATCH_SIZE * (i + 1)
    result = df[start:end]
    if "Survived" in result:
        batch_ys = pd.get_dummies(result.pop('Survived').values).as_matrix()
        batch_xs = result.as_matrix()
        return batch_xs, batch_ys
    else:
        return result.as_matrix()


def split_dataset(df, test_part=None):
    """
    Split dataframe
    :param test_part: float from 0 to 1
    :param df: pandas dataframe
    :return: (pandas dataframe train, pandas dataframe test)
    """
    length = df.shape[0]
    if test_part is None:
        test_part = 0.15

    test_part = int(length * test_part)

    test_dataset = df[0:test_part]
    training_dataset = df[test_part:]
    return training_dataset, test_dataset


dataset = preprocess_data(TRAIN_PATH)

training_dataset, test_narray = split_dataset(dataset)

x = tf.placeholder(tf.float32, [None, 7], name='InputData')
y = tf.placeholder(tf.float32, [None, 2], name='TargetData')

W = tf.Variable(tf.zeros([7, 2]), name='Weights')
b = tf.Variable(tf.zeros([2]), name='Bias')

with tf.name_scope('Model'):  # scope 作用域
    pred = tf.nn.softmax(tf.matmul(x, W) + b)

with tf.name_scope('Loss'):
    cost = tf.reduce_mean(-tf.reduce_sum(y * tf.log(pred + 1e-10), reduction_indices=1))

with tf.name_scope('GDS'):
    optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(cost)

with tf.name_scope('Accuracy'):
    acc = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
    acc = tf.reduce_mean(tf.cast(acc, tf.float32))

init = tf.global_variables_initializer()

tf.summary.scalar("loss", cost)
tf.summary.scalar("accuracy", acc)
merged_summary = tf.summary.merge_all()

with tf.Session() as sess:
    sess.run(init)

    log_writer = tf.summary.FileWriter(LOGS_PATH, graph=tf.get_default_graph())
    training_dataset_size = training_dataset.shape[0]
    for epoch in range(EPOCH_NUM):
        avg_cost = 0.
        total_batch = int(training_dataset_size / BATCH_SIZE)

        for i in range(total_batch):
            batch_xs, batch_ys = next_batch(training_dataset, i)
            _, c, summary = sess.run([optimizer, cost, merged_summary], feed_dict={x: batch_xs, y: batch_ys})
            log_writer.add_summary(summary, epoch * total_batch + i)
            avg_cost += c / total_batch

        print("Epoch:", '%d' % (epoch + 1), "cost=", "{0}".format(avg_cost))

    test_x, test_y = next_batch(test_narray)
    print("Accuracy:", acc.eval({x: test_x, y: test_y}))

    test_df = preprocess_data(TEST_PATH, is_test=True)
    indexes = test_df.index.values
    test_narray = next_batch(test_df)
    feed_dict = {x: test_narray}
    predict_proba = pred.eval(feed_dict)
    predictions = tf.argmax(predict_proba, dimension=1).eval()

    with open("gender_submission.csv", "w") as f:
        f.write("PassengerId,Survived\n")
        for index, prediction in zip(indexes, predictions):
            f.write("{0},{1}\n".format(index, prediction))

准确率

Accuracy: 0.8113208

转载

原文作者 : no one

相关文章

网友评论

      本文标题:生存预测 : kaggle titanic 泰坦尼克号 逻辑回归

      本文链接:https://www.haomeiwen.com/subject/gupbrqtx.html