import matplotlib as mpl
import matplotlib. pyplot as plt
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100,默认为50
pd.set_option('max_colwidth',100)
# print(tf.__version__)
# print(sys. version_info)
# for module in np, pd, sklearn, tf, keras:
# print(module.__name__, module.__version__)
train_file="./data1/train.csv"
eval_file="./data1/eval.csv"
train_df=pd.read_csv(train_file)
eval_df=pd.read_csv(eval_file)
y_train=train_df.pop('survived')
y_eval=eval_df.pop('survived')
# print(train_df)
# print(eval_df.head())
# print(y_train.head())
# print(y_eval.head())
# train_df.age.hist(bins=20) # 把所有的值分成二十份
# plt.show()
# 显示性别 画一个横向的柱状图 barh:横向 barv:纵向
# train_df["sex"].value_counts().plot(kind='barh')
# plt.show()
# # 显示社会等级
# train_df['class'].value_counts().plot(kind ='barh')
# plt.show()
# 把train_df拆开survived后的y_train再拼回去,并以性别分组,算出每个性别获救的概率
sex_Prob = pd.concat([train_df,y_train], axis=1).groupby('sex').survived.mean()
# 把结果用柱状图打印
sex_Prob.plot(kind="barh")
# 如果数据是离散值,features_Column可以很好的的对数据进行one-hot编码
# 如果数据是连续值,features_Column可以很好的的对数据进行分统,把数据变成离散特征
# 对九个指标分类,离散型 or 连续型
categorical_columns=['sex','n_siblings_spouses','parch','class','deck','embark_town','alone']
numeric_columns=['age','fare']
feature_columns=[]
for categorical_column in categorical_columns:
vocab=train_df[categorical_column].unique() # 获得所有可能的值
# tf.feature_column.indicator_column() one-hot编码
#
feature_columns.append(
tf.feature_column.indicator_column(
tf.feature_column.categorical_column_with_vocabulary_list(categorical_column,vocab)
)
)
# 连续型
for categorical_column in numeric_columns:
feature_columns.append(
tf.feature_column.numeric_column(categorical_column,dtype=tf.float32)
)
# 构建dataset
# data_df -> x
# label_df -> y
def make_dataset(data_df,label_df,epochs=10,shuffle=True, batch_size=32):
dataset = tf.data.Dataset.from_tensor_slices(
(dict(data_df),label_df)
)
if shuffle:
dataset = dataset.shuffle(10000)
dataset = dataset.repeat(epochs).batch(batch_size)
return dataset
train_dataset =make_dataset(train_df,y_train,batch_size=5)
# keras. layers. DenseFeature
for x,y in train_dataset.take(1):
age_column=feature_columns[7]
gender_column=feature_columns[0]
print(keras.layers.DenseFeatures(age_column)(x).numpy())
print(keras.layers.DenseFeatures(gender_column)(x).numpy())
for x,y in train_dataset.take(1):
print(keras.layers.DenseFeatures(feature_columns)(x).numpy())
model=keras.models.Sequential([
keras.layers.DenseFeatures(feature_columns),
keras.layers.Dense(100, activation='relu'),
keras.layers.Dense(100, activation='relu'),
keras.layers.Dense(2, activation='softmax'),
])
model.compile(loss='sparse_categorical_crossentropy',
optimizer=keras.optimizers.SGD(lr=0.01),
metrics=[' accuracy'])
train_dataset=make_dataset(train_df,y_train,epochs =100)
eval_dataset =make_dataset(eval_df,y_eval,epochs=1,shuffle = False)
history=model.fit(
train_dataset,
validation_data=eval_dataset,
steps_per_epoch=20,
validation_steps=8,
epochs=100)
网友评论