大纲
- 代码实践
- 待解决的问题
1. 代码实践
本文尝试了tensorflow自带的BoostedTreesClassifier API。
import os
import sys
import pandas as pd
import numpy as np
import time
import json
# pylint: disable=wrong-import-order
from absl import app as absl_app
from absl import flags
from six.moves import urllib
import tensorflow as tf
搭建特征
class DataPreprocesser(object):
def __init__(self, train_file):
self.train_file = train_file
self.categorical_columns = cols
self._categorical_column_vocab()
def _categorical_column_vocab(self):
self.train_data, labels = data_preprocess(self.train_file, train=True)
self.categorical_vocabs = {}
for feature in self.categorical_columns:
vocab = list(self.train_data[feature].dropna().unique())
print (feature, vocab[:10])
self.categorical_vocabs[feature] = vocab
def build_model_columns(self):
feature_columns = []
for feature in real_data_columns:
if feature in self.categorical_columns:
_column = tf.feature_column.categorical_column_with_vocabulary_list(feature, self.categorical_vocabs[feature])
feature_columns.append(tf.feature_column.indicator_column(_column))
else:
_column = tf.feature_column.numeric_column(feature)
if feature == "house_month_ctr":
_bucket_ranges = [i/100.0 for i in range(2500)]
else:
continue
feature_columns.append(tf.feature_column.bucketized_column(_column, _bucket_ranges))
return feature_columns
搭建模型
这里用了tf.estimator.LinearClassifier做baseline
def build_estimator(model_dir, model_type, model_column_fn, inter_op, intra_op):
"""Build an estimator appropriate for the given model type."""
feature_columns = model_column_fn()
if os.path.isdir(model_dir):
shutil.rmtree(model_dir)
os.makedirs(model_dir)
# Create a tf.estimator.RunConfig to ensure the model is run on CPU, which
# trains faster than GPU for this model.
run_config = tf.estimator.RunConfig().replace(
session_config=tf.ConfigProto(device_count={'GPU': 0},
inter_op_parallelism_threads=inter_op,
intra_op_parallelism_threads=intra_op))
print ("model_type: %s" % model_type)
if model_type == 'lr':
return tf.estimator.LinearClassifier(
model_dir=model_dir,
feature_columns=feature_columns,
#optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.01),
#optimizer=tf.train.FtrlOptimizer(learning_rate=0.01),
optimizer=tf.train.AdamOptimizer(learning_rate=0.001),
config=run_config)
else:
return tf.estimator.BoostedTreesClassifier(
feature_columns=feature_columns,
n_batches_per_layer=50, #0.5 * NUM_EXAMPLES / BATCH_SIZE
model_dir=model_dir,
n_classes=2,
n_trees=100,
max_depth=6,
learning_rate=0.1,
l1_regularization=0.,
l2_regularization=0.,
config=run_config)
准备数据
def data_preprocess(data_file, train=False):
data = pd.read_csv(data_file, names=all_feature_columns, low_memory=False)
data.sort_values("query_id", inplace=True)
data.replace(-1, pd.NaT, inplace=True)
data.replace("-1", pd.NaT, inplace=True)
if train:
data.dropna(thresh=25, inplace=True)
data = data[real_data_columns]
labels = data["label"].astype(int)
del data['label'] #删除之后就是训练集
del data["query_id"]
for col in cols:
if data[col].dtype != np.str:
data[col] = data[col].astype("str")
print ("data shape:", data.shape)
return data, labels
def input_fn(data_file, num_epochs, shuffle, batch_size, train=False):
data, labels = data_preprocess(data_file, train)
data_columns = data.columns.values.tolist()
return tf.estimator.inputs.pandas_input_fn(
x=data,
y=labels,
batch_size=batch_size,
num_epochs=num_epochs,
shuffle=shuffle,
num_threads=5)
训练模型
def run_loop(train_input_fn, eval_input_fn, model_column_fn,
build_estimator_fn, flags_obj):
model = build_estimator_fn(
model_dir=flags_obj.model_dir, model_type=flags_obj.model_type,
model_column_fn=model_column_fn,
inter_op=0,
intra_op=0)
# Train and evaluate the model every `flags.epochs_between_evals` epochs.
for n in range(flags_obj.train_epochs // flags_obj.epochs_between_evals):
print ("============================")
model.train(input_fn=train_input_fn)
print ("============================")
results = model.evaluate(input_fn=eval_input_fn)
for key in sorted(results):
print("%s: %s" % (key, results[key]))
程序入口
def main(_):
train_file = "./data_dir/20190316_data.csv"
test_file = "./data_dir/20190317_data_eval.csv"
flags_obj = FLAGS
dp = data_preprocesser.DataPreprocesser(train_file)
run_loop(train_input_fn=data_preprocesser.input_fn(train_file, num_epochs=2, shuffle=True, batch_size=32),
eval_input_fn=data_preprocesser.input_fn(test_file, 1, False, batch_size=32),
model_column_fn=dp.build_model_columns,
build_estimator_fn=build_estimator,
flags_obj=flags_obj)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.register("type", "bool", lambda v: v.lower() == "true")
parser.add_argument(
"--model_dir",
type=str,
default="./model_dir",
help="Base directory for output models."
)
parser.add_argument(
"--model_type",
type=str,
default="lr",
help="Valid model types: {'lr', 'boosted_tree'}."
)
parser.add_argument(
"--train_epochs",
type=int,
default=2,
help="Number of training epochs."
)
parser.add_argument(
"--epochs_between_evals",
type=int,
default=2,
help="Number of eval epochs."
)
parser.add_argument(
"--batch_size",
type=int,
default=256,
help="batch_size."
)
parser.add_argument(
"--train_data",
type=str,
default="",
help="Path to the training data."
)
parser.add_argument(
"--test_data",
type=str,
default="",
help="Path to the test data."
)
FLAGS, unparsed = parser.parse_known_args()
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
2. 待解决的问题
feature_column
目前用的是tensorflow1.12.0版本,tf.estimator.BoostedTreesClassifier只能支持tf.feature_column.indicator_column和tf.feature_column.bucketized_column,因此处理特征不灵活,模型效果下降。
训练速度
当数据量增大后,性能特别慢。
网友评论