美文网首页
tf.estimator.BoostedTreesClassif

tf.estimator.BoostedTreesClassif

作者: 张虾米试错 | 来源:发表于2019-04-01 21:29 被阅读0次

大纲

  1. 代码实践
  2. 待解决的问题

1. 代码实践

本文尝试了tensorflow自带的BoostedTreesClassifier API。

import os
import sys
import pandas as pd
import numpy as np
import time
import json 

# pylint: disable=wrong-import-order
from absl import app as absl_app
from absl import flags
from six.moves import urllib
import tensorflow as tf

搭建特征

class DataPreprocesser(object):
    def __init__(self, train_file):
        self.train_file = train_file
        self.categorical_columns = cols 
        self._categorical_column_vocab()
    
    def _categorical_column_vocab(self):
        self.train_data, labels = data_preprocess(self.train_file, train=True)
        self.categorical_vocabs = {}
        for feature in self.categorical_columns:
            vocab = list(self.train_data[feature].dropna().unique())
            print (feature, vocab[:10])
            self.categorical_vocabs[feature] = vocab

    def build_model_columns(self):
        feature_columns = []
        for feature in real_data_columns:
            if feature in self.categorical_columns:
                _column = tf.feature_column.categorical_column_with_vocabulary_list(feature, self.categorical_vocabs[feature])
                feature_columns.append(tf.feature_column.indicator_column(_column))
            else:
                _column = tf.feature_column.numeric_column(feature)
                if feature == "house_month_ctr":
                    _bucket_ranges = [i/100.0 for i in range(2500)]
                else:
                    continue
                feature_columns.append(tf.feature_column.bucketized_column(_column, _bucket_ranges))
         return feature_columns

搭建模型

这里用了tf.estimator.LinearClassifier做baseline

def build_estimator(model_dir, model_type, model_column_fn, inter_op, intra_op):
  """Build an estimator appropriate for the given model type."""
  feature_columns = model_column_fn()
  if os.path.isdir(model_dir):
      shutil.rmtree(model_dir)
  os.makedirs(model_dir)

  # Create a tf.estimator.RunConfig to ensure the model is run on CPU, which
  # trains faster than GPU for this model.
  run_config = tf.estimator.RunConfig().replace(
      session_config=tf.ConfigProto(device_count={'GPU': 0},
                                    inter_op_parallelism_threads=inter_op,
                                    intra_op_parallelism_threads=intra_op))
  print ("model_type: %s" % model_type)
  if model_type == 'lr':
    return tf.estimator.LinearClassifier(
        model_dir=model_dir,
        feature_columns=feature_columns,
        #optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.01),
        #optimizer=tf.train.FtrlOptimizer(learning_rate=0.01),
        optimizer=tf.train.AdamOptimizer(learning_rate=0.001),
        config=run_config)
  else:
    return tf.estimator.BoostedTreesClassifier(
        feature_columns=feature_columns,
        n_batches_per_layer=50, #0.5 * NUM_EXAMPLES / BATCH_SIZE
        model_dir=model_dir,
        n_classes=2,
        n_trees=100,
        max_depth=6,
        learning_rate=0.1,
        l1_regularization=0.,
        l2_regularization=0.,
        config=run_config)

准备数据

def data_preprocess(data_file, train=False):
    data = pd.read_csv(data_file, names=all_feature_columns, low_memory=False)
    data.sort_values("query_id", inplace=True)
    data.replace(-1, pd.NaT, inplace=True)
    data.replace("-1", pd.NaT, inplace=True)
    if train:
        data.dropna(thresh=25, inplace=True)
    data = data[real_data_columns]
    labels = data["label"].astype(int)
    del data['label'] #删除之后就是训练集
    del data["query_id"]
    for col in cols:
        if data[col].dtype != np.str:
            data[col] = data[col].astype("str")
    print ("data shape:", data.shape)
    return data, labels

def input_fn(data_file, num_epochs, shuffle, batch_size, train=False):
    data, labels = data_preprocess(data_file, train)
    data_columns = data.columns.values.tolist()
    return tf.estimator.inputs.pandas_input_fn(
            x=data,
            y=labels,
            batch_size=batch_size,
            num_epochs=num_epochs,
            shuffle=shuffle,
            num_threads=5)

训练模型

def run_loop(train_input_fn, eval_input_fn, model_column_fn,
        build_estimator_fn, flags_obj):
  model = build_estimator_fn(
      model_dir=flags_obj.model_dir, model_type=flags_obj.model_type,
      model_column_fn=model_column_fn,
      inter_op=0,
      intra_op=0)

  # Train and evaluate the model every `flags.epochs_between_evals` epochs.
  for n in range(flags_obj.train_epochs // flags_obj.epochs_between_evals):
    print ("============================")
    model.train(input_fn=train_input_fn)
    print ("============================")

    results = model.evaluate(input_fn=eval_input_fn)

    for key in sorted(results):
      print("%s: %s" % (key, results[key]))

程序入口

def main(_):
  train_file = "./data_dir/20190316_data.csv"
  test_file = "./data_dir/20190317_data_eval.csv"
  flags_obj = FLAGS

  dp = data_preprocesser.DataPreprocesser(train_file)
  run_loop(train_input_fn=data_preprocesser.input_fn(train_file, num_epochs=2, shuffle=True, batch_size=32),
          eval_input_fn=data_preprocesser.input_fn(test_file, 1, False, batch_size=32),
          model_column_fn=dp.build_model_columns,
          build_estimator_fn=build_estimator,
          flags_obj=flags_obj) 


if __name__ == '__main__':
  parser = argparse.ArgumentParser()
  parser.register("type", "bool", lambda v: v.lower() == "true")
  parser.add_argument(
      "--model_dir",
      type=str,
      default="./model_dir",
      help="Base directory for output models."
  )
  parser.add_argument(
      "--model_type",
      type=str,
      default="lr",
      help="Valid model types: {'lr', 'boosted_tree'}."
  )
  parser.add_argument(
      "--train_epochs",
      type=int,
      default=2,
      help="Number of training epochs."
  )
  parser.add_argument(
      "--epochs_between_evals",
      type=int,
      default=2,
      help="Number of eval epochs."
  )
  parser.add_argument(
      "--batch_size",
      type=int,
      default=256,
      help="batch_size."
  )
  parser.add_argument(
      "--train_data",
      type=str,
      default="",
      help="Path to the training data."
  )
  parser.add_argument(
      "--test_data",
      type=str,
      default="",
      help="Path to the test data."
  )
  FLAGS, unparsed = parser.parse_known_args()
  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)

2. 待解决的问题

feature_column

目前用的是tensorflow1.12.0版本,tf.estimator.BoostedTreesClassifier只能支持tf.feature_column.indicator_column和tf.feature_column.bucketized_column,因此处理特征不灵活,模型效果下降。

训练速度

当数据量增大后,性能特别慢。

相关文章

网友评论

      本文标题:tf.estimator.BoostedTreesClassif

      本文链接:https://www.haomeiwen.com/subject/rtlmbqtx.html