美文网首页
tf.estimator.BoostedTreesClassif

tf.estimator.BoostedTreesClassif

作者: 张虾米试错 | 来源:发表于2019-04-01 21:29 被阅读0次

    大纲

    1. 代码实践
    2. 待解决的问题

    1. 代码实践

    本文尝试了tensorflow自带的BoostedTreesClassifier API。

    import os
    import sys
    import pandas as pd
    import numpy as np
    import time
    import json 
    
    # pylint: disable=wrong-import-order
    from absl import app as absl_app
    from absl import flags
    from six.moves import urllib
    import tensorflow as tf
    

    搭建特征

    class DataPreprocesser(object):
        def __init__(self, train_file):
            self.train_file = train_file
            self.categorical_columns = cols 
            self._categorical_column_vocab()
        
        def _categorical_column_vocab(self):
            self.train_data, labels = data_preprocess(self.train_file, train=True)
            self.categorical_vocabs = {}
            for feature in self.categorical_columns:
                vocab = list(self.train_data[feature].dropna().unique())
                print (feature, vocab[:10])
                self.categorical_vocabs[feature] = vocab
    
        def build_model_columns(self):
            feature_columns = []
            for feature in real_data_columns:
                if feature in self.categorical_columns:
                    _column = tf.feature_column.categorical_column_with_vocabulary_list(feature, self.categorical_vocabs[feature])
                    feature_columns.append(tf.feature_column.indicator_column(_column))
                else:
                    _column = tf.feature_column.numeric_column(feature)
                    if feature == "house_month_ctr":
                        _bucket_ranges = [i/100.0 for i in range(2500)]
                    else:
                        continue
                    feature_columns.append(tf.feature_column.bucketized_column(_column, _bucket_ranges))
             return feature_columns
    

    搭建模型

    这里用了tf.estimator.LinearClassifier做baseline

    def build_estimator(model_dir, model_type, model_column_fn, inter_op, intra_op):
      """Build an estimator appropriate for the given model type."""
      feature_columns = model_column_fn()
      if os.path.isdir(model_dir):
          shutil.rmtree(model_dir)
      os.makedirs(model_dir)
    
      # Create a tf.estimator.RunConfig to ensure the model is run on CPU, which
      # trains faster than GPU for this model.
      run_config = tf.estimator.RunConfig().replace(
          session_config=tf.ConfigProto(device_count={'GPU': 0},
                                        inter_op_parallelism_threads=inter_op,
                                        intra_op_parallelism_threads=intra_op))
      print ("model_type: %s" % model_type)
      if model_type == 'lr':
        return tf.estimator.LinearClassifier(
            model_dir=model_dir,
            feature_columns=feature_columns,
            #optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.01),
            #optimizer=tf.train.FtrlOptimizer(learning_rate=0.01),
            optimizer=tf.train.AdamOptimizer(learning_rate=0.001),
            config=run_config)
      else:
        return tf.estimator.BoostedTreesClassifier(
            feature_columns=feature_columns,
            n_batches_per_layer=50, #0.5 * NUM_EXAMPLES / BATCH_SIZE
            model_dir=model_dir,
            n_classes=2,
            n_trees=100,
            max_depth=6,
            learning_rate=0.1,
            l1_regularization=0.,
            l2_regularization=0.,
            config=run_config)
    

    准备数据

    def data_preprocess(data_file, train=False):
        data = pd.read_csv(data_file, names=all_feature_columns, low_memory=False)
        data.sort_values("query_id", inplace=True)
        data.replace(-1, pd.NaT, inplace=True)
        data.replace("-1", pd.NaT, inplace=True)
        if train:
            data.dropna(thresh=25, inplace=True)
        data = data[real_data_columns]
        labels = data["label"].astype(int)
        del data['label'] #删除之后就是训练集
        del data["query_id"]
        for col in cols:
            if data[col].dtype != np.str:
                data[col] = data[col].astype("str")
        print ("data shape:", data.shape)
        return data, labels
    
    def input_fn(data_file, num_epochs, shuffle, batch_size, train=False):
        data, labels = data_preprocess(data_file, train)
        data_columns = data.columns.values.tolist()
        return tf.estimator.inputs.pandas_input_fn(
                x=data,
                y=labels,
                batch_size=batch_size,
                num_epochs=num_epochs,
                shuffle=shuffle,
                num_threads=5)
    

    训练模型

    def run_loop(train_input_fn, eval_input_fn, model_column_fn,
            build_estimator_fn, flags_obj):
      model = build_estimator_fn(
          model_dir=flags_obj.model_dir, model_type=flags_obj.model_type,
          model_column_fn=model_column_fn,
          inter_op=0,
          intra_op=0)
    
      # Train and evaluate the model every `flags.epochs_between_evals` epochs.
      for n in range(flags_obj.train_epochs // flags_obj.epochs_between_evals):
        print ("============================")
        model.train(input_fn=train_input_fn)
        print ("============================")
    
        results = model.evaluate(input_fn=eval_input_fn)
    
        for key in sorted(results):
          print("%s: %s" % (key, results[key]))
    

    程序入口

    def main(_):
      train_file = "./data_dir/20190316_data.csv"
      test_file = "./data_dir/20190317_data_eval.csv"
      flags_obj = FLAGS
    
      dp = data_preprocesser.DataPreprocesser(train_file)
      run_loop(train_input_fn=data_preprocesser.input_fn(train_file, num_epochs=2, shuffle=True, batch_size=32),
              eval_input_fn=data_preprocesser.input_fn(test_file, 1, False, batch_size=32),
              model_column_fn=dp.build_model_columns,
              build_estimator_fn=build_estimator,
              flags_obj=flags_obj) 
    
    
    if __name__ == '__main__':
      parser = argparse.ArgumentParser()
      parser.register("type", "bool", lambda v: v.lower() == "true")
      parser.add_argument(
          "--model_dir",
          type=str,
          default="./model_dir",
          help="Base directory for output models."
      )
      parser.add_argument(
          "--model_type",
          type=str,
          default="lr",
          help="Valid model types: {'lr', 'boosted_tree'}."
      )
      parser.add_argument(
          "--train_epochs",
          type=int,
          default=2,
          help="Number of training epochs."
      )
      parser.add_argument(
          "--epochs_between_evals",
          type=int,
          default=2,
          help="Number of eval epochs."
      )
      parser.add_argument(
          "--batch_size",
          type=int,
          default=256,
          help="batch_size."
      )
      parser.add_argument(
          "--train_data",
          type=str,
          default="",
          help="Path to the training data."
      )
      parser.add_argument(
          "--test_data",
          type=str,
          default="",
          help="Path to the test data."
      )
      FLAGS, unparsed = parser.parse_known_args()
      tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
    

    2. 待解决的问题

    feature_column

    目前用的是tensorflow1.12.0版本,tf.estimator.BoostedTreesClassifier只能支持tf.feature_column.indicator_column和tf.feature_column.bucketized_column,因此处理特征不灵活,模型效果下降。

    训练速度

    当数据量增大后,性能特别慢。

    相关文章

      网友评论

          本文标题:tf.estimator.BoostedTreesClassif

          本文链接:https://www.haomeiwen.com/subject/rtlmbqtx.html