美文网首页
updater_basemaker-inl.h

updater_basemaker-inl.h

作者: 世间五彩我执纯白 | 来源:发表于2016-07-28 17:54 被阅读0次
    class BaseMaker: public TreeUpdater {
        TrainParam param; //训练参数
        vector<int> qexpand; //queue of nodes to be expanded
        vector<int> node2workindex; //map active node to its working index offset in qexpand, can be -1, which means the node is node actively expanding
        vector<int> position; //position of each instance in the tree, can be negative, which means this position is no longer expanding, see also Decode/EncodePosition
    
        struct FMetaHelper { //采集feature的元数据
          vector<float> fminmax; //大小为2*num_feature,偶数位存某个feature最大值
          int Type(int fid); //返回指定feature的数据类型,0为空,1为binary,2为real
          float MaxValue(int fid); //返回指定feature的最大值
          SampleCol(float p, vector<int> *p_findex); //根据概率p,从所有的feature中随机sample出p*num_feature个,存在p_findex中
        }
    
        //helper for row-based data
        int NextLevel(RowBatch &inst, RegTree &tree, int nid); //nid是树中Node的编号,inst是一条sample,根据Node上的split信息返回左树还是右树
        int get_nthread(); //返回线程数
    
        //gpair是一阶和二阶导数,fmat是数据
        void InitData(vector<gpair> &gpair,DMatrix &fmat,RegTree &tree) {
          //setup position,position的大小等于gpair的size
          //fmat.info.root_index是每个sample的当前index,用它来初始化position
          //mark delete for the deleted datas
          if (gpair[i].hess < 0.0f) position[i] = ~position[i]; //如果二阶导数为负,position设为负
          //mark subsample,如果要对数据进行sample,根据伯努利分布采样,没采样到的position设为负
          // expand query,将树中待处理节点放入队列,可以并行处理
          for (int i = 0; i < tree.param.num_roots; ++i) {
            qexpand.push_back(i);
          }
          this->UpdateNode2WorkIndex(tree); //更新Tree node到queue index的映射
        }
        
        //update queue expand add in new leaves,遍历queue中对应的tree node,如果不是leaf,则将左树和右树加入queue
        void UpdateQueueExpand(RegTree &tree);
    
        //return decoded position,rindex是数据的index
        int DecodePosition(int ridx) {
          pid = position[ridx]; //tree中node的位置
          return pid < 0 ? ~pid : pid;  //如果pid为负,表示此node不再expand
        }
        //encode the encoded position value for ridx,nid是正整数,如果当前position是负,则设为~nid
        void SetEncodePosition(int ridx, int nid);
    
        //this is helper function uses column based data structure to reset the positions,根据树结构,把新的数据放在树中
        //nodes: the set of nodes that contains the split to be used
        //p_fmat: feature matrix needed for tree construction
        void ResetPositionCol(vector<int> &nodes,DMatrix *p_fmat,RegTree &tree) {
          SetNonDefaultPositionCol(nodes, p_fmat, tree);
          SetDefaultPostion(p_fmat, tree);
        }
        //helper function to set the non-leaf positions to default direction. 把不能分到leaf上的数据(比如missing value)分配到default的方向
        void SetDefaultPostion(DMatrix *p_fmat, RegTree &tree);
    
        //helper function uses column based data structure to CORRECT the positions of non-default directions that WAS set to default before calling this function.
        //batch: The column batch
        //sorted_split_set: The set of index that contains split solutions.
        void CorrectNonDefaultPositionByBatch(ColBatch& batch,vector<int> &sorted_split_set,RegTree &tree);
        
        //helper function uses column based data structure,从给定的nodes中获取split的feature id的集合
        //nodes: the set of nodes that contains the split to be used
        //out_split_set: The split index set
        void GetSplitSet(vector<int> &nodes,RegTree &tree, vector<unsigned>* out_split_set);
    
        //helper function uses column based data structure,update all positions into nondefault branch, if any, ignore the default branch
        void SetNonDefaultPositionCol(vector<int> &nodes,DMatrix *p_fmat,RegTree &tree) {
    
        //helper function to get statistics from a tree
        void GetNodeStats(vector<bst_gpair> &gpair, DMatrix &fmat, RegTree &tree, vector<vector<TStats>> *p_thread_temp, vector<TStats> *p_node_stats)
    
        //common helper data structure to build sketch,构建sketch
        struct SketchEntry {
          //total sum of amount to be met
          double sum_total;
          //statistics used in the sketch
          double rmin, wmin;
          //last seen feature value
          float last_fvalue;
          //current size of sketch
          double next_goal;
          //pointer to the sketch to put things in
          WXQuantileSketch<float, float> *sketch;
          
          // initialize the space
          void Init(max_size);
    
          //push a new element to sketch
          //fvalue: feature value, comes in sorted ascending order
          //w: weight
          void Push(float fvalue, float w, unsigned max_size);
          //更新node2workerindex
          void UpdateNode2WorkIndex(RegTree &tree) {
        }
    }
    
    
    

    相关文章

      网友评论

          本文标题:updater_basemaker-inl.h

          本文链接:https://www.haomeiwen.com/subject/xtsajttx.html