class BaseMaker: public TreeUpdater {
TrainParam param; //训练参数
vector<int> qexpand; //queue of nodes to be expanded
vector<int> node2workindex; //map active node to its working index offset in qexpand, can be -1, which means the node is node actively expanding
vector<int> position; //position of each instance in the tree, can be negative, which means this position is no longer expanding, see also Decode/EncodePosition
struct FMetaHelper { //采集feature的元数据
vector<float> fminmax; //大小为2*num_feature,偶数位存某个feature最大值
int Type(int fid); //返回指定feature的数据类型,0为空,1为binary,2为real
float MaxValue(int fid); //返回指定feature的最大值
SampleCol(float p, vector<int> *p_findex); //根据概率p,从所有的feature中随机sample出p*num_feature个,存在p_findex中
}
//helper for row-based data
int NextLevel(RowBatch &inst, RegTree &tree, int nid); //nid是树中Node的编号,inst是一条sample,根据Node上的split信息返回左树还是右树
int get_nthread(); //返回线程数
//gpair是一阶和二阶导数,fmat是数据
void InitData(vector<gpair> &gpair,DMatrix &fmat,RegTree &tree) {
//setup position,position的大小等于gpair的size
//fmat.info.root_index是每个sample的当前index,用它来初始化position
//mark delete for the deleted datas
if (gpair[i].hess < 0.0f) position[i] = ~position[i]; //如果二阶导数为负,position设为负
//mark subsample,如果要对数据进行sample,根据伯努利分布采样,没采样到的position设为负
// expand query,将树中待处理节点放入队列,可以并行处理
for (int i = 0; i < tree.param.num_roots; ++i) {
qexpand.push_back(i);
}
this->UpdateNode2WorkIndex(tree); //更新Tree node到queue index的映射
}
//update queue expand add in new leaves,遍历queue中对应的tree node,如果不是leaf,则将左树和右树加入queue
void UpdateQueueExpand(RegTree &tree);
//return decoded position,rindex是数据的index
int DecodePosition(int ridx) {
pid = position[ridx]; //tree中node的位置
return pid < 0 ? ~pid : pid; //如果pid为负,表示此node不再expand
}
//encode the encoded position value for ridx,nid是正整数,如果当前position是负,则设为~nid
void SetEncodePosition(int ridx, int nid);
//this is helper function uses column based data structure to reset the positions,根据树结构,把新的数据放在树中
//nodes: the set of nodes that contains the split to be used
//p_fmat: feature matrix needed for tree construction
void ResetPositionCol(vector<int> &nodes,DMatrix *p_fmat,RegTree &tree) {
SetNonDefaultPositionCol(nodes, p_fmat, tree);
SetDefaultPostion(p_fmat, tree);
}
//helper function to set the non-leaf positions to default direction. 把不能分到leaf上的数据(比如missing value)分配到default的方向
void SetDefaultPostion(DMatrix *p_fmat, RegTree &tree);
//helper function uses column based data structure to CORRECT the positions of non-default directions that WAS set to default before calling this function.
//batch: The column batch
//sorted_split_set: The set of index that contains split solutions.
void CorrectNonDefaultPositionByBatch(ColBatch& batch,vector<int> &sorted_split_set,RegTree &tree);
//helper function uses column based data structure,从给定的nodes中获取split的feature id的集合
//nodes: the set of nodes that contains the split to be used
//out_split_set: The split index set
void GetSplitSet(vector<int> &nodes,RegTree &tree, vector<unsigned>* out_split_set);
//helper function uses column based data structure,update all positions into nondefault branch, if any, ignore the default branch
void SetNonDefaultPositionCol(vector<int> &nodes,DMatrix *p_fmat,RegTree &tree) {
//helper function to get statistics from a tree
void GetNodeStats(vector<bst_gpair> &gpair, DMatrix &fmat, RegTree &tree, vector<vector<TStats>> *p_thread_temp, vector<TStats> *p_node_stats)
//common helper data structure to build sketch,构建sketch
struct SketchEntry {
//total sum of amount to be met
double sum_total;
//statistics used in the sketch
double rmin, wmin;
//last seen feature value
float last_fvalue;
//current size of sketch
double next_goal;
//pointer to the sketch to put things in
WXQuantileSketch<float, float> *sketch;
// initialize the space
void Init(max_size);
//push a new element to sketch
//fvalue: feature value, comes in sorted ascending order
//w: weight
void Push(float fvalue, float w, unsigned max_size);
//更新node2workerindex
void UpdateNode2WorkIndex(RegTree &tree) {
}
}
网友评论