美文网首页
CUDA yolov5后处理

CUDA yolov5后处理

作者: leon_tly | 来源:发表于2023-12-28 10:00 被阅读0次

    yolov5模型输出结构

    batch_size * box_num * (5 + num_classes)


    image.png

    表示batch_size为1, 框的数量为25200,类别为33

    cuda 后处理

    1. 计算IoU
    __device__ float box_iou(
            float aleft, float atop, float aright, float abottom, 
            float bleft, float btop, float bright, float bbottom
        )
    {
        float cleft   = max(aleft, bleft);
        float ctop    = max(atop, btop);
        float cright  = min(aright, bright);
        float cbottom = min(abottom, bbottom);
        
        float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f);
        if(c_area == 0.0f)
            return 0.0f;
        
        float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop);
        float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop);
        return c_area / (a_area + b_area - c_area);
    }
    
    1. 还原原图坐标
      还原到原图坐标需要预处理做letter-box的时候使用warpaffine来出来的
    __device__ void affine_project(float* matrix, float x, float y, float* ox, float* oy){
        *ox = matrix[0] * x + matrix[1] * y + matrix[2];
        *oy = matrix[3] * x + matrix[4] * y + matrix[5];
    }
    
    1. 解码yolo输出
      predict 为模型输出的指针
      num_bboxes 模型框的数量
      num_classes 模型识别类别数量
      threshold 各个分类的阈值
      min_threshold 最小阈值
      invert_affine_matrix 还原原图坐标矩阵
      parray 存储识别结果数组,其中第一个元素为识别结果的数量,其余分别保存识别结果left,top,right,bottom,class_id,confidence,keep。其中keep表示最后是否保留该结果,用在nms时使用到
      max_objects 最大保存的结果数量。
    __global__ void decode_kernel(
        float* predict, 
        int num_bboxes,
        int num_classes,
        float* threshold,
        float  min_threshold,
        float* invert_affine_matrix,
        float* parray,
        int max_objects)
    {
        int position = blockDim.x * blockIdx.x + threadIdx.x;
        if (position >= num_bboxes) return;
        float* pitem     = predict + (5 + num_classes) * position;
        float objectness = pitem[4];
        if(objectness < min_threshold)
            return;
        float* class_confidence = pitem + 5;
        float confidence = *class_confidence++;
        int label  = 0;
        for(int i = 1; i < num_classes; ++i, ++class_confidence){
            if(*class_confidence > confidence){
                confidence = *class_confidence;
                label      = i;
            }
        }
        confidence *= objectness;
        if (confidence < threshold[label])
            return;
        int index = atomicAdd(parray, 1);
        if(index >= max_objects)
            return;
        
        float cx     = *pitem++;
        float cy     = *pitem++;
        float width  = *pitem++;
        float height = *pitem++;
        float left   = cx - width * 0.5f;
        float top    = cy - height * 0.5f;
        float right  = cx + width * 0.5f;
        float bottom = cy + height * 0.5f;
    
        affine_project(invert_affine_matrix, left,  top,    &left,  &top);
        affine_project(invert_affine_matrix, right, bottom, &right, &bottom);
    
        float* pout_item = parray + 1 + index * 7;
        *pout_item++ = left;
        *pout_item++ = top;
        *pout_item++ = right;
        *pout_item++ = bottom;
        *pout_item++ = confidence;
        *pout_item++ = label;
        *pout_item++ = 1;
    }
    
    1. fast nms
    __global__ void fast_nms_kernel(
        float* bboxes, 
        int max_objects, 
        float nms_threshold)
    {
        int position = (blockDim.x * blockIdx.x + threadIdx.x);
        int count = min((int)*bboxes, max_objects);
        if (position >= count) 
            return;
        float* pcurrent = bboxes + 1 + position * 7;
        for(int i = 0; i < count; ++i)
        {
            float* pitem = bboxes + 1 + i * 7;
            // 不同类别或者同一个目标不做比较
            if(i == position || pcurrent[5] != pitem[5]) continue;
            if(pitem[4] >= pcurrent[4])
            {
                if(pitem[4] == pcurrent[4] && i < position)
                    continue;
                float iou = box_iou(
                    pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3],
                    pitem[0],    pitem[1],    pitem[2],    pitem[3]
                );
                if(iou > nms_threshold){
                    pcurrent[6] = 0;  // 1=keep, 0=ignore
                    return;
                }
            }
        }
    }
    
    1. decode函数
    object::BoxArray CudaDecode::decode(
        float* preidct, 
        float* invert_affine_matrix,
        const float* confidence_threshold,
        float nms_threshold,
        int num_classes,
        int num_bboxes,
        int num_box_element,
        int max_objects)
    {
        float min_confidence_threshold = 0.01;
        object::BoxArray result;
    
        cudaStream_t stream = nullptr;
        checkRuntime(cudaStreamCreate(&stream));
    
        auto block = num_bboxes > 256 ? 256 : num_bboxes;
        auto grid = (num_bboxes + block - 1) / block;
        float* device_confidence_threshold = nullptr;
        checkRuntime(cudaMalloc(&device_confidence_threshold, sizeof(float) * num_classes));
    
        float* device_invert_affine_matrix = nullptr;
        checkRuntime(cudaMalloc(&device_invert_affine_matrix, sizeof(float) * 6));
    
        float* predict_device = nullptr;
        float* output_device = nullptr;
        float* output_host = nullptr;
        checkRuntime(cudaMalloc(&predict_device, num_bboxes * (num_classes+5) * sizeof(float)));
        checkRuntime(cudaMalloc(&output_device, sizeof(float) + max_objects * num_box_element * sizeof(float)));
        // 在实际使用过程中,出现了申请的内存一直时同一块的情况,所以需要在申请后需要手动初始化为0,消除上一次识别的结果,不然会出现结果一直累加的情况
        checkRuntime(cudaMemsetAsync(output_device, 0, sizeof(float) + max_objects * num_box_element * sizeof(float), stream));
        checkRuntime(cudaMallocHost(&output_host, sizeof(float) + max_objects * num_box_element * sizeof(float)));
        checkRuntime(cudaMemcpyAsync(predict_device, preidct, num_bboxes * (num_classes+5) * sizeof(float), cudaMemcpyHostToDevice, stream));
        checkRuntime(cudaMemcpyAsync(device_invert_affine_matrix, invert_affine_matrix, 6 * sizeof(float), cudaMemcpyHostToDevice, stream));
        checkRuntime(cudaMemcpyAsync(device_confidence_threshold, confidence_threshold, num_classes * sizeof(float), cudaMemcpyHostToDevice, stream));
        decode_kernel<<<grid, block, 0, stream>>>(
            predict_device, num_bboxes, num_classes, 
            device_confidence_threshold, min_confidence_threshold,
            device_invert_affine_matrix, output_device, max_objects);
        checkRuntime(cudaStreamSynchronize(stream));
        block = max_objects > 256 ? 256 : max_objects;
        grid = (max_objects + block - 1) / block;
        fast_nms_kernel<<<grid, block, 0, stream>>>(output_device, max_objects, nms_threshold);
        checkRuntime(cudaStreamSynchronize(stream));
        checkRuntime(cudaMemcpyAsync(output_host, output_device, 
            sizeof(float) + max_objects * num_box_element * sizeof(float), 
            cudaMemcpyDeviceToHost, stream
        ));
        checkRuntime(cudaStreamSynchronize(stream));
        int num_boxes = std::min((int)output_host[0], max_objects);
        for(int i = 0; i < num_boxes; ++i)
        {
            float* ptr = output_host + 1 + num_box_element * i;
            int keep_flag = (int)ptr[6];
            if(keep_flag != 0)
            {
                result.emplace_back(
                    ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], (int)ptr[5]
                );
            }
        }
        checkRuntime(cudaStreamDestroy(stream));
        checkRuntime(cudaFree(device_confidence_threshold));
        checkRuntime(cudaFree(device_invert_affine_matrix));
        checkRuntime(cudaFree(predict_device));
        checkRuntime(cudaFree(output_device));
        checkRuntime(cudaFreeHost(output_host));
        return result;
    }
    

    相关文章

      网友评论

          本文标题:CUDA yolov5后处理

          本文链接:https://www.haomeiwen.com/subject/asmrndtx.html