OnnxRuntime部署SwinTransformer

作者: RunningJiang | 来源:发表于2021-12-16 14:52 被阅读0次

OnnxRuntime部署SwinTransformer
模型部署之 ONNX ONNXRuntime
编译onnxruntime:no instance of ove
pytorch onnx onnxruntime tensorr
SimMIM：SwinTransformer的屏蔽图像建模
Prometheus+Grafana监控Docker主机
自动部署开源AI模型到生产环境：Scikit-learn、XGB
使用Docker部署一些服务
关于部署的一些经典算法
Activiti5

一、概述

实测SwinTransformer真的是涨点神器，刷榜秘籍，用SwinTransformer作为模型主干网络来微调下游任务对比ResNet50保守能够带来2~5个点的提升，当然模型参数量是大了点。测试了下基于OnnxRuntime cpu模式和gpu（非TensorRT）模式下的速度。对于大部分图片识别类任务，这个速度也是可以接受的。

模式	硬件	输入	平均速度
cpu	Intel(R) Xeon(R) W-2102 CPU @ 2.90GHz	224*224	$\color{blue}{360ms}$
gpu	Nvidia Tesla T4	224*224	$\color{green}{10ms}$

二、环境

ubuntu18.04
pytorch1.10低版本不支持
onnxruntime1.10
cuda11.4
cudnn 8.24cuda版本和cudnn版本需要11.4以上
opencv4.4
Onnxruntime和cuda版本之间的对应关系如下图
CUDA - onnxruntime对应关系
 Onnxruntime下载地址

三、模型转onnx

去掉训练时候的分类头，只提取timm的版本SwinTransformer的特征。
转出来的模型可以在此处下载
链接：https://pan.baidu.com/s/1oKUrPxPtYUFGVXJ2SiBP3g
提取码：czfj

import timm
import torch.nn as nn
import torch
class ft_net_swin_extract(nn.Module):

    def __init__(self, class_num, droprate=0.5, stride=2,):
        super(ft_net_swin_extract, self).__init__()
        model_ft = timm.create_model('swin_base_patch4_window7_224', pretrained=True)
        # avg pooling to global pooling
        #model_ft.avgpool = nn.AdaptiveAvgPool2d((1,1))
        model_ft.head = nn.Sequential() # save memory
        self.model = model_ft
    def forward(self, x):
        x = self.model.forward_features(x)
        return x

from model import ft_net_swin_extract
import numpy as np
from torchvision import models,transforms,datasets
import cv2
import onnx
import onnxruntime


data_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
image_file = "000_000_gan0002_c3s1_136308_04.jpg"
input = cv2.imread(image_file)
img_h,img_w,_ = input.shape
resize_input = cv2.resize(input,(224,224))
image = data_transforms(resize_input)
image = image.unsqueeze(0)
print(image.shape)

model = ft_net_swin_extract(class_num=751,circle=True)
model_path = "/home/nemo/DeepLearning/Person_reID_baseline_pytorch/MarketOut/best.pth"
model.load_state_dict(torch.load(model_path))
model.classifier.classifier = nn.Sequential()
model.eval()

torch_out = model(image)
# Export the model
torch.onnx.export(model,                     # model being run
                  image,                         # model input (or a tuple for multiple inputs)
                  "swin-transform.onnx",     # where to save the model (can be a file or file-like object)
                  export_params=True,        # store the trained parameter weights inside the model file
                  opset_version=12,          # the ONNX version to export the model to
                  do_constant_folding=True,  # whether to execute constant folding for optimization
                  input_names = ['input'],   # the model's input names
                  output_names = ['output'], # the model's output names
                  )

onnx_model = onnx.load("swin-transform.onnx")
onnx.checker.check_model(onnx_model)

四、编写onnxruntime 推理代码
代码地址
https://gitee.com/running_jiang/swintransformer-onnxruntime.git
https://github.com/runningJ/swintransformer-onnxruntime.git
欢迎star，拒绝白嫖。

cpu版本

#include <iostream>
#include <vector>
#include <opencv2/core.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/dnn.hpp>
#include <onnxruntime_cxx_api.h>
#include <algorithm>
#include <numeric>
#include <chrono>

using namespace std;
using namespace cv;
using namespace Ort;


template <typename T>
T vectorProduct(const std::vector<T>& v)
{
    return accumulate(v.begin(), v.end(), 1, std::multiplies<T>());
};

int main(int argc,char**argv)
{
    if (argc != 3)
    {
        cerr<<"usage "<< argv[0] <<" image_path model_path"<<endl;
        return 0;
    }
    cv::Mat image = imread(argv[1]);
    if(image.empty())
    {
        cerr <<"input image has problem "<< argv[1]<<endl;
        return 0;
    }

    string model_path = argv[2];

    Env env;
    SessionOptions options{nullptr};
    Session session(env, model_path.c_str(),options);

    size_t numInputNodes = session.GetInputCount();
    size_t numOutputNodes = session.GetOutputCount();

    std::cout << "Number of Input Nodes: " << numInputNodes << std::endl;
    std::cout << "Number of Output Nodes: " << numOutputNodes << std::endl;

    AllocatorWithDefaultOptions allocator;
    const char* inputName = session.GetInputName(0, allocator);
    std::cout << "Input Name: " << inputName << std::endl;
    TypeInfo inputTypeInfo = session.GetInputTypeInfo(0);
    auto inputTensorInfo = inputTypeInfo.GetTensorTypeAndShapeInfo();
    ONNXTensorElementDataType inputType = inputTensorInfo.GetElementType();
    std::vector<int64_t> inputDims = inputTensorInfo.GetShape();
    std::cout << "Input Dimensions: ";
    for(int i = 0; i < inputDims.size(); ++i)
    {
        cout<< inputDims[i]<<" ";
    }
    cout <<endl;
    cout <<"-----------------------------------------"<<endl;
    const char* outputName = session.GetOutputName(0, allocator);
    cout << "Output Name: " << outputName << std::endl;
    TypeInfo outputTypeInfo = session.GetOutputTypeInfo(0);
    auto outputTensorInfo = outputTypeInfo.GetTensorTypeAndShapeInfo();
    ONNXTensorElementDataType outputType = outputTensorInfo.GetElementType();
    std::vector<int64_t> outputDims = outputTensorInfo.GetShape();
    std::cout << "Output Dimensions: ";
    for(int i = 0; i < outputDims.size(); ++i)
    {
        cout<< outputDims[i]<<" ";
    }
    cout <<endl;

    //data preprocess
    cv::Mat resizedImageBGR, resizedImageRGB, resizedImage, preprocessedImage;
    cv::resize(image, resizedImageBGR,cv::Size(inputDims.at(2), inputDims.at(3)));
    resizedImageRGB = resizedImageBGR;
    //cv::cvtColor(resizedImageBGR, resizedImageRGB,cv::ColorConversionCodes::COLOR_BGR2RGB);
    resizedImageRGB.convertTo(resizedImage, CV_32F, 1.0 / 255);
    cv::Mat channels[3];
    cv::split(resizedImage, channels);
    channels[0] = (channels[0] - 0.485) / 0.229;
    channels[1] = (channels[1] - 0.456) / 0.224;
    channels[2] = (channels[2] - 0.406) / 0.225;
    cv::merge(channels, 3, resizedImage);
    cv::dnn::blobFromImage(resizedImage, preprocessedImage);

    size_t inputTensorSize = vectorProduct(inputDims);
    std::vector<float> inputTensorValues(inputTensorSize);
    inputTensorValues.assign(preprocessedImage.begin<float>(),
                             preprocessedImage.end<float>());

    size_t outputTensorSize = vectorProduct(outputDims);
    std::vector<float> outputTensorValues(outputTensorSize);

    std::vector<const char*> inputNames{inputName};
    std::vector<const char*> outputNames{outputName};
    
    std::vector<Value> inputTensors;
    std::vector<Value> outputTensors;

    MemoryInfo memoryInfo = Ort::MemoryInfo::CreateCpu(
        OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);

    inputTensors.push_back(Value::CreateTensor<float>(
        memoryInfo, inputTensorValues.data(), inputTensorSize, inputDims.data(),
        inputDims.size()));

    outputTensors.push_back(Value::CreateTensor<float>(
        memoryInfo, outputTensorValues.data(), outputTensorSize,
        outputDims.data(), outputDims.size()));
    for(int i = 0; i < 100; ++i)
    {
        auto s_t=std::chrono::steady_clock::now();
        session.Run(Ort::RunOptions{nullptr}, inputNames.data(),
                    inputTensors.data(), 1, outputNames.data(),
                    outputTensors.data(), 1);
        auto e_t=std::chrono::steady_clock::now();
        double dr_s=std::chrono::duration<double,std::milli>(e_t-s_t).count();
        cout <<"runing inference cost time "<< dr_s <<"ms"<<endl;
    }

     for(int j = 0; j < 10; ++j)
     {
         cout << outputTensorValues.at(j)<<endl;
    }
    return 0;
}

cuda 版本

#include <iostream>
#include <vector>
#include <opencv2/core.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/dnn.hpp>
#include <onnxruntime_cxx_api.h>
#include <algorithm>
#include <numeric>
#include <chrono>

using namespace std;
using namespace cv;
using namespace Ort;

template <typename T>
T vectorProduct(const std::vector<T>& v)
{
    return accumulate(v.begin(), v.end(), 1, std::multiplies<T>());
};

int main(int argc,char**argv)
{
    if (argc != 3)
    {
        cerr<<"usage "<< argv[0] <<" image_path model_path"<<endl;
        return 0;
    }
    cv::Mat image = imread(argv[1]);
    if(image.empty())
    {
        cerr <<"input image has problem "<< argv[1]<<endl;
        return 0;
    }
    string model_path = argv[2];

    Env env(ORT_LOGGING_LEVEL_WARNING, "Default");
    Session session{nullptr};
    SessionOptions session_options;
    OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0);
    session = Ort::Session(env, model_path.c_str(), session_options);

    size_t numInputNodes = session.GetInputCount();
    size_t numOutputNodes = session.GetOutputCount();

    std::cout << "Number of Input Nodes: " << numInputNodes << std::endl;
    std::cout << "Number of Output Nodes: " << numOutputNodes << std::endl;

    AllocatorWithDefaultOptions allocator;
    const char* inputName = session.GetInputName(0, allocator);
    std::cout << "Input Name: " << inputName << std::endl;
    TypeInfo inputTypeInfo = session.GetInputTypeInfo(0);
    auto inputTensorInfo = inputTypeInfo.GetTensorTypeAndShapeInfo();
    ONNXTensorElementDataType inputType = inputTensorInfo.GetElementType();
    std::vector<int64_t> inputDims = inputTensorInfo.GetShape();
    std::cout << "Input Dimensions: ";
    for(int i = 0; i < inputDims.size(); ++i)
    {
        cout<< inputDims[i]<<" ";
    }
    cout <<endl;

    cout <<"-----------------------------------------"<<endl;
    const char* outputName = session.GetOutputName(0, allocator);
    cout << "Output Name: " << outputName << std::endl;
    TypeInfo outputTypeInfo = session.GetOutputTypeInfo(0);
    auto outputTensorInfo = outputTypeInfo.GetTensorTypeAndShapeInfo();
    ONNXTensorElementDataType outputType = outputTensorInfo.GetElementType();
    std::vector<int64_t> outputDims = outputTensorInfo.GetShape();
    std::cout << "Output Dimensions: ";
    for(int i = 0; i < outputDims.size(); ++i)
    {
        cout<< outputDims[i]<<" ";
    }
    cout <<endl;

     //data preprocess
    cv::Mat resizedImageBGR, resizedImageRGB, resizedImage, preprocessedImage;
    cv::resize(image, resizedImageBGR,cv::Size(inputDims.at(2), inputDims.at(3)));
    resizedImageRGB = resizedImageBGR;
    //cv::cvtColor(resizedImageBGR, resizedImageRGB,cv::ColorConversionCodes::COLOR_BGR2RGB);
    resizedImageRGB.convertTo(resizedImage, CV_32F, 1.0 / 255);
    cv::Mat channels[3];
    cv::split(resizedImage, channels);
    channels[0] = (channels[0] - 0.485) / 0.229;
    channels[1] = (channels[1] - 0.456) / 0.224;
    channels[2] = (channels[2] - 0.406) / 0.225;
    cv::merge(channels, 3, resizedImage);
    cv::dnn::blobFromImage(resizedImage, preprocessedImage);

    size_t inputTensorSize = vectorProduct(inputDims);
    std::vector<float> inputTensorValues(inputTensorSize);
    inputTensorValues.assign(preprocessedImage.begin<float>(),
                             preprocessedImage.end<float>());
    size_t outputTensorSize = vectorProduct(outputDims);
    std::vector<float> outputTensorValues(outputTensorSize);

    std::vector<const char*> inputNames{inputName};
    std::vector<const char*> outputNames{outputName};
    
    std::vector<Value> inputTensors;
    std::vector<Value> outputTensors;

    MemoryInfo memoryInfo = Ort::MemoryInfo::CreateCpu(
        OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);

    inputTensors.push_back(Value::CreateTensor<float>(
        memoryInfo, inputTensorValues.data(), inputTensorSize, inputDims.data(),
        inputDims.size()));

    outputTensors.push_back(Value::CreateTensor<float>(
        memoryInfo, outputTensorValues.data(), outputTensorSize,
        outputDims.data(), outputDims.size()));
    for(int i = 0; i < 100; ++i)
    {
        auto s_t=std::chrono::steady_clock::now();
        session.Run(Ort::RunOptions{nullptr}, inputNames.data(),
                inputTensors.data(), 1, outputNames.data(),
                outputTensors.data(), 1);
        auto e_t=std::chrono::steady_clock::now();
        double dr_s=std::chrono::duration<double,std::milli>(e_t-s_t).count();
        cout <<"runing inference cost time "<< dr_s <<"ms"<<endl;
    }
    

     for(int j = 0; j < 10; ++j)
     {
         cout << outputTensorValues.at(j)<<endl;
    }
    return 0;
}

OnnxRuntime部署SwinTransformer
一、概述实测SwinTransformer真的是涨点神器，刷榜秘籍，用SwinTransformer作为模型主干...
模型部署之 ONNX ONNXRuntime
通常我们在训练模型时可以使用很多不同的框架，比如有的同学喜欢用 Pytorch，有的同学喜欢使用 TensorFL...
编译onnxruntime:no instance of ove
主要还是较低的cuda版本中没有atomicAdd实现，修改git目录下的onnxruntime/core/pro...
pytorch onnx onnxruntime tensorr
做了一个小测试，发现pytorch onnx tensorrt三个库的版本存在微妙的联系，在我之前的错误实验中，P...
SimMIM：SwinTransformer的屏蔽图像建模
arXiv:2111.09886[https://arxiv.org/abs/2111.09886][pdf[ht...
Prometheus+Grafana监控Docker主机
监控系统部署 Docker部署cAdvisor 部署访问 Docker部署Prometheus 部署修改配置 ...
自动部署开源AI模型到生产环境：Scikit-learn、XGB
目录背景介绍部署准备部署Scikit-learn模型部署XGBoost模型部署LightGBM模型部署...
使用Docker部署一些服务
部署 Gitlab 部署 Nexus 部署 tomcat
关于部署的一些经典算法
基于部署方式的节点部署算法根据部署方式的不同，节点部署算法可分可为确定性部署和随机性部署两大类。确定性部署通常应...
Activiti5
Activiti5 新增流程部署查询所有部署的流程(流程部署) 删除部署的流程(流程部署) 查看所有的流程定义 ...