一、前言
本文主要介绍如何基于ESP32的开发板通过microTVM进行一个卷积算子的调优。
二、microTVM
microTVM是TVM 编译器的扩展,它能够使TVM应用于微控制器,提供了在设备上运行 TVM RPC 服务以完成自动调优的方法,同时也提供了一套最小化 C 语言的runtime,使得裸机边缘设备可以独立完成模型推理。
-
基于TVM RPC服务
需要host端与设备端同时参与,由host端与设备端通过串口或USB等进行连接通信,host端将交叉编译完的固件程序烧录到设备端,该固件程序包括了TVM编译完成的模型设备端代码,TVM C runtime,设备的初始化操作以及TVM RPC server。而host端负责GrpahExecutor实例的创建,它会通过串口或USB等物理连接发送RPC命令到设备端进行模型推理。
基于TVM RPC服务 -
独立运行
只需要设备端参与,与基于RPC服务的区别是GraphExecutor实例是由设备自己独立完成。
独立运行
三、在ESP32上运行microTVM进行autotune
1、Zephyr安装与配置
1.1 配置zephyr sdk
Zephyr sdk的release地址在:https://github.com/zephyrproject-rtos/sdk-ng
需要下载最新的0.14.1版本,提供了esp相关的toolchain,如果使用west espressif进行安装会遇到newlibc的编译问题。
cd ~/
wget https://github.com/zephyrproject-rtos/sdk-ng/releases/download/v0.14.1/zephyr-sdk-0.14.1_linux-x86_64.tar.gz
tar -xvf zephyr-sdk-0.14.1_linux-x86_64.tar.gz
cd zephyr-sdk-0.14.1
./setup.sh
. environment-setup-x86_64-pokysdk-linux
我们需要的toolchain目录为 xtensa-espressif_esp32_zephyr-elf
1.2 安装依赖
这个按照Zephyr官方文档进行:
sudo apt install --no-install-recommends git cmake \
ninja-build gperf ccache dfu-util device-tree-compiler wget \
python3-dev python3-pip python3-setuptools python3-tk \
python3-wheel xz-utils file make gcc gcc-multilib \
g++-multilib libsdl2-dev
pip3 install -- user -U west
echo 'export PATH=~/.local/bin:"$PATH"' >> ~/.bashrc
source ~/.bashrc
zephyr对cmake版本有要求,如果需要升级,可以执行:
wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | sudo apt-key add -
sudo apt-add-repository 'deb https://apt.kitware.com/ubuntu/ focal main'
sudo apt update
sudo apt install cmake
1.3 初始化zephyr工程
west init ~/zephyrproject
cd ~/zephyrproject
west update
west zephyr-export
1.4 ESP32 newlibc支持
cd ~/zephyrproject/zephyr
git remote add upstream https://github.com/sylvioalves/zephyr.git
git fetch upstream
git checkout upstream/feature/newlibc_cpp_support
west update
虽然这个feature还没有入到主分支,但是不加这个支持的话,不出意外应该会遇到的错误应该是这样的:
In file included from ~/zephyrproject/zephyr/lib/posix/pthread_common.c:10:
~/zephyrproject/zephyr/include/posix/time.h:90:15: error: static declaration of 'clock_gettime' follows non-static declaration
__syscall int clock_gettime(clockid_t clock_id, struct timespec *ts);
^~~~~~~~~~~~~
In file included from ~/zephyrproject/zephyr/include/posix/time.h:12,
from ~/zephyrproject/zephyr/lib/posix/pthread_common.c:10:
/home/zgs/.espressif/tools/zephyr/xtensa-esp32-elf/xtensa-esp32-elf/sys-include/time.h:187:5: note: previous declaration of 'clock_gettime' was here
int clock_gettime (clockid_t clock_id, struct timespec *tp);
^~~~~~~~~~~~~
In file included from ~/zephyrproject/zephyr/lib/posix/pthread_common.c:10:
~/zephyrproject/zephyr/include/posix/time.h:94:5: error: conflicting types for 'timer_create'
int timer_create(clockid_t clockId, struct sigevent *evp, timer_t *timerid);
....
make[2]: *** [zephyr/lib/posix/CMakeFiles/lib__posix.dir/build.make:76: zephyr/lib/posix/CMakeFiles/lib__posix.dir/pthread_common.c.obj] Error 1
make[1]: *** [CMakeFiles/Makefile2:2950: zephyr/lib/posix/CMakeFiles/lib__posix.dir/all] Error 2
make[1]: *** Waiting for unfinished jobs....
1.5 导出环境变量
export ZEPHYR_BASE="${HOME}/zephyrproject/zephyr"
export ZEPHYR_TOOLCHAIN_VARIANT="espressif"
export ESPRESSIF_TOOLCHAIN_PATH="${HOME}/zephyr-sdk-0.14.1/xtensa-espressif_esp32_zephyr-elf"
export PATH=$PATH:$ESPRESSIF_TOOLCHAIN_PATH/bin
1.6 修改toolchain名称
下载的zephyr-sdk 0.14.1中的toolchain名称跟esp32所使用的默认名称不一致,需要修改编译脚本的默认值:
diff --git a/cmake/toolchain/espressif/target.cmake b/cmake/toolchain/espressif/target.cmake
index 5245bf9d08..f677bc9024 100644
--- a/cmake/toolchain/espressif/target.cmake
+++ b/cmake/toolchain/espressif/target.cmake
@@ -8,7 +8,7 @@ set(COMPILER gcc)
set(LINKER ld)
set(BINTOOLS gnu)
-set(CROSS_COMPILE_TARGET_xtensa_esp32 xtensa-esp32-elf)
+set(CROSS_COMPILE_TARGET_xtensa_esp32 xtensa-espressif_esp32_zephyr-elf)
set(CROSS_COMPILE_TARGET_xtensa_esp32s2 xtensa-esp32s2-elf)
set(CROSS_COMPILE_TARGET_riscv_esp32c3 riscv32-esp-elf)
2、TVM配置
2.1 使能 microTVM 编译
修改 config.cmake
set(USE_MICRO ON)
重新编译tvm。
2.2 增加esp32支持
在 ~/github/tvm/apps/microtvm/zephyr/template_project/boards.json 增加:
diff --git a/apps/microtvm/zephyr/template_project/boards.json b/apps/microtvm/zephyr/template_project/boards.json
index aae764a82..19a80397a 100644
--- a/apps/microtvm/zephyr/template_project/boards.json
+++ b/apps/microtvm/zephyr/template_project/boards.json
@@ -95,5 +95,13 @@
"fpu": true,
"vid_hex": "0483",
"pid_hex": "374b"
+ },
+ "esp32": {
+ "board": "esp32",
+ "model": "esp32",
+ "is_qemu": false,
+ "fpu": true,
+ "vid_hex": "",
+ "pid_hex": ""
}
}
2.3 增加esp32 flash runner串口获取方式
在~/github/tvm/apps/microtvm/zephyr/template_project/microtvm_api_server.py增加:
diff --git a/apps/microtvm/zephyr/template_project/microtvm_api_server.py b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
index 059e76048..7e7b6e888 100644
--- a/apps/microtvm/zephyr/template_project/microtvm_api_server.py
+++ b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
@@ -669,6 +669,10 @@ class ZephyrSerialTransport:
def _find_stm32cubeprogrammer_serial_port(cls, options):
return generic_find_serial_port()
+ @classmethod
+ def _find_esp32_serial_port(cls, options):
+ return generic_find_serial_port()
+
@classmethod
def _find_serial_port(cls, options):
flash_runner = _get_flash_runner()
@@ -685,6 +689,9 @@ class ZephyrSerialTransport:
if flash_runner == "stm32cubeprogrammer":
return cls._find_stm32cubeprogrammer_serial_port(options)
+ if flash_runner == "esp32":
+ return cls._find_esp32_serial_port(options)
+
raise RuntimeError(f"Don't know how to deduce serial port for flash runner {flash_runner}")
def __init__(self, options):
2.4 修改host_driven内存分配及头文件依赖
主要修改两个地方:
- zephyr从2.6.0开始power/reboot.h改成了sys/reboot.h
- 缩小tvm_heap分配的内存,否则最终的编译会出现region `dram0_1_seg' overflowed by xxxxx bytes的错误。
diff --git a/apps/microtvm/zephyr/template_project/src/host_driven/main.c b/apps/microtvm/zephyr/template_project/src/host_driven/main.c
index 44d656028..463f7e0d1 100644
--- a/apps/microtvm/zephyr/template_project/src/host_driven/main.c
+++ b/apps/microtvm/zephyr/template_project/src/host_driven/main.c
@@ -33,7 +33,7 @@
#include <drivers/uart.h>
#include <fatal.h>
#include <kernel.h>
-#include <power/reboot.h>
+#include <sys/reboot.h>
#include <random/rand32.h>
#include <stdio.h>
#include <sys/printk.h>
@@ -42,6 +42,7 @@
#include <tvm/runtime/crt/microtvm_rpc_server.h>
#include <unistd.h>
#include <zephyr.h>
+#include <string.h>
#ifdef CONFIG_ARCH_POSIX
#include "posix_board_if.h"
@@ -130,7 +131,7 @@ tvm_crt_error_t TVMPlatformGenerateRandom(uint8_t* buffer, size_t num_bytes) {
}
// Heap for use by TVMPlatformMemoryAllocate.
-K_HEAP_DEFINE(tvm_heap, 216 * 1024);
+K_HEAP_DEFINE(tvm_heap, 50 * 1024);
// Called by TVM to allocate memory.
tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) {
按这样改完后,编译最终得到的内存区使用情况如下,可以看到dram0_1_seg的使用率已经到了96.22%:
内存区使用情况
3、算子调优
3.1 按microtvm_autotune的例程编写应用
import os
import json
import numpy as np
import pathlib
import shutil
import tvm
from tvm.relay.backend import Runtime
BOARD = os.getenv("TVM_MICRO_BOARD", default="esp32")
def create_module():
data_shape = (1, 3, 10, 10)
weight_shape = (6, 3, 5, 5)
# 输入数据
data = tvm.relay.var("data", tvm.relay.TensorType(data_shape, "float32"))
weight = tvm.relay.var("weight", tvm.relay.TensorType(weight_shape, "float32"))
# relay卷积算子
y = tvm.relay.nn.conv2d(
data,
weight,
padding=(2,2),
kernel_size=(5, 5),
kernel_layout="OIHW",
out_dtype="float32",
)
# 定义relay Function表达式
f = tvm.relay.Function([data, weight], y)
# 用卷积算子表达式构建一个module
relay_mod = tvm.IRModule.from_expr(f)
# 表达式类型推理
relay_mod = tvm.relay.transform.InferType()(relay_mod)
# weight随机值
weight_sample = np.random.rand(
weight_shape[0], weight_shape[1], weight_shape[2], weight_shape[3]
).astype("float32")
params = {"weight": weight_sample}
return relay_mod, params
def config_target():
runtime = Runtime("crt", {"system-lib": True})
boards_file = pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr")) / "boards.json"
with open(boards_file) as fp:
boards = json.load(fp)
target = tvm.target.target.micro(boards[BOARD]["model"])
return runtime, target
relay_mod, params = create_module()
runtime, target = config_target()
# 配置优化pass
pass_context = tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True})
with pass_context:
tasks = tvm.autotvm.task.extract_from_program(relay_mod["main"], {}, target)
assert len(tasks) > 0
zephyr_base = os.getenv("HOME") + "/zephyrproject/zephyr"
module_loader = tvm.micro.AutoTvmModuleLoader(
template_project_dir=pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr")),
project_options={
"zephyr_board": BOARD,
"west_cmd": "west",
"verbose": False,
"project_type": "host_driven",
"zephyr_base": zephyr_base,
},
)
builder = tvm.autotvm.LocalBuilder(
n_parallel=1,
build_kwargs={"build_option": {"tir.disable_vectorize": True}},
do_fork=False,
build_func=tvm.micro.autotvm_build_func,
runtime=runtime,
)
runner = tvm.autotvm.LocalRunner(number=1, repeat=1, timeout=100, module_loader=module_loader)
measure_option = tvm.autotvm.measure_option(builder=builder, runner=runner)
# ----------------run tune-----------------
autotune_log_file = pathlib.Path("microtvm_autotune.log.txt")
if os.path.exists(autotune_log_file):
os.remove(autotune_log_file)
num_trials = 10
for task in tasks:
tuner = tvm.autotvm.tuner.GATuner(task)
tuner.tune(
n_trial=num_trials,
measure_option=measure_option,
callbacks=[
tvm.autotvm.callback.log_to_file(str(autotune_log_file)),
tvm.autotvm.callback.progress_bar(num_trials, si_prefix="M"),
],
si_prefix="M",
)
# ------------------timing untune program-----------------
with pass_context:
lowered = tvm.relay.build(relay_mod, target=target, runtime=runtime, params=params)
temp_dir = os.getenv("HOME") + "/microtvm_esp32/untuned"
if os.path.exists(temp_dir):
shutil.rmtree(temp_dir)
project = tvm.micro.generate_project(
str(tvm.micro.get_microtvm_template_projects("zephyr")),
lowered,
temp_dir,
{
"zephyr_board": BOARD,
"west_cmd": "west",
"verbose": False,
"project_type": "host_driven",
"zephyr_base": zephyr_base,
},
)
project.build()
project.flash()
with tvm.micro.Session(project.transport()) as session:
debug_module = tvm.micro.create_local_debug_executor(
lowered.get_graph_json(), session.get_system_lib(), session.device
)
debug_module.set_input(**lowered.get_params())
print("########## Build without Autotuning ##########")
debug_module.run()
del debug_module
# ------------------timing tuned program-----------------
with tvm.autotvm.apply_history_best(str(autotune_log_file)):
with pass_context:
lowered_tuned = tvm.relay.build(relay_mod, target=target, runtime=runtime, params=params)
temp_dir = os.getenv("HOME") + "/microtvm_esp32/tuned"
if os.path.exists(temp_dir):
shutil.rmtree(temp_dir)
project = tvm.micro.generate_project(
str(tvm.micro.get_microtvm_template_projects("zephyr")),
lowered_tuned,
temp_dir,
{
"zephyr_board": BOARD,
"west_cmd": "west",
"verbose": False,
"project_type": "host_driven",
"zephyr_base": zephyr_base,
},
)
project.build()
project.flash()
transporter = project.transport()
with tvm.micro.Session(transporter) as session:
debug_module = tvm.micro.create_local_debug_executor(
lowered_tuned.get_graph_json(), session.get_system_lib(), session.device
)
debug_module.set_input(**lowered_tuned.get_params())
print("########## Build with Autotuning ##########")
debug_module.run()
del debug_module
3.2 执行autotune
autotune过程中的打印,可以看到autotune的一个完整流程是每次都会重新编译然后flash到设备,再运行推理:
autotune过程
autotune完成后的打印:
autotune完成后
同时会得到调优参数结果文件microtvm_autotune.log.txt:
{"input": ["c -keys=cpu -link-params=0 -model=esp32", "conv2d_NCHWc.x86", [["TENSOR", [1, 3, 10, 10], "float32"], ["TENSOR", [6, 3, 5, 5], "float32"], [1, 1], [2, 2, 2, 2], [1, 1], "NCHW", "NCHW", "float32"], {}], "config": {"index": 47, "code_hash": null, "entity": [["tile_ic", "sp", [-1, 3]], ["tile_oc", "sp", [-1, 6]], ["tile_ow", "sp", [-1, 10]], ["unroll_kw", "ot", true]]}, "result": [[0.002898416], 0, 40.33066153526306, 1650420190.1888804], "version": 0.2, "tvm_version": "0.9.dev0"}
...
{"input": ["c -keys=cpu -link-params=0 -model=esp32", "conv2d_NCHWc.x86", [["TENSOR", [1, 3, 10, 10], "float32"], ["TENSOR", [6, 3, 5, 5], "float32"], [1, 1], [2, 2, 2, 2], [1, 1], "NCHW", "NCHW", "float32"], {}], "config": {"index": 55, "code_hash": null, "entity": [["tile_ic", "sp", [-1, 3]], ["tile_oc", "sp", [-1, 6]], ["tile_ow", "sp", [-1, 1]], ["unroll_kw", "ot", false]]},
"result": [[0.003652516], 0, 43.680758237838745, 1650420592.7613897], "version": 0.2, > "tvm_version": "0.9.dev0"}
3.3 结果对比
-
未调优结果
未调优结果 -
调优结果
调优结果
四、总结
本文对microTVM进行了简单的介绍,并通过一个实例详细说明了如何在ESP32开发板上通过microTVM调优卷积算子。
网友评论