美文网首页
tvm在CPU上优化GEMM结果

tvm在CPU上优化GEMM结果

作者: peteyuan | 来源:发表于2018-10-26 19:19 被阅读112次

    本文展示如何只添加18行code,在矩阵乘法上获得200+倍的加速。

    通常,CPU上的计算密集型任务有2个优化点:

    • 提高内存访问的缓存命中率
    • SIMD指令加速

    对于gemm的优化手段已有现成的总结,基本都可以在这篇文档how to optimize gemm找到。

    tvm已经实现了其中的一些优化方法,但由于tvm本身的限制,还有一些方法没有实现。

    本文逐步优化,不断提升程序性能。首先用没有优化的code和numpy运行结果做对比,如下:

    这是我自己的容器里运行的结果

    Numpy running time: 0.004862
    Baseline: 2.646903
    

    原始ir如下:

    produce C {
      for (x, 0, 1024) {
        for (y, 0, 1024) {
          C[((x*1024) + y)] = 0.000000f
          for (k, 0, 1024) {
            C[((x*1024) + y)] = (C[((x*1024) + y)] + (A[((x*1024) + k)]*B[(y + (k*1024))]))
          }
        }
      }
    }
    

    1 blocking

    使用blocking的技术可以显著提升缓存命中率,因为数据被分块进行计算,块内的数据在缓存中的访问都是相邻的。

    结果如下:

    Numpy running time: 0.004732
    Baseline: 2.892019
    Opt1: 0.701635
    

    优化后ir

    produce C {
      for (x.outer, 0, 32) {
        for (y.outer, 0, 32) {
          for (x.inner.init, 0, 32) {
            for (y.inner.init, 0, 32) {
              C[(((((x.outer*1024) + y.outer) + (x.inner.init*32))*32) + y.inner.init)] = 0.000000f
            }
          }
          for (k.outer, 0, 256) {
            for (k.inner, 0, 4) {
              for (x.inner, 0, 32) {
                for (y.inner, 0, 32) {
                  C[(((((x.outer*1024) + y.outer) + (x.inner*32))*32) + y.inner)] = (C[(((((x.outer*1024) + y.outer) + (x.inner*32))*32) + y.inner)] + (A[(((((x.outer*8192) + k.outer)*4) + k.inner) + (x.inner*1024))]*B[((((y.outer + (k.outer*128)) + (k.inner*32))*32) + y.inner)]))
                }
              }
            }
          }
        }
      }
    }
    

    2 Vectorization

    向量化。
    结果:

    Numpy running time: 0.004964
    Baseline: 2.884543
    Opt1: 0.713341
    Opt2: 0.331218
    

    优化后ir

    produce C {
      for (x.outer, 0, 32) {
        for (y.outer, 0, 32) {
          for (x.inner.init, 0, 32) {
            C[ramp(((((x.outer*1024) + y.outer) + (x.inner.init*32))*32), 1, 32)] = x32(0.000000f)
          }
          for (k.outer, 0, 256) {
            for (k.inner, 0, 4) {
              for (x.inner, 0, 32) {
                C[ramp(((((x.outer*1024) + y.outer) + (x.inner*32))*32), 1, 32)] = (C[ramp(((((x.outer*1024) + y.outer) + (x.inner*32))*32), 1, 32)] + (x32(A[(((((x.outer*8192) + k.outer)*4) + k.inner) + (x.inner*1024))])*B[ramp((((y.outer + (k.outer*128)) + (k.inner*32))*32), 1, 32)]))
              }
            }
          }
        }
      }
    }
    

    3 Loop Permutation

    结果:

    Numpy running time: 0.005203
    Baseline: 2.646298
    Opt1: 0.691242
    Opt2: 0.330293
    Opt3: 0.147917
    

    优化后ir:

    produce C {
      for (x.outer, 0, 32) {
        for (y.outer, 0, 32) {
          for (x.inner.init, 0, 32) {
            C[ramp(((((x.outer*1024) + y.outer) + (x.inner.init*32))*32), 1, 32)] = x32(0.000000f)
          }
          for (k.outer, 0, 256) {
            for (x.inner, 0, 32) {
              for (k.inner, 0, 4) {
                C[ramp(((((x.outer*1024) + y.outer) + (x.inner*32))*32), 1, 32)] = (C[ramp(((((x.outer*1024) + y.outer) + (x.inner*32))*32), 1, 32)] + (x32(A[(((((x.outer*8192) + k.outer) + (x.inner*256))*4) + k.inner)])*B[ramp((((y.outer + (k.outer*128)) + (k.inner*32))*32), 1, 32)]))
              }
            }
          }
        }
      }
    }
    

    4 Array Packing

    结果:

    Numpy running time: 0.005159
    Baseline: 2.884619
    Opt1: 0.693074
    Opt2: 0.332173
    Opt3: 0.149278
    Opt4: 0.233195
    

    这一步优化,连续跑了两次,性能都反而变差了。

    优化后ir:

    
    // attr [packedB] storage_scope = "global"
    allocate packedB[float32x32 * 32 * 1024 * 1]
    produce packedB {
      parallel (x, 0, 32) {
        for (y, 0, 1024) {
          packedB[ramp((((x*1024) + y)*32), 1, 32)] = B[ramp(((x + (y*32))*32), 1, 32)]
        }
      }
    }
    produce C {
      for (x.outer, 0, 32) {
        for (y.outer, 0, 32) {
          for (x.inner.init, 0, 32) {
            C[ramp(((((x.outer*1024) + y.outer) + (x.inner.init*32))*32), 1, 32)] = x32(0.000000f)
          }
          for (k.outer, 0, 256) {
            for (x.inner, 0, 32) {
              for (k.inner, 0, 4) {
                C[ramp(((((x.outer*1024) + y.outer) + (x.inner*32))*32), 1, 32)] = (C[ramp(((((x.outer*1024) + y.outer) + (x.inner*32))*32), 1, 32)] + (x32(A[(((((x.outer*8192) + k.outer) + (x.inner*256))*4) + k.inner)])*packedB[ramp((((((y.outer*256) + k.outer)*4) + k.inner)*32), 1, 32)]))
              }
            }
          }
        }
      }
    }
    

    5 Write cache for blocks

    结果:

    Numpy running time: 0.005358
    Baseline: 2.654734
    Opt1: 0.689408
    Opt2: 0.329072
    Opt3: 0.148742
    Opt4: 0.231431
    Opt5: 0.211086
    

    优化后ir:

    // attr [packedB] storage_scope = "global"
    allocate packedB[float32x32 * 32 * 1024 * 1]
    // attr [C.global] storage_scope = "global"
    allocate C.global[float32 * 32 * 32]
    produce packedB {
      parallel (x, 0, 32) {
        for (y, 0, 1024) {
          packedB[ramp((((x*1024) + y)*32), 1, 32)] = B[ramp(((x + (y*32))*32), 1, 32)]
        }
      }
    }
    produce C {
      for (x.outer, 0, 32) {
        for (y.outer, 0, 32) {
          produce C.global {
            for (x.c.init, 0, 32) {
              C.global[ramp((x.c.init*32), 1, 32)] = x32(0.000000f)
            }
            for (k.outer, 0, 256) {
              for (x.c, 0, 32) {
                C.global[ramp((x.c*32), 1, 32)] = (C.global[ramp((x.c*32), 1, 32)] + (x32(A[((((x.outer*8192) + k.outer) + (x.c*256))*4)])*packedB[ramp((((y.outer*256) + k.outer)*128), 1, 32)]))
                C.global[ramp((x.c*32), 1, 32)] = (C.global[ramp((x.c*32), 1, 32)] + (x32(A[(((((x.outer*8192) + k.outer) + (x.c*256))*4) + 1)])*packedB[ramp(((((y.outer*256) + k.outer)*128) + 32), 1, 32)]))
                C.global[ramp((x.c*32), 1, 32)] = (C.global[ramp((x.c*32), 1, 32)] + (x32(A[(((((x.outer*8192) + k.outer) + (x.c*256))*4) + 2)])*packedB[ramp(((((y.outer*256) + k.outer)*128) + 64), 1, 32)]))
                C.global[ramp((x.c*32), 1, 32)] = (C.global[ramp((x.c*32), 1, 32)] + (x32(A[(((((x.outer*8192) + k.outer) + (x.c*256))*4) + 3)])*packedB[ramp(((((y.outer*256) + k.outer)*128) + 96), 1, 32)]))
              }
            }
          }
          for (x.inner, 0, 32) {
            for (y.inner, 0, 32) {
              C[(((((x.outer*1024) + y.outer) + (x.inner*32))*32) + y.inner)] = C.global[((x.inner*32) + y.inner)]
            }
          }
        }
      }
    }
    

    6 Parallel

    结果:

    Numpy running time: 0.005989
    Baseline: 2.635383
    Opt1: 0.691006
    Opt2: 0.328837
    Opt3: 0.149464
    Opt4: 0.233010
    Opt5: 0.213697
    Opt6: 0.018374
    

    优化后ir:

    // attr [packedB] storage_scope = "global"
    allocate packedB[float32x32 * 32 * 1024 * 1]
    produce packedB {
      parallel (x, 0, 32) {
        for (y, 0, 1024) {
          packedB[ramp((((x*1024) + y)*32), 1, 32)] = B[ramp(((x + (y*32))*32), 1, 32)]
        }
      }
    }
    produce C {
      parallel (x.outer, 0, 32) {
        // attr [C.global] storage_scope = "global"
        allocate C.global[float32 * 32 * 32]
        for (y.outer, 0, 32) {
          produce C.global {
            for (x.c.init, 0, 32) {
              C.global[ramp((x.c.init*32), 1, 32)] = x32(0.000000f)
            }
            for (k.outer, 0, 256) {
              for (x.c, 0, 32) {
                C.global[ramp((x.c*32), 1, 32)] = (C.global[ramp((x.c*32), 1, 32)] + (x32(A[((((x.outer*8192) + k.outer) + (x.c*256))*4)])*packedB[ramp((((y.outer*256) + k.outer)*128), 1, 32)]))
                C.global[ramp((x.c*32), 1, 32)] = (C.global[ramp((x.c*32), 1, 32)] + (x32(A[(((((x.outer*8192) + k.outer) + (x.c*256))*4) + 1)])*packedB[ramp(((((y.outer*256) + k.outer)*128) + 32), 1, 32)]))
                C.global[ramp((x.c*32), 1, 32)] = (C.global[ramp((x.c*32), 1, 32)] + (x32(A[(((((x.outer*8192) + k.outer) + (x.c*256))*4) + 2)])*packedB[ramp(((((y.outer*256) + k.outer)*128) + 64), 1, 32)]))
                C.global[ramp((x.c*32), 1, 32)] = (C.global[ramp((x.c*32), 1, 32)] + (x32(A[(((((x.outer*8192) + k.outer) + (x.c*256))*4) + 3)])*packedB[ramp(((((y.outer*256) + k.outer)*128) + 96), 1, 32)]))
              }
            }
          }
          for (x.inner, 0, 32) {
            for (y.inner, 0, 32) {
              C[(((((x.outer*1024) + y.outer) + (x.inner*32))*32) + y.inner)] = C.global[((x.inner*32) + y.inner)]
            }
          }
        }
      }
    }
    
    

    参考: https://docs.tvm.ai/tutorials/optimize/opt_gemm.html

    相关文章

      网友评论

          本文标题:tvm在CPU上优化GEMM结果

          本文链接:https://www.haomeiwen.com/subject/kypltqtx.html