cuda性能分析工具nsys使用

安装 nsys 命令行工具

一般这个工具是随着 cuda toolkit 一起安装的。如果没有安装，可以查看网站：https://docs.nvidia.com/nsight-systems/InstallationGuide/index.html 。
ubuntu可以用如下安装：

apt update
apt install -y --no-install-recommends gnupg
echo "deb http://developer.download.nvidia.com/devtools/repos/ubuntu$(source /etc/lsb-release; echo "$DISTRIB_RELEASE" | tr -d .)/$(dpkg --print-architecture) /" | tee /etc/apt/sources.list.d/nvidia-devtools.list
apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
apt update
apt install nsight-systems-cli

安装后可以尝试查看：

@└────> # nsys status -e
Timestamp counter supported: Yes

CPU Profiling Environment Check
Root privilege: disabled
Linux Kernel Paranoid Level = 4
Linux Distribution = Ubuntu
Linux Kernel Version = 5.15.0-105-generic: OK
Linux perf_event_open syscall available: Fail
Sampling trigger event available: Fail
Intel(c) Last Branch Record support: Not Available
CPU Profiling Environment (process-tree): Fail
CPU Profiling Environment (system-wide): Fail

See the product documentation at https://docs.nvidia.com/nsight-systems for more information,
including information on how to set the Linux Kernel Paranoid Level.

在可以使用 nsys 工具后，可以使用 nsys 来查看一些 kernel 的性能。
举个例子，我们有以下 cuda 代码：

#include <bits/stdc++.h>
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <time.h>
#include <sys/time.h>

#define THREAD_PER_BLOCK 256

// baseline
__global__ void reduce0(float* d_in, float* d_out) {
    __shared__ float sdata[THREAD_PER_BLOCK];

    // each thread loads one element from global to shared mem
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
    sdata[tid] = d_in[i];
    __syncthreads();

    // do reduction in shared mem
    for (unsigned int s = 1; s < blockDim.x; s *= 2) {
        if (tid % (2 * s) == 0) {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }

    // write result for this block to global mem
    if (tid == 0) {
        d_out[blockIdx.x] = sdata[0];
    }
}

// bank conflict
__global__ void reduce1(float* d_in, float* d_out) {
    __shared__ float sdata[THREAD_PER_BLOCK];

    // each thread loads one element from global to shared mem
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
    sdata[tid] = d_in[i];
    __syncthreads();

    // do reduction in shared mem
    for (unsigned int s = 1; s < blockDim.x; s *= 2) {
        int index = 2 * s * tid;
        if (index < blockDim.x) {
            sdata[index] += sdata[index + s];
        }
        __syncthreads();
    }

    // write result for this block to global mem
    if (tid == 0) {
        d_out[blockIdx.x] = sdata[0];
    }
}

bool check(float* out, float* res, int n) {
    for (int i = 0; i < n; i++) {
        if (out[i] != res[i]) {
            return false;
        }
    }
    return true;
}

int main() {
    const int N = 32 * 1024 * 1024;
    float* a = (float*)malloc(N * sizeof(float));
    float* d_a;
    cudaMalloc((void**)&d_a, N * sizeof(float));

    int block_num = N / THREAD_PER_BLOCK;
    float* out = (float*)malloc((N / THREAD_PER_BLOCK) * sizeof(float));
    float* d_out;
    cudaMalloc((void**)&d_out, (N / THREAD_PER_BLOCK) * sizeof(float));
    float* res = (float*)malloc((N / THREAD_PER_BLOCK) * sizeof(float));

    for (int i = 0; i < N; i++) {
        a[i] = 1;
    }

    for (int i = 0; i < block_num; i++) {
        float cur = 0;
        for (int j = 0; j < THREAD_PER_BLOCK; j++) {
            cur += a[i * THREAD_PER_BLOCK + j];
        }
        res[i] = cur;
    }

    cudaMemcpy(d_a, a, N * sizeof(float), cudaMemcpyHostToDevice);

    dim3 Grid(N / THREAD_PER_BLOCK, 1);
    dim3 Block(THREAD_PER_BLOCK, 1);

    reduce0<<<Grid, Block>>>(d_a, d_out);
    cudaMemcpy(out, d_out, block_num * sizeof(float), cudaMemcpyDeviceToHost);
    if (check(out, res, block_num)) { 
        printf("the ans is right\n");
    } else {
        printf("the ans is wrong\n");
        for (int i = 0; i < block_num; i++) {
            printf("%lf ", out[i]);
        }
        printf("\n");
    }

    reduce1<<<Grid, Block>>>(d_a, d_out);
    cudaMemcpy(out, d_out, block_num * sizeof(float), cudaMemcpyDeviceToHost);
    if (check(out, res, block_num)) { 
        printf("the ans is right\n");
    } else {
        printf("the ans is wrong\n");
        for (int i = 0; i < block_num; i++) {
            printf("%lf ", out[i]);
        }
        printf("\n");
    }

    cudaFree(d_a);
    cudaFree(d_out);
}

以上是一个 reduce_sum 的例子，有两个性能不同的核函数 reduce0 和 reduce1。可以使用命令行工具来看这两个工具的性能。

# 一个过渡方式，沿用之前 nvprof 的用法。
@└────> # nsys nvprof ./a.out

# 后续支持的方式
@└────> # nsys profile --stats=true ./a.out
...
[4/8] Executing 'osrtsum' stats report

 Time (%)  Total Time (ns)  Num Calls    Avg (ns)       Med (ns)     Min (ns)     Max (ns)      StdDev (ns)            Name         
 --------  ---------------  ---------  -------------  -------------  ---------  -------------  -------------  ----------------------
     49.8    6,560,704,940         41  160,017,193.7  100,127,248.0      2,946  3,656,490,088  561,147,066.6  poll                  
     40.5    5,331,726,596      1,591    3,351,179.5       34,054.0      1,200    405,442,288   30,016,785.5  ioctl                 
      4.7      614,593,088         60   10,243,218.1   10,295,123.0  1,532,148     14,425,405    2,880,533.4  waitpid               
      2.6      341,072,143         60    5,684,535.7    5,470,557.0    576,832     10,285,481    1,599,344.7  fork                  
      2.4      312,589,312        113    2,766,277.1       14,265.0      4,724    267,144,344   25,109,089.4  open64                
      0.0        5,684,320        144       39,474.4       11,704.5      1,108      3,826,795      317,952.4  fopen                 
      0.0        2,034,643         38       53,543.2       10,234.5      3,945      1,236,239      198,417.6  mmap64                
      0.0          607,662         10       60,766.2       56,248.0     42,036        112,182       19,745.1  sem_timedwait         
      0.0          405,925        123        3,300.2        2,289.0      1,006         77,642        7,014.0  fclose                
      0.0          391,343          4       97,835.8       80,675.0     58,047        171,946       53,486.5  pthread_create        
      0.0          154,726         19        8,143.5        4,985.0      1,004         47,713       10,904.9  mmap                  
      0.0           82,936          1       82,936.0       82,936.0     82,936         82,936            0.0  pthread_cond_wait     
      0.0           78,328          8        9,791.0        4,877.5      2,074         40,677       12,827.6  munmap                
      0.0           71,358          7       10,194.0        9,879.0      3,968         14,551        3,520.2  open                  
      0.0           51,719          3       17,239.7       13,793.0      3,372         34,554       15,874.2  fread                 
      0.0           42,644         29        1,470.5        1,295.0      1,000          5,372          801.5  fcntl                 
      0.0           41,205          1       41,205.0       41,205.0     41,205         41,205            0.0  fgets                 
      0.0           36,755         15        2,450.3        2,072.0      1,083          7,000        1,497.0  read                  
      0.0           33,701         12        2,808.4        2,360.0      1,384          6,014        1,224.1  write                 
      0.0           30,013          3       10,004.3       11,976.0      5,640         12,397        3,785.5  pipe2                 
      0.0           26,410          2       13,205.0       13,205.0     10,057         16,353        4,451.9  socket                
      0.0           12,546          2        6,273.0        6,273.0      5,777          6,769          701.4  fwrite                
      0.0           10,654          2        5,327.0        5,327.0      4,160          6,494        1,650.4  pthread_cond_broadcast
      0.0           10,412          1       10,412.0       10,412.0     10,412         10,412            0.0  pthread_mutex_trylock 
      0.0            9,158          1        9,158.0        9,158.0      9,158          9,158            0.0  connect               
      0.0            5,650          1        5,650.0        5,650.0      5,650          5,650            0.0  bind                  
      0.0            3,085          1        3,085.0        3,085.0      3,085          3,085            0.0  listen                

[5/8] Executing 'cudaapisum' stats report

 Time (%)  Total Time (ns)  Num Calls    Avg (ns)       Med (ns)      Min (ns)      Max (ns)      StdDev (ns)            Name         
 --------  ---------------  ---------  -------------  -------------  -----------  -------------  -------------  ----------------------
     77.1    1,961,601,754          2  980,800,877.0  980,800,877.0  674,626,715  1,286,975,039  432,995,652.3  cudaMalloc            
     22.2      564,147,432          2  282,073,716.0  282,073,716.0  275,230,367    288,917,065    9,677,957.0  cudaFree              
      0.7       17,105,648          3    5,701,882.7      657,298.0      434,348     16,014,002    8,931,253.0  cudaMemcpy            
      0.0          415,480          2      207,740.0      207,740.0       42,497        372,983      233,688.9  cudaLaunchKernel      
      0.0            1,368          1        1,368.0        1,368.0        1,368          1,368            0.0  cuModuleGetLoadingMode

[6/8] Executing 'gpukernsum' stats report

 Time (%)  Total Time (ns)  Instances  Avg (ns)   Med (ns)   Min (ns)  Max (ns)  StdDev (ns)      GridXYZ          BlockXYZ               Name           
 --------  ---------------  ---------  ---------  ---------  --------  --------  -----------  ----------------  --------------  -------------------------
     62.5          556,575          1  556,575.0  556,575.0   556,575   556,575          0.0  131072    1    1   256    1    1  reduce0(float *, float *)
     37.5          334,111          1  334,111.0  334,111.0   334,111   334,111          0.0  131072    1    1   256    1    1  reduce1(float *, float *)

[7/8] Executing 'gpumemtimesum' stats report

 Time (%)  Total Time (ns)  Count    Avg (ns)      Med (ns)     Min (ns)    Max (ns)   StdDev (ns)      Operation     
 --------  ---------------  -----  ------------  ------------  ----------  ----------  -----------  ------------------
     99.7       15,820,690      1  15,820,690.0  15,820,690.0  15,820,690  15,820,690          0.0  [CUDA memcpy HtoD]
      0.3           46,176      2      23,088.0      23,088.0      23,040      23,136         67.9  [CUDA memcpy DtoH]

[8/8] Executing 'gpumemsizesum' stats report

 Total (MB)  Count  Avg (MB)  Med (MB)  Min (MB)  Max (MB)  StdDev (MB)      Operation     
 ----------  -----  --------  --------  --------  --------  -----------  ------------------
    134.218      1   134.218   134.218   134.218   134.218        0.000  [CUDA memcpy HtoD]
      1.049      2     0.524     0.524     0.524     0.524        0.000  [CUDA memcpy DtoH]

可以看见，步骤 4 中调用的是 osrtsum（OS Runtime Summary），关注操作系统层面的性能数据。而如果在命令行中加入

@└────> # nsys profile --stats=true --trace=cuda,nvtx,cudnn,cublas ./a.out
...
[4/7] Executing 'cudaapisum' stats report

 Time (%)  Total Time (ns)  Num Calls    Avg (ns)       Med (ns)     Min (ns)   Max (ns)     StdDev (ns)            Name         
 --------  ---------------  ---------  -------------  -------------  --------  -----------  -------------  ----------------------
     94.6      282,480,873          2  141,240,436.5  141,240,436.5   269,662  282,211,211  199,362,781.2  cudaMalloc            
      5.2       15,504,039          3    5,168,013.0      826,866.0   541,742   14,135,431    7,767,320.2  cudaMemcpy            
      0.1          392,989          2      196,494.5      196,494.5    96,115      296,874      141,958.1  cudaFree              
      0.1          318,607          2      159,303.5      159,303.5    28,009      290,598      185,678.5  cudaLaunchKernel      
      0.0            1,184          1        1,184.0        1,184.0     1,184        1,184            0.0  cuModuleGetLoadingMode

则使用的是 cudaapisum（CUDA API Summary），关注的是 CUDA API 层面的性能数据。
除此之外，可以看到 gpukernsum 中核函数执行时间。reduce1 函数性能优于 reduce0，他们的执行次数，执行时间最大值最小值和平均值。

若算子是用 pybind 绑定，用 python 调用的，可以使用 torch 的函数来只监控该算子。

import torch
# warmup
for i in range(10):
    _ = torch.matmul(x, y) # x 和 y 是矩阵，这里就不展开了

# start profiling
torch.cuda.cudart().cudaProfilerStart()

### benchmarking
for i in range(100): # 测试 100 次
    torch.cuda.nvtx.range_push("your_ops_name")
    _ = your_ops(x, y)
    torch.cuda.nvtx.range_pop()
torch.cuda.synchronize()

# stop profiling
torch.cuda.cudart().cudaProfilerStop()

安装 Nsight Systems 可视化工具

在进行完命令行分析后，会生成一个报告文件，结尾是 .nsys-rep。这个文件可以下载下来，丢进 nsight-system 可视化软件，在软件中可以看到更加详细的数据以及程序执行的时间线。
从上图可以看出 cuda hardware 的函数执行时间情况和 cpu 侧的执行时间情况。下面是核函数的发射时间，上面 device 侧是核函数实际执行时间，和 gpukernsum 统计的时间一致。
上图是在软件中可以看见的较为详细的统计数据，和命令行结果一致。