Infinite repetitive thinking for this case:
#52
by
zhaocc1106
- opened
case:
// work for row == col
global void mat_transpose_f32_diagonal2d_kernel(
float *x, float *y, int row, int col) {
const int block_y = blockIdx.x;
const int block_x = (blockIdx.x + blockIdx.y) % gridDim.x;
const int global_col = threadIdx.x + blockDim.x * block_x;
const int global_row = threadIdx.y + blockDim.y * block_y;
if (global_col < col && global_row < row) {
y[global_row * col + global_col] = x[global_col * row + global_row];
}
讲解一下这个核函数