Skip to content

Commit

Permalink
Fix bug where grid.z is larger than 65535, there is a launch failure.
Browse files Browse the repository at this point in the history
  • Loading branch information
liuliu committed Jan 18, 2025
1 parent 3fec0ed commit f3a29ea
Show file tree
Hide file tree
Showing 2 changed files with 117 additions and 18 deletions.
45 changes: 27 additions & 18 deletions lib/nnc/cmd/blas/gpu/ccv_nnc_cmul_gpu_ref.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ __global__ void _ccv_nnc_cmul_kernel(const size_t count, const NUM1* const a, co
}

template<typename NUM1, typename NUM2, typename NUM3>
__global__ void _ccv_nnc_cmul_kernel_4d_0(const int astride2, const int astride1, const int astride0, const int bstride2, const int bstride1, const int bstride0, const int cstride2, const int cstride1, const int cstride0, const int dim2, const int dim1, const int dim0, const NUM1* const a, const NUM2* const b, NUM3* const c)
__global__ void _ccv_nnc_cmul_kernel_4d_0(const int z_start, const int astride2, const int astride1, const int astride0, const int bstride2, const int bstride1, const int bstride0, const int cstride2, const int cstride1, const int cstride0, const int dim2, const int dim1, const int dim0, const NUM1* const a, const NUM2* const b, NUM3* const c)
{
const int z = blockIdx.z * blockDim.z + threadIdx.z;
const int z = blockIdx.z * blockDim.z + threadIdx.z + z_start;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
if (y >= dim1 || x >= dim0)
Expand Down Expand Up @@ -159,26 +159,35 @@ static int _ccv_nnc_cmul_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint
assert(nd <= 4);
if (nd == 4)
{
if (a->info.datatype == CCV_32F && c->info.datatype == CCV_32F)
// Make sure when we launch, the grid.z won't exceed the limit which is 65535.
for (i = 0; i < (cdim[2] * cdim[3] + 0xfffe) / 0xffff; i++)
{
_ccv_nnc_cmul_kernel_4d_0<<<dim3((cdim[0] / 2 + 63) / 64, (cdim[1] + 7) / 8, cdim[2] * cdim[3]), dim3(64, 8, 1), 0, stream>>>(astride[3], astride[2], astride[1], bstride[3], bstride[2], bstride[1], cstride[3], cstride[2], cstride[1], cdim[2], cdim[1], cdim[0] / 2, a->data.f32, b->data.f32, c->data.f32);
} else if (a->info.datatype == CCV_32F && c->info.datatype == CCV_16F) {
_ccv_nnc_cmul_kernel_4d_0<<<dim3((cdim[0] / 2 + 63) / 64, (cdim[1] + 7) / 8, cdim[2] * cdim[3]), dim3(64, 8, 1), 0, stream>>>(astride[3], astride[2], astride[1], bstride[3], bstride[2], bstride[1], cstride[3], cstride[2], cstride[1], cdim[2], cdim[1], cdim[0] / 2, a->data.f32, b->data.f32, (__half*)c->data.f16);
} else if (a->info.datatype == CCV_16F && c->info.datatype == CCV_32F) {
_ccv_nnc_cmul_kernel_4d_0<<<dim3((cdim[0] / 2 + 63) / 64, (cdim[1] + 7) / 8, cdim[2] * cdim[3]), dim3(64, 8, 1), 0, stream>>>(astride[3], astride[2], astride[1], bstride[3], bstride[2], bstride[1], cstride[3], cstride[2], cstride[1], cdim[2], cdim[1], cdim[0] / 2, (__half*)a->data.f16, (__half*)b->data.f16, c->data.f32);
} else if (a->info.datatype == CCV_16F && c->info.datatype == CCV_16F) {
_ccv_nnc_cmul_kernel_4d_0<<<dim3((cdim[0] / 2 + 63) / 64, (cdim[1] + 7) / 8, cdim[2] * cdim[3]), dim3(64, 8, 1), 0, stream>>>(astride[3], astride[2], astride[1], bstride[3], bstride[2], bstride[1], cstride[3], cstride[2], cstride[1], cdim[2], cdim[1], cdim[0] / 2, (__half*)a->data.f16, (__half*)b->data.f16, (__half*)c->data.f16);
const int z_start = i * 0xffff;
if (a->info.datatype == CCV_32F && c->info.datatype == CCV_32F)
{
_ccv_nnc_cmul_kernel_4d_0<<<dim3((cdim[0] / 2 + 63) / 64, (cdim[1] + 7) / 8, ccv_min(cdim[2] * cdim[3] - z_start, 0xffff)), dim3(64, 8, 1), 0, stream>>>(z_start, astride[3], astride[2], astride[1], bstride[3], bstride[2], bstride[1], cstride[3], cstride[2], cstride[1], cdim[2], cdim[1], cdim[0] / 2, a->data.f32, b->data.f32, c->data.f32);
} else if (a->info.datatype == CCV_32F && c->info.datatype == CCV_16F) {
_ccv_nnc_cmul_kernel_4d_0<<<dim3((cdim[0] / 2 + 63) / 64, (cdim[1] + 7) / 8, ccv_min(cdim[2] * cdim[3] - z_start, 0xffff)), dim3(64, 8, 1), 0, stream>>>(z_start, astride[3], astride[2], astride[1], bstride[3], bstride[2], bstride[1], cstride[3], cstride[2], cstride[1], cdim[2], cdim[1], cdim[0] / 2, a->data.f32, b->data.f32, (__half*)c->data.f16);
} else if (a->info.datatype == CCV_16F && c->info.datatype == CCV_32F) {
_ccv_nnc_cmul_kernel_4d_0<<<dim3((cdim[0] / 2 + 63) / 64, (cdim[1] + 7) / 8, ccv_min(cdim[2] * cdim[3] - z_start, 0xffff)), dim3(64, 8, 1), 0, stream>>>(z_start, astride[3], astride[2], astride[1], bstride[3], bstride[2], bstride[1], cstride[3], cstride[2], cstride[1], cdim[2], cdim[1], cdim[0] / 2, (__half*)a->data.f16, (__half*)b->data.f16, c->data.f32);
} else if (a->info.datatype == CCV_16F && c->info.datatype == CCV_16F) {
_ccv_nnc_cmul_kernel_4d_0<<<dim3((cdim[0] / 2 + 63) / 64, (cdim[1] + 7) / 8, ccv_min(cdim[2] * cdim[3] - z_start, 0xffff)), dim3(64, 8, 1), 0, stream>>>(z_start, astride[3], astride[2], astride[1], bstride[3], bstride[2], bstride[1], cstride[3], cstride[2], cstride[1], cdim[2], cdim[1], cdim[0] / 2, (__half*)a->data.f16, (__half*)b->data.f16, (__half*)c->data.f16);
}
}
} else if (nd == 3) {
if (a->info.datatype == CCV_32F && c->info.datatype == CCV_32F)
// Make sure when we launch, the grid.z won't exceed the limit which is 65535.
for (i = 0; i < (cdim[2] + 0xfffe) / 0xffff; i++)
{
_ccv_nnc_cmul_kernel_3d_0<<<dim3((cdim[0] / 2 + 63) / 64, (cdim[1] + 7) / 8, cdim[2]), dim3(64, 8, 1), 0, stream>>>(astride[2], astride[1], bstride[2], bstride[1], cstride[2], cstride[1], cdim[1], cdim[0] / 2, a->data.f32, b->data.f32, c->data.f32);
} else if (a->info.datatype == CCV_32F && c->info.datatype == CCV_16F) {
_ccv_nnc_cmul_kernel_3d_0<<<dim3((cdim[0] / 2 + 63) / 64, (cdim[1] + 7) / 8, cdim[2]), dim3(64, 8, 1), 0, stream>>>(astride[2], astride[1], bstride[2], bstride[1], cstride[2], cstride[1], cdim[1], cdim[0] / 2, a->data.f32, b->data.f32, (__half*)c->data.f16);
} else if (a->info.datatype == CCV_16F && c->info.datatype == CCV_32F) {
_ccv_nnc_cmul_kernel_3d_0<<<dim3((cdim[0] / 2 + 63) / 64, (cdim[1] + 7) / 8, cdim[2]), dim3(64, 8, 1), 0, stream>>>(astride[2], astride[1], bstride[2], bstride[1], cstride[2], cstride[1], cdim[1], cdim[0] / 2, (__half*)a->data.f16, (__half*)b->data.f16, c->data.f32);
} else if (a->info.datatype == CCV_16F && c->info.datatype == CCV_16F) {
_ccv_nnc_cmul_kernel_3d_0<<<dim3((cdim[0] / 2 + 63) / 64, (cdim[1] + 7) / 8, cdim[2]), dim3(64, 8, 1), 0, stream>>>(astride[2], astride[1], bstride[2], bstride[1], cstride[2], cstride[1], cdim[1], cdim[0] / 2, (__half*)a->data.f16, (__half*)b->data.f16, (__half*)c->data.f16);
if (a->info.datatype == CCV_32F && c->info.datatype == CCV_32F)
{
_ccv_nnc_cmul_kernel_3d_0<<<dim3((cdim[0] / 2 + 63) / 64, (cdim[1] + 7) / 8, ccv_min(cdim[2] - i * 0xffff, 0xffff)), dim3(64, 8, 1), 0, stream>>>(astride[2], astride[1], bstride[2], bstride[1], cstride[2], cstride[1], cdim[1], cdim[0] / 2, a->data.f32 + 0xffff * i * astride[2], b->data.f32 + 0xffff * i * bstride[2], c->data.f32 + 0xffff * i * cstride[2]);
} else if (a->info.datatype == CCV_32F && c->info.datatype == CCV_16F) {
_ccv_nnc_cmul_kernel_3d_0<<<dim3((cdim[0] / 2 + 63) / 64, (cdim[1] + 7) / 8, ccv_min(cdim[2] - i * 0xffff, 0xffff)), dim3(64, 8, 1), 0, stream>>>(astride[2], astride[1], bstride[2], bstride[1], cstride[2], cstride[1], cdim[1], cdim[0] / 2, a->data.f32 + 0xffff * i * astride[2], b->data.f32 + 0xffff * i * bstride[2], (__half*)c->data.f16 + 0xffff * i * cstride[2]);
} else if (a->info.datatype == CCV_16F && c->info.datatype == CCV_32F) {
_ccv_nnc_cmul_kernel_3d_0<<<dim3((cdim[0] / 2 + 63) / 64, (cdim[1] + 7) / 8, ccv_min(cdim[2] - i * 0xffff, 0xffff)), dim3(64, 8, 1), 0, stream>>>(astride[2], astride[1], bstride[2], bstride[1], cstride[2], cstride[1], cdim[1], cdim[0] / 2, (__half*)a->data.f16 + 0xffff * i * astride[2], (__half*)b->data.f16 + 0xffff * i * bstride[2], c->data.f32 + 0xffff * i * cstride[2]);
} else if (a->info.datatype == CCV_16F && c->info.datatype == CCV_16F) {
_ccv_nnc_cmul_kernel_3d_0<<<dim3((cdim[0] / 2 + 63) / 64, (cdim[1] + 7) / 8, ccv_min(cdim[2] - i * 0xffff, 0xffff)), dim3(64, 8, 1), 0, stream>>>(astride[2], astride[1], bstride[2], bstride[1], cstride[2], cstride[1], cdim[1], cdim[0] / 2, (__half*)a->data.f16 + 0xffff * i * astride[2], (__half*)b->data.f16 + 0xffff * i * bstride[2], (__half*)c->data.f16 + 0xffff * i * cstride[2]);
}
}
} else if (nd == 2) {
assert(adim[0] == bdim[0] && adim[0] == cdim[0]);
Expand Down
90 changes: 90 additions & 0 deletions test/int/nnc/cublas.tests.c
Original file line number Diff line number Diff line change
Expand Up @@ -3191,6 +3191,96 @@ TEST_CASE("cmul in float, broadcast semantics")
ccv_nnc_symbolic_graph_free(symbolic_graph);
}

TEST_CASE("cmul in float, broadcast semantics with longer than 65535 sequence")
{
GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CMUL_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_CMUL_FORWARD, CCV_NNC_BACKEND_MPS));
ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 1, 70000, 8, 16), "a");
ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 1, 70000, 1, 16), "b");
ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 1, 70000, 8, 16), "c");
ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_CMUL_FORWARD(), TENSOR_SYMBOL_LIST(a, b), TENSOR_SYMBOL_LIST(c), "cmul");
ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
ccv_nnc_graph_t* graph = 0;
ccv_nnc_tensor_arena_t* tensor_arena = 0;
ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 1, 70000, 8, 16), 0);
ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 1, 70000, 1, 16), 0);
dsfmt_t dsfmt;
dsfmt_init_gen_rand(&dsfmt, 0);
int i;
for (i = 0; i < 1 * 70000 * 8 * 16; i++)
x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
for (i = 0; i < 1 * 70000 * 1 * 16; i++)
y_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(a_tensor), 0);
ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y_tensor), TENSOR_LIST(b_tensor), 0);
ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
ccv_nnc_tensor_t* const z_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 1, 70000, 8, 16), 0);
ccv_nnc_tensor_t* const c_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, c);
ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c_tensor), TENSOR_LIST(z_tensor), 0);
ccv_nnc_tensor_t* const tz = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 1, 70000, 8, 16), 0);
ccv_nnc_cmd_exec(CMD_CMUL_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor, y_tensor), TENSOR_LIST(tz), 0);
REQUIRE_TENSOR_EQ(tz, z_tensor, "gelu from cudnn should match from CPU");
ccv_nnc_tensor_free(x_tensor);
ccv_nnc_tensor_free(y_tensor);
ccv_nnc_tensor_free(z_tensor);
ccv_nnc_tensor_free(tz);
ccv_nnc_graph_free(graph);
ccv_nnc_tensor_arena_free(tensor_arena);
ccv_nnc_graph_exec_arena_free(graph_exec_arena);
ccv_nnc_symbolic_graph_free(symbolic_graph);
}

TEST_CASE("cmul in float, broadcast semantics with longer than 65535 sequence and more than 1 batch size")
{
GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CMUL_FORWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_CMUL_FORWARD, CCV_NNC_BACKEND_MPS));
ccv_nnc_symbolic_graph_t* const symbolic_graph = ccv_nnc_symbolic_graph_new();
ccv_nnc_tensor_symbol_t a = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 2, 40000, 8, 16), "a");
ccv_nnc_tensor_symbol_t b = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 1, 40000, 1, 16), "b");
ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(symbolic_graph, GPU_TENSOR_NCHW(000, 32F, 2, 40000, 8, 16), "c");
ccv_nnc_graph_exec_symbol_new(symbolic_graph, CMD_CMUL_FORWARD(), TENSOR_SYMBOL_LIST(a, b), TENSOR_SYMBOL_LIST(c), "cmul");
ccv_nnc_graph_exec_symbol_autogen(symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
SYMBOLIC_GRAPH_GEN(symbolic_graph, CCV_NNC_LONG_DOT_GRAPH);
ccv_nnc_graph_t* graph = 0;
ccv_nnc_tensor_arena_t* tensor_arena = 0;
ccv_nnc_graph_exec_arena_t* graph_exec_arena = 0;
ccv_nnc_symbolic_graph_compile(symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(symbolic_graph), &graph, &tensor_arena, &graph_exec_arena);
GRAPH_GEN(graph, CCV_NNC_LONG_DOT_GRAPH);
ccv_nnc_tensor_t* const x_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2, 40000, 8, 16), 0);
ccv_nnc_tensor_t* const y_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 1, 40000, 1, 16), 0);
dsfmt_t dsfmt;
dsfmt_init_gen_rand(&dsfmt, 0);
int i;
for (i = 0; i < 2 * 40000 * 8 * 16; i++)
x_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
for (i = 0; i < 1 * 40000 * 1 * 16; i++)
y_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
ccv_nnc_tensor_t* const a_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, a);
ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor), TENSOR_LIST(a_tensor), 0);
ccv_nnc_tensor_t* const b_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, b);
ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(y_tensor), TENSOR_LIST(b_tensor), 0);
ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
ccv_nnc_tensor_t* const z_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2, 40000, 8, 16), 0);
ccv_nnc_tensor_t* const c_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, c);
ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c_tensor), TENSOR_LIST(z_tensor), 0);
ccv_nnc_tensor_t* const tz = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 2, 40000, 8, 16), 0);
ccv_nnc_cmd_exec(CMD_CMUL_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(x_tensor, y_tensor), TENSOR_LIST(tz), 0);
REQUIRE_TENSOR_EQ(tz, z_tensor, "gelu from cudnn should match from CPU");
ccv_nnc_tensor_free(x_tensor);
ccv_nnc_tensor_free(y_tensor);
ccv_nnc_tensor_free(z_tensor);
ccv_nnc_tensor_free(tz);
ccv_nnc_graph_free(graph);
ccv_nnc_tensor_arena_free(tensor_arena);
ccv_nnc_graph_exec_arena_free(graph_exec_arena);
ccv_nnc_symbolic_graph_free(symbolic_graph);
}

TEST_CASE("cmul gradient in float")
{
GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CMUL_BACKWARD, CCV_NNC_BACKEND_GPU_REF) || ccv_nnc_cmd_ok(CCV_NNC_CMUL_BACKWARD, CCV_NNC_BACKEND_MPS));
Expand Down

0 comments on commit f3a29ea

Please sign in to comment.