From da950dc656fb28ca638713b28aa65c2449823025 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Fri, 29 Oct 2021 16:06:09 +0300 Subject: [PATCH 1/2] Optimize compilation time for the common case --- cub/device/dispatch/dispatch_merge_sort.cuh | 93 ++++++++++++++------- 1 file changed, 61 insertions(+), 32 deletions(-) diff --git a/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/device/dispatch/dispatch_merge_sort.cuh index 244b9a504c..42cdffb496 100644 --- a/cub/device/dispatch/dispatch_merge_sort.cuh +++ b/cub/device/dispatch/dispatch_merge_sort.cuh @@ -27,10 +27,10 @@ #pragma once -#include "../../util_math.cuh" -#include "../../util_device.cuh" -#include "../../util_namespace.cuh" -#include "../../agent/agent_merge_sort.cuh" +#include +#include +#include +#include #include #include @@ -245,7 +245,8 @@ template + typename ValueT, + bool AgentFitsIntoDefaultShmemSize> struct BlockSortLauncher { int num_tiles; @@ -309,12 +310,15 @@ struct BlockSortLauncher template CUB_RUNTIME_FUNCTION __forceinline__ void launch_impl() const { + constexpr bool use_vshmem = (AgentFitsIntoDefaultShmemSize == false) && + UseVShmem; + THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( num_tiles, ActivePolicyT::MergeSortPolicy::BLOCK_THREADS, - block_sort_shmem_size, + use_vshmem ? 0 : block_sort_shmem_size, stream) - .doit(DeviceMergeSortBlockSortKernel + typename ValueT, + bool AgentFitsIntoDefaultShmemSize> struct MergeLauncher { int num_tiles; @@ -402,12 +407,15 @@ struct MergeLauncher CUB_RUNTIME_FUNCTION __forceinline__ void launch_impl(bool ping, OffsetT target_merged_tiles_number) const { + constexpr bool use_vshmem = (AgentFitsIntoDefaultShmemSize == false) && + UseVShmem; + THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( num_tiles, ActivePolicyT::MergeSortPolicy::BLOCK_THREADS, - merge_shmem_size, + use_vshmem ? 0 : merge_shmem_size, stream) - .doit(DeviceMergeSortMergeKernel(BlockSortAgentT::SHARED_MEMORY_SIZE); + constexpr bool block_sort_fits_into_default_shmem = + block_sort_shmem_size < default_shared_memory_size; - const auto merge_shmem_size = + constexpr auto merge_shmem_size = static_cast(MergeAgentT::SHARED_MEMORY_SIZE); + constexpr bool merge_fits_into_default_shmem = merge_shmem_size < + default_shared_memory_size; + constexpr bool runtime_shmem_size_check_is_required = + !(merge_fits_into_default_shmem && block_sort_fits_into_default_shmem); const auto merge_partitions_size = static_cast(1 + num_tiles) * sizeof(OffsetT); @@ -570,10 +577,32 @@ struct DispatchMergeSort : SelectedPolicy const auto temporary_values_storage_size = static_cast(num_items * sizeof(ValueT)) * !KEYS_ONLY; - const auto virtual_shared_memory_size = - vshmem_size(static_cast(max_shmem), - (cub::max)(block_sort_shmem_size, merge_shmem_size), - static_cast(num_tiles)); + std::size_t virtual_shared_memory_size = 0; + bool block_sort_requires_vshmem = false; + bool merge_requires_vshmem = false; + + if (runtime_shmem_size_check_is_required) + { + int max_shmem = 0; + if (CubDebug( + error = cudaDeviceGetAttribute(&max_shmem, + cudaDevAttrMaxSharedMemoryPerBlock, + device_ordinal))) + { + break; + } + + block_sort_requires_vshmem = block_sort_shmem_size > + static_cast(max_shmem); + merge_requires_vshmem = merge_shmem_size > + static_cast(max_shmem); + + virtual_shared_memory_size = + vshmem_size(static_cast(max_shmem), + (cub::max)(block_sort_shmem_size, merge_shmem_size), + static_cast(num_tiles)); + } + void *allocations[4] = {nullptr, nullptr, nullptr, nullptr}; std::size_t allocation_sizes[4] = {merge_partitions_size, @@ -633,11 +662,10 @@ struct DispatchMergeSort : SelectedPolicy ActivePolicyT, CompareOpT, KeyT, - ValueT> + ValueT, + block_sort_fits_into_default_shmem> block_sort_launcher(static_cast(num_tiles), - virtual_shared_memory_size > 0 - ? 0 - : block_sort_shmem_size, + block_sort_shmem_size, ping, d_input_keys, d_input_items, @@ -648,7 +676,7 @@ struct DispatchMergeSort : SelectedPolicy stream, keys_buffer, items_buffer, - vshmem_ptr); + block_sort_requires_vshmem ? vshmem_ptr : nullptr); block_sort_launcher.launch(); @@ -679,9 +707,10 @@ struct DispatchMergeSort : SelectedPolicy ActivePolicyT, CompareOpT, KeyT, - ValueT> + ValueT, + merge_fits_into_default_shmem> merge_launcher(static_cast(num_tiles), - virtual_shared_memory_size > 0 ? 0 : merge_shmem_size, + merge_shmem_size, d_output_keys, d_output_items, num_items, @@ -690,7 +719,7 @@ struct DispatchMergeSort : SelectedPolicy stream, keys_buffer, items_buffer, - vshmem_ptr); + merge_requires_vshmem ? vshmem_ptr : nullptr); for (int pass = 0; pass < num_passes; ++pass, ping = !ping) { From ea92d9554aa115060335965d17412d6e66fcf935 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Sat, 30 Oct 2021 16:00:00 +0300 Subject: [PATCH 2/2] Document merge sort vshmem specializations --- cub/device/dispatch/dispatch_merge_sort.cuh | 85 +++++++++++++++------ 1 file changed, 62 insertions(+), 23 deletions(-) diff --git a/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/device/dispatch/dispatch_merge_sort.cuh index 42cdffb496..d9b375ea2d 100644 --- a/cub/device/dispatch/dispatch_merge_sort.cuh +++ b/cub/device/dispatch/dispatch_merge_sort.cuh @@ -183,7 +183,7 @@ DeviceMergeSortMergeKernel(bool ping, agent.Process(); } -/****************************************************************************** +/******************************************************************************* * Policy ******************************************************************************/ @@ -192,9 +192,9 @@ struct DeviceMergeSortPolicy { using KeyT = typename std::iterator_traits::value_type; - //------------------------------------------------------------------------------ + //---------------------------------------------------------------------------- // Architecture-specific tuning policies - //------------------------------------------------------------------------------ + //---------------------------------------------------------------------------- struct Policy350 : ChainedPolicy<350, Policy350, Policy350> { @@ -448,28 +448,45 @@ struct DispatchMergeSort : SelectedPolicy using KeyT = typename std::iterator_traits::value_type; using ValueT = typename std::iterator_traits::value_type; - // Whether or not there are values to be trucked along with keys + /// Whether or not there are values to be trucked along with keys static constexpr bool KEYS_ONLY = Equals::VALUE; - //------------------------------------------------------------------------------ // Problem state - //------------------------------------------------------------------------------ - - void *d_temp_storage; ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - std::size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - KeyInputIteratorT d_input_keys; ///< [in] Pointer to the input sequence of unsorted input keys - ValueInputIteratorT d_input_items;///< [in] Pointer to the input sequence of unsorted input values - KeyIteratorT d_output_keys; ///< [out] Pointer to the output sequence of sorted input keys - ValueIteratorT d_output_items; ///< [out] Pointer to the output sequence of sorted input values - OffsetT num_items; ///< [in] Number of items to sort - CompareOpT compare_op; ///< [in] Comparison function object which returns true if the first argument is ordered before the second - cudaStream_t stream; ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - int ptx_version; - //------------------------------------------------------------------------------ - // Constructor - //------------------------------------------------------------------------------ + /// Device-accessible allocation of temporary storage. When NULL, the required + /// allocation size is written to \p temp_storage_bytes and no work is done. + void *d_temp_storage; + + /// Reference to size in bytes of \p d_temp_storage allocation + std::size_t &temp_storage_bytes; + + /// Pointer to the input sequence of unsorted input keys + KeyInputIteratorT d_input_keys; + + /// Pointer to the input sequence of unsorted input values + ValueInputIteratorT d_input_items; + + /// Pointer to the output sequence of sorted input keys + KeyIteratorT d_output_keys; + + /// Pointer to the output sequence of sorted input values + ValueIteratorT d_output_items; + + /// Number of items to sort + OffsetT num_items; + + /// Comparison function object which returns true if the first argument is + /// ordered before the second + CompareOpT compare_op; + + /// CUDA stream to launch kernels within. Default is stream0. + cudaStream_t stream; + + /// Whether or not to synchronize the stream after every kernel launch to + /// check for errors. Also causes launch configurations to be printed to the + /// console. Default is \p false. + bool debug_synchronous; + int ptx_version; CUB_RUNTIME_FUNCTION __forceinline__ std::size_t vshmem_size(std::size_t max_shmem, @@ -486,7 +503,7 @@ struct DispatchMergeSort : SelectedPolicy } } - /// Constructor + // Constructor CUB_RUNTIME_FUNCTION __forceinline__ DispatchMergeSort(void *d_temp_storage, std::size_t &temp_storage_bytes, @@ -512,7 +529,7 @@ struct DispatchMergeSort : SelectedPolicy , ptx_version(ptx_version) {} - /// Invocation + // Invocation template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke() { @@ -555,6 +572,28 @@ struct DispatchMergeSort : SelectedPolicy const auto tile_size = MergePolicyT::ITEMS_PER_TILE; const auto num_tiles = cub::DivideAndRoundUp(num_items, tile_size); + /** + * Merge sort supports large types, which can lead to excessive shared + * memory size requirements. In these cases, merge sort allocates virtual + * shared memory that resides in global memory: + * ``` + * extern __shared__ char shmem[]; + * typename AgentT::TempStorage &storage = + * *reinterpret_cast( + * UseVShmem ? vshmem + vshmem_offset : shmem); + * ``` + * Having `UseVShmem` as a runtime variable leads to the generation of + * generic loads and stores, which causes a slowdown. Therefore, + * `UseVShmem` has to be known at compilation time. + * In the generic case, available shared memory size is queried at runtime + * to check if kernels requirements are satisfied. Since the query result + * is not known at compile-time, merge sort kernels are specialized for + * both cases. + * To address increased compilation time, the dispatch layer checks + * whether kernels requirements fit into default shared memory + * size (48KB). In this case, there's no need for virtual shared + * memory specialization. + */ constexpr std::size_t default_shared_memory_size = 48 * 1024; constexpr auto block_sort_shmem_size = static_cast(BlockSortAgentT::SHARED_MEMORY_SIZE);