diff --git a/cub/device/dispatch/dispatch_histogram.cuh b/cub/device/dispatch/dispatch_histogram.cuh index 6962a89890..f91ef31d83 100644 --- a/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/device/dispatch/dispatch_histogram.cuh @@ -41,6 +41,7 @@ #include "../../agent/agent_histogram.cuh" #include "../../util_debug.cuh" #include "../../util_device.cuh" +#include "../../util_math.cuh" #include "../../thread/thread_search.cuh" #include "../../grid/grid_queue.cuh" #include "../../config.cuh" @@ -518,7 +519,7 @@ struct DipatchHistogram // Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy int pixels_per_tile = histogram_sweep_config.block_threads * histogram_sweep_config.pixels_per_thread; - int tiles_per_row = int(num_row_pixels + pixels_per_tile - 1) / pixels_per_tile; + int tiles_per_row = static_cast(cub::DivideAndRoundUp(num_row_pixels, pixels_per_tile)); int blocks_per_row = CUB_MIN(histogram_sweep_occupancy, tiles_per_row); int blocks_per_col = (blocks_per_row > 0) ? int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) : diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh index 8bb6872ccd..41eb1d2bcd 100644 --- a/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/device/dispatch/dispatch_radix_sort.cuh @@ -48,6 +48,7 @@ #include "../../util_type.cuh" #include "../../util_debug.cuh" #include "../../util_device.cuh" +#include "../../util_math.cuh" #include @@ -1261,10 +1262,9 @@ struct DispatchRadixSort : // parts handle inputs with >=2**30 elements, due to the way lookback works // for testing purposes, one part is <= 2**28 elements const int PART_SIZE = ((1 << 28) - 1) / ONESWEEP_TILE_ITEMS * ONESWEEP_TILE_ITEMS; - int num_passes = (end_bit - begin_bit + RADIX_BITS - 1) / RADIX_BITS; - int num_parts = ((long long)num_items + PART_SIZE - 1) / PART_SIZE; - OffsetT max_num_blocks = (CUB_MIN(num_items, PART_SIZE) + ONESWEEP_TILE_ITEMS - 1) / - ONESWEEP_TILE_ITEMS; + int num_passes = cub::DivideAndRoundUp(end_bit - begin_bit, RADIX_BITS); + int num_parts = static_cast(cub::DivideAndRoundUp(num_items, PART_SIZE)); + OffsetT max_num_blocks = cub::DivideAndRoundUp(CUB_MIN(num_items, PART_SIZE), ONESWEEP_TILE_ITEMS); size_t value_size = KEYS_ONLY ? 0 : sizeof(ValueT); size_t allocation_sizes[] = @@ -1341,7 +1341,7 @@ struct DispatchRadixSort : for (int part = 0; part < num_parts; ++part) { int part_num_items = CUB_MIN(num_items - part * PART_SIZE, PART_SIZE); - int num_blocks = (part_num_items + ONESWEEP_TILE_ITEMS - 1) / ONESWEEP_TILE_ITEMS; + int num_blocks = cub::DivideAndRoundUp(part_num_items, ONESWEEP_TILE_ITEMS); if (CubDebug(error = cudaMemsetAsync( d_lookback, 0, num_blocks * RADIX_DIGITS * sizeof(AtomicOffsetT), stream))) break; @@ -1466,7 +1466,7 @@ struct DispatchRadixSort : // Pass planning. Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size int num_bits = end_bit - begin_bit; - int num_passes = (num_bits + pass_config.radix_bits - 1) / pass_config.radix_bits; + int num_passes = cub::DivideAndRoundUp(num_bits, pass_config.radix_bits); bool is_num_passes_odd = num_passes & 1; int max_alt_passes = (num_passes * pass_config.radix_bits) - num_bits; int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits)); diff --git a/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/device/dispatch/dispatch_reduce_by_key.cuh index 09b531e081..b22fb78323 100644 --- a/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -43,6 +43,7 @@ #include "../../thread/thread_operators.cuh" #include "../../grid/grid_queue.cuh" #include "../../util_device.cuh" +#include "../../util_math.cuh" #include @@ -309,7 +310,7 @@ struct DispatchReduceByKey // Number of input tiles int tile_size = reduce_by_key_config.block_threads * reduce_by_key_config.items_per_thread; - int num_tiles = (num_items + tile_size - 1) / tile_size; + int num_tiles = static_cast(cub::DivideAndRoundUp(num_items, tile_size)); // Specify temporary storage allocation requirements size_t allocation_sizes[1]; @@ -329,7 +330,7 @@ struct DispatchReduceByKey if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; // Log init_kernel configuration - int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS); + int init_grid_size = CUB_MAX(1, cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS)); if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); // Invoke init_kernel to initialize tile descriptors diff --git a/cub/device/dispatch/dispatch_rle.cuh b/cub/device/dispatch/dispatch_rle.cuh index c4b11038b5..25bdb7abc1 100644 --- a/cub/device/dispatch/dispatch_rle.cuh +++ b/cub/device/dispatch/dispatch_rle.cuh @@ -43,6 +43,7 @@ #include "../../thread/thread_operators.cuh" #include "../../grid/grid_queue.cuh" #include "../../util_device.cuh" +#include "../../util_math.cuh" #include @@ -291,7 +292,7 @@ struct DeviceRleDispatch // Number of input tiles int tile_size = device_rle_config.block_threads * device_rle_config.items_per_thread; - int num_tiles = (num_items + tile_size - 1) / tile_size; + int num_tiles = static_cast(cub::DivideAndRoundUp(num_items, tile_size)); // Specify temporary storage allocation requirements size_t allocation_sizes[1]; @@ -311,7 +312,7 @@ struct DeviceRleDispatch if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; // Log device_scan_init_kernel configuration - int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS); + int init_grid_size = CUB_MAX(1, cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS)); if (debug_synchronous) _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors @@ -346,7 +347,7 @@ struct DeviceRleDispatch // Get grid size for scanning tiles dim3 scan_grid_size; scan_grid_size.z = 1; - scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x; + scan_grid_size.y = cub::DivideAndRoundUp(num_tiles, max_dim_x); scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); // Log device_rle_sweep_kernel configuration diff --git a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh index 197f4898da..c0c6d5992a 100644 --- a/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/device/dispatch/dispatch_scan.cuh @@ -43,6 +43,7 @@ #include "../../config.cuh" #include "../../util_debug.cuh" #include "../../util_device.cuh" +#include "../../util_math.cuh" #include @@ -296,7 +297,7 @@ struct DispatchScan: // Number of input tiles int tile_size = Policy::BLOCK_THREADS * Policy::ITEMS_PER_THREAD; - int num_tiles = static_cast((num_items + tile_size - 1) / tile_size); + int num_tiles = static_cast(cub::DivideAndRoundUp(num_items, tile_size)); // Specify temporary storage allocation requirements size_t allocation_sizes[1]; @@ -320,7 +321,7 @@ struct DispatchScan: if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; // Log init_kernel configuration - int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS; + int init_grid_size = cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS); if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); // Invoke init_kernel to initialize tile descriptors diff --git a/cub/device/dispatch/dispatch_select_if.cuh b/cub/device/dispatch/dispatch_select_if.cuh index a1d8c453f0..ebb6b5b98c 100644 --- a/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/device/dispatch/dispatch_select_if.cuh @@ -43,6 +43,7 @@ #include "../../thread/thread_operators.cuh" #include "../../grid/grid_queue.cuh" #include "../../util_device.cuh" +#include "../../util_math.cuh" #include @@ -297,7 +298,7 @@ struct DispatchSelectIf // Number of input tiles int tile_size = select_if_config.block_threads * select_if_config.items_per_thread; - int num_tiles = (num_items + tile_size - 1) / tile_size; + int num_tiles = static_cast(cub::DivideAndRoundUp(num_items, tile_size)); // Specify temporary storage allocation requirements size_t allocation_sizes[1]; @@ -317,7 +318,7 @@ struct DispatchSelectIf if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; // Log scan_init_kernel configuration - int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS); + int init_grid_size = CUB_MAX(1, cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS)); if (debug_synchronous) _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); // Invoke scan_init_kernel to initialize tile descriptors @@ -352,7 +353,7 @@ struct DispatchSelectIf // Get grid size for scanning tiles dim3 scan_grid_size; scan_grid_size.z = 1; - scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x; + scan_grid_size.y = cub::DivideAndRoundUp(num_tiles, max_dim_x); scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); // Log select_if_kernel configuration diff --git a/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/device/dispatch/dispatch_spmv_orig.cuh index a5095daf19..e821ff23e3 100644 --- a/cub/device/dispatch/dispatch_spmv_orig.cuh +++ b/cub/device/dispatch/dispatch_spmv_orig.cuh @@ -43,6 +43,7 @@ #include "../../util_type.cuh" #include "../../util_debug.cuh" #include "../../util_device.cuh" +#include "../../util_math.cuh" #include "../../thread/thread_search.cuh" #include "../../grid/grid_queue.cuh" #include "../../config.cuh" @@ -510,8 +511,8 @@ struct DispatchSpmv } // Get search/init grid dims - int degen_col_kernel_block_size = INIT_KERNEL_THREADS; - int degen_col_kernel_grid_size = (spmv_params.num_rows + degen_col_kernel_block_size - 1) / degen_col_kernel_block_size; + int degen_col_kernel_block_size = INIT_KERNEL_THREADS; + int degen_col_kernel_grid_size = cub::DivideAndRoundUp(spmv_params.num_rows, degen_col_kernel_block_size); if (debug_synchronous) _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n", degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream); @@ -552,8 +553,8 @@ struct DispatchSpmv int segment_fixup_tile_size = segment_fixup_config.block_threads * segment_fixup_config.items_per_thread; // Number of tiles for kernels - int num_merge_tiles = (num_merge_items + merge_tile_size - 1) / merge_tile_size; - int num_segment_fixup_tiles = (num_merge_tiles + segment_fixup_tile_size - 1) / segment_fixup_tile_size; + int num_merge_tiles = cub::DivideAndRoundUp(num_merge_items, merge_tile_size); + int num_segment_fixup_tiles = cub::DivideAndRoundUp(num_merge_tiles, segment_fixup_tile_size); // Get SM occupancy for kernels int spmv_sm_occupancy; @@ -571,12 +572,12 @@ struct DispatchSpmv // Get grid dimensions dim3 spmv_grid_size( CUB_MIN(num_merge_tiles, max_dim_x), - (num_merge_tiles + max_dim_x - 1) / max_dim_x, + cub::DivideAndRoundUp(num_merge_tiles, max_dim_x), 1); dim3 segment_fixup_grid_size( CUB_MIN(num_segment_fixup_tiles, max_dim_x), - (num_segment_fixup_tiles + max_dim_x - 1) / max_dim_x, + cub::DivideAndRoundUp(num_segment_fixup_tiles, max_dim_x), 1); // Get the temporary storage allocation requirements @@ -604,7 +605,7 @@ struct DispatchSpmv // Get search/init grid dims int search_block_size = INIT_KERNEL_THREADS; - int search_grid_size = (num_merge_tiles + 1 + search_block_size - 1) / search_block_size; + int search_grid_size = cub::DivideAndRoundUp(num_merge_tiles + 1, search_block_size); #if CUB_INCLUDE_HOST_CODE if (CUB_IS_HOST_CODE) diff --git a/cub/grid/grid_even_share.cuh b/cub/grid/grid_even_share.cuh index f6eaf7e90b..fcbf370c89 100644 --- a/cub/grid/grid_even_share.cuh +++ b/cub/grid/grid_even_share.cuh @@ -37,6 +37,7 @@ #include "../config.cuh" #include "../util_namespace.cuh" #include "../util_macro.cuh" +#include "../util_math.cuh" #include "../util_type.cuh" #include "grid_mapping.cuh" @@ -129,7 +130,7 @@ public: this->block_offset = num_items_; // Initialize past-the-end this->block_end = num_items_; // Initialize past-the-end this->num_items = num_items_; - this->total_tiles = (num_items_ + tile_items - 1) / tile_items; + this->total_tiles = cub::DivideAndRoundUp(num_items_, tile_items); this->grid_size = CUB_MIN(static_cast(total_tiles), max_grid_size); OffsetT avg_tiles_per_block = total_tiles / grid_size; // leftover grains go to big blocks diff --git a/cub/util_math.cuh b/cub/util_math.cuh new file mode 100644 index 0000000000..21bf843e12 --- /dev/null +++ b/cub/util_math.cuh @@ -0,0 +1,75 @@ +/****************************************************************************** + * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Define helper math functions. + */ + +#pragma once + +#include + +#include "util_namespace.cuh" + +// Optional outer namespace(s) +CUB_NS_PREFIX + +// CUB namespace +namespace cub +{ + +namespace detail +{ + +template +using is_integral_or_enum = + std::integral_constant::value || std::is_enum::value>; + +} + +/** + * Divide n by d, round up if any remainder, and return the result. + * + * Effectively performs `(n + d - 1) / d`, but is robust against the case where + * `(n + d - 1)` would overflow. + */ +template +__host__ __device__ __forceinline__ constexpr NumeratorT +DivideAndRoundUp(NumeratorT n, DenominatorT d) +{ + static_assert(cub::detail::is_integral_or_enum::value && + cub::detail::is_integral_or_enum::value, + "DivideAndRoundUp is only intended for integral types."); + + // Static cast to undo integral promotion. + return static_cast(n / d + (n % d != 0 ? 1 : 0)); +} + +} // namespace cub +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/test/test_device_radix_sort.cu b/test/test_device_radix_sort.cu index 98ab16ed6b..1bc5b13581 100644 --- a/test/test_device_radix_sort.cu +++ b/test/test_device_radix_sort.cu @@ -42,6 +42,7 @@ #endif #include +#include #include #include @@ -1035,8 +1036,10 @@ void TestSizes( int max_items, int max_segments) { - for (int num_items = max_items; num_items > 1; num_items = (num_items + 32 - 1) / 32) - { + for (int num_items = max_items; + num_items > 1; + num_items = cub::DivideAndRoundUp(num_items, 32)) + { TestSegments(h_keys, num_items, max_segments); } TestSegments(h_keys, 1, max_segments); diff --git a/test/test_device_reduce.cu b/test/test_device_reduce.cu index 2e8109114a..49e6faea97 100644 --- a/test/test_device_reduce.cu +++ b/test/test_device_reduce.cu @@ -41,6 +41,7 @@ #include #include +#include #include #include #include @@ -1085,7 +1086,7 @@ void TestByBackend( // Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment int max_items_per_segment = 128000; - for (int num_segments = (num_items + max_items_per_segment - 1) / max_items_per_segment; + for (int num_segments = cub::DivideAndRoundUp(num_items, max_items_per_segment); num_segments < max_segments; num_segments = (num_segments * 32) + 1) { diff --git a/test/test_util.h b/test/test_util.h index e90016d3ec..98d10e786f 100644 --- a/test/test_util.h +++ b/test/test_util.h @@ -54,6 +54,7 @@ #include "cub/util_device.cuh" #include "cub/util_type.cuh" #include "cub/util_macro.cuh" +#include "cub/util_math.cuh" #include "cub/iterator/discard_output_iterator.cuh" /****************************************************************************** @@ -1560,7 +1561,7 @@ void InitializeSegments( if (num_segments <= 0) return; - unsigned int expected_segment_length = (num_items + num_segments - 1) / num_segments; + unsigned int expected_segment_length = cub::DivideAndRoundUp(num_items, num_segments); int offset = 0; for (int i = 0; i < num_segments; ++i) {