Finding Maximum Value and Index with CUDA warp-reduction
When working with large arrays on the GPU, a common task is to find the maximum value along with its index. Doing this efficiently requires a reduction pattern that works across threads, warps, and blocks.
Approach
We represent the result as a simple struct:
struct ArgMaxResult {
float value;
uint32_t index;
};Each thread keeps a local maximum while walking through its assigned strike in the array (fixed grid size during kernel call):
constexpr float FLT_LOWEST = std::numeric_limits<float>::lowest();
// [...]
ArgMaxResult local{.value = FLT_LOWEST, .index = 0};
uint32_t stride = blockDim.x * gridDim.x;
uint32_t startIdx = threadIdx.x + blockDim.x * blockIdx.x;
for (uint32_t idx = startIdx; idx < nElements; idx += stride) {
float v = d_inputArray[idx];
if (v > local.value) local = {v, idx};
}This step collapses as much work as possible into registers before communicating with other threads. This also allows us to use a fixed grid size independently of the array length which is beneficial later to avoid recursive reductions.
Warp level reduction
Each warp reduces its threads’ local maxima using warp shuffle primitives:
__device__ __inline__ ArgMaxResult WarpReduce(ArgMaxResult local) {
for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
float otherValue = __shfl_down_sync(0xFFFFFFFF, local.value, offset);
uint32_t otherIndex = __shfl_down_sync(0xFFFFFFFF, local.index, offset);
if (otherValue > local.value) {
local.index = otherIndex;
local.value = otherValue;
}
}
return local;
}- Each warp ends up with a single “winning” (value, index) in the first lane.
- Tie-breaking can be added by comparing indices for equal values.
Block level reduction
Warp leaders write their results to shared memory, then a single warp reduces these to produce a block-wide maximum.
if (lid == 0) s_argMax[wid] = local;
__syncthreads();
if (wid == 0) {
local = (lid < nw) ? s_argMax[lid] : ArgMaxResult{.value = FLT_LOWEST, .index = 0};
local = WarpReduce(local);
if (lid == 0) d_blockWiseResult[blockIdx.x] = local;
}So the full kernel for this first round looks like this:
__global__ void ArgMaxKernel(ArgMaxResult *__restrict__ d_blockWiseResult,
const float *__restrict__ d_inputArray,
const uint32_t nElements) {
ArgMaxResult local{.value = FLT_LOWEST, .index = 0};
const uint32_t idxStart = threadIdx.x + blockIdx.x * blockDim.x;
const uint32_t stride = blockDim.x * gridDim.x;
// first, each thread walks over the vector with the stride to collapse as
// much as possible into the registers
for (uint32_t idx = idxStart; idx < nElements; idx += stride) {
const float value = d_inputArray[idx];
if (value > local.value) {
local = {.value = value, .index = idx};
}
}
// now each warp does a reduction of the most relevant element into lid == 0
const int lid = threadIdx.x % WARP_SIZE;
const int wid = threadIdx.x / WARP_SIZE;
const int nw = (blockDim.x + WARP_SIZE - 1) / WARP_SIZE;
extern __shared__ ArgMaxResult s_argMax[];
local = WarpReduce(local);
if (lid == 0) {
s_argMax[wid] = local;
}
__syncthreads();
if (wid == 0) {
local =
(lid < nw) ? s_argMax[lid] : ArgMaxResult{.value = FLT_LOWEST, .index = 0};
local = WarpReduce(local);
if (lid == 0) {
d_blockWiseResult[blockIdx.x] = local;
}
}
}Final reduction
After all blocks have produced their maxima, a second kernel reduces the block-wise results into a single global maximum and its index.
- This two-level reduction avoids global memory contention.
- Using a struct ensures that the value and index are stored together, keeping memory accesses coalesced and code readable.
// launched with grid size == 1
__global__ void FindMaxOfMax(ArgMaxResult *__restrict__ d_blockWiseResult,
const uint32_t nElements) {
const uint32_t idxStart = threadIdx.x;
const uint32_t stride = blockDim.x;
ArgMaxResult local{.value = FLT_MIN, .index = 0};
for (uint32_t idx = idxStart; idx < nElements; idx += stride) {
ArgMaxResult curr = d_blockWiseResult[idx];
if (curr.value > local.value)
local = curr;
}
// now each warp does a reduction of the most relevant element into lid == 0
const int lid = threadIdx.x % WARP_SIZE;
const int wid = threadIdx.x / WARP_SIZE;
const int nw = (blockDim.x + WARP_SIZE - 1) / WARP_SIZE;
extern __shared__ ArgMaxResult s_argMax[];
local = WarpReduce(local);
if (lid == 0) {
s_argMax[wid] = local;
}
__syncthreads();
if (wid == 0) {
local =
(lid < nw) ? s_argMax[lid] : ArgMaxResult{.value = FLT_MIN, .index = 0};
local = WarpReduce(local);
if (lid == 0) {
d_blockWiseResult[0] = local;
}
}
}The full code including boilerplate can be found at the bottom of the page.
Results
- Tested against host-side computation for arrays up to 32 million elements.
- The GPU result matches the CPU result exactly.
- Memory-efficient: minimal shared memory and no extra temporary arrays.
- Fully low-level: demonstrates warp primitives, shared memory, and reduction patterns without relying on high-level libraries.
- Is memory bandwidth limited when inspected using NVIDIA NSight Compute (90% of maximum VRAM rate which is 391 Gb of used RTX3070)
Minimal working example
Full code snippet
// retrieve the maximum value and index from an array
#include <cstdint>
#include <iostream>
#include <limits>
#include <stdexcept>
constexpr float FLT_LOWEST = -std::numeric_limits<float>::lowest();
constexpr int WARP_SIZE = 32;
#define CUDA_CHECK_LOG() \
{ \
const cudaError_t err = cudaGetLastError(); \
if (err != cudaSuccess) { \
std::cout << "A CUDA error was caught in line " \
<< std::to_string(__LINE__) << ": " << cudaGetErrorString(err) \
<< std::endl; \
std::abort(); \
} \
}
#define CUDA_SAFE_CALL_LOG(func) \
{ \
const cudaError_t err = func; \
if (err != cudaSuccess) { \
std::cout << "An CUDA error occured in line " \
<< std::to_string(__LINE__) << ": " << cudaGetErrorString(err) \
<< std::endl; \
std::abort(); \
} \
}
struct ArgMaxResult {
float value;
uint32_t index;
};
__device__ __inline__ ArgMaxResult WarpReduce(ArgMaxResult local) {
for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
float otherValue = __shfl_down_sync(0xFFFFFFFF, local.value, offset);
uint32_t otherIndex = __shfl_down_sync(0xFFFFFFFF, local.index, offset);
if (otherValue > local.value) {
local.index = otherIndex;
local.value = otherValue;
}
}
return local;
}
__global__ void ArgMaxKernel(ArgMaxResult *__restrict__ d_blockWiseResult,
const float *__restrict__ d_inputArray,
const uint32_t nElements) {
ArgMaxResult local{.value = FLT_LOWEST, .index = 0};
const uint32_t idxStart = threadIdx.x + blockIdx.x * blockDim.x;
const uint32_t stride = blockDim.x * gridDim.x;
// first, each thread walks over the vector with the stride to collapse as
// much as possible into the registers
for (uint32_t idx = idxStart; idx < nElements; idx += stride) {
const float value = d_inputArray[idx];
if (value > local.value) {
local = {.value = value, .index = idx};
}
}
// now each warp does a reduction of the most relevant element into lid == 0
const int lid = threadIdx.x % WARP_SIZE;
const int wid = threadIdx.x / WARP_SIZE;
const int nw = (blockDim.x + WARP_SIZE - 1) / WARP_SIZE;
extern __shared__ ArgMaxResult s_argMax[];
local = WarpReduce(local);
if (lid == 0) {
s_argMax[wid] = local;
}
__syncthreads();
if (wid == 0) {
local = (lid < nw) ? s_argMax[lid]
: ArgMaxResult{.value = FLT_LOWEST, .index = 0};
local = WarpReduce(local);
if (lid == 0) {
d_blockWiseResult[blockIdx.x] = local;
}
}
__syncthreads();
}
__global__ void FindMaxOfMax(ArgMaxResult *__restrict__ d_blockWiseResult,
const uint32_t nElements) {
const uint32_t idxStart = threadIdx.x;
const uint32_t stride = blockDim.x;
ArgMaxResult local{.value = FLT_LOWEST, .index = 0};
for (uint32_t idx = idxStart; idx < nElements; idx += stride) {
ArgMaxResult curr = d_blockWiseResult[idx];
if (curr.value > local.value)
local = curr;
}
// now each warp does a reduction of the most relevant element into lid == 0
const int lid = threadIdx.x % WARP_SIZE;
const int wid = threadIdx.x / WARP_SIZE;
const int nw = (blockDim.x + WARP_SIZE - 1) / WARP_SIZE;
extern __shared__ ArgMaxResult s_argMax[];
local = WarpReduce(local);
if (lid == 0) {
s_argMax[wid] = local;
}
__syncthreads();
if (wid == 0) {
local = (lid < nw) ? s_argMax[lid]
: ArgMaxResult{.value = FLT_LOWEST, .index = 0};
local = WarpReduce(local);
if (lid == 0) {
d_blockWiseResult[0] = local;
}
}
}
int main() {
std::cout << "### argmax ###" << std::endl;
constexpr uint32_t nElements = 1024 * 1024 * 32;
float *d_data;
CUDA_SAFE_CALL_LOG(cudaMalloc(&d_data, nElements * sizeof(float)));
float *h_data;
CUDA_SAFE_CALL_LOG(cudaMallocHost(&h_data, nElements * sizeof(float)));
float maxValue = FLT_LOWEST;
uint32_t index = 0;
for (uint32_t iElement = 0; iElement < nElements; iElement++) {
const float value = -static_cast<float>(iElement % 255) * 10.0f +
static_cast<float>(iElement / 1024);
h_data[iElement] = value;
if (value > maxValue) {
maxValue = value;
index = iElement;
}
}
CUDA_SAFE_CALL_LOG(cudaMemcpy(d_data, h_data, nElements * sizeof(float),
cudaMemcpyHostToDevice));
cudaDeviceProp props;
CUDA_SAFE_CALL_LOG(cudaGetDeviceProperties(&props, 0));
if (props.warpSize != WARP_SIZE) {
throw std::runtime_error("invalid warp size (must be 32)");
}
// each block performs a two level warp reduction. while the ideal size would
// be
constexpr int blockSize = WARP_SIZE * WARP_SIZE;
if (blockSize % WARP_SIZE != 0) {
throw std::runtime_error("block size should remain multiple of warp size");
}
const int gridSize = props.multiProcessorCount;
ArgMaxResult *d_blockMax;
CUDA_SAFE_CALL_LOG(cudaMalloc(&d_blockMax, sizeof(ArgMaxResult) * gridSize));
{
const int nSharedBytes = (blockSize / WARP_SIZE) * sizeof(ArgMaxResult);
std::cout << "gs: " << gridSize << ", bs: " << blockSize
<< ", sm: " << nSharedBytes << std::endl;
ArgMaxKernel<<<gridSize, blockSize, nSharedBytes>>>(d_blockMax, d_data,
nElements);
CUDA_CHECK_LOG();
CUDA_SAFE_CALL_LOG(cudaDeviceSynchronize());
}
{
// todo here we potentially do too much work at the moment
const int blockSizeReduction = std::min(
blockSize, ((gridSize + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE);
const int nSharedBytes =
(blockSizeReduction / WARP_SIZE) * sizeof(ArgMaxResult);
FindMaxOfMax<<<1, blockSizeReduction, nSharedBytes>>>(d_blockMax, gridSize);
CUDA_CHECK_LOG();
CUDA_SAFE_CALL_LOG(cudaDeviceSynchronize());
}
// the result is now in the first element
ArgMaxResult *h_res;
CUDA_SAFE_CALL_LOG(cudaMallocHost(&h_res, sizeof(ArgMaxResult)));
CUDA_SAFE_CALL_LOG(cudaMemcpy(h_res, d_blockMax, sizeof(ArgMaxResult),
cudaMemcpyDeviceToHost));
if (h_res[0].value == maxValue && index == h_res[0].index) {
std::cout << "looks good" << std::endl;
} else {
std::cout << "results differ" << std::endl;
}
CUDA_SAFE_CALL_LOG(cudaFree(d_data));
CUDA_SAFE_CALL_LOG(cudaFreeHost(h_data));
CUDA_SAFE_CALL_LOG(cudaFreeHost(h_res));
CUDA_SAFE_CALL_LOG(cudaFree(d_blockMax));
return 0;
}Can be compiled and run using nvcc <filename>.cu -o argmax-test && ./argmax-test.