Skip to content

Finding Maximum Value and Index with CUDA warp-reduction

When working with large arrays on the GPU, a common task is to find the maximum value along with its index. Doing this efficiently requires a reduction pattern that works across threads, warps, and blocks.

Approach

We represent the result as a simple struct:

c++
struct ArgMaxResult {
  float value;
  uint32_t index;
};

Each thread keeps a local maximum while walking through its assigned strike in the array (fixed grid size during kernel call):

c++
constexpr float FLT_LOWEST = std::numeric_limits<float>::lowest();
// [...]
ArgMaxResult local{.value = FLT_LOWEST, .index = 0};
uint32_t stride = blockDim.x * gridDim.x;
uint32_t startIdx = threadIdx.x + blockDim.x * blockIdx.x;
for (uint32_t idx = startIdx; idx < nElements; idx += stride) {
    float v = d_inputArray[idx];
    if (v > local.value) local = {v, idx};
}

This step collapses as much work as possible into registers before communicating with other threads. This also allows us to use a fixed grid size independently of the array length which is beneficial later to avoid recursive reductions.

Warp level reduction

Each warp reduces its threads’ local maxima using warp shuffle primitives:

c++
__device__ __inline__ ArgMaxResult WarpReduce(ArgMaxResult local) {
  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
    float otherValue = __shfl_down_sync(0xFFFFFFFF, local.value, offset);
    uint32_t otherIndex = __shfl_down_sync(0xFFFFFFFF, local.index, offset);
    if (otherValue > local.value) {
      local.index = otherIndex;
      local.value = otherValue;
    }
  }
  return local;
}
  • Each warp ends up with a single “winning” (value, index) in the first lane.
  • Tie-breaking can be added by comparing indices for equal values.

Block level reduction

Warp leaders write their results to shared memory, then a single warp reduces these to produce a block-wide maximum.

c++
if (lid == 0) s_argMax[wid] = local;
__syncthreads();
if (wid == 0) {
    local = (lid < nw) ? s_argMax[lid] : ArgMaxResult{.value = FLT_LOWEST, .index = 0};
    local = WarpReduce(local);
    if (lid == 0) d_blockWiseResult[blockIdx.x] = local;
}

So the full kernel for this first round looks like this:

c++
__global__ void ArgMaxKernel(ArgMaxResult *__restrict__ d_blockWiseResult,
                             const float *__restrict__ d_inputArray,
                             const uint32_t nElements) {

  ArgMaxResult local{.value = FLT_LOWEST, .index = 0};

  const uint32_t idxStart = threadIdx.x + blockIdx.x * blockDim.x;
  const uint32_t stride = blockDim.x * gridDim.x;

  // first, each thread walks over the vector with the stride to collapse as
  // much as possible into the registers
  for (uint32_t idx = idxStart; idx < nElements; idx += stride) {
    const float value = d_inputArray[idx];
    if (value > local.value) {
      local = {.value = value, .index = idx};
    }
  }

  // now each warp does a reduction of the most relevant element into lid == 0
  const int lid = threadIdx.x % WARP_SIZE;
  const int wid = threadIdx.x / WARP_SIZE;
  const int nw = (blockDim.x + WARP_SIZE - 1) / WARP_SIZE;

  extern __shared__ ArgMaxResult s_argMax[];

  local = WarpReduce(local);
  if (lid == 0) {
    s_argMax[wid] = local;
  }
  __syncthreads();

  if (wid == 0) {
    local =
        (lid < nw) ? s_argMax[lid] : ArgMaxResult{.value = FLT_LOWEST, .index = 0};
    local = WarpReduce(local);
    if (lid == 0) {
      d_blockWiseResult[blockIdx.x] = local;
    }
  }
}

Final reduction

After all blocks have produced their maxima, a second kernel reduces the block-wise results into a single global maximum and its index.

  • This two-level reduction avoids global memory contention.
  • Using a struct ensures that the value and index are stored together, keeping memory accesses coalesced and code readable.
c++
// launched with grid size == 1
__global__ void FindMaxOfMax(ArgMaxResult *__restrict__ d_blockWiseResult,
                             const uint32_t nElements) {
  const uint32_t idxStart = threadIdx.x;
  const uint32_t stride = blockDim.x;

  ArgMaxResult local{.value = FLT_MIN, .index = 0};

  for (uint32_t idx = idxStart; idx < nElements; idx += stride) {
    ArgMaxResult curr = d_blockWiseResult[idx];
    if (curr.value > local.value)
      local = curr;
  }

  // now each warp does a reduction of the most relevant element into lid == 0
  const int lid = threadIdx.x % WARP_SIZE;
  const int wid = threadIdx.x / WARP_SIZE;
  const int nw = (blockDim.x + WARP_SIZE - 1) / WARP_SIZE;

  extern __shared__ ArgMaxResult s_argMax[];

  local = WarpReduce(local);
  if (lid == 0) {
    s_argMax[wid] = local;
  }
  __syncthreads();

  if (wid == 0) {
    local =
        (lid < nw) ? s_argMax[lid] : ArgMaxResult{.value = FLT_MIN, .index = 0};
    local = WarpReduce(local);
    if (lid == 0) {
      d_blockWiseResult[0] = local;
    }
  }
}

The full code including boilerplate can be found at the bottom of the page.

Results

  • Tested against host-side computation for arrays up to 32 million elements.
  • The GPU result matches the CPU result exactly.
  • Memory-efficient: minimal shared memory and no extra temporary arrays.
  • Fully low-level: demonstrates warp primitives, shared memory, and reduction patterns without relying on high-level libraries.
  • Is memory bandwidth limited when inspected using NVIDIA NSight Compute (90% of maximum VRAM rate which is 391 Gb of used RTX3070)

Minimal working example

Full code snippet
c++
  
// retrieve the maximum value and index from an array

#include <cstdint>
#include <iostream>
#include <limits>
#include <stdexcept>

constexpr float FLT_LOWEST = -std::numeric_limits<float>::lowest();
constexpr int WARP_SIZE = 32;

#define CUDA_CHECK_LOG()                                                       \
  {                                                                            \
    const cudaError_t err = cudaGetLastError();                                \
    if (err != cudaSuccess) {                                                  \
      std::cout << "A CUDA error was caught in line "                          \
                << std::to_string(__LINE__) << ": " << cudaGetErrorString(err) \
                << std::endl;                                                  \
      std::abort();                                                            \
    }                                                                          \
  }

#define CUDA_SAFE_CALL_LOG(func)                                               \
  {                                                                            \
    const cudaError_t err = func;                                              \
    if (err != cudaSuccess) {                                                  \
      std::cout << "An CUDA error occured in line "                            \
                << std::to_string(__LINE__) << ": " << cudaGetErrorString(err) \
                << std::endl;                                                  \
      std::abort();                                                            \
    }                                                                          \
  }

struct ArgMaxResult {
  float value;
  uint32_t index;
};

__device__ __inline__ ArgMaxResult WarpReduce(ArgMaxResult local) {
  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
    float otherValue = __shfl_down_sync(0xFFFFFFFF, local.value, offset);
    uint32_t otherIndex = __shfl_down_sync(0xFFFFFFFF, local.index, offset);
    if (otherValue > local.value) {
      local.index = otherIndex;
      local.value = otherValue;
    }
  }
  return local;
}

__global__ void ArgMaxKernel(ArgMaxResult *__restrict__ d_blockWiseResult,
                             const float *__restrict__ d_inputArray,
                             const uint32_t nElements) {

  ArgMaxResult local{.value = FLT_LOWEST, .index = 0};

  const uint32_t idxStart = threadIdx.x + blockIdx.x * blockDim.x;
  const uint32_t stride = blockDim.x * gridDim.x;

  // first, each thread walks over the vector with the stride to collapse as
  // much as possible into the registers
  for (uint32_t idx = idxStart; idx < nElements; idx += stride) {
    const float value = d_inputArray[idx];
    if (value > local.value) {
      local = {.value = value, .index = idx};
    }
  }

  // now each warp does a reduction of the most relevant element into lid == 0
  const int lid = threadIdx.x % WARP_SIZE;
  const int wid = threadIdx.x / WARP_SIZE;
  const int nw = (blockDim.x + WARP_SIZE - 1) / WARP_SIZE;

  extern __shared__ ArgMaxResult s_argMax[];

  local = WarpReduce(local);
  if (lid == 0) {
    s_argMax[wid] = local;
  }
  __syncthreads();

  if (wid == 0) {
    local = (lid < nw) ? s_argMax[lid]
                       : ArgMaxResult{.value = FLT_LOWEST, .index = 0};
    local = WarpReduce(local);
    if (lid == 0) {
      d_blockWiseResult[blockIdx.x] = local;
    }
  }
  __syncthreads();
}

__global__ void FindMaxOfMax(ArgMaxResult *__restrict__ d_blockWiseResult,
                             const uint32_t nElements) {
  const uint32_t idxStart = threadIdx.x;
  const uint32_t stride = blockDim.x;

  ArgMaxResult local{.value = FLT_LOWEST, .index = 0};

  for (uint32_t idx = idxStart; idx < nElements; idx += stride) {
    ArgMaxResult curr = d_blockWiseResult[idx];
    if (curr.value > local.value)
      local = curr;
  }

  // now each warp does a reduction of the most relevant element into lid == 0
  const int lid = threadIdx.x % WARP_SIZE;
  const int wid = threadIdx.x / WARP_SIZE;
  const int nw = (blockDim.x + WARP_SIZE - 1) / WARP_SIZE;

  extern __shared__ ArgMaxResult s_argMax[];

  local = WarpReduce(local);
  if (lid == 0) {
    s_argMax[wid] = local;
  }
  __syncthreads();

  if (wid == 0) {
    local = (lid < nw) ? s_argMax[lid]
                       : ArgMaxResult{.value = FLT_LOWEST, .index = 0};
    local = WarpReduce(local);
    if (lid == 0) {
      d_blockWiseResult[0] = local;
    }
  }
}

int main() {
  std::cout << "### argmax ###" << std::endl;

  constexpr uint32_t nElements = 1024 * 1024 * 32;

  float *d_data;
  CUDA_SAFE_CALL_LOG(cudaMalloc(&d_data, nElements * sizeof(float)));

  float *h_data;
  CUDA_SAFE_CALL_LOG(cudaMallocHost(&h_data, nElements * sizeof(float)));

  float maxValue = FLT_LOWEST;
  uint32_t index = 0;
  for (uint32_t iElement = 0; iElement < nElements; iElement++) {
    const float value = -static_cast<float>(iElement % 255) * 10.0f +
                        static_cast<float>(iElement / 1024);
    h_data[iElement] = value;
    if (value > maxValue) {
      maxValue = value;
      index = iElement;
    }
  }

  CUDA_SAFE_CALL_LOG(cudaMemcpy(d_data, h_data, nElements * sizeof(float),
                                cudaMemcpyHostToDevice));

  cudaDeviceProp props;
  CUDA_SAFE_CALL_LOG(cudaGetDeviceProperties(&props, 0));
  if (props.warpSize != WARP_SIZE) {
    throw std::runtime_error("invalid warp size (must be 32)");
  }

  // each block performs a two level warp reduction. while the ideal size would
  // be
  constexpr int blockSize = WARP_SIZE * WARP_SIZE;
  if (blockSize % WARP_SIZE != 0) {
    throw std::runtime_error("block size should remain multiple of warp size");
  }
  const int gridSize = props.multiProcessorCount;

  ArgMaxResult *d_blockMax;
  CUDA_SAFE_CALL_LOG(cudaMalloc(&d_blockMax, sizeof(ArgMaxResult) * gridSize));

  {
    const int nSharedBytes = (blockSize / WARP_SIZE) * sizeof(ArgMaxResult);
    std::cout << "gs: " << gridSize << ", bs: " << blockSize
              << ", sm: " << nSharedBytes << std::endl;

    ArgMaxKernel<<<gridSize, blockSize, nSharedBytes>>>(d_blockMax, d_data,
                                                        nElements);
    CUDA_CHECK_LOG();
    CUDA_SAFE_CALL_LOG(cudaDeviceSynchronize());
  }

  {
    // todo here we potentially do too much work at the moment
    const int blockSizeReduction = std::min(
        blockSize, ((gridSize + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE);
    const int nSharedBytes =
        (blockSizeReduction / WARP_SIZE) * sizeof(ArgMaxResult);
    FindMaxOfMax<<<1, blockSizeReduction, nSharedBytes>>>(d_blockMax, gridSize);
    CUDA_CHECK_LOG();
    CUDA_SAFE_CALL_LOG(cudaDeviceSynchronize());
  }

  // the result is now in the first element
  ArgMaxResult *h_res;
  CUDA_SAFE_CALL_LOG(cudaMallocHost(&h_res, sizeof(ArgMaxResult)));

  CUDA_SAFE_CALL_LOG(cudaMemcpy(h_res, d_blockMax, sizeof(ArgMaxResult),
                                cudaMemcpyDeviceToHost));

  if (h_res[0].value == maxValue && index == h_res[0].index) {
    std::cout << "looks good" << std::endl;
  } else {
    std::cout << "results differ" << std::endl;
  }

  CUDA_SAFE_CALL_LOG(cudaFree(d_data));
  CUDA_SAFE_CALL_LOG(cudaFreeHost(h_data));
  CUDA_SAFE_CALL_LOG(cudaFreeHost(h_res));
  CUDA_SAFE_CALL_LOG(cudaFree(d_blockMax));
  return 0;
}

Can be compiled and run using nvcc <filename>.cu -o argmax-test && ./argmax-test.

Created by Urs Hofmann