Program Listing for File AlignedAlloc.hpp

Return to documentation for file (core/include/AlignedAlloc.hpp)

#pragma once
#include <cstddef>
#include <cstdlib>
#include <new>

#ifdef HAVE_CUDA
#include <cuda_runtime.h>
#endif

namespace core::detail
{

#if defined(USE_CUDA_UM)
// Managed allocations are at least 256B aligned on most GPUs; 128 is safe+wide for host SIMD.
inline constexpr std::size_t HW_ALIGN = 128;
#else
inline constexpr std::size_t HW_ALIGN = 64;
#endif

inline void* aligned_malloc(std::size_t bytes, std::size_t alignment = HW_ALIGN)
{
#if defined(USE_CUDA_UM) && defined(HAVE_CUDA)
    // Unified Memory path: one pointer valid on host and device
    void* p = nullptr;
    if (cudaMallocManaged(&p, bytes, cudaMemAttachGlobal) != cudaSuccess)
    {
        throw std::bad_alloc{};
    }
    return p;

#else
// Host-only path (or UM requested but CUDA not available)
#if defined(_MSC_VER)
    void* p = _aligned_malloc(bytes, alignment);
    if (!p)
        throw std::bad_alloc{};
    return p;
#else
    // std::aligned_alloc requires size multiple of alignment
    if (alignment == 0 || (alignment & (alignment - 1)) != 0)
        alignment = HW_ALIGN;
    std::size_t padded = ((bytes + alignment - 1) / alignment) * alignment;
    void* p = std::aligned_alloc(alignment, padded);
    if (!p)
        throw std::bad_alloc{};
    return p;
#endif
#endif
}

inline void aligned_free(void* p) noexcept
{
#if defined(USE_CUDA_UM) && defined(HAVE_CUDA)
    if (p)
        cudaFree(p);
#else
#if defined(_MSC_VER)
    _aligned_free(p);
#else
    std::free(p);
#endif
#endif
}

} // namespace core::detail