Program Listing for File MemoryManager.hpp¶
↰ Return to documentation for file (core/include/MemoryManager.hpp
)
#pragma once
#include "AlignedAlloc.hpp"
#include <cstddef>
#include <cstdint>
#include <mutex>
#include <unordered_map>
#ifdef HAVE_CUDA
#include <cuda_runtime.h>
#endif
namespace core
{
enum class AllocKind : uint8_t
{
HostAligned,
HostPinned,
Device,
Unified
};
struct Block
{
void* host = nullptr; // host-visible pointer
void* device = nullptr; // device mirror when !UM (UM: same as host)
std::size_t bytes = 0;
AllocKind kind = AllocKind::HostAligned;
};
class MemoryManager
{
public:
static MemoryManager& instance();
template <class T> T* allocate(std::size_t n);
void release(void* host_ptr) noexcept;
// Explicit H<->D copies. No-ops (or prefetch) in UM / CPU-only builds.
void to_device(const void* host_ptr, std::size_t bytes, void* stream = nullptr);
void to_host(const void* host_ptr, std::size_t bytes, void* stream = nullptr);
// Query mirrors
void* device_ptr(const void* host_ptr) const noexcept;
void* host_ptr(const void* maybe_device) const noexcept;
bool using_unified_memory() const noexcept;
// Introspection for tests
std::size_t debug_count() const noexcept;
~MemoryManager();
private:
MemoryManager() = default;
MemoryManager(const MemoryManager&) = delete;
MemoryManager& operator=(const MemoryManager&) = delete;
Block* find_block_unlocked(const void* host_ptr) noexcept;
const Block* find_block_unlocked(const void* host_ptr) const noexcept;
mutable std::mutex mtx_;
std::unordered_map<const void*, Block> registry_; // keyed by host pointer
};
// ---- template implementation ----
template <class T> T* MemoryManager::allocate(std::size_t n)
{
const std::size_t bytes = n * sizeof(T);
Block blk{};
#ifdef HAVE_CUDA
if (using_unified_memory())
{
blk.kind = AllocKind::Unified;
blk.host = core::detail::aligned_malloc(bytes); // UM via cudaMallocManaged
blk.device = blk.host;
}
else
{
#ifdef USE_PINNED_HOST
// Prefer pinned host memory for faster H2D/D2H and MPI staging
void* h = nullptr;
if (cudaHostAlloc(&h, bytes, cudaHostAllocDefault) != cudaSuccess)
throw std::bad_alloc{};
blk.kind = AllocKind::HostPinned;
blk.host = h;
#else
blk.kind = AllocKind::HostAligned;
blk.host = core::detail::aligned_malloc(bytes);
#endif
void* d = nullptr;
if (cudaMalloc(&d, bytes) != cudaSuccess)
{
#ifdef USE_PINNED_HOST
if (blk.host)
cudaFreeHost(blk.host);
#else
core::detail::aligned_free(blk.host);
#endif
throw std::bad_alloc{};
}
blk.device = d;
}
#else
// CPU-only build
blk.kind = AllocKind::HostAligned;
blk.host = core::detail::aligned_malloc(bytes);
blk.device = nullptr;
#endif
blk.bytes = bytes;
{
std::lock_guard<std::mutex> lk(mtx_);
registry_.emplace(blk.host, blk);
}
return static_cast<T*>(blk.host);
}
} // namespace core