rpmalloc.c

/* rpmalloc.c  -  Memory allocator  -  Public Domain  -  2016 Mattias Jansson
 *
 * This library provides a cross-platform lock free thread caching malloc implementation in C11.
 * The latest source code is always available at
 *
 * https://github.com/mjansson/rpmalloc
 *
 * This library is put in the public domain; you can redistribute it and/or modify it without any restrictions.
 *
 */

#include "rpmalloc.h"

/// Build time configurable limits
#ifndef HEAP_ARRAY_SIZE
//! Size of heap hashmap
#define HEAP_ARRAY_SIZE           47
#endif
#ifndef ENABLE_THREAD_CACHE
//! Enable per-thread cache
#define ENABLE_THREAD_CACHE       1
#endif
#ifndef ENABLE_GLOBAL_CACHE
//! Enable global cache shared between all threads, requires thread cache
#define ENABLE_GLOBAL_CACHE       1
#endif
#ifndef ENABLE_VALIDATE_ARGS
//! Enable validation of args to public entry points
#define ENABLE_VALIDATE_ARGS      0
#endif
#ifndef ENABLE_STATISTICS
//! Enable statistics collection
#define ENABLE_STATISTICS         0
#endif
#ifndef ENABLE_ASSERTS
//! Enable asserts
#define ENABLE_ASSERTS            0
#endif
#ifndef ENABLE_OVERRIDE
//! Override standard library malloc/free and new/delete entry points
#define ENABLE_OVERRIDE           0
#endif
#ifndef ENABLE_PRELOAD
//! Support preloading
#define ENABLE_PRELOAD            0
#endif
#ifndef DISABLE_UNMAP
//! Disable unmapping memory pages
#define DISABLE_UNMAP             0
#endif
#ifndef DEFAULT_SPAN_MAP_COUNT
//! Default number of spans to map in call to map more virtual memory (default values yield 4MiB here)
#define DEFAULT_SPAN_MAP_COUNT    64
#endif

#if ENABLE_THREAD_CACHE
#ifndef ENABLE_UNLIMITED_CACHE
//! Unlimited thread and global cache
#define ENABLE_UNLIMITED_CACHE    0
#endif
#ifndef ENABLE_UNLIMITED_THREAD_CACHE
//! Unlimited cache disables any thread cache limitations
#define ENABLE_UNLIMITED_THREAD_CACHE ENABLE_UNLIMITED_CACHE
#endif
#if !ENABLE_UNLIMITED_THREAD_CACHE
#ifndef THREAD_CACHE_MULTIPLIER
//! Multiplier for thread cache (cache limit will be span release count multiplied by this value)
#define THREAD_CACHE_MULTIPLIER 16
#endif
#ifndef ENABLE_ADAPTIVE_THREAD_CACHE
//! Enable adaptive size of per-thread cache (still bounded by THREAD_CACHE_MULTIPLIER hard limit)
#define ENABLE_ADAPTIVE_THREAD_CACHE  0
#endif
#endif
#endif

#if ENABLE_GLOBAL_CACHE && ENABLE_THREAD_CACHE
#ifndef ENABLE_UNLIMITED_GLOBAL_CACHE
//! Unlimited cache disables any global cache limitations
#define ENABLE_UNLIMITED_GLOBAL_CACHE ENABLE_UNLIMITED_CACHE
#endif
#if !ENABLE_UNLIMITED_GLOBAL_CACHE
//! Multiplier for global cache (cache limit will be span release count multiplied by this value)
#define GLOBAL_CACHE_MULTIPLIER (THREAD_CACHE_MULTIPLIER * 6)
#endif
#else
#  undef ENABLE_GLOBAL_CACHE
#  define ENABLE_GLOBAL_CACHE 0
#endif

#if !ENABLE_THREAD_CACHE || ENABLE_UNLIMITED_THREAD_CACHE
#  undef ENABLE_ADAPTIVE_THREAD_CACHE
#  define ENABLE_ADAPTIVE_THREAD_CACHE 0
#endif

#if DISABLE_UNMAP && !ENABLE_GLOBAL_CACHE
#  error Must use global cache if unmap is disabled
#endif

#if defined( _WIN32 ) || defined( __WIN32__ ) || defined( _WIN64 )
#  define PLATFORM_WINDOWS 1
#  define PLATFORM_POSIX 0
#else
#  define PLATFORM_WINDOWS 0
#  define PLATFORM_POSIX 1
#endif

/// Platform and arch specifics
#if defined(_MSC_VER) && !defined(__clang__)
#  define FORCEINLINE inline __forceinline
#  define _Static_assert static_assert
#else
#  define FORCEINLINE inline __attribute__((__always_inline__))
#endif
#if PLATFORM_WINDOWS
#  define WIN32_LEAN_AND_MEAN
#  include <windows.h>
#  if ENABLE_VALIDATE_ARGS
#    include <Intsafe.h>
#  endif
#else
#  include <unistd.h>
#  include <stdio.h>
#  include <stdlib.h>
#  if defined(__APPLE__)
#    include <mach/mach_vm.h>
#    include <pthread.h>
#  endif
#  if defined(__HAIKU__)
#    include <OS.h>
#    include <pthread.h>
#  endif
#endif

#include <stdint.h>
#include <string.h>

#if ENABLE_ASSERTS
#  undef NDEBUG
#  if defined(_MSC_VER) && !defined(_DEBUG)
#    define _DEBUG
#  endif
#  include <assert.h>
#else
#  undef  assert
#  define assert(x) do {} while(0)
#endif
#if ENABLE_STATISTICS
#  include <stdio.h>
#endif

/// Atomic access abstraction
#if defined(_MSC_VER) && !defined(__clang__)

typedef volatile long      atomic32_t;
typedef volatile long long atomic64_t;
typedef volatile void*     atomicptr_t;

#define atomic_thread_fence_acquire()
#define atomic_thread_fence_release()

static FORCEINLINE int32_t atomic_load32(atomic32_t* src) { return *src; }
static FORCEINLINE void    atomic_store32(atomic32_t* dst, int32_t val) { *dst = val; }
static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return (int32_t)_InterlockedExchangeAdd(val, 1) + 1; }
static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return (int32_t)_InterlockedExchangeAdd(val, add) + add; }
static FORCEINLINE void*   atomic_load_ptr(atomicptr_t* src) { return (void*)*src; }
static FORCEINLINE void    atomic_store_ptr(atomicptr_t* dst, void* val) { *dst = val; }
#  if defined(__LLP64__) || defined(__LP64__) || defined(_WIN64)
static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return (_InterlockedCompareExchange64((volatile long long*)dst, (long long)val, (long long)ref) == (long long)ref) ? 1 : 0; }
#else
static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return (_InterlockedCompareExchange((volatile long*)dst, (long)val, (long)ref) == (long)ref) ? 1 : 0; }
#endif

#define EXPECTED(x) (x)
#define UNEXPECTED(x) (x)

#else

#include <stdatomic.h>

typedef volatile _Atomic(int32_t) atomic32_t;
typedef volatile _Atomic(int64_t) atomic64_t;
typedef volatile _Atomic(void*) atomicptr_t;

#define atomic_thread_fence_acquire() atomic_thread_fence(memory_order_acquire)
#define atomic_thread_fence_release() atomic_thread_fence(memory_order_release)

static FORCEINLINE int32_t atomic_load32(atomic32_t* src) { return atomic_load_explicit(src, memory_order_relaxed); }
static FORCEINLINE void    atomic_store32(atomic32_t* dst, int32_t val) { atomic_store_explicit(dst, val, memory_order_relaxed); }
static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return atomic_fetch_add_explicit(val, 1, memory_order_relaxed) + 1; }
static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return atomic_fetch_add_explicit(val, add, memory_order_relaxed) + add; }
static FORCEINLINE void*   atomic_load_ptr(atomicptr_t* src) { return atomic_load_explicit(src, memory_order_relaxed); }
static FORCEINLINE void    atomic_store_ptr(atomicptr_t* dst, void* val) { atomic_store_explicit(dst, val, memory_order_relaxed); }
static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return atomic_compare_exchange_weak_explicit(dst, &ref, val, memory_order_release, memory_order_acquire); }

#define EXPECTED(x) __builtin_expect((x), 1)
#define UNEXPECTED(x) __builtin_expect((x), 0)
    
#endif

/// Preconfigured limits and sizes
//! Granularity of a small allocation block
#define SMALL_GRANULARITY         16
//! Small granularity shift count
#define SMALL_GRANULARITY_SHIFT   4
//! Number of small block size classes
#define SMALL_CLASS_COUNT         65
//! Maximum size of a small block
#define SMALL_SIZE_LIMIT          (SMALL_GRANULARITY * (SMALL_CLASS_COUNT - 1))
//! Granularity of a medium allocation block
#define MEDIUM_GRANULARITY        512
//! Medium granularity shift count
#define MEDIUM_GRANULARITY_SHIFT  9
//! Number of medium block size classes
#define MEDIUM_CLASS_COUNT        61
//! Total number of small + medium size classes
#define SIZE_CLASS_COUNT          (SMALL_CLASS_COUNT + MEDIUM_CLASS_COUNT)
//! Number of large block size classes
#define LARGE_CLASS_COUNT         32
//! Maximum size of a medium block
#define MEDIUM_SIZE_LIMIT         (SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY * MEDIUM_CLASS_COUNT))
//! Maximum size of a large block
#define LARGE_SIZE_LIMIT          ((LARGE_CLASS_COUNT * _memory_span_size) - SPAN_HEADER_SIZE)
//! Size of a span header (must be a multiple of SMALL_GRANULARITY)
#define SPAN_HEADER_SIZE          96

#if ENABLE_VALIDATE_ARGS
//! Maximum allocation size to avoid integer overflow
#undef  MAX_ALLOC_SIZE
#define MAX_ALLOC_SIZE            (((size_t)-1) - _memory_span_size)
#endif

#define pointer_offset(ptr, ofs) (void*)((char*)(ptr) + (ptrdiff_t)(ofs))
#define pointer_diff(first, second) (ptrdiff_t)((const char*)(first) - (const char*)(second))

#define INVALID_POINTER ((void*)((uintptr_t)-1))

/// Data types
//! A memory heap, per thread
typedef struct heap_t heap_t;
//! Heap spans per size class
typedef struct heap_class_t heap_class_t;
//! Span of memory pages
typedef struct span_t span_t;
//! Span list
typedef struct span_list_t span_list_t;
//! Span active data
typedef struct span_active_t span_active_t;
//! Size class definition
typedef struct size_class_t size_class_t;
//! Global cache
typedef struct global_cache_t global_cache_t;

//! Flag indicating span is the first (master) span of a split superspan
#define SPAN_FLAG_MASTER 1U
//! Flag indicating span is a secondary (sub) span of a split superspan
#define SPAN_FLAG_SUBSPAN 2U
//! Flag indicating span has blocks with increased alignment
#define SPAN_FLAG_ALIGNED_BLOCKS 4U

#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
struct span_use_t {
  //! Current number of spans used (actually used, not in cache)
  uint32_t current;
  //! High water mark of spans used
  uint32_t high;
#if ENABLE_STATISTICS
  //! Number of spans transitioned to global cache
  uint32_t spans_to_global;
  //! Number of spans transitioned from global cache
  uint32_t spans_from_global;
  //! Number of spans transitioned to thread cache
  uint32_t spans_to_cache;
  //! Number of spans transitioned from thread cache
  uint32_t spans_from_cache;
  //! Number of spans transitioned to reserved state
  uint32_t spans_to_reserved;
  //! Number of spans transitioned from reserved state
  uint32_t spans_from_reserved;
  //! Number of raw memory map calls
  uint32_t spans_map_calls;
#endif
};
typedef struct span_use_t span_use_t;
#endif

#if ENABLE_STATISTICS
struct size_class_use_t {
  //! Current number of allocations
  atomic32_t alloc_current;
  //! Peak number of allocations
  int32_t alloc_peak;
  //! Total number of allocations
  int32_t alloc_total;
  //! Total number of frees
  atomic32_t free_total;
  //! Number of spans transitioned to cache
  uint32_t spans_to_cache;
  //! Number of spans transitioned from cache
  uint32_t spans_from_cache;
  //! Number of spans transitioned from reserved state
  uint32_t spans_from_reserved;
  //! Number of spans mapped
  uint32_t spans_map_calls;
};
typedef struct size_class_use_t size_class_use_t;
#endif

typedef enum span_state_t {
  SPAN_STATE_ACTIVE = 0,
  SPAN_STATE_PARTIAL,
  SPAN_STATE_FULL
} span_state_t;

//A span can either represent a single span of memory pages with size declared by span_map_count configuration variable,
//or a set of spans in a continuous region, a super span. Any reference to the term "span" usually refers to both a single
//span or a super span. A super span can further be divided into multiple spans (or this, super spans), where the first
//(super)span is the master and subsequent (super)spans are subspans. The master span keeps track of how many subspans
//that are still alive and mapped in virtual memory, and once all subspans and master have been unmapped the entire
//superspan region is released and unmapped (on Windows for example, the entire superspan range has to be released
//in the same call to release the virtual memory range, but individual subranges can be decommitted individually
//to reduce physical memory use).
struct span_t {
  //! Free list
  void*       free_list;
  //! State
  uint32_t    state;
  //! Used count when not active (not including deferred free list)
  uint32_t    used_count;
  //! Block count
  uint32_t    block_count;
  //! Size class
  uint32_t    size_class;
  //! Index of last block initialized in free list
  uint32_t    free_list_limit;
  //! Span list size when part of a cache list, or size of deferred free list when partial/full
  uint32_t    list_size;
  //! Deferred free list
  atomicptr_t free_list_deferred;
  //! Size of a block
  uint32_t    block_size;
  //! Flags and counters
  uint32_t    flags;
  //! Number of spans
  uint32_t    span_count;
  //! Total span counter for master spans, distance for subspans
  uint32_t    total_spans_or_distance;
  //! Remaining span counter, for master spans
  atomic32_t  remaining_spans;
  //! Alignment offset
  uint32_t    align_offset;
  //! Owning heap
  heap_t*     heap;
  //! Next span
  span_t*     next;
  //! Previous span
  span_t*     prev;
};
_Static_assert(sizeof(span_t) <= SPAN_HEADER_SIZE, "span size mismatch");

struct heap_class_t {
  //! Free list of active span
  void*        free_list;
  //! Double linked list of partially used spans with free blocks for each size class.
  //  Current active span is at head of list. Previous span pointer in head points to tail span of list.
  span_t*      partial_span;
};

struct heap_t {
  //! Active and semi-used span data per size class
  heap_class_t span_class[SIZE_CLASS_COUNT];
#if ENABLE_THREAD_CACHE
  //! List of free spans (single linked list)
  span_t*      span_cache[LARGE_CLASS_COUNT];
  //! List of deferred free spans of class 0 (single linked list)
  atomicptr_t  span_cache_deferred;
#endif
#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
  //! Current and high water mark of spans used per span count
  span_use_t   span_use[LARGE_CLASS_COUNT];
#endif
  //! Mapped but unused spans
  span_t*      span_reserve;
  //! Master span for mapped but unused spans
  span_t*      span_reserve_master;
  //! Number of mapped but unused spans
  size_t       spans_reserved;
  //! Next heap in id list
  heap_t*      next_heap;
  //! Next heap in orphan list
  heap_t*      next_orphan;
  //! Memory pages alignment offset
  size_t       align_offset;
  //! Heap ID
  int32_t      id;
#if ENABLE_STATISTICS
  //! Number of bytes transitioned thread -> global
  size_t       thread_to_global;
  //! Number of bytes transitioned global -> thread
  size_t       global_to_thread;
  //! Allocation stats per size class
  size_class_use_t size_class_use[SIZE_CLASS_COUNT + 1];
#endif
};

struct size_class_t {
  //! Size of blocks in this class
  uint32_t block_size;
  //! Number of blocks in each chunk
  uint16_t block_count;
  //! Class index this class is merged with
  uint16_t class_idx;
};
_Static_assert(sizeof(size_class_t) == 8, "Size class size mismatch");

struct global_cache_t {
  //! Cache list pointer
  atomicptr_t cache;
  //! Cache size
  atomic32_t size;
  //! ABA counter
  atomic32_t counter;
};

/// Global data
//! Initialized flag
static int _rpmalloc_initialized;
//! Configuration
static rpmalloc_config_t _memory_config;
//! Memory page size
static size_t _memory_page_size;
//! Shift to divide by page size
static size_t _memory_page_size_shift;
//! Granularity at which memory pages are mapped by OS
static size_t _memory_map_granularity;
#if RPMALLOC_CONFIGURABLE
//! Size of a span of memory pages
static size_t _memory_span_size;
//! Shift to divide by span size
static size_t _memory_span_size_shift;
//! Mask to get to start of a memory span
static uintptr_t _memory_span_mask;
#else
//! Hardwired span size (64KiB)
#define _memory_span_size (64 * 1024)
#define _memory_span_size_shift 16
#define _memory_span_mask (~((uintptr_t)(_memory_span_size - 1)))
#endif
//! Number of spans to map in each map call
static size_t _memory_span_map_count;
//! Number of spans to release from thread cache to global cache (single spans)
static size_t _memory_span_release_count;
//! Number of spans to release from thread cache to global cache (large multiple spans)
static size_t _memory_span_release_count_large;
//! Global size classes
static size_class_t _memory_size_class[SIZE_CLASS_COUNT];
//! Run-time size limit of medium blocks
static size_t _memory_medium_size_limit;
//! Heap ID counter
static atomic32_t _memory_heap_id;
//! Huge page support
static int _memory_huge_pages;
#if ENABLE_GLOBAL_CACHE
//! Global span cache
static global_cache_t _memory_span_cache[LARGE_CLASS_COUNT];
#endif
//! All heaps
static atomicptr_t _memory_heaps[HEAP_ARRAY_SIZE];
//! Orphaned heaps
static atomicptr_t _memory_orphan_heaps;
//! Running orphan counter to avoid ABA issues in linked list
static atomic32_t _memory_orphan_counter;
#if ENABLE_STATISTICS
//! Active heap count
static atomic32_t _memory_active_heaps;
//! Number of currently mapped memory pages
static atomic32_t _mapped_pages;
//! Peak number of concurrently mapped memory pages
static int32_t _mapped_pages_peak;
//! Number of currently unused spans
static atomic32_t _reserved_spans;
//! Running counter of total number of mapped memory pages since start
static atomic32_t _mapped_total;
//! Running counter of total number of unmapped memory pages since start
static atomic32_t _unmapped_total;
//! Number of currently mapped memory pages in OS calls
static atomic32_t _mapped_pages_os;
//! Number of currently allocated pages in huge allocations
static atomic32_t _huge_pages_current;
//! Peak number of currently allocated pages in huge allocations
static int32_t _huge_pages_peak;
#endif

//! Current thread heap
#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
static pthread_key_t _memory_thread_heap;
#else
#  ifdef _MSC_VER
#    define _Thread_local __declspec(thread)
#    define TLS_MODEL
#  else
#    define TLS_MODEL __attribute__((tls_model("initial-exec")))
#    if !defined(__clang__) && defined(__GNUC__)
#      define _Thread_local __thread
#    endif
#  endif
static _Thread_local heap_t* _memory_thread_heap TLS_MODEL;
#endif

static inline heap_t*
get_thread_heap_raw(void) {
#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
  return pthread_getspecific(_memory_thread_heap);
#else
  return _memory_thread_heap;
#endif
}

//! Get the current thread heap
static inline heap_t*
get_thread_heap(void) {
  heap_t* heap = get_thread_heap_raw();
#if ENABLE_PRELOAD
  if (EXPECTED(heap != 0))
    return heap;
  rpmalloc_initialize();
  return get_thread_heap_raw();
#else
  return heap;
#endif
}

//! Set the current thread heap
static void
set_thread_heap(heap_t* heap) {
#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
  pthread_setspecific(_memory_thread_heap, heap);
#else
  _memory_thread_heap = heap;
#endif
}

//! Default implementation to map more virtual memory
static void*
_memory_map_os(size_t size, size_t* offset);

//! Default implementation to unmap virtual memory
static void
_memory_unmap_os(void* address, size_t size, size_t offset, size_t release);

//! Lookup a memory heap from heap ID
static heap_t*
_memory_heap_lookup(int32_t id) {
  uint32_t list_idx = id % HEAP_ARRAY_SIZE;
  heap_t* heap = atomic_load_ptr(&_memory_heaps[list_idx]);
  while (heap && (heap->id != id))
    heap = heap->next_heap;
  return heap;
}

#if ENABLE_STATISTICS
#  define _memory_statistics_inc(counter, value) counter += value
#  define _memory_statistics_add(atomic_counter, value) atomic_add32(atomic_counter, (int32_t)(value))
#  define _memory_statistics_add_peak(atomic_counter, value, peak) do { int32_t _cur_count = atomic_add32(atomic_counter, (int32_t)(value)); if (_cur_count > (peak)) peak = _cur_count; } while (0)
#  define _memory_statistics_sub(atomic_counter, value) atomic_add32(atomic_counter, -(int32_t)(value))
#  define _memory_statistics_inc_alloc(heap, class_idx) do { \
  int32_t alloc_current = atomic_incr32(&heap->size_class_use[class_idx].alloc_current); \
  if (alloc_current > heap->size_class_use[class_idx].alloc_peak) \
    heap->size_class_use[class_idx].alloc_peak = alloc_current; \
  heap->size_class_use[class_idx].alloc_total++; \
} while(0)
#  define _memory_statistics_inc_free(heap, class_idx) do { \
  atomic_add32(&heap->size_class_use[class_idx].alloc_current, -1); \
  atomic_incr32(&heap->size_class_use[class_idx].free_total); \
} while(0)
#else
#  define _memory_statistics_inc(counter, value) do {} while(0)
#  define _memory_statistics_add(atomic_counter, value) do {} while(0)
#  define _memory_statistics_add_peak(atomic_counter, value, peak) do {} while (0)
#  define _memory_statistics_sub(atomic_counter, value) do {} while(0)
#  define _memory_statistics_inc_alloc(heap, class_idx) do {} while(0)
#  define _memory_statistics_inc_free(heap, class_idx) do {} while(0)
#endif

static void
_memory_heap_cache_insert(heap_t* heap, span_t* span);

//! Map more virtual memory
static void*
_memory_map(size_t size, size_t* offset) {
  assert(!(size % _memory_page_size));
  assert(size >= _memory_page_size);
  _memory_statistics_add_peak(&_mapped_pages, (size >> _memory_page_size_shift), _mapped_pages_peak);
  _memory_statistics_add(&_mapped_total, (size >> _memory_page_size_shift));
  return _memory_config.memory_map(size, offset);
}

//! Unmap virtual memory
static void
_memory_unmap(void* address, size_t size, size_t offset, size_t release) {
  assert(!release || (release >= size));
  assert(!release || (release >= _memory_page_size));
  if (release) {
    assert(!(release % _memory_page_size));
    _memory_statistics_sub(&_mapped_pages, (release >> _memory_page_size_shift));
    _memory_statistics_add(&_unmapped_total, (release >> _memory_page_size_shift));
  }
  _memory_config.memory_unmap(address, size, offset, release);
}

//! Declare the span to be a subspan and store distance from master span and span count
static void
_memory_span_mark_as_subspan_unless_master(span_t* master, span_t* subspan, size_t span_count) {
  assert((subspan != master) || (subspan->flags & SPAN_FLAG_MASTER));
  if (subspan != master) {
    subspan->flags = SPAN_FLAG_SUBSPAN;
    subspan->total_spans_or_distance = (uint32_t)((uintptr_t)pointer_diff(subspan, master) >> _memory_span_size_shift);
    subspan->align_offset = 0;
  }
  subspan->span_count = (uint32_t)span_count;
}

//! Use reserved spans to fulfill a memory map request (reserve size must be checked by caller)
static span_t*
_memory_map_from_reserve(heap_t* heap, size_t span_count) {
  //Update the heap span reserve
  span_t* span = heap->span_reserve;
  heap->span_reserve = pointer_offset(span, span_count * _memory_span_size);
  heap->spans_reserved -= span_count;

  _memory_span_mark_as_subspan_unless_master(heap->span_reserve_master, span, span_count);
  if (span_count <= LARGE_CLASS_COUNT)
    _memory_statistics_inc(heap->span_use[span_count - 1].spans_from_reserved, 1);

  return span;
}

//! Get the aligned number of spans to map in based on wanted count, configured mapping granularity and the page size
static size_t
_memory_map_align_span_count(size_t span_count) {
  size_t request_count = (span_count > _memory_span_map_count) ? span_count : _memory_span_map_count;
  if ((_memory_page_size > _memory_span_size) && ((request_count * _memory_span_size) % _memory_page_size))
    request_count += _memory_span_map_count - (request_count % _memory_span_map_count); 
  return request_count;
}

//! Store the given spans as reserve in the given heap
static void
_memory_heap_set_reserved_spans(heap_t* heap, span_t* master, span_t* reserve, size_t reserve_span_count) {
  heap->span_reserve_master = master;
  heap->span_reserve = reserve;
  heap->spans_reserved = reserve_span_count;
}

//! Setup a newly mapped span
static void
_memory_span_initialize(span_t* span, size_t total_span_count, size_t span_count, size_t align_offset) {
  span->total_spans_or_distance = (uint32_t)total_span_count;
  span->span_count = (uint32_t)span_count;
  span->align_offset = (uint32_t)align_offset;
  span->flags = SPAN_FLAG_MASTER;
  atomic_store32(&span->remaining_spans, (int32_t)total_span_count);  
}

//! Map a akigned set of spans, taking configured mapping granularity and the page size into account
static span_t*
_memory_map_aligned_span_count(heap_t* heap, size_t span_count) {
  //If we already have some, but not enough, reserved spans, release those to heap cache and map a new
  //full set of spans. Otherwise we would waste memory if page size > span size (huge pages)
  size_t aligned_span_count = _memory_map_align_span_count(span_count);
  size_t align_offset = 0;
  span_t* span = _memory_map(aligned_span_count * _memory_span_size, &align_offset);
  if (!span)
    return 0;
  _memory_span_initialize(span, aligned_span_count, span_count, align_offset);
  _memory_statistics_add(&_reserved_spans, aligned_span_count);
  if (span_count <= LARGE_CLASS_COUNT)
    _memory_statistics_inc(heap->span_use[span_count - 1].spans_map_calls, 1);
  if (aligned_span_count > span_count) {
    if (heap->spans_reserved) {
      _memory_span_mark_as_subspan_unless_master(heap->span_reserve_master, heap->span_reserve, heap->spans_reserved);
      _memory_heap_cache_insert(heap, heap->span_reserve);
    }
    _memory_heap_set_reserved_spans(heap, span, pointer_offset(span, span_count * _memory_span_size), aligned_span_count - span_count);
  }
  return span;
}

//! Map in memory pages for the given number of spans (or use previously reserved pages)
static span_t*
_memory_map_spans(heap_t* heap, size_t span_count) {
  if (span_count <= heap->spans_reserved)
    return _memory_map_from_reserve(heap, span_count);
  return _memory_map_aligned_span_count(heap, span_count);
}

//! Unmap memory pages for the given number of spans (or mark as unused if no partial unmappings)
static void
_memory_unmap_span(span_t* span) {
  assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN));
  assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN));

  int is_master = !!(span->flags & SPAN_FLAG_MASTER);
  span_t* master = is_master ? span : (pointer_offset(span, -(int32_t)(span->total_spans_or_distance * _memory_span_size)));
  assert(is_master || (span->flags & SPAN_FLAG_SUBSPAN));
  assert(master->flags & SPAN_FLAG_MASTER);

  size_t span_count = span->span_count;
  if (!is_master) {
    //Directly unmap subspans (unless huge pages, in which case we defer and unmap entire page range with master)
    assert(span->align_offset == 0);
    if (_memory_span_size >= _memory_page_size) {
      _memory_unmap(span, span_count * _memory_span_size, 0, 0);
      _memory_statistics_sub(&_reserved_spans, span_count);
    }
  } else {
    //Special double flag to denote an unmapped master
    //It must be kept in memory since span header must be used
    span->flags |= SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN;
  }

  if (atomic_add32(&master->remaining_spans, -(int32_t)span_count) <= 0) {
    //Everything unmapped, unmap the master span with release flag to unmap the entire range of the super span
    assert(!!(master->flags & SPAN_FLAG_MASTER) && !!(master->flags & SPAN_FLAG_SUBSPAN));
    size_t unmap_count = master->span_count;
    if (_memory_span_size < _memory_page_size)
      unmap_count = master->total_spans_or_distance;
    _memory_statistics_sub(&_reserved_spans, unmap_count);
    _memory_unmap(master, unmap_count * _memory_span_size, master->align_offset, master->total_spans_or_distance * _memory_span_size);
  }
}

#if ENABLE_THREAD_CACHE

//! Unmap a single linked list of spans
static void
_memory_unmap_span_list(span_t* span) {
  size_t list_size = span->list_size;
  for (size_t ispan = 0; ispan < list_size; ++ispan) {
    span_t* next_span = span->next;
    _memory_unmap_span(span);
    span = next_span;
  }
  assert(!span);
}

//! Add span to head of single linked span list
static size_t
_memory_span_list_push(span_t** head, span_t* span) {
  span->next = *head;
  if (*head)
    span->list_size = (*head)->list_size + 1;
  else
    span->list_size = 1;
  *head = span;
  return span->list_size;
}

//! Remove span from head of single linked span list, returns the new list head
static span_t*
_memory_span_list_pop(span_t** head) {
  span_t* span = *head;
  span_t* next_span = 0;
  if (span->list_size > 1) {
    assert(span->next);
    next_span = span->next;
    assert(next_span);
    next_span->list_size = span->list_size - 1;
  }
  *head = next_span;
  return span;
}

//! Split a single linked span list
static span_t*
_memory_span_list_split(span_t* span, size_t limit) {
  span_t* next = 0;
  if (limit < 2)
    limit = 2;
  if (span->list_size > limit) {
    uint32_t list_size = 1;
    span_t* last = span;
    next = span->next;
    while (list_size < limit) {
      last = next;
      next = next->next;
      ++list_size;
    }
    last->next = 0;
    assert(next);
    next->list_size = span->list_size - list_size;
    span->list_size = list_size;
    span->prev = 0;
  }
  return next;
}

#endif

//! Add a span to partial span double linked list at the head
static void
_memory_span_partial_list_add(span_t** head, span_t* span) {
  if (*head) {
    span->next = *head;
    //Maintain pointer to tail span
    span->prev = (*head)->prev;
    (*head)->prev = span;
  } else {
    span->next = 0;
    span->prev = span;
  }
  *head = span;
}

//! Add a span to partial span double linked list at the tail
static void
_memory_span_partial_list_add_tail(span_t** head, span_t* span) {
  span->next = 0;
  if (*head) {
    span_t* tail = (*head)->prev;
    tail->next = span;
    span->prev = tail;
    //Maintain pointer to tail span
    (*head)->prev = span;
  } else {
    span->prev = span;
    *head = span;
  }
}

//! Pop head span from partial span double linked list
static void
_memory_span_partial_list_pop_head(span_t** head) {
  span_t* span = *head;
  *head = span->next;
  if (*head) {
    //Maintain pointer to tail span
    (*head)->prev = span->prev;
  }
}

//! Remove a span from partial span double linked list
static void
_memory_span_partial_list_remove(span_t** head, span_t* span) {
  if (UNEXPECTED(*head == span)) {
    _memory_span_partial_list_pop_head(head);
  } else {
    span_t* next_span = span->next;
    span_t* prev_span = span->prev;
    prev_span->next = next_span;
    if (EXPECTED(next_span != 0)) {
      next_span->prev = prev_span;
    } else {
      //Update pointer to tail span
      (*head)->prev = prev_span;
    }
  }
}

#if ENABLE_GLOBAL_CACHE

//! Insert the given list of memory page spans in the global cache
static void
_memory_cache_insert(global_cache_t* cache, span_t* span, size_t cache_limit) {
  assert((span->list_size == 1) || (span->next != 0));
  int32_t list_size = (int32_t)span->list_size;
  //Unmap if cache has reached the limit
  if (atomic_add32(&cache->size, list_size) > (int32_t)cache_limit) {
#if !ENABLE_UNLIMITED_GLOBAL_CACHE
    _memory_unmap_span_list(span);
    atomic_add32(&cache->size, -list_size);
    return;
#endif
  }
  void* current_cache, *new_cache;
  do {
    current_cache = atomic_load_ptr(&cache->cache);
    span->prev = (void*)((uintptr_t)current_cache & _memory_span_mask);
    new_cache = (void*)((uintptr_t)span | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask));
  } while (!atomic_cas_ptr(&cache->cache, new_cache, current_cache));
}

//! Extract a number of memory page spans from the global cache
static span_t*
_memory_cache_extract(global_cache_t* cache) {
  uintptr_t span_ptr;
  do {
    void* global_span = atomic_load_ptr(&cache->cache);
    span_ptr = (uintptr_t)global_span & _memory_span_mask;
    if (span_ptr) {
      span_t* span = (void*)span_ptr;
      //By accessing the span ptr before it is swapped out of list we assume that a contending thread
      //does not manage to traverse the span to being unmapped before we access it
      void* new_cache = (void*)((uintptr_t)span->prev | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask));
      if (atomic_cas_ptr(&cache->cache, new_cache, global_span)) {
        atomic_add32(&cache->size, -(int32_t)span->list_size);
        return span;
      }
    }
  } while (span_ptr);
  return 0;
}

//! Finalize a global cache, only valid from allocator finalization (not thread safe)
static void
_memory_cache_finalize(global_cache_t* cache) {
  void* current_cache = atomic_load_ptr(&cache->cache);
  span_t* span = (void*)((uintptr_t)current_cache & _memory_span_mask);
  while (span) {
    span_t* skip_span = (void*)((uintptr_t)span->prev & _memory_span_mask);
    atomic_add32(&cache->size, -(int32_t)span->list_size);
    _memory_unmap_span_list(span);
    span = skip_span;
  }
  assert(!atomic_load32(&cache->size));
  atomic_store_ptr(&cache->cache, 0);
  atomic_store32(&cache->size, 0);
}

//! Insert the given list of memory page spans in the global cache
static void
_memory_global_cache_insert(span_t* span) {
  size_t span_count = span->span_count;
#if ENABLE_UNLIMITED_GLOBAL_CACHE
  _memory_cache_insert(&_memory_span_cache[span_count - 1], span, 0);
#else
  const size_t cache_limit = (GLOBAL_CACHE_MULTIPLIER * ((span_count == 1) ? _memory_span_release_count : _memory_span_release_count_large));
  _memory_cache_insert(&_memory_span_cache[span_count - 1], span, cache_limit);
#endif
}

//! Extract a number of memory page spans from the global cache for large blocks
static span_t*
_memory_global_cache_extract(size_t span_count) {
  span_t* span = _memory_cache_extract(&_memory_span_cache[span_count - 1]);
  assert(!span || (span->span_count == span_count));
  return span;
}

#endif

#if ENABLE_THREAD_CACHE
//! Adopt the deferred span cache list
static void
_memory_heap_cache_adopt_deferred(heap_t* heap) {
  atomic_thread_fence_acquire();
  span_t* span = atomic_load_ptr(&heap->span_cache_deferred);
  if (!span)
    return;
  do {
    span = atomic_load_ptr(&heap->span_cache_deferred);
  } while (!atomic_cas_ptr(&heap->span_cache_deferred, 0, span));
  while (span) {
    span_t* next_span = span->next;
    _memory_span_list_push(&heap->span_cache[0], span);
#if ENABLE_STATISTICS
    heap->size_class_use[span->size_class].spans_to_cache++;
#endif
    span = next_span;
  }
}
#endif

//! Insert a single span into thread heap cache, releasing to global cache if overflow
static void
_memory_heap_cache_insert(heap_t* heap, span_t* span) {
#if ENABLE_THREAD_CACHE
  size_t span_count = span->span_count;
  size_t idx = span_count - 1;
  _memory_statistics_inc(heap->span_use[idx].spans_to_cache, 1);
  if (!idx)
    _memory_heap_cache_adopt_deferred(heap);
#if ENABLE_UNLIMITED_THREAD_CACHE
  _memory_span_list_push(&heap->span_cache[idx], span);
#else
  const size_t release_count = (!idx ? _memory_span_release_count : _memory_span_release_count_large);
  size_t current_cache_size = _memory_span_list_push(&heap->span_cache[idx], span);
  if (current_cache_size <= release_count)
    return;
  const size_t hard_limit = release_count * THREAD_CACHE_MULTIPLIER;
  if (current_cache_size <= hard_limit) {
#if ENABLE_ADAPTIVE_THREAD_CACHE
    //Require 25% of high water mark to remain in cache (and at least 1, if use is 0)
    const size_t high_mark = heap->span_use[idx].high;
    const size_t min_limit = (high_mark >> 2) + release_count + 1;
    if (current_cache_size < min_limit)
      return;
#else
    return;
#endif
  }
  heap->span_cache[idx] = _memory_span_list_split(span, release_count);
  assert(span->list_size == release_count);
#if ENABLE_STATISTICS
  heap->thread_to_global += (size_t)span->list_size * span_count * _memory_span_size;
  heap->span_use[idx].spans_to_global += span->list_size;
#endif
#if ENABLE_GLOBAL_CACHE
  _memory_global_cache_insert(span);
#else