Skip to content
Snippets Groups Projects
rpmalloc.c 88.2 KiB
Newer Older
anta999's avatar
anta999 committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000
/* rpmalloc.c  -  Memory allocator  -  Public Domain  -  2016 Mattias Jansson
 *
 * This library provides a cross-platform lock free thread caching malloc implementation in C11.
 * The latest source code is always available at
 *
 * https://github.com/mjansson/rpmalloc
 *
 * This library is put in the public domain; you can redistribute it and/or modify it without any restrictions.
 *
 */

#include "rpmalloc.h"

/// Build time configurable limits
#ifndef HEAP_ARRAY_SIZE
//! Size of heap hashmap
#define HEAP_ARRAY_SIZE           47
#endif
#ifndef ENABLE_THREAD_CACHE
//! Enable per-thread cache
#define ENABLE_THREAD_CACHE       1
#endif
#ifndef ENABLE_GLOBAL_CACHE
//! Enable global cache shared between all threads, requires thread cache
#define ENABLE_GLOBAL_CACHE       1
#endif
#ifndef ENABLE_VALIDATE_ARGS
//! Enable validation of args to public entry points
#define ENABLE_VALIDATE_ARGS      0
#endif
#ifndef ENABLE_STATISTICS
//! Enable statistics collection
#define ENABLE_STATISTICS         0
#endif
#ifndef ENABLE_ASSERTS
//! Enable asserts
#define ENABLE_ASSERTS            0
#endif
#ifndef ENABLE_OVERRIDE
//! Override standard library malloc/free and new/delete entry points
#define ENABLE_OVERRIDE           0
#endif
#ifndef ENABLE_PRELOAD
//! Support preloading
#define ENABLE_PRELOAD            0
#endif
#ifndef DISABLE_UNMAP
//! Disable unmapping memory pages
#define DISABLE_UNMAP             0
#endif
#ifndef DEFAULT_SPAN_MAP_COUNT
//! Default number of spans to map in call to map more virtual memory (default values yield 4MiB here)
#define DEFAULT_SPAN_MAP_COUNT    64
#endif

#if ENABLE_THREAD_CACHE
#ifndef ENABLE_UNLIMITED_CACHE
//! Unlimited thread and global cache
#define ENABLE_UNLIMITED_CACHE    0
#endif
#ifndef ENABLE_UNLIMITED_THREAD_CACHE
//! Unlimited cache disables any thread cache limitations
#define ENABLE_UNLIMITED_THREAD_CACHE ENABLE_UNLIMITED_CACHE
#endif
#if !ENABLE_UNLIMITED_THREAD_CACHE
#ifndef THREAD_CACHE_MULTIPLIER
//! Multiplier for thread cache (cache limit will be span release count multiplied by this value)
#define THREAD_CACHE_MULTIPLIER 16
#endif
#ifndef ENABLE_ADAPTIVE_THREAD_CACHE
//! Enable adaptive size of per-thread cache (still bounded by THREAD_CACHE_MULTIPLIER hard limit)
#define ENABLE_ADAPTIVE_THREAD_CACHE  0
#endif
#endif
#endif

#if ENABLE_GLOBAL_CACHE && ENABLE_THREAD_CACHE
#ifndef ENABLE_UNLIMITED_GLOBAL_CACHE
//! Unlimited cache disables any global cache limitations
#define ENABLE_UNLIMITED_GLOBAL_CACHE ENABLE_UNLIMITED_CACHE
#endif
#if !ENABLE_UNLIMITED_GLOBAL_CACHE
//! Multiplier for global cache (cache limit will be span release count multiplied by this value)
#define GLOBAL_CACHE_MULTIPLIER (THREAD_CACHE_MULTIPLIER * 6)
#endif
#else
#  undef ENABLE_GLOBAL_CACHE
#  define ENABLE_GLOBAL_CACHE 0
#endif

#if !ENABLE_THREAD_CACHE || ENABLE_UNLIMITED_THREAD_CACHE
#  undef ENABLE_ADAPTIVE_THREAD_CACHE
#  define ENABLE_ADAPTIVE_THREAD_CACHE 0
#endif

#if DISABLE_UNMAP && !ENABLE_GLOBAL_CACHE
#  error Must use global cache if unmap is disabled
#endif

#if defined( _WIN32 ) || defined( __WIN32__ ) || defined( _WIN64 )
#  define PLATFORM_WINDOWS 1
#  define PLATFORM_POSIX 0
#else
#  define PLATFORM_WINDOWS 0
#  define PLATFORM_POSIX 1
#endif

/// Platform and arch specifics
#if defined(_MSC_VER) && !defined(__clang__)
#  define FORCEINLINE inline __forceinline
#  define _Static_assert static_assert
#else
#  define FORCEINLINE inline __attribute__((__always_inline__))
#endif
#if PLATFORM_WINDOWS
#  define WIN32_LEAN_AND_MEAN
#  include <windows.h>
#  if ENABLE_VALIDATE_ARGS
#    include <Intsafe.h>
#  endif
#else
#  include <unistd.h>
#  include <stdio.h>
#  include <stdlib.h>
#  if defined(__APPLE__)
#    include <mach/mach_vm.h>
#    include <pthread.h>
#  endif
#  if defined(__HAIKU__)
#    include <OS.h>
#    include <pthread.h>
#  endif
#endif

#include <stdint.h>
#include <string.h>

#if ENABLE_ASSERTS
#  undef NDEBUG
#  if defined(_MSC_VER) && !defined(_DEBUG)
#    define _DEBUG
#  endif
#  include <assert.h>
#else
#  undef  assert
#  define assert(x) do {} while(0)
#endif
#if ENABLE_STATISTICS
#  include <stdio.h>
#endif

/// Atomic access abstraction
#if defined(_MSC_VER) && !defined(__clang__)

typedef volatile long      atomic32_t;
typedef volatile long long atomic64_t;
typedef volatile void*     atomicptr_t;

#define atomic_thread_fence_acquire()
#define atomic_thread_fence_release()

static FORCEINLINE int32_t atomic_load32(atomic32_t* src) { return *src; }
static FORCEINLINE void    atomic_store32(atomic32_t* dst, int32_t val) { *dst = val; }
static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return (int32_t)_InterlockedExchangeAdd(val, 1) + 1; }
static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return (int32_t)_InterlockedExchangeAdd(val, add) + add; }
static FORCEINLINE void*   atomic_load_ptr(atomicptr_t* src) { return (void*)*src; }
static FORCEINLINE void    atomic_store_ptr(atomicptr_t* dst, void* val) { *dst = val; }
#  if defined(__LLP64__) || defined(__LP64__) || defined(_WIN64)
static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return (_InterlockedCompareExchange64((volatile long long*)dst, (long long)val, (long long)ref) == (long long)ref) ? 1 : 0; }
#else
static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return (_InterlockedCompareExchange((volatile long*)dst, (long)val, (long)ref) == (long)ref) ? 1 : 0; }
#endif

#define EXPECTED(x) (x)
#define UNEXPECTED(x) (x)

#else

#include <stdatomic.h>

typedef volatile _Atomic(int32_t) atomic32_t;
typedef volatile _Atomic(int64_t) atomic64_t;
typedef volatile _Atomic(void*) atomicptr_t;

#define atomic_thread_fence_acquire() atomic_thread_fence(memory_order_acquire)
#define atomic_thread_fence_release() atomic_thread_fence(memory_order_release)

static FORCEINLINE int32_t atomic_load32(atomic32_t* src) { return atomic_load_explicit(src, memory_order_relaxed); }
static FORCEINLINE void    atomic_store32(atomic32_t* dst, int32_t val) { atomic_store_explicit(dst, val, memory_order_relaxed); }
static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return atomic_fetch_add_explicit(val, 1, memory_order_relaxed) + 1; }
static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return atomic_fetch_add_explicit(val, add, memory_order_relaxed) + add; }
static FORCEINLINE void*   atomic_load_ptr(atomicptr_t* src) { return atomic_load_explicit(src, memory_order_relaxed); }
static FORCEINLINE void    atomic_store_ptr(atomicptr_t* dst, void* val) { atomic_store_explicit(dst, val, memory_order_relaxed); }
static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return atomic_compare_exchange_weak_explicit(dst, &ref, val, memory_order_release, memory_order_acquire); }

#define EXPECTED(x) __builtin_expect((x), 1)
#define UNEXPECTED(x) __builtin_expect((x), 0)
    
#endif

/// Preconfigured limits and sizes
//! Granularity of a small allocation block
#define SMALL_GRANULARITY         16
//! Small granularity shift count
#define SMALL_GRANULARITY_SHIFT   4
//! Number of small block size classes
#define SMALL_CLASS_COUNT         65
//! Maximum size of a small block
#define SMALL_SIZE_LIMIT          (SMALL_GRANULARITY * (SMALL_CLASS_COUNT - 1))
//! Granularity of a medium allocation block
#define MEDIUM_GRANULARITY        512
//! Medium granularity shift count
#define MEDIUM_GRANULARITY_SHIFT  9
//! Number of medium block size classes
#define MEDIUM_CLASS_COUNT        61
//! Total number of small + medium size classes
#define SIZE_CLASS_COUNT          (SMALL_CLASS_COUNT + MEDIUM_CLASS_COUNT)
//! Number of large block size classes
#define LARGE_CLASS_COUNT         32
//! Maximum size of a medium block
#define MEDIUM_SIZE_LIMIT         (SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY * MEDIUM_CLASS_COUNT))
//! Maximum size of a large block
#define LARGE_SIZE_LIMIT          ((LARGE_CLASS_COUNT * _memory_span_size) - SPAN_HEADER_SIZE)
//! Size of a span header (must be a multiple of SMALL_GRANULARITY)
#define SPAN_HEADER_SIZE          96

#if ENABLE_VALIDATE_ARGS
//! Maximum allocation size to avoid integer overflow
#undef  MAX_ALLOC_SIZE
#define MAX_ALLOC_SIZE            (((size_t)-1) - _memory_span_size)
#endif

#define pointer_offset(ptr, ofs) (void*)((char*)(ptr) + (ptrdiff_t)(ofs))
#define pointer_diff(first, second) (ptrdiff_t)((const char*)(first) - (const char*)(second))

#define INVALID_POINTER ((void*)((uintptr_t)-1))

/// Data types
//! A memory heap, per thread
typedef struct heap_t heap_t;
//! Heap spans per size class
typedef struct heap_class_t heap_class_t;
//! Span of memory pages
typedef struct span_t span_t;
//! Span list
typedef struct span_list_t span_list_t;
//! Span active data
typedef struct span_active_t span_active_t;
//! Size class definition
typedef struct size_class_t size_class_t;
//! Global cache
typedef struct global_cache_t global_cache_t;

//! Flag indicating span is the first (master) span of a split superspan
#define SPAN_FLAG_MASTER 1U
//! Flag indicating span is a secondary (sub) span of a split superspan
#define SPAN_FLAG_SUBSPAN 2U
//! Flag indicating span has blocks with increased alignment
#define SPAN_FLAG_ALIGNED_BLOCKS 4U

#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
struct span_use_t {
  //! Current number of spans used (actually used, not in cache)
  uint32_t current;
  //! High water mark of spans used
  uint32_t high;
#if ENABLE_STATISTICS
  //! Number of spans transitioned to global cache
  uint32_t spans_to_global;
  //! Number of spans transitioned from global cache
  uint32_t spans_from_global;
  //! Number of spans transitioned to thread cache
  uint32_t spans_to_cache;
  //! Number of spans transitioned from thread cache
  uint32_t spans_from_cache;
  //! Number of spans transitioned to reserved state
  uint32_t spans_to_reserved;
  //! Number of spans transitioned from reserved state
  uint32_t spans_from_reserved;
  //! Number of raw memory map calls
  uint32_t spans_map_calls;
#endif
};
typedef struct span_use_t span_use_t;
#endif

#if ENABLE_STATISTICS
struct size_class_use_t {
  //! Current number of allocations
  atomic32_t alloc_current;
  //! Peak number of allocations
  int32_t alloc_peak;
  //! Total number of allocations
  int32_t alloc_total;
  //! Total number of frees
  atomic32_t free_total;
  //! Number of spans transitioned to cache
  uint32_t spans_to_cache;
  //! Number of spans transitioned from cache
  uint32_t spans_from_cache;
  //! Number of spans transitioned from reserved state
  uint32_t spans_from_reserved;
  //! Number of spans mapped
  uint32_t spans_map_calls;
};
typedef struct size_class_use_t size_class_use_t;
#endif

typedef enum span_state_t {
  SPAN_STATE_ACTIVE = 0,
  SPAN_STATE_PARTIAL,
  SPAN_STATE_FULL
} span_state_t;

//A span can either represent a single span of memory pages with size declared by span_map_count configuration variable,
//or a set of spans in a continuous region, a super span. Any reference to the term "span" usually refers to both a single
//span or a super span. A super span can further be divided into multiple spans (or this, super spans), where the first
//(super)span is the master and subsequent (super)spans are subspans. The master span keeps track of how many subspans
//that are still alive and mapped in virtual memory, and once all subspans and master have been unmapped the entire
//superspan region is released and unmapped (on Windows for example, the entire superspan range has to be released
//in the same call to release the virtual memory range, but individual subranges can be decommitted individually
//to reduce physical memory use).
struct span_t {
  //! Free list
  void*       free_list;
  //! State
  uint32_t    state;
  //! Used count when not active (not including deferred free list)
  uint32_t    used_count;
  //! Block count
  uint32_t    block_count;
  //! Size class
  uint32_t    size_class;
  //! Index of last block initialized in free list
  uint32_t    free_list_limit;
  //! Span list size when part of a cache list, or size of deferred free list when partial/full
  uint32_t    list_size;
  //! Deferred free list
  atomicptr_t free_list_deferred;
  //! Size of a block
  uint32_t    block_size;
  //! Flags and counters
  uint32_t    flags;
  //! Number of spans
  uint32_t    span_count;
  //! Total span counter for master spans, distance for subspans
  uint32_t    total_spans_or_distance;
  //! Remaining span counter, for master spans
  atomic32_t  remaining_spans;
  //! Alignment offset
  uint32_t    align_offset;
  //! Owning heap
  heap_t*     heap;
  //! Next span
  span_t*     next;
  //! Previous span
  span_t*     prev;
};
_Static_assert(sizeof(span_t) <= SPAN_HEADER_SIZE, "span size mismatch");

struct heap_class_t {
  //! Free list of active span
  void*        free_list;
  //! Double linked list of partially used spans with free blocks for each size class.
  //  Current active span is at head of list. Previous span pointer in head points to tail span of list.
  span_t*      partial_span;
};

struct heap_t {
  //! Active and semi-used span data per size class
  heap_class_t span_class[SIZE_CLASS_COUNT];
#if ENABLE_THREAD_CACHE
  //! List of free spans (single linked list)
  span_t*      span_cache[LARGE_CLASS_COUNT];
  //! List of deferred free spans of class 0 (single linked list)
  atomicptr_t  span_cache_deferred;
#endif
#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
  //! Current and high water mark of spans used per span count
  span_use_t   span_use[LARGE_CLASS_COUNT];
#endif
  //! Mapped but unused spans
  span_t*      span_reserve;
  //! Master span for mapped but unused spans
  span_t*      span_reserve_master;
  //! Number of mapped but unused spans
  size_t       spans_reserved;
  //! Next heap in id list
  heap_t*      next_heap;
  //! Next heap in orphan list
  heap_t*      next_orphan;
  //! Memory pages alignment offset
  size_t       align_offset;
  //! Heap ID
  int32_t      id;
#if ENABLE_STATISTICS
  //! Number of bytes transitioned thread -> global
  size_t       thread_to_global;
  //! Number of bytes transitioned global -> thread
  size_t       global_to_thread;
  //! Allocation stats per size class
  size_class_use_t size_class_use[SIZE_CLASS_COUNT + 1];
#endif
};

struct size_class_t {
  //! Size of blocks in this class
  uint32_t block_size;
  //! Number of blocks in each chunk
  uint16_t block_count;
  //! Class index this class is merged with
  uint16_t class_idx;
};
_Static_assert(sizeof(size_class_t) == 8, "Size class size mismatch");

struct global_cache_t {
  //! Cache list pointer
  atomicptr_t cache;
  //! Cache size
  atomic32_t size;
  //! ABA counter
  atomic32_t counter;
};

/// Global data
//! Initialized flag
static int _rpmalloc_initialized;
//! Configuration
static rpmalloc_config_t _memory_config;
//! Memory page size
static size_t _memory_page_size;
//! Shift to divide by page size
static size_t _memory_page_size_shift;
//! Granularity at which memory pages are mapped by OS
static size_t _memory_map_granularity;
#if RPMALLOC_CONFIGURABLE
//! Size of a span of memory pages
static size_t _memory_span_size;
//! Shift to divide by span size
static size_t _memory_span_size_shift;
//! Mask to get to start of a memory span
static uintptr_t _memory_span_mask;
#else
//! Hardwired span size (64KiB)
#define _memory_span_size (64 * 1024)
#define _memory_span_size_shift 16
#define _memory_span_mask (~((uintptr_t)(_memory_span_size - 1)))
#endif
//! Number of spans to map in each map call
static size_t _memory_span_map_count;
//! Number of spans to release from thread cache to global cache (single spans)
static size_t _memory_span_release_count;
//! Number of spans to release from thread cache to global cache (large multiple spans)
static size_t _memory_span_release_count_large;
//! Global size classes
static size_class_t _memory_size_class[SIZE_CLASS_COUNT];
//! Run-time size limit of medium blocks
static size_t _memory_medium_size_limit;
//! Heap ID counter
static atomic32_t _memory_heap_id;
//! Huge page support
static int _memory_huge_pages;
#if ENABLE_GLOBAL_CACHE
//! Global span cache
static global_cache_t _memory_span_cache[LARGE_CLASS_COUNT];
#endif
//! All heaps
static atomicptr_t _memory_heaps[HEAP_ARRAY_SIZE];
//! Orphaned heaps
static atomicptr_t _memory_orphan_heaps;
//! Running orphan counter to avoid ABA issues in linked list
static atomic32_t _memory_orphan_counter;
#if ENABLE_STATISTICS
//! Active heap count
static atomic32_t _memory_active_heaps;
//! Number of currently mapped memory pages
static atomic32_t _mapped_pages;
//! Peak number of concurrently mapped memory pages
static int32_t _mapped_pages_peak;
//! Number of currently unused spans
static atomic32_t _reserved_spans;
//! Running counter of total number of mapped memory pages since start
static atomic32_t _mapped_total;
//! Running counter of total number of unmapped memory pages since start
static atomic32_t _unmapped_total;
//! Number of currently mapped memory pages in OS calls
static atomic32_t _mapped_pages_os;
//! Number of currently allocated pages in huge allocations
static atomic32_t _huge_pages_current;
//! Peak number of currently allocated pages in huge allocations
static int32_t _huge_pages_peak;
#endif

//! Current thread heap
#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
static pthread_key_t _memory_thread_heap;
#else
#  ifdef _MSC_VER
#    define _Thread_local __declspec(thread)
#    define TLS_MODEL
#  else
#    define TLS_MODEL __attribute__((tls_model("initial-exec")))
#    if !defined(__clang__) && defined(__GNUC__)
#      define _Thread_local __thread
#    endif
#  endif
static _Thread_local heap_t* _memory_thread_heap TLS_MODEL;
#endif

static inline heap_t*
get_thread_heap_raw(void) {
#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
  return pthread_getspecific(_memory_thread_heap);
#else
  return _memory_thread_heap;
#endif
}

//! Get the current thread heap
static inline heap_t*
get_thread_heap(void) {
  heap_t* heap = get_thread_heap_raw();
#if ENABLE_PRELOAD
  if (EXPECTED(heap != 0))
    return heap;
  rpmalloc_initialize();
  return get_thread_heap_raw();
#else
  return heap;
#endif
}

//! Set the current thread heap
static void
set_thread_heap(heap_t* heap) {
#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
  pthread_setspecific(_memory_thread_heap, heap);
#else
  _memory_thread_heap = heap;
#endif
}

//! Default implementation to map more virtual memory
static void*
_memory_map_os(size_t size, size_t* offset);

//! Default implementation to unmap virtual memory
static void
_memory_unmap_os(void* address, size_t size, size_t offset, size_t release);

//! Lookup a memory heap from heap ID
static heap_t*
_memory_heap_lookup(int32_t id) {
  uint32_t list_idx = id % HEAP_ARRAY_SIZE;
  heap_t* heap = atomic_load_ptr(&_memory_heaps[list_idx]);
  while (heap && (heap->id != id))
    heap = heap->next_heap;
  return heap;
}

#if ENABLE_STATISTICS
#  define _memory_statistics_inc(counter, value) counter += value
#  define _memory_statistics_add(atomic_counter, value) atomic_add32(atomic_counter, (int32_t)(value))
#  define _memory_statistics_add_peak(atomic_counter, value, peak) do { int32_t _cur_count = atomic_add32(atomic_counter, (int32_t)(value)); if (_cur_count > (peak)) peak = _cur_count; } while (0)
#  define _memory_statistics_sub(atomic_counter, value) atomic_add32(atomic_counter, -(int32_t)(value))
#  define _memory_statistics_inc_alloc(heap, class_idx) do { \
  int32_t alloc_current = atomic_incr32(&heap->size_class_use[class_idx].alloc_current); \
  if (alloc_current > heap->size_class_use[class_idx].alloc_peak) \
    heap->size_class_use[class_idx].alloc_peak = alloc_current; \
  heap->size_class_use[class_idx].alloc_total++; \
} while(0)
#  define _memory_statistics_inc_free(heap, class_idx) do { \
  atomic_add32(&heap->size_class_use[class_idx].alloc_current, -1); \
  atomic_incr32(&heap->size_class_use[class_idx].free_total); \
} while(0)
#else
#  define _memory_statistics_inc(counter, value) do {} while(0)
#  define _memory_statistics_add(atomic_counter, value) do {} while(0)
#  define _memory_statistics_add_peak(atomic_counter, value, peak) do {} while (0)
#  define _memory_statistics_sub(atomic_counter, value) do {} while(0)
#  define _memory_statistics_inc_alloc(heap, class_idx) do {} while(0)
#  define _memory_statistics_inc_free(heap, class_idx) do {} while(0)
#endif

static void
_memory_heap_cache_insert(heap_t* heap, span_t* span);

//! Map more virtual memory
static void*
_memory_map(size_t size, size_t* offset) {
  assert(!(size % _memory_page_size));
  assert(size >= _memory_page_size);
  _memory_statistics_add_peak(&_mapped_pages, (size >> _memory_page_size_shift), _mapped_pages_peak);
  _memory_statistics_add(&_mapped_total, (size >> _memory_page_size_shift));
  return _memory_config.memory_map(size, offset);
}

//! Unmap virtual memory
static void
_memory_unmap(void* address, size_t size, size_t offset, size_t release) {
  assert(!release || (release >= size));
  assert(!release || (release >= _memory_page_size));
  if (release) {
    assert(!(release % _memory_page_size));
    _memory_statistics_sub(&_mapped_pages, (release >> _memory_page_size_shift));
    _memory_statistics_add(&_unmapped_total, (release >> _memory_page_size_shift));
  }
  _memory_config.memory_unmap(address, size, offset, release);
}

//! Declare the span to be a subspan and store distance from master span and span count
static void
_memory_span_mark_as_subspan_unless_master(span_t* master, span_t* subspan, size_t span_count) {
  assert((subspan != master) || (subspan->flags & SPAN_FLAG_MASTER));
  if (subspan != master) {
    subspan->flags = SPAN_FLAG_SUBSPAN;
    subspan->total_spans_or_distance = (uint32_t)((uintptr_t)pointer_diff(subspan, master) >> _memory_span_size_shift);
    subspan->align_offset = 0;
  }
  subspan->span_count = (uint32_t)span_count;
}

//! Use reserved spans to fulfill a memory map request (reserve size must be checked by caller)
static span_t*
_memory_map_from_reserve(heap_t* heap, size_t span_count) {
  //Update the heap span reserve
  span_t* span = heap->span_reserve;
  heap->span_reserve = pointer_offset(span, span_count * _memory_span_size);
  heap->spans_reserved -= span_count;

  _memory_span_mark_as_subspan_unless_master(heap->span_reserve_master, span, span_count);
  if (span_count <= LARGE_CLASS_COUNT)
    _memory_statistics_inc(heap->span_use[span_count - 1].spans_from_reserved, 1);

  return span;
}

//! Get the aligned number of spans to map in based on wanted count, configured mapping granularity and the page size
static size_t
_memory_map_align_span_count(size_t span_count) {
  size_t request_count = (span_count > _memory_span_map_count) ? span_count : _memory_span_map_count;
  if ((_memory_page_size > _memory_span_size) && ((request_count * _memory_span_size) % _memory_page_size))
    request_count += _memory_span_map_count - (request_count % _memory_span_map_count); 
  return request_count;
}

//! Store the given spans as reserve in the given heap
static void
_memory_heap_set_reserved_spans(heap_t* heap, span_t* master, span_t* reserve, size_t reserve_span_count) {
  heap->span_reserve_master = master;
  heap->span_reserve = reserve;
  heap->spans_reserved = reserve_span_count;
}

//! Setup a newly mapped span
static void
_memory_span_initialize(span_t* span, size_t total_span_count, size_t span_count, size_t align_offset) {
  span->total_spans_or_distance = (uint32_t)total_span_count;
  span->span_count = (uint32_t)span_count;
  span->align_offset = (uint32_t)align_offset;
  span->flags = SPAN_FLAG_MASTER;
  atomic_store32(&span->remaining_spans, (int32_t)total_span_count);  
}

//! Map a akigned set of spans, taking configured mapping granularity and the page size into account
static span_t*
_memory_map_aligned_span_count(heap_t* heap, size_t span_count) {
  //If we already have some, but not enough, reserved spans, release those to heap cache and map a new
  //full set of spans. Otherwise we would waste memory if page size > span size (huge pages)
  size_t aligned_span_count = _memory_map_align_span_count(span_count);
  size_t align_offset = 0;
  span_t* span = _memory_map(aligned_span_count * _memory_span_size, &align_offset);
  if (!span)
    return 0;
  _memory_span_initialize(span, aligned_span_count, span_count, align_offset);
  _memory_statistics_add(&_reserved_spans, aligned_span_count);
  if (span_count <= LARGE_CLASS_COUNT)
    _memory_statistics_inc(heap->span_use[span_count - 1].spans_map_calls, 1);
  if (aligned_span_count > span_count) {
    if (heap->spans_reserved) {
      _memory_span_mark_as_subspan_unless_master(heap->span_reserve_master, heap->span_reserve, heap->spans_reserved);
      _memory_heap_cache_insert(heap, heap->span_reserve);
    }
    _memory_heap_set_reserved_spans(heap, span, pointer_offset(span, span_count * _memory_span_size), aligned_span_count - span_count);
  }
  return span;
}

//! Map in memory pages for the given number of spans (or use previously reserved pages)
static span_t*
_memory_map_spans(heap_t* heap, size_t span_count) {
  if (span_count <= heap->spans_reserved)
    return _memory_map_from_reserve(heap, span_count);
  return _memory_map_aligned_span_count(heap, span_count);
}

//! Unmap memory pages for the given number of spans (or mark as unused if no partial unmappings)
static void
_memory_unmap_span(span_t* span) {
  assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN));
  assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN));

  int is_master = !!(span->flags & SPAN_FLAG_MASTER);
  span_t* master = is_master ? span : (pointer_offset(span, -(int32_t)(span->total_spans_or_distance * _memory_span_size)));
  assert(is_master || (span->flags & SPAN_FLAG_SUBSPAN));
  assert(master->flags & SPAN_FLAG_MASTER);

  size_t span_count = span->span_count;
  if (!is_master) {
    //Directly unmap subspans (unless huge pages, in which case we defer and unmap entire page range with master)
    assert(span->align_offset == 0);
    if (_memory_span_size >= _memory_page_size) {
      _memory_unmap(span, span_count * _memory_span_size, 0, 0);
      _memory_statistics_sub(&_reserved_spans, span_count);
    }
  } else {
    //Special double flag to denote an unmapped master
    //It must be kept in memory since span header must be used
    span->flags |= SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN;
  }

  if (atomic_add32(&master->remaining_spans, -(int32_t)span_count) <= 0) {
    //Everything unmapped, unmap the master span with release flag to unmap the entire range of the super span
    assert(!!(master->flags & SPAN_FLAG_MASTER) && !!(master->flags & SPAN_FLAG_SUBSPAN));
    size_t unmap_count = master->span_count;
    if (_memory_span_size < _memory_page_size)
      unmap_count = master->total_spans_or_distance;
    _memory_statistics_sub(&_reserved_spans, unmap_count);
    _memory_unmap(master, unmap_count * _memory_span_size, master->align_offset, master->total_spans_or_distance * _memory_span_size);
  }
}

#if ENABLE_THREAD_CACHE

//! Unmap a single linked list of spans
static void
_memory_unmap_span_list(span_t* span) {
  size_t list_size = span->list_size;
  for (size_t ispan = 0; ispan < list_size; ++ispan) {
    span_t* next_span = span->next;
    _memory_unmap_span(span);
    span = next_span;
  }
  assert(!span);
}

//! Add span to head of single linked span list
static size_t
_memory_span_list_push(span_t** head, span_t* span) {
  span->next = *head;
  if (*head)
    span->list_size = (*head)->list_size + 1;
  else
    span->list_size = 1;
  *head = span;
  return span->list_size;
}

//! Remove span from head of single linked span list, returns the new list head
static span_t*
_memory_span_list_pop(span_t** head) {
  span_t* span = *head;
  span_t* next_span = 0;
  if (span->list_size > 1) {
    assert(span->next);
    next_span = span->next;
    assert(next_span);
    next_span->list_size = span->list_size - 1;
  }
  *head = next_span;
  return span;
}

//! Split a single linked span list
static span_t*
_memory_span_list_split(span_t* span, size_t limit) {
  span_t* next = 0;
  if (limit < 2)
    limit = 2;
  if (span->list_size > limit) {
    uint32_t list_size = 1;
    span_t* last = span;
    next = span->next;
    while (list_size < limit) {
      last = next;
      next = next->next;
      ++list_size;
    }
    last->next = 0;
    assert(next);
    next->list_size = span->list_size - list_size;
    span->list_size = list_size;
    span->prev = 0;
  }
  return next;
}

#endif

//! Add a span to partial span double linked list at the head
static void
_memory_span_partial_list_add(span_t** head, span_t* span) {
  if (*head) {
    span->next = *head;
    //Maintain pointer to tail span
    span->prev = (*head)->prev;
    (*head)->prev = span;
  } else {
    span->next = 0;
    span->prev = span;
  }
  *head = span;
}

//! Add a span to partial span double linked list at the tail
static void
_memory_span_partial_list_add_tail(span_t** head, span_t* span) {
  span->next = 0;
  if (*head) {
    span_t* tail = (*head)->prev;
    tail->next = span;
    span->prev = tail;
    //Maintain pointer to tail span
    (*head)->prev = span;
  } else {
    span->prev = span;
    *head = span;
  }
}

//! Pop head span from partial span double linked list
static void
_memory_span_partial_list_pop_head(span_t** head) {
  span_t* span = *head;
  *head = span->next;
  if (*head) {
    //Maintain pointer to tail span
    (*head)->prev = span->prev;
  }
}

//! Remove a span from partial span double linked list
static void
_memory_span_partial_list_remove(span_t** head, span_t* span) {
  if (UNEXPECTED(*head == span)) {
    _memory_span_partial_list_pop_head(head);
  } else {
    span_t* next_span = span->next;
    span_t* prev_span = span->prev;
    prev_span->next = next_span;
    if (EXPECTED(next_span != 0)) {
      next_span->prev = prev_span;
    } else {
      //Update pointer to tail span
      (*head)->prev = prev_span;
    }
  }
}

#if ENABLE_GLOBAL_CACHE

//! Insert the given list of memory page spans in the global cache
static void
_memory_cache_insert(global_cache_t* cache, span_t* span, size_t cache_limit) {
  assert((span->list_size == 1) || (span->next != 0));
  int32_t list_size = (int32_t)span->list_size;
  //Unmap if cache has reached the limit
  if (atomic_add32(&cache->size, list_size) > (int32_t)cache_limit) {
#if !ENABLE_UNLIMITED_GLOBAL_CACHE
    _memory_unmap_span_list(span);
    atomic_add32(&cache->size, -list_size);
    return;
#endif
  }
  void* current_cache, *new_cache;
  do {
    current_cache = atomic_load_ptr(&cache->cache);
    span->prev = (void*)((uintptr_t)current_cache & _memory_span_mask);
    new_cache = (void*)((uintptr_t)span | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask));
  } while (!atomic_cas_ptr(&cache->cache, new_cache, current_cache));
}

//! Extract a number of memory page spans from the global cache
static span_t*
_memory_cache_extract(global_cache_t* cache) {
  uintptr_t span_ptr;
  do {
    void* global_span = atomic_load_ptr(&cache->cache);
    span_ptr = (uintptr_t)global_span & _memory_span_mask;
    if (span_ptr) {
      span_t* span = (void*)span_ptr;
      //By accessing the span ptr before it is swapped out of list we assume that a contending thread
      //does not manage to traverse the span to being unmapped before we access it
      void* new_cache = (void*)((uintptr_t)span->prev | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask));
      if (atomic_cas_ptr(&cache->cache, new_cache, global_span)) {
        atomic_add32(&cache->size, -(int32_t)span->list_size);
        return span;
      }
    }
  } while (span_ptr);
  return 0;
}

//! Finalize a global cache, only valid from allocator finalization (not thread safe)
static void
_memory_cache_finalize(global_cache_t* cache) {
  void* current_cache = atomic_load_ptr(&cache->cache);
  span_t* span = (void*)((uintptr_t)current_cache & _memory_span_mask);
  while (span) {
    span_t* skip_span = (void*)((uintptr_t)span->prev & _memory_span_mask);
    atomic_add32(&cache->size, -(int32_t)span->list_size);
    _memory_unmap_span_list(span);
    span = skip_span;
  }
  assert(!atomic_load32(&cache->size));
  atomic_store_ptr(&cache->cache, 0);
  atomic_store32(&cache->size, 0);
}

//! Insert the given list of memory page spans in the global cache
static void
_memory_global_cache_insert(span_t* span) {
  size_t span_count = span->span_count;
#if ENABLE_UNLIMITED_GLOBAL_CACHE
  _memory_cache_insert(&_memory_span_cache[span_count - 1], span, 0);
#else
  const size_t cache_limit = (GLOBAL_CACHE_MULTIPLIER * ((span_count == 1) ? _memory_span_release_count : _memory_span_release_count_large));
  _memory_cache_insert(&_memory_span_cache[span_count - 1], span, cache_limit);
#endif
}

//! Extract a number of memory page spans from the global cache for large blocks
static span_t*
_memory_global_cache_extract(size_t span_count) {
  span_t* span = _memory_cache_extract(&_memory_span_cache[span_count - 1]);
  assert(!span || (span->span_count == span_count));
  return span;
}

#endif

#if ENABLE_THREAD_CACHE
//! Adopt the deferred span cache list
static void
_memory_heap_cache_adopt_deferred(heap_t* heap) {
  atomic_thread_fence_acquire();
  span_t* span = atomic_load_ptr(&heap->span_cache_deferred);
  if (!span)
    return;
  do {
    span = atomic_load_ptr(&heap->span_cache_deferred);
  } while (!atomic_cas_ptr(&heap->span_cache_deferred, 0, span));
  while (span) {
    span_t* next_span = span->next;
    _memory_span_list_push(&heap->span_cache[0], span);
#if ENABLE_STATISTICS
    heap->size_class_use[span->size_class].spans_to_cache++;
#endif
    span = next_span;
  }
}
#endif

//! Insert a single span into thread heap cache, releasing to global cache if overflow
static void
_memory_heap_cache_insert(heap_t* heap, span_t* span) {
#if ENABLE_THREAD_CACHE
  size_t span_count = span->span_count;
  size_t idx = span_count - 1;
  _memory_statistics_inc(heap->span_use[idx].spans_to_cache, 1);
  if (!idx)
    _memory_heap_cache_adopt_deferred(heap);
#if ENABLE_UNLIMITED_THREAD_CACHE
  _memory_span_list_push(&heap->span_cache[idx], span);
#else
  const size_t release_count = (!idx ? _memory_span_release_count : _memory_span_release_count_large);
  size_t current_cache_size = _memory_span_list_push(&heap->span_cache[idx], span);
  if (current_cache_size <= release_count)
    return;
  const size_t hard_limit = release_count * THREAD_CACHE_MULTIPLIER;
  if (current_cache_size <= hard_limit) {
#if ENABLE_ADAPTIVE_THREAD_CACHE
    //Require 25% of high water mark to remain in cache (and at least 1, if use is 0)
    const size_t high_mark = heap->span_use[idx].high;
    const size_t min_limit = (high_mark >> 2) + release_count + 1;
    if (current_cache_size < min_limit)
      return;
#else
    return;
#endif
  }
  heap->span_cache[idx] = _memory_span_list_split(span, release_count);
  assert(span->list_size == release_count);
#if ENABLE_STATISTICS
  heap->thread_to_global += (size_t)span->list_size * span_count * _memory_span_size;
  heap->span_use[idx].spans_to_global += span->list_size;
#endif
#if ENABLE_GLOBAL_CACHE
  _memory_global_cache_insert(span);
#else