Loading...
--- /dev/null
+++ libmalloc/libmalloc-792.80.2/src/xzone_malloc/xzone_segment.c
@@ -0,0 +1,3270 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2022, Microsoft Research, Daan Leijen
+Copyright © 2025 Apple Inc.
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" in the same directory as this file.
+-----------------------------------------------------------------------------*/
+
+#include "../internal.h"
+
+#if CONFIG_XZONE_MALLOC
+
+static void
+xzm_madvise(xzm_malloc_zone_t zone, uint8_t *start, size_t size);
+
+static void
+_xzm_segment_group_segment_deallocate(xzm_segment_group_t sg,
+ xzm_segment_t segment, bool free_from_table);
+
+#pragma mark segment map
+
+// mimalloc: _mi_segment_map_allocated_at
+static void
+_xzm_segment_table_allocated_at(xzm_main_malloc_zone_t main, void *data,
+ xzm_segment_t metadata, bool normal)
+{
+ xzm_debug_assert((uintptr_t)data % XZM_SEGMENT_SIZE == 0);
+
+ void *segment_end = _xzm_segment_end(metadata);
+ xzm_debug_assert((uintptr_t)data < (uintptr_t)segment_end);
+
+ xzm_segment_table_entry_s entry_val =
+ _xzm_segment_to_segment_table_entry(metadata, normal);
+
+ while (data < segment_end) {
+#if CONFIG_EXTERNAL_METADATA_LARGE
+ // If this allocation is in a new 64GB granule, allocate a new leaf
+ // table to store the metadata pointers in
+ size_t ext_idx = 0;
+ __assert_only size_t index = _xzm_segment_table_index_of(data, &ext_idx);
+ xzm_debug_assert(index < XZM_SEGMENT_TABLE_ENTRIES);
+ xzm_debug_assert(ext_idx < XZM_EXTENDED_SEGMENT_TABLE_ENTRIES);
+
+ if (ext_idx != 0) {
+ xzm_extended_segment_table_entry_s *ext_addr =
+ &main->xzmz_extended_segment_table[ext_idx];
+ xzm_extended_segment_table_entry_s ext_entry = { 0 };
+ ext_entry = os_atomic_load(ext_addr, relaxed);
+ if (ext_entry.xeste_val == 0) {
+ // Need to allocate a new segment table since this pointer is in
+ // a new segment table (64GB span)
+ _malloc_lock_lock(&main->xzmz_extended_segment_table_lock);
+ // Load the table entry again to see if another thread populated
+ // it while we were acquiring the lock
+ ext_entry = os_atomic_load(ext_addr, relaxed);
+ if (ext_entry.xeste_val == 0) {
+ xzm_metapool_t mp;
+ mp = &main->xzmz_metapools[XZM_METAPOOL_SEGMENT_TABLE];
+ void *leaf_table = xzm_metapool_alloc(mp);
+ xzm_assert(leaf_table);
+ xzm_debug_assert(((uintptr_t)leaf_table /
+ XZM_SEGMENT_TABLE_ALIGN) <= UINT32_MAX);
+ ext_entry.xeste_val = (uint32_t)((uintptr_t)leaf_table /
+ XZM_SEGMENT_TABLE_ALIGN);
+ os_atomic_store(ext_addr, ext_entry, relaxed);
+ }
+ _malloc_lock_unlock(&main->xzmz_extended_segment_table_lock);
+ }
+ }
+#endif // CONFIG_EXTERNAL_METADATA_LARGE
+
+ xzm_segment_table_entry_s *entry;
+ entry = _xzm_ptr_to_table_entry(data, main);
+ xzm_debug_assert(entry != NULL);
+
+ xzm_debug_assert(entry->xste_val == 0);
+
+ // Store-release to publish the segment and chunk initializations
+ // TODO: document all paired dependency/acquire loads
+ os_atomic_store(entry, entry_val, release);
+
+ data = (void *)((uintptr_t)data + XZM_SEGMENT_SIZE);
+ }
+}
+
+// mimalloc: _mi_segment_map_freed_at
+static void
+_xzm_segment_table_freed_at(xzm_main_malloc_zone_t main, void *data,
+ xzm_segment_t metadata, __assert_only bool full_segment)
+{
+ void *end = _xzm_segment_end(metadata);
+ xzm_debug_assert(!full_segment ||
+ _xzm_segment_start(metadata) == data);
+ while (data < end) {
+ xzm_segment_table_entry_s *entry;
+ entry = _xzm_ptr_to_table_entry(data, main);
+ xzm_debug_assert(entry != NULL);
+ xzm_debug_assert(_xzm_segment_to_segment_table_entry(metadata, false).xste_val ==
+ entry->xste_val);
+ xzm_segment_table_entry_s null_entry;
+ null_entry = _xzm_segment_to_segment_table_entry(NULL, false);
+ os_atomic_store(entry, null_entry, relaxed);
+
+ data = (void *)((uintptr_t)data + XZM_SEGMENT_SIZE);
+ }
+}
+
+#pragma mark vm reclaim
+
+#if CONFIG_XZM_DEFERRED_RECLAIM
+
+static struct xzm_reclaim_buffer_s xzm_reclaim_buffer;
+
+static bool
+_xzm_reclaim_id_cache_is_empty(xzm_reclaim_id_cache_t cache)
+{
+ return cache->ric_head == 0;
+}
+
+static uint64_t
+_xzm_reclaim_id_cache_pop(xzm_reclaim_id_cache_t cache)
+{
+ xzm_debug_assert(!_xzm_reclaim_id_cache_is_empty(cache));
+ uint64_t id = cache->ric_ids[--cache->ric_head];
+ xzm_debug_assert(id != VM_RECLAIM_ID_NULL);
+ return id;
+}
+
+static void
+_xzm_reclaim_id_cache_push(xzm_reclaim_id_cache_t cache, mach_vm_reclaim_id_t id)
+{
+ xzm_assert(cache->ric_head < cache->ric_len);
+ xzm_debug_assert(id != VM_RECLAIM_ID_NULL);
+ cache->ric_ids[cache->ric_head++] = id;
+}
+
+static void
+_xzm_reclaim_id_cache_init(xzm_reclaim_buffer_t buffer)
+{
+ xzm_reclaim_id_cache_t id_cache = &buffer->xrb_id_cache;
+ mach_vm_reclaim_count_t max_buffer_count;
+ mach_vm_reclaim_error_t kr = mach_vm_reclaim_ring_capacity(
+ buffer->xrb_ringbuffer, &max_buffer_count);
+ xzm_assert(kr == VM_RECLAIM_SUCCESS);
+ size_t min_id_cache_size =
+ max_buffer_count * sizeof(mach_vm_reclaim_id_t);
+ size_t id_cache_size = round_page(min_id_cache_size);
+ if (id_cache->ric_ids == NULL ||
+ id_cache->ric_len < max_buffer_count) {
+ mach_vm_reclaim_id_t *ids = (mach_vm_reclaim_id_t *)
+ mvm_allocate_pages(id_cache_size, 0, MALLOC_ABORT_ON_ERROR,
+ VM_MEMORY_MALLOC);
+ if (id_cache->ric_ids != NULL) {
+ // Deallocate the old cache
+ mvm_deallocate_pages((void *)(id_cache->ric_ids),
+ id_cache->ric_len * sizeof(mach_vm_reclaim_id_t),
+ MALLOC_ABORT_ON_ERROR);
+ }
+ id_cache->ric_ids = ids;
+ id_cache->ric_len = id_cache_size / sizeof(mach_vm_reclaim_id_t);
+ }
+ id_cache->ric_head = 0;
+ xzm_debug_assert(id_cache->ric_len >= max_buffer_count);
+}
+
+bool
+xzm_reclaim_init(xzm_main_malloc_zone_t main,
+ mach_vm_reclaim_count_t initial_count, mach_vm_reclaim_count_t max_count)
+{
+ // Pick a sane minimum number of entries and let vm_reclaim round up
+ // to a page boundary. The intention is for the initial size to be
+ // one page.
+ mach_vm_reclaim_count_t buffer_capacity =
+ mach_vm_reclaim_round_capacity(initial_count);
+ mach_vm_reclaim_count_t max_buffer_capacity =
+ mach_vm_reclaim_round_capacity(max_count);
+ xzm_reclaim_buffer.xrb_id_cache.ric_len = 0;
+ xzm_reclaim_buffer.xrb_id_cache.ric_ids = NULL;
+ _malloc_lock_init(&xzm_reclaim_buffer.xrb_lock);
+ mach_vm_reclaim_error_t err = mach_vm_reclaim_ring_allocate(
+ &xzm_reclaim_buffer.xrb_ringbuffer, buffer_capacity,
+ max_buffer_capacity);
+ if (err == VM_RECLAIM_SUCCESS) {
+ xzm_reclaim_buffer.xrb_len = buffer_capacity;
+ main->xzmz_reclaim_buffer = &xzm_reclaim_buffer;
+ _xzm_reclaim_id_cache_init(&xzm_reclaim_buffer);
+ } else {
+ malloc_report(ASL_LEVEL_ERR,
+ "xzm: failed to initialize deferred "
+ "reclamation buffer [%d] %s\n",
+ err_get_code(err), mach_error_string(err));
+ }
+ return (err == VM_RECLAIM_SUCCESS);
+}
+
+static mach_vm_reclaim_state_t
+_xzm_reclaim_mark_used_locked(xzm_reclaim_buffer_t buffer,
+ mach_vm_reclaim_id_t id, uint8_t *addr, size_t size, bool reusable,
+ bool *update_accounting_out)
+{
+ mach_vm_reclaim_error_t err;
+ mach_vm_reclaim_state_t state;
+
+ xzm_debug_assert(size <= UINT32_MAX);
+ mach_vm_reclaim_action_t behavior = reusable ?
+ VM_RECLAIM_FREE : VM_RECLAIM_DEALLOCATE;
+
+ err = mach_vm_reclaim_try_cancel(buffer->xrb_ringbuffer, id,
+ (mach_vm_address_t)addr, (mach_vm_size_t)size,
+ behavior, &state, update_accounting_out);
+ xzm_assert(err == VM_RECLAIM_SUCCESS);
+
+ if (state == VM_RECLAIM_UNRECLAIMED) {
+ _xzm_reclaim_id_cache_push(&buffer->xrb_id_cache, id);
+ }
+
+ return state;
+}
+
+static mach_vm_reclaim_state_t
+_xzm_reclaim_mark_used(xzm_reclaim_buffer_t buffer, mach_vm_reclaim_id_t id,
+ uint8_t *addr, size_t size, bool reusable)
+{
+ bool update_accounting = false;
+
+ _malloc_lock_lock(&buffer->xrb_lock);
+
+ mach_vm_reclaim_state_t state = _xzm_reclaim_mark_used_locked(buffer, id,
+ addr, size, reusable, &update_accounting);
+
+ _malloc_lock_unlock(&buffer->xrb_lock);
+
+ if (update_accounting) {
+ __assert_only mach_vm_reclaim_error_t err =
+ mach_vm_reclaim_update_kernel_accounting(buffer->xrb_ringbuffer);
+ xzm_debug_assert(err == VM_RECLAIM_SUCCESS);
+ }
+
+ return state;
+}
+
+static bool
+_xzm_reclaim_is_reusable(xzm_reclaim_buffer_t buffer, mach_vm_reclaim_id_t reclaim_id, bool deallocate)
+{
+ mach_vm_reclaim_error_t err;
+ mach_vm_reclaim_state_t state;
+ err = mach_vm_reclaim_query_state(buffer->xrb_ringbuffer, reclaim_id,
+ deallocate ? VM_RECLAIM_DEALLOCATE : VM_RECLAIM_FREE, &state);
+ xzm_assert(err == VM_RECLAIM_SUCCESS);
+ return mach_vm_reclaim_is_reusable(state);
+}
+
+uint64_t
+xzm_reclaim_mark_free_locked(xzm_reclaim_buffer_t buffer, uint8_t *addr,
+ size_t size, bool reusable, bool *update_accounting_out)
+{
+ mach_vm_reclaim_error_t kr;
+ mach_vm_reclaim_id_t id;
+ mach_vm_address_t vm_addr = (mach_vm_address_t)addr;
+ uint32_t vm_size = (uint32_t)size;
+ xzm_debug_assert(size <= UINT32_MAX);
+ xzm_debug_assert(vm_addr % XZM_SEGMENT_SLICE_SIZE == 0);
+ xzm_debug_assert(vm_size % XZM_SEGMENT_SLICE_SIZE == 0);
+#ifdef DEBUG
+ _malloc_lock_assert_owner(&buffer->xrb_lock);
+#endif // DEBUG
+
+ mach_vm_reclaim_action_t behavior = reusable ?
+ VM_RECLAIM_FREE : VM_RECLAIM_DEALLOCATE;
+
+ while (!_xzm_reclaim_id_cache_is_empty(&buffer->xrb_id_cache)) {
+ id = _xzm_reclaim_id_cache_pop(&buffer->xrb_id_cache);
+ kr = mach_vm_reclaim_try_enter(
+ buffer->xrb_ringbuffer,
+ vm_addr, vm_size, behavior, &id,
+ update_accounting_out);
+ xzm_assert(kr == VM_RECLAIM_SUCCESS);
+ if (id != VM_RECLAIM_ID_NULL) {
+ goto done;
+ }
+ }
+ do {
+ id = VM_RECLAIM_ID_NULL;
+ kr = mach_vm_reclaim_try_enter(buffer->xrb_ringbuffer, vm_addr, vm_size,
+ behavior, &id, update_accounting_out);
+ xzm_assert(kr == VM_RECLAIM_SUCCESS);
+ if (id == VM_RECLAIM_ID_NULL) {
+ // If the ringbuffer is full, reap all of its contents and resize
+ xzm_reclaim_sync_and_resize(buffer);
+ }
+ } while (id == VM_RECLAIM_ID_NULL);
+
+done:
+ return id;
+}
+
+static uint64_t
+_xzm_reclaim_mark_free(xzm_reclaim_buffer_t buffer, uint8_t *addr, size_t size,
+ bool reusable)
+{
+ uint64_t id;
+ bool should_update_kernel_accounting = false;
+
+ _malloc_lock_lock(&buffer->xrb_lock);
+
+ id = xzm_reclaim_mark_free_locked(buffer, addr, size, reusable,
+ &should_update_kernel_accounting);
+
+ _malloc_lock_unlock(&buffer->xrb_lock);
+
+ if (should_update_kernel_accounting) {
+ __assert_only mach_vm_reclaim_error_t kr =
+ mach_vm_reclaim_update_kernel_accounting(buffer->xrb_ringbuffer);
+ xzm_debug_assert(kr == VM_RECLAIM_SUCCESS);
+ }
+ return id;
+}
+
+static bool
+xzm_reclaim_mark_smaller(xzm_reclaim_buffer_t buffer, uint64_t *front_id,
+ uint64_t *back_id, uint8_t *front_start, size_t front_free_size,
+ size_t used_size, size_t back_free_size, bool deferred, bool pristine,
+ bool reusable)
+{
+ const size_t span_size = front_free_size + used_size + back_free_size;
+ xzm_debug_assert(span_size <= UINT32_MAX);
+
+ bool should_update_used = false;
+ bool should_update_free_front = false, should_update_free_back = false;
+
+ _malloc_lock_lock(&buffer->xrb_lock);
+
+ bool usable = true;
+ mach_vm_reclaim_state_t state;
+ if (deferred) {
+ xzm_debug_assert(*front_id != VM_RECLAIM_ID_NULL);
+ // Mark the entire span as used
+ state = _xzm_reclaim_mark_used_locked(buffer, *front_id, front_start,
+ span_size, reusable, &should_update_used);
+ usable = mach_vm_reclaim_is_reusable(state);
+ if (usable) {
+ *front_id = VM_RECLAIM_ID_NULL;
+ }
+ }
+ if (usable) {
+ if (front_free_size && !pristine) {
+ // Mark the front as free. Note that it already has a reclaim id
+ xzm_debug_assert(*front_id == VM_RECLAIM_ID_NULL);
+ *front_id = xzm_reclaim_mark_free_locked(buffer, front_start,
+ front_free_size, reusable, &should_update_free_front);
+ }
+
+ if (back_free_size) {
+ xzm_debug_assert(back_id);
+ if (!pristine) {
+ // Mark the back as free
+ uint8_t *back_start = front_start + front_free_size + used_size;
+ *back_id = xzm_reclaim_mark_free_locked(buffer, back_start,
+ back_free_size, reusable, &should_update_free_back);
+ } else {
+ // Initialize the reclaim id now, because when the span metadata
+ // is updated, it cannot overwrite any reclaim id we set
+ *back_id = VM_RECLAIM_ID_NULL;
+ }
+ }
+ }
+
+ _malloc_lock_unlock(&buffer->xrb_lock);
+
+ if (should_update_used || should_update_free_front ||
+ should_update_free_back) {
+ mach_vm_reclaim_update_kernel_accounting(buffer->xrb_ringbuffer);
+ }
+
+ return usable;
+}
+
+void
+xzm_reclaim_force_sync(xzm_reclaim_buffer_t buffer)
+{
+ // This function is called in a loop when reclaim_mark_used fails while
+ // trying to free a span in the reclaim buffer.
+ mach_vm_reclaim_count_t capacity;
+ __assert_only mach_vm_reclaim_error_t err;
+ err = mach_vm_reclaim_ring_capacity(buffer->xrb_ringbuffer, &capacity);
+ xzm_assert(err == VM_RECLAIM_SUCCESS);
+ err = mach_vm_reclaim_ring_flush(buffer->xrb_ringbuffer, capacity);
+ xzm_assert(err == VM_RECLAIM_SUCCESS);
+}
+
+void
+xzm_reclaim_sync_and_resize(xzm_reclaim_buffer_t buffer)
+{
+ mach_vm_reclaim_error_t kr;
+ mach_vm_reclaim_count_t count;
+ kr = mach_vm_reclaim_ring_capacity(buffer->xrb_ringbuffer, &count);
+ xzm_assert(kr == VM_RECLAIM_SUCCESS);
+ mach_vm_reclaim_count_t new_count =
+ mach_vm_reclaim_round_capacity(2 * count);
+
+ kr = mach_vm_reclaim_ring_resize(buffer->xrb_ringbuffer, new_count);
+ if (kr == VM_RECLAIM_SUCCESS) {
+ _xzm_reclaim_id_cache_init(buffer);
+ } else {
+ // Must explicitly flush if the resize operation failed
+ xzm_reclaim_force_sync(buffer);
+ }
+}
+
+#endif // CONFIG_XZM_DEFERRED_RECLAIM
+
+#pragma mark range group
+
+OS_OPTIONS(xzm_range_group_alloc_flags, uint32_t,
+ XZM_RANGE_GROUP_ALLOC_FLAGS_HUGE = 1 << 0,
+ XZM_RANGE_GROUP_ALLOC_FLAGS_PURGEABLE = 1 << 1,
+#if CONFIG_MTE
+ XZM_RANGE_GROUP_ALLOC_FLAGS_MTE = 1 << 2,
+#endif
+);
+
+static int
+_xzm_range_group_vm_tag_for_segment(size_t size, bool huge)
+{
+ // Note: although there is already a VM_MEMORY_MALLOC_HUGE tag, which has
+ // been there since prehistory, we'll use LARGE for huge segments to ensure
+ // that any special handling from the kernel or other tools works exactly as
+ // before (e.g. VM_MEMORY_MALLOC_HUGE is not included in
+ // vm_memory_malloc_no_cow_mask)
+ //
+ // We use VM_MEMORY_MALLOC_SMALL for normal segment allocations so that they
+ // are easily distinguisable from metadata allocations purely by tag.
+ return huge ? VM_MEMORY_MALLOC_LARGE : VM_MEMORY_MALLOC_SMALL;
+}
+
+static void * __alloc_size(2)
+_xzm_range_group_alloc_mvm_segment(xzm_main_malloc_zone_t main, size_t size,
+ size_t align, plat_map_t *map, xzm_range_group_alloc_flags_t rga_flags)
+{
+ bool huge = (rga_flags & XZM_RANGE_GROUP_ALLOC_FLAGS_HUGE);
+ bool purgeable = (rga_flags & XZM_RANGE_GROUP_ALLOC_FLAGS_PURGEABLE);
+
+ uint32_t flags = 0;
+ if (os_unlikely(purgeable)) {
+ flags |= MALLOC_PURGEABLE;
+ }
+
+#if XZM_NARROW_BUCKETING
+ // If we're doing narrow bucketing, and we ourselves aren't enabling
+ // VM user ranges, but we've detected that VM user ranges are active in the
+ // address space (<-> entropic_base is set), we want to pass DISABLE_ASLR to
+ // skip the mvm-layer ASLR, which would cause our allocations to be placed
+ // at the opposite end of the heap range from other pure data allocations
+ // and use an additional PTE
+ if (main->xzmz_narrow_bucketing && !main->xzmz_use_ranges && entropic_base) {
+ flags |= DISABLE_ASLR;
+ }
+#endif
+
+#if CONFIG_MTE
+ if (rga_flags & XZM_RANGE_GROUP_ALLOC_FLAGS_MTE) {
+ flags |= MALLOC_MTE_TAGGABLE;
+ }
+#endif
+
+ int tag = _xzm_range_group_vm_tag_for_segment(size, huge);
+ if (os_likely(align == 0)) {
+ return mvm_allocate_pages_plat(size, XZM_SEGMENT_SHIFT, flags, tag, map);
+ } else {
+ // mvm_allocate_pages_plat takes the log2 of the alignment
+ size_t align_pow = __builtin_ctzl(align);
+ xzm_debug_assert(align_pow < UINT8_MAX);
+ align_pow = MAX(align_pow, XZM_SEGMENT_SHIFT);
+ return mvm_allocate_pages_plat(size, align_pow, flags, tag, map);
+ }
+}
+
+MALLOC_USED
+static void * __alloc_size(1)
+_xzm_range_group_alloc_anywhere_segment(mach_vm_address_t hint, size_t size,
+ size_t align, plat_map_t *map, xzm_range_group_alloc_flags_t rga_flags)
+{
+ bool huge = (rga_flags & XZM_RANGE_GROUP_ALLOC_FLAGS_HUGE);
+ bool purgeable = (rga_flags & XZM_RANGE_GROUP_ALLOC_FLAGS_PURGEABLE);
+
+ (void)map;
+ int tag = _xzm_range_group_vm_tag_for_segment(size, huge);
+
+ mach_vm_address_t vm_addr = hint;
+ mach_vm_size_t allocation_size = (mach_vm_size_t)size;
+ int flags = VM_FLAGS_ANYWHERE | VM_MAKE_TAG(tag);
+ if (os_unlikely(purgeable)) {
+ flags |= VM_FLAGS_PURGABLE;
+ }
+
+#if CONFIG_MTE
+ if (rga_flags & XZM_RANGE_GROUP_ALLOC_FLAGS_MTE) {
+ flags |= VM_FLAGS_MTE;
+ }
+#endif
+
+ align = MAX(align, XZM_SEGMENT_SIZE);
+ // alignment must be a power of 2 for the allocation mask to work
+ xzm_debug_assert(powerof2(align));
+ mach_vm_offset_t allocation_mask = (mach_vm_offset_t)align - 1;
+ kern_return_t kr = mach_vm_map(mach_task_self(), &vm_addr, allocation_size,
+ allocation_mask, flags, MEMORY_OBJECT_NULL, 0, FALSE,
+ VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
+ if (kr) {
+ if (kr != KERN_NO_SPACE) {
+ xzm_debug_abort_with_reason("Failed to allocate data segment", kr);
+ malloc_zone_error(0, false,
+ "Failed to allocate segment (size=%lu, flags=%x, kr=%d)\n",
+ (unsigned long)size, flags, kr);
+ }
+
+ return NULL;
+ }
+
+ xzm_debug_assert(vm_addr);
+ xzm_debug_assert(vm_addr % align == 0);
+ return (void *)vm_addr;
+}
+
+static uintptr_t
+_xzm_range_group_bump_alloc_segment(xzm_range_group_t rg, size_t size,
+ bool warn_on_exhaustion)
+{
+ uintptr_t segment_addr = 0;
+
+ if (rg->xzrg_warned_full) {
+ return segment_addr;
+ }
+
+ // Reserve space for a new segment
+ _malloc_lock_lock(&rg->xzrg_lock);
+ if (rg->xzrg_remaining >= size) {
+ if (rg->xzrg_next == rg->xzrg_skip_addr) {
+ if (rg->xzrg_direction == XZM_FRONT_INCREASING) {
+ rg->xzrg_next += rg->xzrg_skip_size;
+ } else {
+ xzm_debug_assert(rg->xzrg_direction == XZM_FRONT_DECREASING);
+ rg->xzrg_next -= rg->xzrg_skip_size;
+ }
+ }
+
+ // In the decreasing direction, xzrg_next points to the _end_ of what
+ // will be the next segment we serve, and we subtract the size to be
+ // allocated from its initial value. In the increasing direction, the
+ // initial value is the start of the segment we're going to serve, and
+ // we increase afterward.
+ if (rg->xzrg_direction == XZM_FRONT_DECREASING) {
+ rg->xzrg_next -= size;
+ }
+
+ segment_addr = rg->xzrg_next;
+ xzm_debug_assert(segment_addr % size == 0);
+
+ if (rg->xzrg_direction == XZM_FRONT_INCREASING) {
+ rg->xzrg_next += size;
+ }
+
+ rg->xzrg_remaining -= size;
+ }
+
+ if (!segment_addr && warn_on_exhaustion) {
+ if (!rg->xzrg_warned_full) {
+ rg->xzrg_warned_full = true;
+ malloc_report(ASL_LEVEL_WARNING, "Failed to allocate segment from range group - out of space\n");
+ }
+ }
+
+ _malloc_lock_unlock(&rg->xzrg_lock);
+
+ return segment_addr;
+}
+
+static void * __alloc_size(2)
+_xzm_range_group_alloc_data_segment(xzm_range_group_t rg, size_t size,
+ size_t alignment, plat_map_t *map, xzm_range_group_alloc_flags_t rga_flags)
+{
+ xzm_debug_assert(rg->xzrg_id == XZM_RANGE_GROUP_DATA);
+
+#if CONFIG_VM_USER_RANGES || CONFIG_MACOS_RANGES
+ if (rg->xzrg_main_ref->xzmz_use_ranges) {
+ // On systems with VM user ranges, an ANYWHERE allocation with one of the
+ // VM_MEMORY_MALLOC tags will be placed in the data range automatically.
+ mach_vm_address_t hint = 0;
+
+#if CONFIG_MACOS_RANGES
+ // On macOS, the data range isn't strongly isolated. We just choose an
+ // otherwise empty normal range of the address space to allocate into
+ // using a hint.
+ hint = rg->xzrg_base;
+#endif // CONFIG_MACOS_RANGES
+
+ return _xzm_range_group_alloc_anywhere_segment(hint, size, alignment,
+ map, rga_flags);
+ }
+#endif // CONFIG_VM_USER_RANGES || CONFIG_MACOS_RANGES
+
+ return _xzm_range_group_alloc_mvm_segment(rg->xzrg_main_ref, size,
+ alignment, map, rga_flags);
+}
+
+static void * __alloc_size(2)
+_xzm_range_group_alloc_ptr_segment(xzm_range_group_t rg, size_t size,
+ plat_map_t *map, xzm_range_group_alloc_flags_t rga_flags)
+{
+ xzm_debug_assert(rg->xzrg_id == XZM_RANGE_GROUP_PTR);
+ xzm_debug_assert(!(rga_flags & XZM_RANGE_GROUP_ALLOC_FLAGS_HUGE));
+ xzm_debug_assert(size == XZM_SEGMENT_SIZE);
+ xzm_debug_assert(rg->xzrg_main_ref->xzmz_segment_group_count !=
+ XZM_SEGMENT_GROUP_IDS_COUNT_DATA_ONLY);
+
+#if CONFIG_VM_USER_RANGES || CONFIG_MACOS_RANGES
+ if (rg->xzrg_main_ref->xzmz_use_ranges)
+#else
+ if ((0))
+#endif // MALLOC_TARGET_EXCLAVES
+ {
+ bool allow_fallback = false;
+#if CONFIG_MALLOC_PROCESS_IDENTITY && CONFIG_MACOS_RANGES
+ if (!malloc_process_is_security_critical(malloc_process_identity)) {
+ allow_fallback = true;
+ }
+#endif // CONFIG_MALLOC_PROCESS_IDENTITY && CONFIG_MACOS_RANGES
+
+ mach_vm_address_t segment_addr = _xzm_range_group_bump_alloc_segment(rg,
+ size, !allow_fallback);
+ if (!segment_addr) {
+#if CONFIG_MALLOC_PROCESS_IDENTITY && CONFIG_MACOS_RANGES
+ if (allow_fallback) {
+ goto fallback;
+ }
+#endif // CONFIG_MALLOC_PROCESS_IDENTITY && CONFIG_MACOS_RANGES
+
+ xzm_debug_abort("Pointer range exhausted");
+ return NULL;
+ }
+
+#if CONFIG_VM_USER_RANGES || CONFIG_MACOS_RANGES
+ mach_vm_address_t vm_addr = segment_addr;
+ mach_vm_size_t vm_size = (mach_vm_size_t)size;
+ int alloc_flags = VM_FLAGS_OVERWRITE |
+ VM_MAKE_TAG(VM_MEMORY_MALLOC_SMALL);
+
+#if CONFIG_MTE
+ if (rga_flags & XZM_RANGE_GROUP_ALLOC_FLAGS_MTE) {
+ alloc_flags |= VM_FLAGS_MTE;
+ }
+#endif
+
+ kern_return_t kr = mach_vm_map(mach_task_self(), &vm_addr, vm_size,
+ /* mask */ 0, alloc_flags, MEMORY_OBJECT_NULL,
+ /* offset */ 0, /* copy */ FALSE, VM_PROT_DEFAULT,
+ VM_PROT_ALL, VM_INHERIT_DEFAULT);
+ if (kr != KERN_SUCCESS) {
+ xzm_abort_with_reason(
+ "pointer range mach_vm_map() overwrite failed", kr);
+ }
+#endif // CONFIG_VM_USER_RANGES || CONFIG_MACOS_RANGES
+
+ return (void *)segment_addr;
+ }
+
+#if CONFIG_MALLOC_PROCESS_IDENTITY && CONFIG_MACOS_RANGES
+fallback:
+#endif // CONFIG_MALLOC_PROCESS_IDENTITY && CONFIG_MACOS_RANGES
+ return _xzm_range_group_alloc_mvm_segment(rg->xzrg_main_ref, size, 0, map,
+ rga_flags);
+}
+
+static void * __alloc_size(2)
+xzm_range_group_alloc_segment(xzm_range_group_t rg, size_t size,
+ size_t alignment, plat_map_t *map,
+ xzm_range_group_alloc_flags_t rga_flags)
+{
+ if (rg->xzrg_id == XZM_RANGE_GROUP_DATA) {
+ return _xzm_range_group_alloc_data_segment(rg, size, alignment, map,
+ rga_flags);
+ } else {
+ xzm_debug_assert(alignment == 0);
+ // Only huge segment bodies (which must be in the data range) can be
+ // purgable
+ xzm_debug_assert(!(rga_flags & XZM_RANGE_GROUP_ALLOC_FLAGS_PURGEABLE));
+ return _xzm_range_group_alloc_ptr_segment(rg, size, map, rga_flags);
+ }
+}
+
+static void
+xzm_range_group_free_segment_body(xzm_range_group_t rg, void *body,
+ size_t size, plat_map_t *map)
+{
+ xzm_debug_assert(rg->xzrg_id == XZM_RANGE_GROUP_DATA);
+
+ int debug_flags = 0;
+#ifdef DEBUG
+ debug_flags = MALLOC_ABORT_ON_ERROR;
+#endif // DEBUG
+ mvm_deallocate_plat(body, size, debug_flags, map);
+}
+
+#if CONFIG_VM_USER_RANGES
+static bool
+parse_void_ranges(struct mach_vm_range *left_void,
+ struct mach_vm_range *right_void)
+{
+ char buf[256];
+ size_t bsz = sizeof(buf) - 1;
+ char *s;
+
+ int rc = sysctlbyname("vm.malloc_ranges", buf, &bsz, NULL, 0);
+ if (rc == -1) {
+ switch (errno) {
+ case ENOENT:
+#ifdef DEBUG
+ malloc_report(ASL_LEVEL_INFO, "VM user ranges not supported\n");
+#endif
+ break;
+ case EPERM:
+ // TODO: make this fatal in processes that strictly need VM user
+ // ranges
+ malloc_report(ASL_LEVEL_ERR,
+ "sysctlbyname(\"vm.malloc_ranges\") denied\n");
+ break;
+ default:
+ xzm_abort_with_reason("sysctlbyname(\"vm.malloc_ranges\") failed",
+ errno);
+ break;
+ }
+ return false;
+ }
+ buf[bsz] = '\0';
+
+ s = buf;
+
+ left_void->min_address = strtoull(s, &s, 16);
+ s++;
+
+ left_void->max_address = strtoull(s, &s, 16);
+ s++;
+
+ right_void->min_address = strtoull(s, &s, 16);
+ s++;
+
+ right_void->max_address = strtoull(s, &s, 16);
+
+ return true;
+}
+#endif // CONFIG_VM_USER_RANGES
+
+#if MALLOC_TARGET_EXCLAVES || CONFIG_VM_USER_RANGES || CONFIG_MACOS_RANGES
+
+#define XZM_RANGE_SEPARATION GiB(4)
+
+#define XZM_DATA_RANGE_SIZE GiB(10)
+#define XZM_POINTER_RANGE_SIZE GiB(16)
+
+#define XZM_PAGE_TABLE_GRANULE MiB(32)
+#define XZM_PAGE_TABLE_BITS 25
+
+// Exclaves don't have struct mach_vm_range, so we'll just define our own little
+// identical type
+struct xzm_vm_range {
+ uint64_t min_address;
+ uint64_t max_address;
+};
+
+
+static_assert(sizeof(struct mach_vm_range) == sizeof(struct xzm_vm_range),
+ "compatible vm range size");
+static_assert(offsetof(struct mach_vm_range, min_address) ==
+ offsetof(struct xzm_vm_range, min_address),
+ "compatible vm range min_address offset");
+static_assert(offsetof(struct mach_vm_range, max_address) ==
+ offsetof(struct xzm_vm_range, max_address),
+ "compatible vm range max_address offset");
+
+
+static void
+_xzm_main_malloc_zone_init_ptr_fronts(xzm_range_group_t range_groups,
+ size_t allocation_front_count, struct xzm_vm_range *ranges,
+ size_t range_count, plat_map_t *map)
+{
+ xzm_assert(allocation_front_count == 2);
+ xzm_assert(range_count > 0);
+ xzm_assert(ranges[0].min_address < ranges[0].max_address);
+ if (range_count > 1) {
+#if CONFIG_VM_USER_RANGES
+ xzm_assert(range_count == 2);
+ xzm_assert(ranges[1].min_address > ranges[0].max_address);
+ xzm_assert(ranges[1].min_address < ranges[1].max_address);
+#else
+ xzm_abort_with_reason("unsupported range_count", range_count);
+#endif
+ }
+
+ uint64_t total_span = 0;
+ for (size_t i = 0; i < range_count; i++) {
+ total_span += ranges[i].max_address - ranges[i].min_address;
+ }
+ uint64_t middle_pte_offset = roundup(total_span / 2,
+ XZM_PAGE_TABLE_GRANULE);
+
+ if (ranges[0].min_address + middle_pte_offset >= ranges[0].max_address) {
+ xzm_assert(range_count == 2);
+ middle_pte_offset += ranges[1].min_address - ranges[0].max_address;
+ }
+
+ uint64_t middle_pte = ranges[0].min_address + middle_pte_offset;
+ xzm_assert(middle_pte % XZM_PAGE_TABLE_GRANULE == 0);
+
+ uint64_t middle_pte_middle = middle_pte + (XZM_PAGE_TABLE_GRANULE / 2);
+
+ uint64_t rg_up_size = 0;
+ uint64_t rg_up_skip_addr = 0;
+ uint64_t rg_up_skip_size = 0;
+
+ uint64_t rg_down_size = 0;
+ uint64_t rg_down_skip_addr = 0;
+ uint64_t rg_down_skip_size = 0;
+
+ if (range_count == 2) {
+ if (middle_pte_middle > ranges[0].max_address) {
+ xzm_assert(middle_pte_middle > ranges[1].min_address);
+ xzm_assert(middle_pte_middle < ranges[1].max_address);
+
+ // The right side (up) is not split
+ rg_up_size = ranges[1].max_address - middle_pte_middle;
+
+ // The left side (down) is split
+ rg_down_size = (middle_pte_middle - ranges[1].min_address) +
+ (ranges[0].max_address - ranges[0].min_address);
+ rg_down_skip_addr = ranges[1].min_address;
+ rg_down_skip_size = ranges[1].min_address - ranges[0].max_address;
+ } else {
+ xzm_assert(middle_pte_middle < ranges[0].max_address);
+ xzm_assert(middle_pte_middle > ranges[0].min_address);
+
+ // The right side (up) is split
+ rg_up_size = (ranges[1].max_address - ranges[1].min_address) +
+ (ranges[0].max_address - middle_pte_middle);
+ rg_up_skip_addr = ranges[0].max_address;
+ rg_up_skip_size = ranges[1].min_address - ranges[0].max_address;
+
+ // The left side (down) is not split
+ rg_down_size = middle_pte_middle - ranges[0].min_address;
+ }
+ } else {
+ xzm_assert(ranges[0].min_address < middle_pte_middle);
+ xzm_assert(middle_pte_middle < ranges[0].max_address);
+
+ rg_up_size = ranges[0].max_address - middle_pte_middle;
+ rg_down_size = middle_pte_middle - ranges[0].min_address;
+ }
+
+ xzm_range_group_t ptr_rg_up = &range_groups[XZM_RANGE_GROUP_PTR + 0];
+ xzm_debug_assert(ptr_rg_up->xzrg_id == XZM_RANGE_GROUP_PTR);
+
+ ptr_rg_up->xzrg_base = middle_pte_middle;
+ ptr_rg_up->xzrg_next = ptr_rg_up->xzrg_base;
+ ptr_rg_up->xzrg_size = rg_up_size;
+ ptr_rg_up->xzrg_remaining = ptr_rg_up->xzrg_size;
+ ptr_rg_up->xzrg_skip_addr = rg_up_skip_addr;
+ ptr_rg_up->xzrg_skip_size = rg_up_skip_size;
+ ptr_rg_up->xzrg_direction = XZM_FRONT_INCREASING;
+
+ xzm_range_group_t ptr_rg_down = &range_groups[XZM_RANGE_GROUP_PTR + 1];
+ xzm_debug_assert(ptr_rg_down->xzrg_id == XZM_RANGE_GROUP_PTR);
+
+ ptr_rg_down->xzrg_base = middle_pte_middle;
+ ptr_rg_down->xzrg_next = ptr_rg_down->xzrg_base;
+ ptr_rg_down->xzrg_size = rg_down_size;
+ ptr_rg_down->xzrg_remaining = ptr_rg_down->xzrg_size;
+ ptr_rg_down->xzrg_skip_addr = rg_down_skip_addr;
+ ptr_rg_down->xzrg_skip_size = rg_down_skip_size;
+ ptr_rg_down->xzrg_direction = XZM_FRONT_DECREASING;
+
+}
+
+#if CONFIG_VM_USER_RANGES
+
+static void
+_xzm_main_malloc_zone_choose_ptr_ranges(struct mach_vm_range left_void,
+ struct mach_vm_range right_void, size_t ptr_rg_size, uint64_t entropy,
+ struct mach_vm_range *ranges_out, size_t *ranges_count_inout)
+{
+ // For now, the caller needs to be able to handle 2 result ranges
+ xzm_assert(*ranges_count_inout == 2);
+
+ xzm_assert(left_void.min_address);
+ xzm_assert(left_void.max_address >= left_void.min_address);
+ xzm_assert(right_void.min_address >= left_void.max_address);
+ xzm_assert(right_void.max_address >= right_void.min_address);
+
+#define xzm_trunc_page_table_granule(addr) \
+ ((addr) & ~(XZM_PAGE_TABLE_GRANULE - 1))
+
+ // Note: the void boundaries should already be aligned to the page table
+ // granule anyway
+
+ // |<----------------total span--------------->|
+ // |<-left void->|<-data body->|<-right void->|
+ // |<usable>|<pad>|<-data body->|<pad>|<usable>|
+ // |<usable>|<-------data span------->|<usable>|
+
+ uint64_t left_void_min = roundup(left_void.min_address,
+ XZM_PAGE_TABLE_GRANULE);
+ uint64_t left_void_limit =
+ xzm_trunc_page_table_granule(left_void.max_address);
+ if (left_void_limit < left_void_min) {
+ // Shouldn't ever happen - the kernel would have to give us a
+ // sub-granule left void that isn't granule-aligned. If it does, we can
+ // pretend it gave us an empty left void that's actually "in" the data
+ // range, technically.
+ left_void_min = left_void_limit;
+ }
+ xzm_assert(left_void_min <= left_void_limit);
+
+ uint64_t right_void_min = roundup(right_void.min_address,
+ XZM_PAGE_TABLE_GRANULE);
+ uint64_t right_void_limit =
+ xzm_trunc_page_table_granule(right_void.max_address);
+ if (right_void_limit < right_void_min) {
+ // Same thing, shouldn't happen
+ right_void_limit = right_void_min;
+ }
+ xzm_assert(right_void_min <= right_void_limit);
+
+ xzm_assert(left_void_limit <= right_void_min);
+
+ uint64_t total_span = right_void_limit - left_void_min;
+
+ uint64_t data_body_span = right_void_min - left_void_limit;
+
+ uint64_t data_left_pad = MIN(XZM_RANGE_SEPARATION,
+ left_void_limit - left_void_min);
+ uint64_t data_left_pad_start = left_void_limit - data_left_pad;
+
+ uint64_t data_right_pad = MIN(XZM_RANGE_SEPARATION,
+ right_void_limit - right_void_min);
+ uint64_t data_right_pad_limit = right_void_min + data_right_pad;
+
+ uint64_t data_span = data_left_pad + data_body_span + data_right_pad;
+
+ xzm_assert(data_span < total_span);
+ uint64_t usable_space = total_span - data_span;
+
+ xzm_assert(usable_space >= ptr_rg_size);
+ uint64_t starting_space = usable_space - ptr_rg_size;
+
+ xzm_assert(starting_space % XZM_PAGE_TABLE_GRANULE == 0);
+
+ // Note: + 1 because the final granule address is also usable
+ uint64_t starting_candidate_granules =
+ (starting_space / XZM_PAGE_TABLE_GRANULE) + 1;
+
+ // Note: start_granules is small relative to entropy, so the modulo bias is
+ // not significant
+ uint64_t start_granule = entropy % starting_candidate_granules;
+
+ uint64_t start_address = left_void_min +
+ (start_granule * XZM_PAGE_TABLE_GRANULE);
+
+ if (start_address >= data_left_pad_start) {
+ start_address += data_span;
+ }
+
+ uint64_t limit_address = start_address + ptr_rg_size;
+
+ if (start_address < data_left_pad_start &&
+ limit_address > data_left_pad_start) {
+ // The pointer range is split across the data range
+ ranges_out[0] = (struct mach_vm_range){
+ .min_address = start_address,
+ .max_address = data_left_pad_start,
+ };
+
+ uint64_t left_range_span = data_left_pad_start - start_address;
+ uint64_t right_range_span = ptr_rg_size - left_range_span;
+ ranges_out[1] = (struct mach_vm_range){
+ .min_address = data_right_pad_limit,
+ .max_address = data_right_pad_limit + right_range_span,
+ };
+
+ *ranges_count_inout = 2;
+ } else {
+ // The pointer range is fully on one side of the data range
+ ranges_out[0] = (struct mach_vm_range){
+ .min_address = start_address,
+ .max_address = limit_address,
+ };
+
+ *ranges_count_inout = 1;
+ }
+}
+
+static kern_return_t
+_xzm_main_malloc_zone_create_ptr_range(struct mach_vm_range range)
+{
+ // It's important that we use a malloc tag in the recipe so that the kernel
+ // gives us a single object rather than chunking into many.
+ mach_vm_range_recipe_v1_t recipe = {
+ .range = range,
+ .range_tag = MACH_VM_RANGE_FIXED,
+ .vm_tag = VM_MEMORY_MALLOC_SMALL,
+ };
+
+ kern_return_t kr = mach_vm_range_create(mach_task_self(),
+ MACH_VM_RANGE_FLAVOR_V1, (mach_vm_range_recipes_raw_t)&recipe,
+ sizeof(recipe));
+ switch (kr) {
+ case KERN_SUCCESS:
+ break;
+ case KERN_DENIED:
+ // TODO: make this fatal in processes that strictly need VM user ranges
+ malloc_report(ASL_LEVEL_ERR, "mach_vm_range_create() denied\n");
+ return kr;
+ case KERN_NOT_SUPPORTED:
+ // Strange - in a process that doesn't have VM user ranges we would have
+ // expected the sysctl to fail
+ xzm_debug_abort("mach_vm_range_create() not supported?");
+ return kr;
+ default:
+ xzm_abort_with_reason("unexpected error from mach_vm_range_create()",
+ kr);
+ return kr;
+ }
+
+ // Avoid malloc-no-CoW semantics on the pointer range reservation by
+ // replacing the VM object for it with one that has a non-malloc tag.
+ // Giving it VM_PROT_NONE causes the kernel to give us a single object
+ // rather than chunking (which is important to avoid creating tons of
+ // pointless VM objects), and hides it in vmmap by default.
+ mach_vm_address_t overwrite_addr = (mach_vm_address_t)range.min_address;
+ mach_vm_size_t overwrite_size =
+ (mach_vm_size_t)(range.max_address - range.min_address);
+ int alloc_flags = VM_FLAGS_OVERWRITE;
+ kr = mach_vm_map(mach_task_self(), &overwrite_addr, overwrite_size,
+ /* mask */ 0, alloc_flags, MEMORY_OBJECT_NULL, /* offset */ 0,
+ /* copy */ FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
+ if (kr != KERN_SUCCESS) {
+ xzm_abort_with_reason(
+ "pointer range initial overwrite failed", kr);
+ }
+
+ return KERN_SUCCESS;
+}
+
+#endif // CONFIG_VM_USER_RANGES
+
+#endif // MALLOC_TARGET_EXCLAVES || CONFIG_VM_USER_RANGES || CONFIG_MACOS_RANGES
+
+void
+xzm_main_malloc_zone_init_range_groups(xzm_main_malloc_zone_t main)
+{
+ // Basic initialization is done in xzm_main_malloc_zone_create() - here we
+ // mainly deal with VM user ranges.
+ MALLOC_STATIC_ASSERT(XZM_RANGE_GROUP_COUNT == 3,
+ "all range groups need to be initialized");
+
+#if CONFIG_VM_USER_RANGES
+ struct mach_vm_range left_void, right_void;
+ bool user_ranges_supported = parse_void_ranges(&left_void, &right_void);
+ if (!user_ranges_supported) {
+ return;
+ }
+
+ // VM user range support:
+ //
+ // We'll use the kernel default heap range for the DATA range.
+ //
+ // The ranges in the PTR range group:
+ // - Should be separated from the data range (as defined by
+ // [void1.max_address, void2.min_address)) by at least 4G
+ // - Should allow each allocation front to span 8G, possibly crossing the
+ // DATA range if necessary
+
+ // The configurations we support are:
+ // - No user ranges at all, in which case we shouldn't get here
+ // - User ranges support with 2 allocation fronts
+ if (main->xzmz_allocation_front_count != 2) {
+ xzm_abort_with_reason("unsupported allocation front count",
+ main->xzmz_allocation_front_count);
+ }
+
+ size_t ptr_rg_size = XZM_POINTER_RANGE_SIZE;
+
+ struct mach_vm_range ptr_ranges[2];
+ size_t ptr_range_count = 2;
+ _xzm_main_malloc_zone_choose_ptr_ranges(left_void, right_void, ptr_rg_size,
+ malloc_entropy[1], ptr_ranges, &ptr_range_count);
+
+ for (size_t i = 0; i < ptr_range_count; i++) {
+ kern_return_t kr =
+ _xzm_main_malloc_zone_create_ptr_range(ptr_ranges[i]);
+ if (kr != KERN_SUCCESS) {
+ return;
+ }
+ }
+
+ main->xzmz_use_ranges = true;
+
+ _xzm_main_malloc_zone_init_ptr_fronts(main->xzmz_range_groups,
+ main->xzmz_allocation_front_count,
+ (struct xzm_vm_range *)ptr_ranges, ptr_range_count, NULL);
+
+ xzm_range_group_t data_rg = &main->xzmz_range_groups[XZM_RANGE_GROUP_DATA];
+ xzm_debug_assert(data_rg->xzrg_id == XZM_RANGE_GROUP_DATA);
+
+ // Note: these are recorded purely for introspection purposes
+ data_rg->xzrg_base = (mach_vm_address_t)left_void.max_address;
+ data_rg->xzrg_size = right_void.min_address - left_void.max_address;
+
+ // end of CONFIG_VM_USER_RANGES
+#elif CONFIG_MACOS_RANGES
+ // We want a similar layout to embedded, with:
+ // - A data range and a pointer range located in the first 64GB (L2) of the
+ // address space to economize PTE usage
+ // - Guaranteed minimum separation between the pointer range and everything
+ // else
+ // - Both ranges separated from the traditional "low space" by a few GB of
+ // buffer distance
+ //
+ // However, on macOS there are no "voids" for us to need the
+ // mach_vm_range_create() interface to access, nor is there a special data
+ // range that the kernel knows about. Instead, we create our own strongly
+ // isolated pointer range reservation, and have a more relaxed model for the
+ // data range that permits reuse with general VA, allowing us to model it as
+ // a simple starting address hint. An implication of the data range not
+ // being strongly isolated is that it doesn't need to be contiguous.
+ //
+ // Either range should be able to grow to their standard size without
+ // overflowing the first L2.
+ //
+ // So, our placement strategy will be:
+ // - Place the pointer range, with its guards, in the space
+ // - Then choose the data range hint somewhere in the remaining space
+
+ // Start at 16GB to leave room in the low space for other VM allocations
+#define XZM_MACOS_RANGES_START GiB(16)
+ // End at 63GB to avoid crossing the commpage
+#define XZM_MACOS_RANGES_END GiB(63)
+
+ uint64_t range_first_candidate = XZM_MACOS_RANGES_START;
+ uint64_t ptr_reservation_size = XZM_RANGE_SEPARATION +
+ XZM_POINTER_RANGE_SIZE + XZM_RANGE_SEPARATION;
+ uint64_t range_last_candidate = XZM_MACOS_RANGES_END - ptr_reservation_size;
+
+ uint64_t ptr_candidate_span = range_last_candidate - range_first_candidate;
+ uint64_t ptr_candidate_granules =
+ ptr_candidate_span / XZM_PAGE_TABLE_GRANULE;
+
+ uint64_t ptr_entropy = (uint32_t)(malloc_entropy[1]);
+ uint64_t ptr_granule = ptr_entropy % ptr_candidate_granules;
+
+ uint64_t ptr_start =
+ range_first_candidate + (ptr_granule * XZM_PAGE_TABLE_GRANULE);
+
+ xzm_assert(ptr_start + ptr_reservation_size <= XZM_MACOS_RANGES_END);
+
+ // Reserve the pointer range with a big max-protection == PROT_NONE region.
+ // It is important that we not give it a malloc tag or protection above
+ // PROT_NONE to avoid chunking or special CoW treatment from the VM - we
+ // need for this to be just one entry.
+ mach_vm_address_t ptr_addr = (mach_vm_address_t)ptr_start;
+ mach_vm_size_t reservation_size = (mach_vm_size_t)ptr_reservation_size;
+ int alloc_flags = 0; // fixed, no tag
+ kern_return_t kr = mach_vm_map(mach_task_self(), &ptr_addr,
+ reservation_size, /* mask */ 0, alloc_flags, MEMORY_OBJECT_NULL,
+ /* offset */ 0, /* copy */ FALSE, VM_PROT_NONE, VM_PROT_NONE,
+ VM_INHERIT_DEFAULT);
+ if (kr != KERN_SUCCESS) {
+ // We could fall back to mvm allocation, but we want this to fail loudly
+ // if something starts preventing us from being able to make the
+ // reservation we need
+ xzm_abort_with_reason(
+ "pointer range initial reservation failed", kr);
+ }
+
+ main->xzmz_use_ranges = true;
+
+ mach_vm_address_t ptr_base = ptr_addr + XZM_RANGE_SEPARATION;
+
+ struct xzm_vm_range range = {
+ .min_address = ptr_base,
+ .max_address = ptr_base + XZM_POINTER_RANGE_SIZE,
+ };
+ _xzm_main_malloc_zone_init_ptr_fronts(main->xzmz_range_groups,
+ main->xzmz_allocation_front_count, &range, 1, NULL);
+
+ // Choose a starting hint for the data range
+
+ uint64_t data_candidate_span = ptr_candidate_span - XZM_DATA_RANGE_SIZE;
+ uint64_t data_candidate_granules =
+ data_candidate_span / XZM_PAGE_TABLE_GRANULE;
+
+ uint64_t data_entropy = malloc_entropy[1] >> 32;
+ uint64_t data_granule = data_entropy % data_candidate_granules;
+
+ uint64_t data_start;
+ if (data_granule < ptr_granule) {
+ data_start = XZM_MACOS_RANGES_START +
+ (data_granule * XZM_PAGE_TABLE_GRANULE);
+ } else {
+ uint64_t ptr_reservation_granules =
+ ptr_reservation_size / XZM_PAGE_TABLE_GRANULE;
+ uint64_t data_adjusted_granule =
+ data_granule + ptr_reservation_granules;
+ data_start = XZM_MACOS_RANGES_START +
+ (data_adjusted_granule * XZM_PAGE_TABLE_GRANULE);
+ }
+
+ xzm_assert(data_start < ptr_start ||
+ data_start >= ptr_start + ptr_reservation_size);
+ xzm_assert(data_start + XZM_DATA_RANGE_SIZE <= XZM_MACOS_RANGES_END);
+
+ xzm_range_group_t data_rg = &main->xzmz_range_groups[XZM_RANGE_GROUP_DATA];
+ xzm_debug_assert(data_rg->xzrg_id == XZM_RANGE_GROUP_DATA);
+
+ data_rg->xzrg_base = (mach_vm_address_t)data_start;
+#endif // CONFIG_MACOS_RANGES
+}
+
+#pragma mark segment group
+
+static void _xzm_segment_group_clear_chunk(xzm_segment_group_t sg,
+ uint8_t *start, size_t size);
+
+static void _xzm_segment_group_split_huge_segment(xzm_segment_group_t sg,
+ xzm_segment_t segment, xzm_slice_count_t required_slices);
+
+#if CONFIG_XZM_DEFERRED_RECLAIM
+
+static void
+__xzm_segment_cache_remove(xzm_segment_cache_t cache,
+ xzm_segment_t segment)
+{
+ xzm_debug_assert(cache->xzsc_count > 0);
+ cache->xzsc_count--;
+ TAILQ_REMOVE(&cache->xzsc_head, segment, xzs_cache_entry);
+}
+
+static void
+__xzm_segment_cache_insert(xzm_segment_cache_t cache, xzm_segment_t segment)
+{
+ xzm_debug_assert(cache->xzsc_count < cache->xzsc_max_count);
+ TAILQ_INSERT_HEAD(&cache->xzsc_head, segment, xzs_cache_entry);
+ cache->xzsc_count++;
+}
+
+static void
+_xzm_segment_group_cache_invalidate(xzm_segment_group_t sg,
+ xzm_segment_t segment)
+{
+#ifdef DEBUG
+ _malloc_lock_assert_owner(&sg->xzsg_cache.xzsc_lock);
+#endif
+ __xzm_segment_cache_remove(&sg->xzsg_cache, segment);
+ // Free memory backing segment header
+ xzm_metapool_free(&sg->xzsg_main_ref->xzmz_metapools[XZM_METAPOOL_SEGMENT],
+ segment);
+}
+
+static void
+_xzm_segment_group_cache_mark_free(xzm_segment_group_t sg,
+ xzm_segment_t segment)
+{
+#ifdef DEBUG
+ _malloc_lock_assert_owner(&sg->xzsg_cache.xzsc_lock);
+ // Make sure that this segment isn't in the segment table before we put it
+ // into the cache
+ xzm_segment_table_entry_s *entry;
+ entry = _xzm_ptr_to_table_entry(_xzm_segment_start(segment),
+ sg->xzsg_main_ref);
+ xzm_debug_assert(entry->xste_val == 0);
+#endif
+ xzm_debug_assert(segment->xzs_reclaim_id == VM_RECLAIM_ID_NULL);
+
+ xzm_main_malloc_zone_t main = sg->xzsg_main_ref;
+ xzm_reclaim_buffer_t buffer = main->xzmz_reclaim_buffer;
+
+ uint8_t *addr = _xzm_segment_start(segment);
+ size_t size = _xzm_segment_size(segment);
+ segment->xzs_reclaim_id = _xzm_reclaim_mark_free(buffer, addr, size, false);
+ __xzm_segment_cache_insert(&sg->xzsg_cache, segment);
+}
+
+// Attempt to re-use a segment from the cache. Returns true if successful.
+// If unsuccessful, the caller should invalidate the segment's cache entry.
+static bool
+_xzm_segment_group_cache_mark_used(xzm_segment_group_t sg,
+ xzm_segment_t segment)
+{
+#ifdef DEBUG
+ _malloc_lock_assert_owner(&sg->xzsg_cache.xzsc_lock);
+#endif
+ xzm_debug_assert(segment->xzs_reclaim_id != VM_RECLAIM_ID_NULL);
+ xzm_main_malloc_zone_t main = sg->xzsg_main_ref;
+ xzm_reclaim_buffer_t buffer = main->xzmz_reclaim_buffer;
+ mach_vm_reclaim_state_t state;
+
+ state = _xzm_reclaim_mark_used(buffer, segment->xzs_reclaim_id,
+ _xzm_segment_start(segment), _xzm_segment_size(segment), false);
+ if (!mach_vm_reclaim_is_reusable(state)) {
+ // Entry has been reclaimed by the kernel since being placed in cache
+ _xzm_segment_group_cache_invalidate(sg, segment);
+ return false;
+ }
+ segment->xzs_reclaim_id = VM_RECLAIM_ID_NULL;
+ __xzm_segment_cache_remove(&sg->xzsg_cache, segment);
+ return true;
+}
+
+// Evict a segment from the cache
+static void
+_xzm_segment_group_cache_evict(xzm_segment_group_t sg)
+{
+#ifdef DEBUG
+ _malloc_lock_assert_owner(&sg->xzsg_cache.xzsc_lock);
+#endif
+ // approximate the oldest segment by evicting the tail
+ xzm_segment_t segment = TAILQ_LAST(&sg->xzsg_cache.xzsc_head,
+ xzm_segment_cache_head_s);
+ xzm_debug_assert(segment->xzs_reclaim_id != VM_RECLAIM_ID_NULL);
+ if (_xzm_segment_group_cache_mark_used(sg, segment)) {
+ _malloc_lock_unlock(&sg->xzsg_cache.xzsc_lock);
+ // Segment isn't in segment table while in the cache, so pass false for
+ // free_from_table while deallocating
+ _xzm_segment_group_segment_deallocate(sg, segment, false);
+ _malloc_lock_lock(&sg->xzsg_cache.xzsc_lock);
+ }
+}
+
+#endif // CONFIG_XZM_DEFERRED_RECLAIM
+
+// mimalloc: mi_slice_bin8
+MALLOC_ALWAYS_INLINE MALLOC_INLINE
+static size_t
+xzm_slice_bin8(xzm_slice_count_t slice_count)
+{
+ xzm_debug_assert(slice_count != 0);
+ if (slice_count <= 8) {
+ return slice_count - 1;
+ }
+
+ xzm_debug_assert(slice_count <= XZM_SLICES_PER_SEGMENT);
+ slice_count--;
+
+ int msb = 63 - __builtin_clzl(slice_count);
+ return ((msb << 2) + ((slice_count >> (msb - 2)) & 0x3)) - 5;
+}
+
+// mimalloc: mi_slice_bin
+MALLOC_ALWAYS_INLINE MALLOC_INLINE
+static size_t
+xzm_slice_bin(xzm_slice_count_t slice_count)
+{
+ xzm_debug_assert(slice_count * XZM_SEGMENT_SLICE_SIZE <= XZM_SEGMENT_SIZE);
+ xzm_debug_assert(xzm_slice_bin8(XZM_SLICES_PER_SEGMENT) <
+ XZM_SPAN_QUEUE_COUNT);
+ size_t bin = xzm_slice_bin8(slice_count);
+ xzm_debug_assert(bin < XZM_SPAN_QUEUE_COUNT);
+ return bin;
+}
+
+// mimalloc: mi_span_queue_for
+static xzm_span_queue_t
+xzm_span_queue_for(xzm_segment_group_t sg, xzm_slice_count_t slice_count)
+{
+ size_t bin = xzm_slice_bin(slice_count);
+ xzm_span_queue_t sq = &sg->xzsg_spans[bin];
+ xzm_debug_assert(sq->xzsq_slice_count >= slice_count);
+ return sq;
+}
+
+#ifdef DEBUG
+static void
+_xzm_segment_group_assert_correct_span_queue(xzm_segment_group_t sg,
+ xzm_slice_t slice)
+{
+ xzm_slice_kind_t kind = slice->xzc_bits.xzcb_kind;
+ xzm_assert(_xzm_slice_kind_is_free_span(kind));
+
+ xzm_slice_count_t slice_count;
+ if (kind == XZM_SLICE_KIND_SINGLE_FREE) {
+ slice_count = 1;
+ } else {
+ slice_count = slice->xzcs_slice_count;
+ }
+
+ xzm_span_queue_t sq = xzm_span_queue_for(sg, slice_count);
+ xzm_free_span_t span;
+ LIST_FOREACH(span, &sq->xzsq_queue, xzc_entry) {
+ if (span == slice) {
+ return;
+ }
+ }
+ xzm_abort("Didn't find free span in expected span queue");
+}
+
+// mimalloc: mi_segment_is_valid
+static bool
+_xzm_segment_group_segment_is_valid(xzm_segment_group_t sg,
+ xzm_segment_t segment)
+{
+ xzm_assert(segment->xzs_segment_group == sg);
+
+ xzm_slice_t end = _xzm_segment_slices_end(segment);
+ xzm_slice_t slice = _xzm_segment_slices_begin(segment);
+
+ if (segment->xzs_kind == XZM_SEGMENT_KIND_HUGE) {
+ xzm_assert(segment->xzs_used == 1);
+ xzm_chunk_t chunk = slice;
+ xzm_assert(chunk->xzc_bits.xzcb_kind == XZM_SLICE_KIND_HUGE_CHUNK);
+ xzm_assert(chunk->xzcs_slice_count == segment->xzs_slice_count);
+ return true;
+ }
+
+ _malloc_lock_assert_owner(&sg->xzsg_lock);
+
+ while (slice < end) {
+ xzm_slice_kind_t kind = slice->xzc_bits.xzcb_kind;
+ switch (kind) {
+ case XZM_SLICE_KIND_TINY_CHUNK:
+ slice++;
+ break;
+ case XZM_SLICE_KIND_SMALL_CHUNK:
+ case XZM_SLICE_KIND_SMALL_FREELIST_CHUNK:
+ case XZM_SLICE_KIND_LARGE_CHUNK: {
+ size_t slice_index = _xzm_slice_index(segment, slice);
+ size_t slice_count = slice->xzcs_slice_count;
+ xzm_assert(slice_count > 1);
+
+ slice++;
+ size_t extra = MIN(slice_count - 1, XZM_MAX_SLICE_OFFSET);
+ for (size_t i = 1; i <= extra; i++, slice++) {
+ xzm_assert(slice->xzc_bits.xzcb_kind ==
+ XZM_SLICE_KIND_MULTI_BODY);
+ xzm_assert(slice->xzsl_slice_offset_bytes ==
+ (uint32_t)(sizeof(struct xzm_slice_s) * i));
+ }
+
+ size_t last_slice_index = slice_index + slice_count - 1;
+ xzm_assert(last_slice_index < segment->xzs_slice_entry_count);
+ xzm_slice_t last = &segment->xzs_slices[last_slice_index];
+ if (last >= slice) {
+ xzm_assert(last->xzc_bits.xzcb_kind ==
+ XZM_SLICE_KIND_MULTI_BODY);
+ xzm_assert(last->xzsl_slice_offset_bytes ==
+ (uint32_t)(sizeof(struct xzm_slice_s) *
+ (slice_count - 1)));
+ }
+ slice = last + 1;
+ break;
+ }
+ case XZM_SLICE_KIND_GUARD: {
+ size_t slice_count = slice->xzcs_slice_count;
+ slice++;
+
+ for (size_t i = 1; i < slice_count; i++, slice++) {
+ xzm_assert(slice->xzc_bits.xzcb_kind ==
+ XZM_SLICE_KIND_MULTI_BODY);
+ xzm_assert(slice->xzsl_slice_offset_bytes ==
+ (uint32_t)(sizeof(struct xzm_slice_s) * i));
+ }
+
+ // Adjacent guards should always be coalesced
+ if (slice < end) {
+ xzm_assert(slice->xzc_bits.xzcb_kind != XZM_SLICE_KIND_GUARD);
+ }
+
+ break;
+ }
+ case XZM_SLICE_KIND_HUGE_CHUNK:
+ xzm_abort("huge chunk in normal segment");
+ break;
+ case XZM_SLICE_KIND_SINGLE_FREE: {
+ xzm_assert(slice->xzc_mzone_idx == XZM_MZONE_INDEX_INVALID);
+ _xzm_segment_group_assert_correct_span_queue(sg, slice);
+#if CONFIG_XZM_DEFERRED_RECLAIM
+ mach_vm_reclaim_id_t *reclaim_id =
+ _xzm_segment_slice_meta_reclaim_id(segment, slice);
+ xzm_assert(*reclaim_id == VM_RECLAIM_ID_NULL ||
+ !slice->xzc_bits.xzcb_is_pristine);
+#endif // CONFIG_XZM_DEFERRED_RECLAIM
+ slice++;
+ break;
+ }
+ case XZM_SLICE_KIND_MULTI_FREE: {
+ xzm_assert(slice->xzc_mzone_idx == XZM_MZONE_INDEX_INVALID);
+ _xzm_segment_group_assert_correct_span_queue(sg, slice);
+
+ size_t slice_index = _xzm_slice_index(segment, slice);
+ size_t slice_count = slice->xzcs_slice_count;
+ xzm_assert(slice_count > 1);
+
+ size_t last_slice_index = slice_index + slice_count - 1;
+ xzm_assert(last_slice_index < segment->xzs_slice_entry_count);
+
+ xzm_slice_t last = &segment->xzs_slices[last_slice_index];
+ xzm_assert(last->xzc_bits.xzcb_kind ==
+ XZM_SLICE_KIND_MULTI_BODY);
+ xzm_assert(last->xzsl_slice_offset_bytes ==
+ (uint32_t)(sizeof(struct xzm_slice_s) * (slice_count - 1)));
+
+#if CONFIG_XZM_DEFERRED_RECLAIM
+ mach_vm_reclaim_id_t *reclaim_id =
+ _xzm_segment_slice_meta_reclaim_id(segment, slice);
+ xzm_assert(*reclaim_id == VM_RECLAIM_ID_NULL ||
+ !slice->xzc_bits.xzcb_is_pristine);
+#endif // CONFIG_XZM_DEFERRED_RECLAIM
+
+ slice = last + 1;
+ break;
+ }
+ default:
+ xzm_abort_with_reason("Unexpected slice kind", (unsigned)kind);
+ break;
+ }
+ }
+
+ return true;
+}
+#endif // DEBUG
+
+#if CONFIG_XZM_DEFERRED_RECLAIM
+
+static void
+_xzm_segment_group_span_mark_free(xzm_segment_group_t sg, xzm_free_span_t span)
+{
+ xzm_debug_assert(_xzm_segment_group_uses_deferred_reclamation(sg));
+ xzm_debug_assert(_xzm_slice_kind_is_free_span(span->xzc_bits.xzcb_kind));
+
+ xzm_main_malloc_zone_t main = sg->xzsg_main_ref;
+ xzm_malloc_zone_t zone = &main->xzmz_base;
+ xzm_reclaim_buffer_t buffer = main->xzmz_reclaim_buffer;
+
+ mach_vm_reclaim_id_t *reclaim_id = _xzm_slice_meta_reclaim_id(zone, span);
+ xzm_debug_assert(*reclaim_id == VM_RECLAIM_ID_NULL);
+ size_t span_size = _xzm_free_span_size(span);
+ uint8_t *span_start = _xzm_slice_start(zone, span);
+
+ *reclaim_id = _xzm_reclaim_mark_free(buffer, span_start, span_size, true);
+}
+
+static bool
+_xzm_segment_group_span_mark_used(xzm_segment_group_t sg, xzm_free_span_t span)
+{
+ xzm_debug_assert(_xzm_segment_group_uses_deferred_reclamation(sg));
+ xzm_debug_assert(_xzm_slice_kind_is_free_span(span->xzc_bits.xzcb_kind));
+ xzm_main_malloc_zone_t main = sg->xzsg_main_ref;
+ xzm_malloc_zone_t zone = &main->xzmz_base;
+
+ if (!_xzm_slice_is_deferred(zone, span)) {
+ // span has not been marked free
+ return true;
+ }
+
+ xzm_reclaim_buffer_t buffer = main->xzmz_reclaim_buffer;
+
+ mach_vm_reclaim_id_t *reclaim_id = _xzm_slice_meta_reclaim_id(zone, span);
+ xzm_debug_assert(*reclaim_id != VM_RECLAIM_ID_NULL);
+ size_t span_size = _xzm_free_span_size(span);
+ uint8_t *span_start = _xzm_slice_start(zone, span);
+ mach_vm_reclaim_state_t state;
+
+ state = _xzm_reclaim_mark_used(buffer, *reclaim_id, span_start,
+ span_size, true);
+ if (mach_vm_reclaim_is_reusable(state)) {
+ *reclaim_id = VM_RECLAIM_ID_NULL;
+ return true;
+ }
+ return false;
+}
+
+static bool
+_xzm_segment_group_span_mark_smaller(xzm_segment_group_t sg,
+ xzm_free_span_t span, xzm_slice_count_t front_free_count,
+ xzm_slice_count_t used_count, xzm_slice_count_t back_free_count)
+{
+ xzm_debug_assert(_xzm_segment_group_uses_deferred_reclamation(sg));
+ xzm_debug_assert(_xzm_slice_kind_is_free_span(span->xzc_bits.xzcb_kind));
+ xzm_debug_assert(front_free_count + used_count + back_free_count ==
+ _xzm_free_span_slice_count(span));
+
+ xzm_main_malloc_zone_t main = sg->xzsg_main_ref;
+ xzm_malloc_zone_t zone = &main->xzmz_base;
+ xzm_reclaim_buffer_t buffer = main->xzmz_reclaim_buffer;
+
+ const bool deferred = _xzm_slice_is_deferred(zone, span);
+ uint64_t *span_id = _xzm_slice_meta_reclaim_id(zone, span);
+ uint8_t *span_start = _xzm_slice_start(zone, span);
+ bool pristine = span->xzc_bits.xzcb_is_pristine;
+
+ // Actual span metadata for the front/middle/back spans has not yet been
+ // updated, we only set the deferred reclaim metadata for these spans
+ xzm_free_span_t back_span = span + front_free_count + used_count;
+ const size_t front_size = front_free_count << XZM_SEGMENT_SLICE_SHIFT;
+ const size_t used_size = used_count << XZM_SEGMENT_SLICE_SHIFT;
+ const size_t back_size = back_free_count << XZM_SEGMENT_SLICE_SHIFT;
+ xzm_debug_assert(!back_size || span_start + front_size + used_size ==
+ _xzm_slice_start(zone, back_span));
+ uint64_t *back_id = back_size ?
+ _xzm_slice_meta_reclaim_id(zone, back_span) : NULL;
+ return xzm_reclaim_mark_smaller(buffer, span_id, back_id, span_start,
+ front_size, used_size, back_size, deferred, pristine, true);
+}
+
+void
+xzm_chunk_mark_free(xzm_malloc_zone_t zone, xzm_chunk_t chunk)
+{
+ xzm_main_malloc_zone_t main = _xzm_malloc_zone_main(zone);
+ xzm_debug_assert(_xzm_chunk_should_defer_reclamation(main, chunk));
+
+ xzm_reclaim_buffer_t buffer = main->xzmz_reclaim_buffer;
+
+ mach_vm_reclaim_id_t *reclaim_id = _xzm_slice_meta_reclaim_id(zone, chunk);
+ xzm_debug_assert(*reclaim_id == VM_RECLAIM_ID_NULL);
+ size_t chunk_size;
+ uint8_t *chunk_start = _xzm_chunk_start_ptr(zone, chunk, &chunk_size);
+
+ *reclaim_id = _xzm_reclaim_mark_free(buffer, chunk_start, chunk_size,
+ true);
+}
+
+bool
+xzm_chunk_mark_used(xzm_malloc_zone_t zone, xzm_chunk_t chunk,
+ bool *was_reclaimed)
+{
+ xzm_main_malloc_zone_t main = _xzm_malloc_zone_main(zone);
+ xzm_debug_assert(_xzm_slice_kind_is_chunk(chunk->xzc_bits.xzcb_kind));
+ xzm_debug_assert(_xzm_chunk_should_defer_reclamation(main, chunk));
+
+ xzm_reclaim_buffer_t buffer = main->xzmz_reclaim_buffer;
+
+ mach_vm_reclaim_id_t *reclaim_id = _xzm_slice_meta_reclaim_id(zone, chunk);
+ xzm_debug_assert(*reclaim_id != VM_RECLAIM_ID_NULL);
+ size_t chunk_size;
+ uint8_t *chunk_start = _xzm_chunk_start_ptr(zone, chunk, &chunk_size);
+ mach_vm_reclaim_state_t state;
+
+ state = _xzm_reclaim_mark_used(buffer, *reclaim_id, chunk_start,
+ chunk_size, true);
+
+ if (was_reclaimed) {
+ *was_reclaimed = (state != VM_RECLAIM_UNRECLAIMED);
+ }
+ if (mach_vm_reclaim_is_reusable(state)) {
+ *reclaim_id = VM_RECLAIM_ID_NULL;
+ return true;
+ }
+ return false;
+}
+
+#endif // CONFIG_XZM_DEFERRED_RECLAIM
+
+// mimalloc: mi_segment_span_free
+//
+// Precondition: sg is locked (except for huge segments)
+static void
+_xzm_segment_group_segment_span_free(xzm_segment_group_t sg,
+ xzm_segment_t segment, xzm_slice_count_t slice_index,
+ xzm_slice_count_t slice_count, bool set_id, bool is_pristine)
+{
+ xzm_debug_assert(slice_count != 0);
+ xzm_debug_assert(slice_index < segment->xzs_slice_entry_count);
+
+ // set first and last slice (the intermediates can be undetermined)
+ //
+ // TODO: leaving the intermediates undetermined means that you can't
+ // reliably check whether an arbitrary slice in a segment belongs to a
+ // chunk. That would be useful for:
+ // - the checked memcpy trick
+ // - malloc_claimed_address()
+ // - possibly other things?
+ //
+ // However, for large allocations it would require updating large numbers of
+ // slices, which is probably not worth the cost
+ xzm_free_span_t span = &segment->xzs_slices[slice_index];
+ span->xzc_bits.xzcb_is_pristine = is_pristine;
+ if (slice_count == 1) {
+ xzm_debug_assert(segment->xzs_kind != XZM_SEGMENT_KIND_HUGE);
+ span->xzc_bits.xzcb_kind = XZM_SLICE_KIND_SINGLE_FREE;
+ } else {
+ span->xzc_bits.xzcb_kind = XZM_SLICE_KIND_MULTI_FREE;
+ span->xzcs_slice_count = slice_count;
+
+ xzm_debug_assert(slice_index + slice_count - 1 < segment->xzs_slice_entry_count);
+ xzm_slice_t last = &segment->xzs_slices[slice_index + slice_count - 1];
+ last->xzc_bits.xzcb_kind = XZM_SLICE_KIND_MULTI_BODY;
+ last->xzsl_slice_offset_bytes =
+ (uint32_t)(sizeof(struct xzm_slice_s) * (slice_count - 1));
+ }
+
+ if (segment->xzs_kind == XZM_SEGMENT_KIND_NORMAL) {
+#ifdef DEBUG
+ _malloc_lock_assert_owner(&sg->xzsg_lock);
+#endif
+ xzm_span_queue_t sq = xzm_span_queue_for(sg, slice_count);
+ LIST_INSERT_HEAD(&sq->xzsq_queue, span, xzc_entry);
+ }
+#if CONFIG_XZM_DEFERRED_RECLAIM
+ if (set_id) {
+ mach_vm_reclaim_id_t *reclaim_id = _xzm_segment_slice_meta_reclaim_id(
+ segment, span);
+ *reclaim_id = VM_RECLAIM_ID_NULL;
+ } else if (!is_pristine) {
+ xzm_debug_assert(*_xzm_segment_slice_meta_reclaim_id(segment, span) !=
+ VM_RECLAIM_ID_NULL);
+ }
+#endif // CONFIG_XZM_DEFERRED_RECLAIM
+}
+
+// mimalloc: mi_segment_slice_split
+static xzm_free_span_t
+_xzm_segment_group_segment_slice_split(xzm_segment_group_t sg,
+ xzm_segment_t segment, xzm_free_span_t span,
+ xzm_slice_count_t slice_count, bool uses_dr, bool front)
+{
+ xzm_debug_assert(_xzm_segment_for_slice(&sg->xzsg_main_ref->xzmz_base, span) == segment);
+ xzm_debug_assert(span->xzc_bits.xzcb_kind == XZM_SLICE_KIND_MULTI_FREE);
+ xzm_debug_assert(span->xzcs_slice_count > slice_count);
+ xzm_debug_assert(segment->xzs_kind != XZM_SEGMENT_KIND_HUGE);
+
+ // Find the start and length of the piece being split off and update its
+ // slices
+ xzm_free_span_t retval;
+ xzm_slice_count_t index_to_free;
+ xzm_slice_count_t count_to_free = span->xzcs_slice_count - slice_count;
+ if (front) {
+ retval = span + count_to_free;
+ // We don't update the backpointers here because this span is about to
+ // be used as a large chunk, but we do need to update the slice count
+ // and kind since this span could be given back to _segment_slice_split
+ // to split off the back end
+ retval->xzcs_slice_count = span->xzcs_slice_count - count_to_free;
+ // We could probably copy the bits wholesale, but for now only
+ // explicitly copy the ones we know we need
+ retval->xzc_bits.xzcb_kind = XZM_SLICE_KIND_MULTI_FREE;
+ // Preserve whether the span is pristine, since it was undefined
+ retval->xzc_bits.xzcb_is_pristine = span->xzc_bits.xzcb_is_pristine;
+ index_to_free = _xzm_slice_index(segment, span);
+ } else {
+ retval = span;
+ index_to_free = _xzm_slice_index(segment, span) + slice_count;
+ }
+ // If the segment group uses deferred reclaim, then the reclaim id for the
+ // split span has already been initialized, so don't overwrite it
+ _xzm_segment_group_segment_span_free(sg, segment, index_to_free,
+ count_to_free, !uses_dr, span->xzc_bits.xzcb_is_pristine);
+ return retval;
+}
+
+static void
+_xzm_segment_group_segment_create_guard(xzm_segment_group_t sg,
+ xzm_segment_t segment, xzm_slice_count_t index)
+{
+ xzm_slice_t slice = &segment->xzs_slices[index];
+ xzm_slice_count_t slice_count = 1;
+
+ // Coalesce with next guard page
+ if (&slice[1] < _xzm_segment_slices_end(segment) &&
+ slice[1].xzc_bits.xzcb_kind == XZM_SLICE_KIND_GUARD) {
+ slice_count += slice[1].xzcs_slice_count;
+ }
+
+ // Coalesce with previous guard page
+ if (slice > _xzm_segment_slices_begin(segment)) {
+ xzm_slice_t prev = _xzm_span_slice_first(slice - 1);
+ if (prev->xzc_bits.xzcb_kind == XZM_SLICE_KIND_GUARD) {
+ index -= prev->xzcs_slice_count;
+ slice_count += prev->xzcs_slice_count;
+ slice = prev;
+ }
+ }
+
+ if (slice_count > 1) {
+ // Setup backpointers
+ for (int i = 1; i < slice_count; i++) {
+ slice[i].xzsl_slice_offset_bytes = i * sizeof(struct xzm_slice_s);
+ slice[i].xzc_bits.xzcb_kind = XZM_SLICE_KIND_MULTI_BODY;
+ }
+ } else {
+ // This is a new guard page entry, increment segment count to avoid
+ // trying to free this segment while it has guards
+ segment->xzs_used++;
+ }
+
+ xzm_debug_assert(slice == &segment->xzs_slices[index]);
+
+ slice->xzcs_slice_count = slice_count;
+ // mprotect
+ size_t size = XZM_SEGMENT_SLICE_SIZE * slice_count;
+ void *start = _xzm_segment_slice_index_start(segment, index);
+ int rc = mprotect(start, size, PROT_NONE);
+ if (rc) {
+ xzm_abort_with_reason("Failed to mprotect guard page", errno);
+ }
+
+ // Atomic store maybe?
+ slice->xzc_bits.xzcb_kind = XZM_SLICE_KIND_GUARD;
+}
+
+// mimalloc: mi_segment_span_allocate
+static xzm_chunk_t
+_xzm_segment_group_segment_span_mark_allocated(xzm_segment_group_t sg,
+ xzm_segment_t segment, xzm_slice_kind_t kind, size_t slice_index,
+ xzm_slice_count_t slice_count)
+{
+ xzm_debug_assert(_xzm_slice_kind_is_chunk(kind));
+ xzm_debug_assert(slice_index < segment->xzs_slice_entry_count);
+
+ xzm_slice_t slice = &segment->xzs_slices[slice_index];
+ xzm_chunk_t chunk = slice;
+
+ // set slice back pointers for the first XZM_MAX_SLICE_OFFSET entries
+ size_t extra = MIN(slice_count - 1, XZM_MAX_SLICE_OFFSET);
+ if (slice_index + extra >= segment->xzs_slice_entry_count) {
+ // huge objects may have more slices than available entries in the
+ // segment->xzs_slices table
+ extra = segment->xzs_slice_entry_count - slice_index - 1;
+ }
+ slice++;
+ for (size_t i = 1; i <= extra; i++, slice++) {
+ slice->xzc_bits.xzcb_kind = XZM_SLICE_KIND_MULTI_BODY;
+ slice->xzsl_slice_offset_bytes =
+ (uint32_t)(sizeof(struct xzm_slice_s) * i);
+ }
+
+ // And also for the last one, if not set already (the last one is needed for
+ // coalescing)
+ size_t last_slice_index = slice_index + slice_count - 1;
+ if (kind != XZM_SLICE_KIND_HUGE_CHUNK) {
+ xzm_debug_assert(last_slice_index < segment->xzs_slice_entry_count);
+
+ xzm_slice_t last = &segment->xzs_slices[last_slice_index];
+ if (last >= slice) {
+ last->xzc_bits.xzcb_kind = XZM_SLICE_KIND_MULTI_BODY;
+ last->xzsl_slice_offset_bytes =
+ (uint32_t)(sizeof(struct xzm_slice_s) * (slice_count - 1));
+ }
+ }
+
+ // Update the chunk slice last, setting the kind at the very end to
+ // "publish" the chunk for the enumerator protocol
+ if (kind != XZM_SLICE_KIND_TINY_CHUNK) {
+ chunk->xzcs_slice_count = slice_count;
+ } else {
+ xzm_debug_assert(slice_count == 1);
+ }
+ // TODO: atomic store, compiler barrier
+ chunk->xzc_bits.xzcb_kind = kind;
+
+#if CONFIG_XZM_DEFERRED_RECLAIM
+ mach_vm_reclaim_id_t *reclaim_id = _xzm_segment_slice_meta_reclaim_id(
+ segment, chunk);
+ *reclaim_id = VM_RECLAIM_ID_NULL;
+#endif // CONFIG_XZM_DEFERRED_RECLAIM
+ segment->xzs_used++;
+ return chunk;
+}
+
+// Taken from xnu/osfmk/kern/zalloc.c
+static inline uint32_t
+dist_u32(uint32_t a, uint32_t b)
+{
+ return a < b ? b - a : a - b;
+}
+
+static uint32_t
+_xzm_random_clear_n_bits(uint32_t mask, uint32_t pop, uint32_t n)
+{
+ for(; n--; pop--) {
+ uint32_t bit = arc4random_uniform(pop);
+ uint32_t m = mask;
+ // Clear the bottom `bit` bits from m...
+ for (; bit; bit--) {
+ m &= (m - 1);
+ }
+ // ... in order to clear the `bit`th least significant set bit in mask
+ mask ^= 1 << __builtin_ctz(m);
+ }
+ return mask;
+}
+
+// Create a bitmap `width` bits wide with `pop` set bits
+static uint32_t
+_xzm_random_bits(uint32_t pop, uint32_t width)
+{
+ uint32_t mask = (uint32_t)((1ull << width) - 1);
+ uint32_t retval;
+ uint32_t cur;
+
+ if (3 * width / 4 <= pop) {
+ // Caller wants >75% of the bits set, so set them all and clear <25%
+ retval = mask;
+ cur = width;
+ } else if (pop <= width / 4) {
+ retval = 0;
+ cur = 0;
+ } else {
+ // A masked value from arc4random should contain ~`width/2` set bits
+ retval = arc4random() & mask;
+ cur = __builtin_popcount(retval);
+
+ if (dist_u32(cur, pop) > dist_u32(width - cur, pop)) {
+ // If the opposite mask has a closer popcount, then start with that
+ cur = width - cur;
+ retval ^= mask;
+ }
+ }
+
+ if (cur < pop) {
+ // Setting `pop - cur` bits is really clearing that many from the
+ // opposite mask.
+ retval ^= mask;
+ retval = _xzm_random_clear_n_bits(retval, width - cur, pop - cur);
+ retval ^= mask;
+ } else if (pop < cur) {
+ retval = _xzm_random_clear_n_bits(retval, cur, cur - pop);
+ }
+ xzm_debug_assert(__builtin_popcount(retval) == pop);
+ xzm_debug_assert((retval & ~mask) == 0);
+ return retval;
+}
+
+static xzm_chunk_t
+_xzm_segment_group_segment_span_init_run(xzm_segment_group_t sg,
+ xzm_segment_t segment, xzm_slice_kind_t kind,
+ xzm_preallocate_list_s *preallocate_list, xzm_slice_count_t start_index,
+ xzm_slice_count_t total_slices, xzm_slice_count_t guard_count,
+ xzm_slice_count_t num_chunks)
+{
+ xzm_chunk_t retval = NULL;
+ uint32_t guard_mask;
+ if (guard_count) {
+ guard_mask = _xzm_random_bits(guard_count, num_chunks + 1);
+ } else {
+ guard_mask = 0;
+ }
+
+ xzm_slice_count_t slices_per_chunk = 0;
+ if (kind == XZM_SLICE_KIND_TINY_CHUNK) {
+ slices_per_chunk = 1;
+ } else if (kind == XZM_SLICE_KIND_SMALL_CHUNK) {
+ slices_per_chunk = XZM_SMALL_CHUNK_SIZE / XZM_SEGMENT_SLICE_SIZE;
+ } else if (kind == XZM_SLICE_KIND_SMALL_FREELIST_CHUNK) {
+ slices_per_chunk =
+ XZM_SMALL_FREELIST_CHUNK_SIZE / XZM_SEGMENT_SLICE_SIZE;
+ } else {
+ xzm_debug_assert(!preallocate_list);
+ xzm_debug_assert(guard_count == 0);
+ xzm_debug_assert(kind == XZM_SLICE_KIND_LARGE_CHUNK);
+ xzm_debug_assert(num_chunks == 1);
+ slices_per_chunk = total_slices;
+ }
+ xzm_debug_assert((num_chunks * slices_per_chunk + guard_count) ==
+ total_slices);
+ xzm_debug_assert((start_index + total_slices) <=
+ segment->xzs_slice_entry_count);
+
+ xzm_slice_count_t index = start_index;
+ bool is_pristine = segment->xzs_slices[index].xzc_bits.xzcb_is_pristine;
+
+ for (int i = 0; i < num_chunks; i++) {
+ if (guard_mask & 1) {
+ _xzm_segment_group_segment_create_guard(sg, segment, index);
+ index++;
+ }
+ guard_mask >>= 1;
+
+ xzm_chunk_t chunk = _xzm_segment_group_segment_span_mark_allocated(sg,
+ segment, kind, index, slices_per_chunk);
+ chunk->xzc_bits.xzcb_is_pristine = is_pristine;
+ index += slices_per_chunk;
+
+ if (i == 0) {
+ retval = chunk;
+ } else {
+ SLIST_INSERT_HEAD(preallocate_list, chunk, xzc_slist_entry);
+ }
+ }
+
+ xzm_debug_assert(guard_mask <= 1);
+ if (guard_mask) {
+ _xzm_segment_group_segment_create_guard(sg, segment, index);
+ index++;
+ }
+
+ xzm_debug_assert(index - start_index == total_slices);
+ return retval;
+}
+
+// mimalloc: mi_segments_page_find_and_allocate
+// Precondition: sg is locked
+static xzm_chunk_t
+_xzm_segment_group_find_and_allocate_chunk(xzm_segment_group_t sg,
+ xzm_slice_kind_t kind, xzm_xzone_guard_config_t guard_config,
+ xzm_preallocate_list_s *preallocate_list, xzm_slice_count_t slice_count,
+ size_t alignment)
+{
+ xzm_debug_assert(_xzm_slice_kind_is_chunk(kind));
+ xzm_debug_assert(kind != XZM_SLICE_KIND_TINY_CHUNK || slice_count == 1);
+ xzm_debug_assert(slice_count != 0);
+ xzm_debug_assert(slice_count * XZM_SEGMENT_SLICE_SIZE <=
+ XZM_LARGE_BLOCK_SIZE_MAX);
+ xzm_debug_assert(alignment == 0 || kind == XZM_SLICE_KIND_LARGE_CHUNK);
+
+ xzm_debug_assert(kind != XZM_SLICE_KIND_TINY_CHUNK || guard_config != NULL);
+ xzm_debug_assert(kind != XZM_SLICE_KIND_SMALL_CHUNK || guard_config != NULL);
+ xzm_debug_assert(kind != XZM_SLICE_KIND_SMALL_FREELIST_CHUNK ||
+ guard_config != NULL);
+ xzm_debug_assert(kind != XZM_SLICE_KIND_LARGE_CHUNK || guard_config == NULL);
+
+ if (alignment <= XZM_SEGMENT_SLICE_SIZE) {
+ // Large chunks guarantee page alignment
+ alignment = 0;
+ }
+ xzm_slice_count_t alignment_slices;
+ if (os_convert_overflow(alignment / XZM_SEGMENT_SLICE_SIZE, &alignment_slices)) {
+ xzm_debug_abort_with_reason("Unexpected align value", alignment);
+ return NULL;
+ }
+
+ xzm_slice_count_t total_slice_count;
+ uint8_t chunks_in_run;
+ uint8_t guards;
+ if (guard_config && guard_config->xxgc_max_run_length) {
+ chunks_in_run = arc4random_uniform(guard_config->xxgc_max_run_length) + 1;
+ total_slice_count = chunks_in_run * slice_count;
+ guards = (guard_config->xxgc_density * total_slice_count) / 256;
+ uint32_t remainder = (guard_config->xxgc_density * total_slice_count) %
+ 256;
+ // short circuit to avoid a call to corecrypto in common case that the
+ // density of guard pages goes perfectly into the allocated pages
+ if (remainder && remainder > arc4random_uniform(256)) {
+ guards++;
+ }
+ total_slice_count += guards;
+ } else {
+ total_slice_count = slice_count;
+ chunks_in_run = 1;
+ guards = 0;
+ }
+ xzm_debug_assert(total_slice_count <=
+ (XZM_LARGE_BLOCK_SIZE_MAX / XZM_SEGMENT_SLICE_SIZE) ||
+ // Aligned allocations can request more than LARGE_BLOCK_SIZE slices
+ // from the span queue
+ alignment != 0);
+ // At present, we only allow 1 guard page between chunks in a run, so it
+ // shouldn't be possible to have more guards than chunks
+ xzm_debug_assert(chunks_in_run >= guards);
+
+ if (alignment_slices) {
+ // We only need to allocate (slice_count + alignment_slices - 1) slices
+ // to guarantee that there will be a slice_count long span at the
+ // correct alignment
+ xzm_slice_count_t max_align_slices =
+ alignment_slices ? alignment_slices - 1 : 0;
+
+ if (os_add_overflow(total_slice_count, max_align_slices,
+ &total_slice_count)) {
+ xzm_debug_abort_with_reason("Unexpected total slice count",
+ slice_count + max_align_slices);
+ return NULL;
+ }
+
+ xzm_debug_assert(total_slice_count < XZM_SLICES_PER_SEGMENT);
+ }
+
+ for (xzm_span_queue_t sq = xzm_span_queue_for(sg, total_slice_count);
+ sq < &sg->xzsg_spans[XZM_SPAN_QUEUE_COUNT];
+ sq++) {
+ // TODO: rather than allowing a range of span sizes in a span queue,
+ // should all the spans be exactly the span queue size? Then this would
+ // be a pop rather than a list scan.
+ xzm_free_span_t span, tmp;
+ LIST_FOREACH_SAFE(span, &sq->xzsq_queue, xzc_entry, tmp) {
+ xzm_slice_count_t span_slice_count =
+ _xzm_free_span_slice_count(span);
+ if (span_slice_count >= total_slice_count) {
+ xzm_malloc_zone_t zone = &sg->xzsg_main_ref->xzmz_base;
+ xzm_segment_t segment = _xzm_segment_for_slice(zone, span);
+#if CONFIG_XZM_DEFERRED_RECLAIM
+ xzm_slice_count_t old_total_slice_count = total_slice_count;
+#endif // CONFIG_XZM_DEFERRED_RECLAIM
+ xzm_slice_count_t front_free_count = 0;
+
+ if (alignment_slices) {
+ // Split off the front to round the address up to alignment
+ xzm_slice_count_t actual_index = _xzm_slice_index(segment,
+ span);
+ xzm_slice_count_t desired_index = roundup(actual_index,
+ alignment_slices);
+
+ front_free_count = desired_index - actual_index;
+ xzm_debug_assert(slice_count <= (total_slice_count - front_free_count));
+
+ // Take the alignment slices back out of our request
+ total_slice_count = slice_count;
+
+ if (front_free_count) {
+ span_slice_count -= front_free_count;
+ }
+ }
+
+ xzm_slice_count_t back_free_count =
+ span_slice_count - total_slice_count;
+
+ bool uses_dr = false;
+#if CONFIG_XZM_DEFERRED_RECLAIM
+ uses_dr = _xzm_segment_group_uses_deferred_reclamation(sg);
+ if (uses_dr) {
+ if (!_xzm_segment_group_span_mark_smaller(sg, span,
+ front_free_count, total_slice_count,
+ back_free_count)) {
+ total_slice_count = old_total_slice_count;
+ // span is busy being reclaimed by the kernel
+ continue;
+ }
+ }
+#endif // CONFIG_XZM_DEFERRED_RECLAIM
+
+ LIST_REMOVE(span, xzc_entry);
+
+ if (front_free_count) {
+ span = _xzm_segment_group_segment_slice_split(sg, segment,
+ span, span_slice_count, uses_dr, true);
+ }
+
+ if (back_free_count) {
+ _xzm_segment_group_segment_slice_split(sg, segment, span,
+ total_slice_count, uses_dr, false);
+ }
+
+ xzm_slice_count_t index = _xzm_slice_index(segment, span);
+
+ xzm_chunk_t chunk;
+ chunk = _xzm_segment_group_segment_span_init_run(sg, segment,
+ kind, preallocate_list, index, total_slice_count,
+ guards, chunks_in_run);
+
+ xzm_debug_assert(chunk);
+ xzm_debug_assert(_xzm_segment_group_segment_is_valid(sg,
+ segment));
+
+ return chunk;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+// mi_segment_init
+static xzm_chunk_t
+_xzm_segment_group_init_segment(xzm_segment_group_t sg, xzm_segment_t segment,
+ void *body, size_t body_size, bool huge, bool is_pristine)
+{
+ xzm_chunk_t chunk = NULL;
+ xzm_assert((uintptr_t)segment < XZM_LIMIT_ADDRESS);
+ xzm_assert((uintptr_t)body < XZM_LIMIT_ADDRESS);
+ xzm_debug_assert((uintptr_t)segment % XZM_METAPOOL_SEGMENT_ALIGN == 0);
+ xzm_debug_assert((uintptr_t)body % XZM_SEGMENT_SIZE == 0);
+ xzm_debug_assert(body_size % XZM_SEGMENT_SLICE_SIZE == 0);
+
+ xzm_slice_count_t total_slices = 0;
+ if (os_convert_overflow(body_size / XZM_SEGMENT_SLICE_SIZE, &total_slices)) {
+ xzm_abort("Slice count too large in init_segment");
+ }
+ segment->xzs_segment_group = sg;
+ segment->xzs_slice_count = total_slices;
+ segment->xzs_slice_entry_count = MIN(total_slices, XZM_SLICES_PER_SEGMENT);
+ segment->xzs_used = 0;
+ segment->xzs_segment_body = body;
+#if CONFIG_XZM_DEFERRED_RECLAIM
+ segment->xzs_reclaim_id = VM_RECLAIM_ID_NULL;
+#endif // CONFIG_XZM_DEFERRED_RECLAIM
+
+ segment->xzs_kind = huge ? XZM_SEGMENT_KIND_HUGE : XZM_SEGMENT_KIND_NORMAL;
+ if (huge) {
+ chunk = _xzm_segment_group_segment_span_mark_allocated(sg, segment,
+ XZM_SLICE_KIND_HUGE_CHUNK, 0, segment->xzs_slice_count);
+ chunk->xzc_bits.xzcb_is_pristine = is_pristine;
+ } else {
+ // Lock the segment group to add this span - we'll return to the caller
+ // with the segment group locked so they can then directly allocate what
+ // they need
+ _malloc_lock_lock(&sg->xzsg_lock);
+
+ _xzm_segment_group_segment_span_free(sg, segment, 0, total_slices,
+ true, is_pristine);
+ }
+ xzm_debug_assert(_xzm_segment_group_segment_is_valid(sg, segment));
+ return chunk;
+}
+
+// mimalloc: mi_segment_alloc
+//
+// Used to allocate both normal and huge segments.
+//
+// Postcondition: for normal segments, the segment group lock is held on
+// successful return
+static bool
+_xzm_segment_group_alloc_segment(xzm_segment_group_t sg, size_t required_bytes,
+ size_t alignment, xzm_chunk_t *huge_chunk, bool purgeable)
+{
+ xzm_chunk_t chunk;
+ xzm_debug_assert((required_bytes == 0 && huge_chunk == NULL) ||
+ (required_bytes > 0 && huge_chunk != NULL));
+
+ bool huge = (required_bytes != 0);
+
+ // non-default segment alignment is only supported for huge chunks
+ xzm_debug_assert(huge || alignment == 0);
+
+ // The total number of bytes we need to allocate is then:
+ // - For normal segments, exactly the standard segment size
+ // - For huge segments, the required body size, rounded up to the next slice
+ size_t total_required_bytes;
+ if (huge) {
+ total_required_bytes = roundup(required_bytes, XZM_SEGMENT_SLICE_SIZE);
+ } else {
+ total_required_bytes = XZM_SEGMENT_SIZE;
+ }
+
+ xzm_range_group_t rg = sg->xzsg_range_group;
+
+ xzm_range_group_alloc_flags_t rga_flags = 0;
+ if (huge) {
+ rga_flags |= XZM_RANGE_GROUP_ALLOC_FLAGS_HUGE;
+ }
+
+ if (purgeable) {
+ rga_flags |= XZM_RANGE_GROUP_ALLOC_FLAGS_PURGEABLE;
+ }
+
+#if CONFIG_MTE
+ // XXX Note: we need to allocate all data segments as taggable in order for
+ // tag_data to work, but the vast majority of the space will be for
+ // large/huge, which is a significant waste. We're okay with that because
+ // tag_data is not the default/production configuration, but we may need to
+ // be more efficient about this in the future.
+ if (_xzm_segment_group_memtag_enabled(sg)) {
+ rga_flags |= XZM_RANGE_GROUP_ALLOC_FLAGS_MTE;
+ }
+#endif
+
+ void *segment_body = xzm_range_group_alloc_segment(rg, total_required_bytes,
+ alignment, mvm_plat_map(*map_ptr), rga_flags);
+ if (!segment_body) {
+ return false;
+ }
+
+ xzm_assert((uintptr_t)segment_body < XZM_LIMIT_ADDRESS);
+
+ xzm_segment_t segment_meta = xzm_metapool_alloc(
+ &sg->xzsg_main_ref->xzmz_metapools[XZM_METAPOOL_SEGMENT]);
+
+
+ chunk = _xzm_segment_group_init_segment(sg, segment_meta, segment_body,
+ total_required_bytes, huge, true);
+
+ // Publish the segment in the segment table now that it has been properly
+ // initialized
+ _xzm_segment_table_allocated_at(sg->xzsg_main_ref, segment_body,
+ segment_meta, !huge);
+
+ if (huge) {
+ *huge_chunk = chunk;
+ }
+ return true;
+}
+
+#if CONFIG_XZM_DEFERRED_RECLAIM
+
+static xzm_chunk_t
+_xzm_segment_group_alloc_huge_chunk_from_cache(xzm_segment_group_t sg,
+ xzm_slice_count_t slice_count)
+{
+ xzm_debug_assert(sg->xzsg_id == XZM_SEGMENT_GROUP_DATA_LARGE);
+
+ xzm_segment_t best_seg, cur_seg, seg_tmp;
+ xzm_segment_cache_t cache = &sg->xzsg_cache;
+ xzm_chunk_t chunk = NULL;
+
+ _malloc_lock_lock(&cache->xzsc_lock);
+
+ if (cache->xzsc_count == 0) {
+ _malloc_lock_unlock(&cache->xzsc_lock);
+ return NULL;
+ }
+
+ xzm_reclaim_buffer_t buffer = sg->xzsg_main_ref->xzmz_reclaim_buffer;
+ while (1) {
+ best_seg = NULL;
+ TAILQ_FOREACH_SAFE(cur_seg, &cache->xzsc_head, xzs_cache_entry, seg_tmp) {
+ if (cur_seg->xzs_slice_count >= slice_count &&
+ // allow up to 50% fragmentation
+ (cur_seg->xzs_slice_count < (2 * slice_count)) &&
+ (best_seg == NULL ||
+ cur_seg->xzs_slice_count < best_seg->xzs_slice_count)) {
+ if (_xzm_reclaim_is_reusable(buffer,
+ cur_seg->xzs_reclaim_id, true)) {
+ best_seg = cur_seg;
+ } else {
+ // Kernel has already reclaimed this entry or
+ // is in the process of trying to reclaim it.
+ _xzm_segment_group_cache_invalidate(sg, cur_seg);
+ }
+ }
+ }
+
+ if (best_seg == NULL) {
+ // Unable to find a suitable entry
+ _malloc_lock_unlock(&cache->xzsc_lock);
+ return NULL;
+ }
+
+ if (_xzm_segment_group_cache_mark_used(sg, best_seg)) {
+ // entry has been reclaimed
+ break;
+ }
+ }
+
+ _malloc_lock_unlock(&cache->xzsc_lock);
+
+ // Mark segment as allocated since it has been removed from the cache
+ _xzm_segment_table_allocated_at(sg->xzsg_main_ref,
+ _xzm_segment_start(best_seg), best_seg, false);
+
+ chunk = (xzm_chunk_t)_xzm_segment_slices_begin(best_seg);
+
+ return chunk;
+}
+
+#endif // CONFIG_XZM_DEFERRED_RECLAIM
+
+// mimalloc: mi_segment_huge_page_alloc
+static xzm_chunk_t
+_xzm_segment_group_alloc_huge_chunk(xzm_segment_group_t sg,
+ xzm_slice_count_t slice_count, bool clear, size_t alignment,
+ bool purgeable)
+{
+ if (alignment < XZM_SEGMENT_SIZE) {
+ // Huge chunks guarantee segment alignment
+ alignment = 0;
+ }
+
+ xzm_debug_assert(alignment % XZM_SEGMENT_SIZE == 0);
+ __assert_only bool defer_large = sg->xzsg_main_ref->xzmz_defer_large;
+ xzm_debug_assert(sg->xzsg_id == XZM_SEGMENT_GROUP_DATA_LARGE ||
+ !defer_large);
+ xzm_debug_assert(sg->xzsg_id == XZM_SEGMENT_GROUP_DATA || defer_large);
+
+ size_t required_bytes = (size_t)slice_count * XZM_SEGMENT_SLICE_SIZE;
+ xzm_chunk_t chunk = NULL;
+
+#if CONFIG_XZM_DEFERRED_RECLAIM
+ if (sg->xzsg_id == XZM_SEGMENT_GROUP_DATA_LARGE &&
+ sg->xzsg_cache.xzsc_max_count > 0 &&
+ slice_count <= sg->xzsg_cache.xzsc_max_entry_slices &&
+ alignment <= XZM_SEGMENT_SIZE) {
+ chunk = _xzm_segment_group_alloc_huge_chunk_from_cache(sg, slice_count);
+ if (chunk) {
+ if (clear) {
+ size_t chunk_size = 0;
+ uint8_t *start = _xzm_chunk_start_ptr(
+ &sg->xzsg_main_ref->xzmz_base,
+ chunk, &chunk_size);
+#if CONFIG_REALLOC_CAN_USE_VMCOPY
+ // rdar://140793773
+ bzero(start, chunk_size);
+#else
+ _xzm_segment_group_clear_chunk(sg, start, chunk_size);
+#endif
+ chunk->xzc_bits.xzcb_is_pristine = true;
+ } else {
+ chunk->xzc_bits.xzcb_is_pristine = false;
+ }
+#ifdef DEBUG
+ size_t chunk_size = 0;
+ uintptr_t start = (uintptr_t)_xzm_chunk_start_ptr(
+ &sg->xzsg_main_ref->xzmz_base, chunk, &chunk_size);
+ xzm_debug_assert(alignment == 0 || (start % alignment) == 0);
+#endif // DEBUG
+ return chunk;
+ }
+ }
+#endif // CONFIG_XZM_DEFERRED_RECLAIM
+ // huge chunks allocated from the VM are inherently clear
+ bool allocated = _xzm_segment_group_alloc_segment(sg, required_bytes,
+ alignment, &chunk, purgeable);
+ return allocated ? chunk : NULL;
+}
+
+static xzm_chunk_t
+_xzm_segment_group_alloc_segment_and_chunk(xzm_segment_group_t sg,
+ xzm_slice_kind_t kind, xzm_xzone_guard_config_t guard_config,
+ xzm_preallocate_list_s *preallocate_list, xzm_slice_count_t slice_count,
+ size_t alignment)
+{
+ xzm_chunk_t chunk = NULL;
+
+ bool allocated = _xzm_segment_group_alloc_segment(sg, 0, 0, NULL, false);
+ if (!allocated) {
+ goto alloc_done;
+ }
+
+ // We hold the main lock again (alloc took it for us). Since we were
+ // able to allocate, we should be sure to get the chunk.
+ chunk = _xzm_segment_group_find_and_allocate_chunk(sg, kind, guard_config,
+ preallocate_list, slice_count, alignment);
+ xzm_debug_assert(chunk);
+ _malloc_lock_unlock(&sg->xzsg_lock);
+
+alloc_done:
+ _malloc_lock_unlock(&sg->xzsg_alloc_lock);
+ return chunk;
+}
+
+static void
+_xzm_segment_group_bzero_chunk(xzm_segment_group_t sg, uint8_t *start, size_t size)
+{
+ // Put a ceiling on the amount of memory we dirty at a time
+ size_t max_clear_size = KiB(512);
+
+ while (size) {
+ size_t next_clear_size = MIN(size, max_clear_size);
+ bzero(start, next_clear_size);
+ xzm_madvise(&sg->xzsg_main_ref->xzmz_base, start, next_clear_size);
+
+ start += next_clear_size;
+ size -= next_clear_size;
+ }
+}
+
+static void
+_xzm_segment_group_clear_chunk(xzm_segment_group_t sg, uint8_t *start, size_t size)
+{
+#if CONFIG_MADV_ZERO
+ if (madvise(start, size, MADV_ZERO)) {
+#ifdef DEBUG
+ malloc_zone_error(0, false,
+ "Failed to madvise(MADV_ZERO) chunk at %p, error: %d\n",
+ start, errno);
+#endif
+ return _xzm_segment_group_bzero_chunk(sg, start, size);
+ }
+#else
+ return _xzm_segment_group_bzero_chunk(sg, start, size);
+#endif // CONFIG_MADV_ZERO
+}
+
+static void
+_xzm_segment_group_overwrite_chunk(uint8_t *start, size_t size,
+ xzm_range_group_alloc_flags_t rga_flags)
+{
+ mach_vm_address_t vm_addr = (mach_vm_address_t)start;
+ mach_vm_size_t vm_size = (mach_vm_size_t)size;
+ int alloc_flags = VM_FLAGS_OVERWRITE | VM_MAKE_TAG(VM_MEMORY_MALLOC_SMALL);
+#if CONFIG_MTE
+ if (rga_flags & XZM_RANGE_GROUP_ALLOC_FLAGS_MTE) {
+ alloc_flags |= VM_FLAGS_MTE;
+ }
+#endif
+ kern_return_t kr = mach_vm_map(mach_task_self(), &vm_addr, vm_size,
+ /* mask */ 0, alloc_flags, MEMORY_OBJECT_NULL,
+ /* offset */ 0, /* copy */ FALSE, VM_PROT_DEFAULT,
+ VM_PROT_ALL, VM_INHERIT_DEFAULT);
+ if (kr != KERN_SUCCESS) {
+ xzm_abort_with_reason("mach_vm_map() overwrite failed", kr);
+ }
+}
+
+// mimalloc: mi_segments_page_alloc
+xzm_chunk_t
+xzm_segment_group_alloc_chunk(xzm_segment_group_t sg, xzm_slice_kind_t kind,
+ xzm_xzone_guard_config_t guard_config, xzm_slice_count_t slice_count,
+ xzm_preallocate_list_s *preallocate_list, size_t alignment, bool clear,
+ bool purgeable) {
+ if (kind == XZM_SLICE_KIND_HUGE_CHUNK) {
+ xzm_debug_assert(guard_config == NULL);
+ xzm_debug_assert(preallocate_list == NULL);
+ xzm_debug_assert((slice_count >
+ XZM_LARGE_BLOCK_SIZE_MAX / XZM_SEGMENT_SLICE_SIZE) ||
+ (alignment > XZM_ALIGNMENT_MAX));
+ return _xzm_segment_group_alloc_huge_chunk(sg, slice_count, clear,
+ alignment, purgeable);
+ }
+ xzm_debug_assert(kind == XZM_SLICE_KIND_LARGE_CHUNK || alignment == 0);
+
+ // Due to alignment, it's possible for the xzone layer to request a single
+ // page large chunk. The segment layer assumes that such chunks can't exist,
+ // so we round up the slice count here
+ if (kind == XZM_SLICE_KIND_LARGE_CHUNK && slice_count == 1) {
+ slice_count = 2;
+ }
+
+ // Consider: round up slice_count like mimalloc does?
+
+ // We don't want to hold the main segment group lock while interacting with
+ // the VM so that other allocations and deallocations that don't need to can
+ // be served concurrently, but we do want to limit ourselves to allocating
+ // only one new segment at a time so that we don't overshoot what we need if
+ // many threads arrive during a period where a new segment is needed.
+ //
+ // So, we also have an "allocations lock", and the protocol is that a thread
+ // wanting to allocate new VM must acquire it before going off to the VM.
+
+ xzm_chunk_t chunk = NULL;
+
+ _malloc_lock_lock(&sg->xzsg_lock);
+ chunk = _xzm_segment_group_find_and_allocate_chunk(sg, kind, guard_config,
+ preallocate_list, slice_count, alignment);
+ if (chunk) {
+ // Happy path: we got the chunk and are done.
+ _malloc_lock_unlock(&sg->xzsg_lock);
+ goto done;
+ }
+
+ // First try didn't succeed, so we need a new segment. See if we can get
+ // the alloc lock to allocate a new segment.
+ bool gotlock = _malloc_lock_trylock(&sg->xzsg_alloc_lock);
+ if (os_likely(gotlock)) {
+ // We got it, so we can try to directly allocate a new segment.
+ _malloc_lock_unlock(&sg->xzsg_lock);
+ chunk = _xzm_segment_group_alloc_segment_and_chunk(sg, kind,
+ guard_config, preallocate_list, slice_count, alignment);
+ } else {
+ // We didn't get it, so somebody else is allocating. We need to drop
+ // the main lock...
+ _malloc_lock_unlock(&sg->xzsg_lock);
+
+ // ... and wait for them on the alloc lock.
+ _malloc_lock_lock(&sg->xzsg_alloc_lock);
+
+ // Now that we've got the alloc lock, reacquire the main lock and try to
+ // allocate from the new segment that the thread we were waiting for
+ // would have installed.
+ _malloc_lock_lock(&sg->xzsg_lock);
+ chunk = _xzm_segment_group_find_and_allocate_chunk(sg, kind,
+ guard_config, preallocate_list, slice_count, alignment);
+ _malloc_lock_unlock(&sg->xzsg_lock);
+
+ if (chunk) {
+ // We were able to allocate from the new segment.
+ _malloc_lock_unlock(&sg->xzsg_alloc_lock);
+ } else {
+ // The entire new segment has already been exhausted while we were
+ // waiting for the alloc lock. We have it now, so it's our turn to
+ // allocate a new segment.
+ chunk = _xzm_segment_group_alloc_segment_and_chunk(sg, kind,
+ guard_config, preallocate_list, slice_count, alignment);
+ }
+ }
+
+done:
+
+ if (chunk) {
+ size_t chunk_size;
+ uint8_t *start = _xzm_chunk_start_ptr(&sg->xzsg_main_ref->xzmz_base,
+ chunk, &chunk_size);
+#if CONFIG_MTE
+ const bool memtag_enabled =
+ _xzm_segment_group_memtag_block(sg, chunk_size);
+#endif
+ if (!chunk->xzc_bits.xzcb_is_pristine) {
+ if (_xzm_segment_group_has_madvise_workaround(sg) &&
+ kind == XZM_SLICE_KIND_LARGE_CHUNK) {
+ xzm_range_group_alloc_flags_t rga_flags = 0;
+#if CONFIG_MTE
+ if (memtag_enabled) {
+ rga_flags |= XZM_RANGE_GROUP_ALLOC_FLAGS_MTE;
+ }
+#endif
+ _xzm_segment_group_overwrite_chunk(start, chunk_size, rga_flags);
+ chunk->xzc_bits.xzcb_is_pristine = true;
+ } else if (clear) {
+ // TODO: is this the right cutoff?
+ if (kind == XZM_SLICE_KIND_TINY_CHUNK) {
+ // It's just one page that we're going to fault anyway
+ bzero(start, chunk_size);
+ } else {
+ _xzm_segment_group_clear_chunk(sg, start, chunk_size);
+ }
+
+ chunk->xzc_bits.xzcb_is_pristine = true;
+ }
+ }
+
+ if (os_unlikely(purgeable)) {
+ xzm_debug_assert(guard_config == NULL);
+ xzm_debug_assert(kind == XZM_SLICE_KIND_LARGE_CHUNK);
+ mach_vm_address_t vm_addr = (mach_vm_address_t)start;
+ mach_vm_size_t vm_size = (mach_vm_size_t)chunk_size;
+ int alloc_flags = VM_FLAGS_OVERWRITE |
+ VM_MAKE_TAG(VM_MEMORY_MALLOC_SMALL) | VM_FLAGS_PURGABLE;
+#if CONFIG_MTE
+ if (memtag_enabled) {
+ alloc_flags |= VM_FLAGS_MTE;
+ }
+#endif
+ kern_return_t kr = mach_vm_map(mach_task_self(), &vm_addr, vm_size,
+ /* mask */ 0, alloc_flags, MEMORY_OBJECT_NULL,
+ /* offset */ 0, /* copy */ FALSE, VM_PROT_DEFAULT,
+ VM_PROT_ALL, VM_INHERIT_DEFAULT);
+ if (kr != KERN_SUCCESS) {
+ xzm_abort_with_reason("mach_vm_map() overwrite failed", kr);
+ }
+ }
+ }
+
+ return chunk;
+}
+
+// mimalloc: mi_segment_span_remove_from_queue
+static void
+_xzm_segment_group_segment_span_remove_from_queue(xzm_segment_group_t sg,
+ xzm_free_span_t span, xzm_slice_count_t slice_count)
+{
+ (void)sg; (void)slice_count;
+ LIST_REMOVE(span, xzc_entry);
+}
+
+// mimalloc: mi_segment_span_free_coalesce
+//
+// TODO: more nuanced policy for zero-tracking
+// - Right now we do the easy thing, which is to mark the entire coalesced free
+// span as dirty because the chunk being deallocated is
+// - However, that's probably not optimal if we're coalescing something small
+// with a very large free span - e.g. the initial pristine span
+// - One possibility would be to compare the sizes of the chunk being freed and
+// the spans being coalesced with - if the spans we're coalescing with are
+// relatively large and already zero-initialized, it may be better to just
+// zero the chunk being freed and maintain the zero initialization of the new
+// span as a whole
+// - The risk of that, though, is that we may waste time zeroing chunks that
+// aren't going to wind up being used to serve cleared allocations anyway
+static xzm_free_span_t
+_xzm_segment_group_segment_span_free_coalesce(xzm_segment_group_t sg,
+ xzm_segment_t segment, xzm_chunk_t chunk, bool *success_out)
+{
+ xzm_slice_count_t slice_count;
+ if (_xzm_slice_kind_is_chunk(chunk->xzc_bits.xzcb_kind)) {
+ slice_count = _xzm_chunk_slice_count(chunk);
+ } else if (_xzm_slice_kind_is_free_span(chunk->xzc_bits.xzcb_kind)) {
+ slice_count = _xzm_free_span_slice_count(chunk);
+ } else {
+ xzm_abort("attempting to coalesce slice of unexpected type");
+ }
+
+ xzm_free_span_t span = chunk;
+
+ if (success_out) {
+ *success_out = true;
+ }
+
+ // "unpublish" the chunk for enumeration as early as possible by resetting
+ // its kind
+ span->xzc_bits.xzcb_kind = XZM_SLICE_KIND_INVALID;
+
+ xzm_slice_t next = chunk + slice_count;
+ if (next < _xzm_segment_slices_end(segment) &&
+ _xzm_slice_kind_is_free_span(next->xzc_bits.xzcb_kind)) {
+#if CONFIG_XZM_DEFERRED_RECLAIM
+ if (_xzm_segment_group_uses_deferred_reclamation(sg)) {
+ if (!_xzm_segment_group_span_mark_used(sg, next)) {
+ if (success_out) {
+ *success_out = false;
+ }
+ goto previous;
+ }
+ }
+#endif // CONFIG_XZM_DEFERRED_RECLAIM
+ xzm_slice_count_t next_slice_count = _xzm_free_span_slice_count(next);
+ slice_count += next_slice_count; // extend
+ _xzm_segment_group_segment_span_remove_from_queue(sg, next,
+ next_slice_count);
+ }
+
+#if CONFIG_XZM_DEFERRED_RECLAIM
+previous:
+#endif // CONFIG_XZM_DEFERRED_RECLAIM
+ if (span > _xzm_segment_slices_begin(segment)) {
+ xzm_slice_t prev = _xzm_span_slice_first(span - 1);
+ if (_xzm_slice_kind_is_free_span(prev->xzc_bits.xzcb_kind)) {
+#if CONFIG_XZM_DEFERRED_RECLAIM
+ if (_xzm_segment_group_uses_deferred_reclamation(sg)) {
+ if (!_xzm_segment_group_span_mark_used(sg, prev)) {
+ if (success_out) {
+ *success_out = false;
+ }
+ goto done;
+ }
+ }
+#endif // CONFIG_XZM_DEFERRED_RECLAIM
+ xzm_slice_count_t prev_slice_count =
+ _xzm_free_span_slice_count(prev);
+ slice_count += prev_slice_count;
+ _xzm_segment_group_segment_span_remove_from_queue(sg, prev,
+ prev_slice_count);
+ span = prev;
+ }
+ }
+
+#if CONFIG_XZM_DEFERRED_RECLAIM
+done:
+#endif // CONFIG_XZM_DEFERRED_RECLAIM
+ // and add the new free span
+ _xzm_segment_group_segment_span_free(sg, segment,
+ _xzm_slice_index(segment, span), slice_count, true, false);
+ return span;
+}
+
+static void
+_xzm_segment_group_segment_deallocate(xzm_segment_group_t sg,
+ xzm_segment_t segment, bool free_from_table)
+{
+ // Remove the segment from the segment map
+ if (free_from_table) {
+ _xzm_segment_table_freed_at(sg->xzsg_main_ref,
+ _xzm_segment_start(segment), segment, true);
+ }
+
+ size_t size = segment->xzs_slice_count * XZM_SEGMENT_SLICE_SIZE;
+ xzm_range_group_free_segment_body(sg->xzsg_range_group,
+ _xzm_segment_start(segment), size, mvm_plat_map(segment->xzs_map));
+ xzm_metapool_free(&sg->xzsg_main_ref->xzmz_metapools[XZM_METAPOOL_SEGMENT],
+ segment);
+}
+
+// mimalloc: mi_segment_free
+static void
+_xzm_segment_group_segment_free(xzm_segment_group_t sg, xzm_segment_t segment)
+{
+ xzm_debug_assert(segment->xzs_used == 0);
+ xzm_free_span_t span = _xzm_segment_slices_begin(segment);
+
+#if CONFIG_XZM_DEFERRED_RECLAIM
+ xzm_free_span_t next;
+ if (_xzm_segment_group_uses_deferred_reclamation(sg)) {
+ if (!_xzm_segment_group_span_mark_used(sg, span)) {
+ // kernel is holding this span busy
+ goto fail;
+ }
+ while (_xzm_free_span_slice_count(span) < _xzm_segment_slice_count(segment)) {
+ bool success;
+ _xzm_segment_group_segment_span_remove_from_queue(sg, span,
+ span->xzcs_slice_count);
+ span = _xzm_segment_group_segment_span_free_coalesce(sg, segment,
+ span, &success);
+ if (!success) {
+ // kernel is holding an adjacent span busy
+ goto fail;
+ }
+ }
+ }
+#endif // CONFIG_XZM_DEFERRED_RECLAIM
+
+ // The segment should have exactly one free span, which we need to now
+ // remove from its span queue
+ xzm_debug_assert(span->xzc_bits.xzcb_kind == XZM_SLICE_KIND_MULTI_FREE);
+ xzm_debug_assert(span->xzcs_slice_count == segment->xzs_slice_count);
+
+ _xzm_segment_group_segment_span_remove_from_queue(sg, span,
+ span->xzcs_slice_count);
+
+ // Drop the segment group lock before going off to the VM
+ _malloc_lock_unlock(&sg->xzsg_lock);
+
+ _xzm_segment_group_segment_deallocate(sg, segment, true);
+ return;
+
+#if CONFIG_XZM_DEFERRED_RECLAIM
+fail:;
+ // Kernel is holding a span busy, place any re-used spans back in the
+ // buffer.
+ next = _xzm_segment_slices_begin(segment);
+ do {
+ span = next;
+ if (!_xzm_segment_slice_is_deferred(segment, span)) {
+ _xzm_segment_group_span_mark_free(sg, span);
+ }
+ next = span + _xzm_free_span_slice_count(span);
+ } while (next < _xzm_segment_slices_end(segment));
+ _malloc_lock_unlock(&sg->xzsg_lock);
+ return;
+#endif // CONFIG_XZM_DEFERRED_RECLAIM
+}
+
+// trim unneeded space off the end of a huge segment
+static void
+_xzm_segment_group_split_huge_segment(xzm_segment_group_t sg, xzm_segment_t segment,
+ xzm_slice_count_t required_slices)
+{
+ xzm_debug_assert(segment->xzs_kind == XZM_SEGMENT_KIND_HUGE);
+ xzm_debug_assert(segment->xzs_slice_count >= required_slices);
+ if (segment->xzs_slice_count == required_slices) {
+ return;
+ }
+
+ uint8_t *start = _xzm_segment_start(segment);
+ uint8_t *end = _xzm_segment_end(segment);
+
+ uint8_t *remainder = (uint8_t *)(start +
+ required_slices * XZM_SEGMENT_SLICE_SIZE);
+ if (remainder < end) {
+ size_t total_remainder_size = (size_t)(end - remainder);
+#if CONFIG_XZM_DEFERRED_RECLAIM
+ // new segments must be created on a SEGMENT_SIZE boundary to be annotated
+ // in the segment table
+ uint8_t *remainder_seg = (uint8_t *)roundup((uintptr_t)remainder,
+ XZM_SEGMENT_SIZE);
+ xzm_metapool_t metapool =
+ &sg->xzsg_main_ref->xzmz_metapools[XZM_METAPOOL_SEGMENT];
+ xzm_segment_t remainder_metadata = xzm_metapool_alloc(metapool);
+ size_t remainder_seg_size = (end - remainder_seg);
+
+ // If the remainder that we're freeing spans a segment granule, we need
+ // to clear the entries from the segment map
+ if (remainder_seg < end) {
+ _xzm_segment_table_freed_at(sg->xzsg_main_ref, remainder_seg,
+ segment, false);
+ }
+
+ _malloc_lock_lock(&sg->xzsg_cache.xzsc_lock);
+ if (remainder_seg < end &&
+ remainder_seg_size > XZM_LARGE_BLOCK_SIZE_MAX &&
+ sg->xzsg_cache.xzsc_count < sg->xzsg_cache.xzsc_max_count) {
+ // create a new segment from the end of this one and add it back to
+ // the cache
+
+ _xzm_segment_group_init_segment(sg, remainder_metadata,
+ remainder_seg, remainder_seg_size, true, false);
+ _xzm_segment_group_cache_mark_free(sg, remainder_metadata);
+
+ _malloc_lock_unlock(&sg->xzsg_cache.xzsc_lock);
+
+ if (remainder_seg > remainder) {
+ // free the unused portion of the current segment
+ size_t remainder_size = total_remainder_size -
+ remainder_seg_size;
+ xzm_range_group_free_segment_body(sg->xzsg_range_group,
+ (void *)remainder, remainder_size, NULL);
+ }
+ } else {
+ _malloc_lock_unlock(&sg->xzsg_cache.xzsc_lock);
+ // cannot create a cached segment out of the remainder,
+ // free it instead.
+ xzm_metapool_free(metapool, remainder_metadata);
+ xzm_range_group_free_segment_body(sg->xzsg_range_group,
+ (void *)remainder, total_remainder_size, NULL);
+ }
+#else // CONFIG_XZM_DEFERRED_RECLAIM
+ uint8_t *remainder_seg = (uint8_t *)roundup((uintptr_t)remainder,
+ XZM_SEGMENT_SIZE);
+ // If the body that we're freeing spans a segment granule, we need to
+ // clear the entries from the segment map
+ if (remainder_seg < end) {
+ _xzm_segment_table_freed_at(sg->xzsg_main_ref, remainder_seg,
+ segment, false);
+ }
+ xzm_range_group_free_segment_body(sg->xzsg_range_group, (void *)remainder,
+ total_remainder_size, NULL);
+#endif // CONFIG_XZM_DEFERRED_RECLAIM
+ // re-initialize original segment with reduced slice count
+ _xzm_segment_group_init_segment(sg, segment,
+ _xzm_segment_start(segment),
+ required_slices * XZM_SEGMENT_SLICE_SIZE, true, false);
+ }
+ xzm_debug_assert(_xzm_segment_end(segment) == remainder);
+}
+
+#if CONFIG_XZM_DEFERRED_RECLAIM
+
+static bool
+_xzm_segment_group_free_huge_chunk_to_cache(xzm_segment_group_t sg,
+ xzm_segment_t segment, xzm_chunk_t chunk) {
+ xzm_segment_cache_t cache = &sg->xzsg_cache;
+ xzm_segment_t entry = NULL, tmp_entry = NULL;
+ xzm_debug_assert(cache->xzsc_max_count > 0);
+
+ if (segment->xzs_slice_count > cache->xzsc_max_entry_slices) {
+ // Do this check (and all others that can cause us to return false)
+ // before modifying the segment table
+ return false;
+ }
+
+ // The data for this segment could be asynchronously reclaimed and reused
+ // before the metadata is invalidated/removed from the segment table, so we
+ // need to remove this segment from the segment table before putting it into
+ // the cache. If reused, the segment will be marked allocated in
+ // _xzm_segment_group_alloc_huge_chunk_from_cache
+ _xzm_segment_table_freed_at(sg->xzsg_main_ref,
+ _xzm_segment_start(segment), segment, true);
+
+#if CONFIG_MTE
+ // We are committed to returning the chunk to the cache and have removed
+ // access to it from the segment table. We can safely retag now before taking
+ // the cache lock.
+ if (_xzm_segment_group_memtag_enabled(sg)) {
+ size_t chunk_size = 0;
+ void *ptr = _xzm_chunk_start_ptr(
+ &sg->xzsg_main_ref->xzmz_base, chunk, &chunk_size);
+ memtag_tag_canonical(ptr, chunk_size);
+ // Note: for better protection from canonical pointers into huge chunks we
+ // could retag with a random tag here (which will require code changes on
+ // the alloc path also).
+ }
+#endif
+
+ _malloc_lock_lock(&cache->xzsc_lock);
+
+ xzm_reclaim_buffer_t buffer = sg->xzsg_main_ref->xzmz_reclaim_buffer;
+
+ if (sg->xzsg_cache.xzsc_count == sg->xzsg_cache.xzsc_max_count) {
+ // cache is full, sweep through the cache to find invalid entries
+ TAILQ_FOREACH_SAFE(entry, &sg->xzsg_cache.xzsc_head,
+ xzs_cache_entry, tmp_entry) {
+ if (!_xzm_reclaim_is_reusable(buffer,
+ entry->xzs_reclaim_id, true)) {
+ _xzm_segment_group_cache_invalidate(sg, entry);
+ continue;
+ } else {
+ // cache entries are kept in LRU order - encountering an
+ // available one implies all other cache entries are also
+ // available
+ break;
+ }
+ }
+ }
+
+ while (cache->xzsc_count == cache->xzsc_max_count) {
+ // Cache is full, evict the oldest entry
+ _xzm_segment_group_cache_evict(sg);
+ }
+
+ // insert segment into cache
+ _xzm_segment_group_cache_mark_free(sg, segment);
+ _malloc_lock_unlock(&cache->xzsc_lock);
+ return true;
+}
+
+#endif // CONFIG_XZM_DEFERRED_RECLAIM
+
+// mimalloc: _mi_segment_huge_page_free
+static void
+_xzm_segment_group_free_huge_chunk(xzm_segment_group_t sg, xzm_chunk_t chunk,
+ bool purgeable)
+{
+ xzm_segment_t segment = _xzm_segment_for_slice(
+ &sg->xzsg_main_ref->xzmz_base, chunk);
+ xzm_debug_assert(segment->xzs_kind == XZM_SEGMENT_KIND_HUGE);
+ xzm_debug_assert(segment->xzs_used == 1);
+
+#if CONFIG_XZM_DEFERRED_RECLAIM
+ if (sg->xzsg_cache.xzsc_max_count > 0 &&
+ !purgeable &&
+ segment->xzs_slice_count <= sg->xzsg_cache.xzsc_max_entry_slices &&
+ segment->xzs_slice_count >
+ (XZM_LARGE_BLOCK_SIZE_MAX / XZM_SEGMENT_SLICE_SIZE)) {
+ if (_xzm_segment_group_free_huge_chunk_to_cache(sg, segment, chunk)) {
+ return;
+ }
+ }
+#else
+ // No special handling of purgeable huge segments without the huge cache
+ (void)purgeable;
+#endif // CONFIG_XZM_DEFERRED_RECLAIM
+
+ _xzm_segment_group_segment_deallocate(sg, segment, true);
+}
+
+static void
+xzm_madvise(xzm_malloc_zone_t zone, uint8_t *start, size_t size)
+{
+ __assert_only int rc = mvm_madvise_plat(start, size, MADV_FREE_REUSABLE, 0,
+ mvm_plat_map(xzm_segment_table_query(_xzm_malloc_zone_main(zone),
+ start)->xzs_map));
+
+#ifdef DEBUG
+ if (rc) {
+ // TODO: time for a compatibility break? Make this fatal?
+ malloc_zone_error(0, false,
+ "Failed to madvise chunk at %p, error: %d\n", start, errno);
+ }
+#endif // DEBUG
+}
+
+void
+xzm_segment_group_segment_madvise_span(xzm_segment_group_t sg,
+ uint8_t *slice_start, xzm_slice_count_t count)
+{
+ xzm_debug_assert((uintptr_t)slice_start % XZM_SEGMENT_SLICE_SIZE == 0);
+ size_t span_size = count * XZM_SEGMENT_SLICE_SIZE;
+ xzm_madvise(&sg->xzsg_main_ref->xzmz_base, slice_start, span_size);
+}
+
+void
+xzm_segment_group_segment_madvise_chunk(xzm_segment_group_t sg,
+ xzm_chunk_t chunk)
+{
+ xzm_debug_assert(_xzm_slice_kind_is_chunk(chunk->xzc_bits.xzcb_kind));
+
+ size_t chunk_size = 0;
+ uint8_t *start = _xzm_chunk_start_ptr(&sg->xzsg_main_ref->xzmz_base, chunk,
+ &chunk_size);
+ xzm_madvise(&sg->xzsg_main_ref->xzmz_base, start, chunk_size);
+}
+
+// mimalloc: _mi_segment_page_free
+void
+xzm_segment_group_free_chunk(xzm_segment_group_t sg, xzm_chunk_t chunk,
+ bool purgeable, bool small_madvise_needed)
+{
+ xzm_slice_kind_t kind = chunk->xzc_bits.xzcb_kind;
+ xzm_debug_assert(_xzm_slice_kind_is_chunk(kind));
+
+ if (kind == XZM_SLICE_KIND_HUGE_CHUNK) {
+ _xzm_segment_group_free_huge_chunk(sg, chunk, purgeable);
+ return;
+ }
+
+ size_t chunk_size = 0;
+ uint8_t *start = _xzm_chunk_start_ptr(
+ &sg->xzsg_main_ref->xzmz_base, chunk, &chunk_size);
+ xzm_range_group_alloc_flags_t rga_flags = 0;
+#if CONFIG_MTE
+ if (_xzm_segment_group_memtag_enabled(sg)) {
+ rga_flags |= XZM_RANGE_GROUP_ALLOC_FLAGS_MTE;
+ // Clear tags for chunk before handing it back to segment group
+ memtag_tag_canonical(start, chunk_size);
+ }
+#endif
+
+ if (os_unlikely(purgeable)) {
+ xzm_debug_assert(kind == XZM_SLICE_KIND_LARGE_CHUNK);
+ // Remove the purgeability from this allocation before freeing back to
+ // the segment
+ _xzm_segment_group_overwrite_chunk(start, chunk_size, rga_flags);
+ }
+
+ xzm_segment_t segment = _xzm_segment_for_slice(
+ &sg->xzsg_main_ref->xzmz_base, chunk);
+
+ if (_xzm_segment_group_has_madvise_workaround(sg) &&
+ kind == XZM_SLICE_KIND_LARGE_CHUNK) {
+ _xzm_segment_group_overwrite_chunk(start, chunk_size, rga_flags);
+ } else if (!_xzm_segment_group_uses_deferred_reclamation(sg) &&
+ // Small chunks will have already been aggressively madvised
+ // by the time they are free
+ (kind != XZM_SLICE_KIND_SMALL_CHUNK || small_madvise_needed)) {
+ xzm_segment_group_segment_madvise_chunk(sg, chunk);
+ }
+
+ _malloc_lock_lock(&sg->xzsg_lock);
+
+ xzm_debug_assert(_xzm_segment_group_segment_is_valid(sg, segment));
+#if CONFIG_XZM_DEFERRED_RECLAIM
+ xzm_debug_assert(!(_xzm_segment_group_uses_deferred_reclamation(sg) &&
+ _xzm_segment_slice_is_deferred(segment, chunk)));
+#endif // CONFIG_XZM_DEFERRED_RECLAIM
+
+ xzm_free_span_t span = _xzm_segment_group_segment_span_free_coalesce(sg, segment, chunk, NULL);
+ segment->xzs_used--;
+#if !CONFIG_XZM_DEFERRED_RECLAIM
+ (void)span;
+#endif // !CONFIG_XZM_DEFERRED_RECLAIM
+
+ xzm_debug_assert(kind != XZM_SLICE_KIND_HUGE_CHUNK);
+ const bool can_deallocate = sg->xzsg_main_ref->xzmz_deallocate_segment &&
+ _xzm_segment_group_id_is_data(segment->xzs_segment_group->xzsg_id);
+ if (segment->xzs_used == 0 && can_deallocate) {
+ // Drops the segment group lock
+ _xzm_segment_group_segment_free(sg, segment);
+ } else {
+#if CONFIG_XZM_DEFERRED_RECLAIM
+ if (_xzm_segment_group_uses_deferred_reclamation(sg)) {
+ _xzm_segment_group_span_mark_free(sg, span);
+ }
+#endif // CONFIG_XZM_DEFERRED_RECLAIM
+ // TODO: sequester segments more efficiently - just leaving the final
+ // whole-segment span in its span queue means its metadata page stays
+ // dirty
+ xzm_debug_assert(_xzm_segment_group_segment_is_valid(sg, segment));
+ _malloc_lock_unlock(&sg->xzsg_lock);
+ }
+}
+
+bool
+xzm_segment_group_try_realloc_large_chunk(xzm_segment_group_t sg,
+ xzm_segment_t segment, xzm_chunk_t chunk,
+ xzm_slice_count_t new_slice_count)
+{
+ xzm_debug_assert(_xzm_segment_for_slice(&sg->xzsg_main_ref->xzmz_base,
+ chunk) == segment);
+ xzm_debug_assert(new_slice_count >
+ (XZM_SMALL_BLOCK_SIZE_MAX / XZM_SEGMENT_SLICE_SIZE));
+ xzm_debug_assert(new_slice_count <=
+ (XZM_LARGE_BLOCK_SIZE_MAX / XZM_SEGMENT_SLICE_SIZE));
+ xzm_debug_assert(chunk->xzc_bits.xzcb_kind == XZM_SLICE_KIND_LARGE_CHUNK);
+
+ if (chunk->xzcs_slice_count < new_slice_count) {
+ _malloc_lock_lock(&sg->xzsg_lock);
+ xzm_slice_count_t slices_to_add = (new_slice_count -
+ chunk->xzcs_slice_count);
+ xzm_slice_t next_slice = chunk + chunk->xzcs_slice_count;
+ xzm_slice_count_t next_free_slices = _xzm_free_span_slice_count(next_slice);
+ // Check if adjacent chunk is in the right segment, free, and
+ // large enough to realloc into
+ if (next_slice >= _xzm_segment_slices_end(segment) ||
+ !_xzm_slice_kind_is_free_span(next_slice->xzc_bits.xzcb_kind) ||
+ next_free_slices < slices_to_add) {
+ _malloc_lock_unlock(&sg->xzsg_lock);
+ return false;
+ }
+
+ const xzm_slice_count_t next_slices_to_free =
+ next_free_slices - slices_to_add;
+ bool uses_dr = false;
+#if CONFIG_XZM_DEFERRED_RECLAIM
+ uses_dr = _xzm_segment_group_uses_deferred_reclamation(sg);
+ if (uses_dr) {
+ if (!_xzm_segment_group_span_mark_smaller(sg, next_slice, 0,
+ slices_to_add, next_slices_to_free)) {
+ // kernel is holding next span busy
+ _malloc_lock_unlock(&sg->xzsg_lock);
+ return false;
+ }
+ }
+#endif // CONFIG_XZM_DEFERRED_RECLAIM
+
+ _xzm_segment_group_segment_span_remove_from_queue(sg, next_slice,
+ next_free_slices);
+
+ // We can only split if there will be 1 or more free slices left over
+ if (next_slices_to_free) {
+ _xzm_segment_group_segment_slice_split(sg, segment, next_slice,
+ slices_to_add, uses_dr, false);
+ }
+
+ for (int i = 0; i < slices_to_add; i++) {
+ next_slice[i].xzc_bits.xzcb_kind = XZM_SLICE_KIND_MULTI_BODY;
+ next_slice[i].xzsl_slice_offset_bytes = (uint32_t)
+ (((uintptr_t)&next_slice[i]) - ((uintptr_t)chunk));
+ }
+ chunk->xzcs_slice_count = new_slice_count;
+ xzm_debug_assert(_xzm_segment_group_segment_is_valid(sg, segment));
+ _malloc_lock_unlock(&sg->xzsg_lock);
+
+#if CONFIG_MTE
+ // If block grows on realloc(), tag additional size with same tag as
+ // allocation.
+ if (_xzm_segment_group_memtag_enabled(sg)) {
+ size_t additional_size = (slices_to_add * XZM_SEGMENT_SLICE_SIZE);
+ size_t chunk_size;
+ void *start = (void *)_xzm_chunk_start(
+ &sg->xzsg_main_ref->xzmz_base, chunk, &chunk_size);
+ size_t offset = chunk_size - additional_size;
+ void *additional_start = _memtag_load_tag(start) + offset;
+ memtag_set_tag(additional_start, additional_size);
+ }
+#endif
+
+ return true;
+ } else if (chunk->xzcs_slice_count > new_slice_count) {
+ _malloc_lock_lock(&sg->xzsg_lock);
+
+ xzm_slice_count_t slices_to_free = (chunk->xzcs_slice_count -
+ new_slice_count);
+ xzm_free_span_t span_to_free = chunk + new_slice_count;
+
+ chunk->xzcs_slice_count = new_slice_count;
+
+ xzm_slice_t last_slice = chunk + (chunk->xzcs_slice_count - 1);
+ last_slice->xzc_bits.xzcb_kind = XZM_SLICE_KIND_MULTI_BODY;
+ last_slice->xzsl_slice_offset_bytes = (uint32_t)
+ (((uintptr_t)last_slice) - ((uintptr_t)chunk));
+
+ // create a fake chunk out of the remainder before freeing it
+ xzm_segment_t segment = _xzm_segment_for_slice(
+ &sg->xzsg_main_ref->xzmz_base, chunk);
+ xzm_slice_kind_t tail_kind = slices_to_free > 1 ?
+ XZM_SLICE_KIND_LARGE_CHUNK : XZM_SLICE_KIND_TINY_CHUNK;
+ _xzm_segment_group_segment_span_mark_allocated(sg, segment, tail_kind,
+ _xzm_slice_index(segment, span_to_free), slices_to_free);
+ _malloc_lock_unlock(&sg->xzsg_lock);
+ // Realloc in place is disabled for the purgeable zone, so we can always
+ // pass purgeable=false here
+ xzm_segment_group_free_chunk(sg, span_to_free, false, false);
+ return true;
+ }
+ return true; // old size == new size, so no-op
+}
+
+bool
+xzm_segment_group_try_realloc_huge_chunk(xzm_segment_group_t sg,
+ xzm_malloc_zone_t zone, xzm_segment_t segment,
+ xzm_chunk_t chunk, xzm_slice_count_t new_slice_count)
+{
+ xzm_debug_assert(_xzm_segment_for_slice(&sg->xzsg_main_ref->xzmz_base,
+ chunk) == segment);
+ xzm_debug_assert(new_slice_count >
+ (XZM_LARGE_BLOCK_SIZE_MAX / XZM_SEGMENT_SLICE_SIZE));
+ xzm_debug_assert(chunk->xzc_bits.xzcb_kind == XZM_SLICE_KIND_HUGE_CHUNK);
+
+
+ if (chunk->xzcs_slice_count < new_slice_count) {
+ size_t current_size = 0;
+ vm_address_t current_ptr = (vm_address_t)_xzm_chunk_start(zone, chunk,
+ ¤t_size);
+ vm_address_t addr_to_request = current_ptr + current_size;
+ size_t slices_to_request = new_slice_count - chunk->xzcs_slice_count;
+ size_t size_to_request = slices_to_request * XZM_SEGMENT_SLICE_SIZE;
+
+ uintptr_t segment_to_check = roundup(addr_to_request, XZM_SEGMENT_SIZE);
+ while (segment_to_check < (addr_to_request+size_to_request)) {
+ // TODO: Once we have deferred reclaim for huge chunks, we have the
+ // option to do something more clever here (e.g. if all segments
+ // are unallocated or are still waiting to be reclaimed, then we
+ // can acquire those and realloc)
+ if (xzm_segment_table_query(sg->xzsg_main_ref,
+ (void *)segment_to_check)) {
+ return false;
+ }
+ segment_to_check += XZM_SEGMENT_SIZE;
+ }
+
+ int label = VM_MEMORY_REALLOC;
+ void *addr = mvm_allocate_plat(addr_to_request, size_to_request,
+ 0, VM_FLAGS_FIXED, 0, label, mvm_plat_map(segment->xzs_map));
+ if (addr) {
+ size_t new_body_size = new_slice_count * XZM_SEGMENT_SLICE_SIZE;
+ _xzm_segment_group_init_segment(sg, segment,
+ _xzm_segment_start(segment), new_body_size, true, false);
+
+ // If we expanded into new segment granules, mark them as allocated
+ uintptr_t first_new_segment = roundup(addr_to_request,
+ XZM_SEGMENT_SIZE);
+ if ((uintptr_t)current_ptr + new_body_size > first_new_segment) {
+ _xzm_segment_table_allocated_at(_xzm_malloc_zone_main(zone),
+ (void *)first_new_segment, segment, false);
+#if CONFIG_MTE
+ // If block grows on realloc(), tag additional size with same tag as
+ // allocation.
+ if (_xzm_segment_group_memtag_enabled(sg)) {
+ void *tagged_addr_to_request =
+ _memtag_load_tag((void *)current_ptr) +
+ current_size;
+ memtag_set_tag(tagged_addr_to_request, size_to_request);
+ }
+#endif
+ }
+
+ xzm_debug_assert(_xzm_segment_group_segment_is_valid(sg, segment));
+ return true;
+ }
+ return false;
+ } else if (chunk->xzcs_slice_count > new_slice_count) {
+ _xzm_segment_group_split_huge_segment(sg, segment, new_slice_count);
+ xzm_debug_assert(_xzm_segment_group_segment_is_valid(sg, segment));
+ return true;
+ }
+ return true; // old size == new size, so no-op
+}
+
+#endif // CONFIG_XZONE_MALLOC