Loading...
--- /dev/null
+++ dyld/dyld-1330/common/Algorithm.h
@@ -0,0 +1,419 @@
+/*
+ * Copyright (c) 2023 Apple Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this
+ * file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_LICENSE_HEADER_END@
+ */
+
+#ifndef Algorithm_h
+#define Algorithm_h
+
+#include <atomic>
+#include <algorithm>
+#include <optional>
+#include <span>
+
+#include <dispatch/dispatch.h>
+
+#include "Array.h"
+
+// A global switch to force all uses of `dispatchApply` to run sequentially instead of in parallel.
+// This is useful for debugging parallel algorithms.
+extern bool gSerializeDispatchApply;
+
+template<typename T, typename Fn>
+void dispatchApply(T&& container, Fn fn);
+
+constexpr uint64_t defaultMapChunkSize = 0x2000;
+
+// mapReduce() is a generalized way to process the entire set of elements in parallel.
+// It uses a map-reduce style algorithm where the entire set of elements is broken up into
+// subranges (by default 8192 elements per subrange). In parallel, the subranges are passed
+// to the 'map' callback along with a T object. The map callback should process the range
+// of elements and update the results in the T object. Once all elements have been processed,
+// the 'reduce' callback is called with the full set of T objects. It should then combine
+// all the information from the Ts.
+//
+// Note: since many map callbacks are in flight at the same time, each should only store any
+// state information in the T object, and not in captured variables.
+//
+template<typename ElementTy, typename ChunkTy>
+inline void mapReduce(std::span<ElementTy> elements, size_t elementsPerChunk,
+ void (^map)(size_t, ChunkTy&, std::span<ElementTy>),
+ void (^reduce)(std::span<ChunkTy>)=nullptr)
+{
+ if ( elements.empty() )
+ return;
+
+ // map:
+ // divvy up all elements into chunks
+ // construct a T object for each chunk
+ // call map(elements range) on each MAP object for a subrange of elements
+ const size_t chunkCount = (elements.size() + (elementsPerChunk - 1)) / elementsPerChunk;
+ const size_t lastChunkIndex = chunkCount - 1;
+
+ // rdar://130080127 (Inputs with lots of potential duplicate functions may exhaust available stack space)
+ std::unique_ptr<ChunkTy[]> chunksHeapStorage;
+ ChunkTy* chunks = nullptr;
+ constexpr size_t MaxStackAllocaSize = 1 << 17; // limit at 128kb
+ if ( (sizeof(ChunkTy) * chunkCount) <= MaxStackAllocaSize ) {
+ void* space = alloca(sizeof(ChunkTy) * chunkCount);
+ chunks = new (space) ChunkTy[chunkCount];
+ } else {
+ chunksHeapStorage = std::make_unique<ChunkTy[]>(chunkCount);
+ chunks = chunksHeapStorage.get();
+ }
+
+ dispatchApply(std::span(chunks, chunkCount), ^(size_t i, ChunkTy& chunk) {
+ if ( i == lastChunkIndex )
+ map(i, chunk, elements.subspan(i * elementsPerChunk)); // Run the last chunk with whatever is left over
+ else
+ map(i, chunk, elements.subspan(i * elementsPerChunk, elementsPerChunk));
+ });
+
+ // reduce:
+ if ( reduce )
+ reduce(std::span<ChunkTy>(chunks, chunkCount));
+}
+
+template<typename ElementTy, typename Fn>
+inline void dispatchForEach(std::span<ElementTy> elements, size_t elementsPerChunk, Fn fn)
+{
+ if ( elements.empty() )
+ return;
+
+ // divvy up all elements into chunks
+ // call fn on each subrange of elements
+ const size_t chunkCount = (elements.size() + (elementsPerChunk - 1)) / elementsPerChunk;
+ const size_t lastChunkIndex = chunkCount - 1;
+ dispatchApply(chunkCount, [elements, elementsPerChunk, lastChunkIndex, fn](size_t chunkIndex) {
+ std::span<ElementTy> chunkElements;
+ if ( chunkIndex == lastChunkIndex )
+ chunkElements = elements.subspan(chunkIndex * elementsPerChunk); // Run the last chunk with whatever is left over
+ else
+ chunkElements = elements.subspan(chunkIndex * elementsPerChunk, elementsPerChunk);
+
+ size_t chunkStart = chunkIndex * elementsPerChunk;
+ for ( size_t i = 0; i < chunkElements.size(); ++i ) {
+ fn(chunkStart + i, chunkElements[i]);
+ }
+ });
+}
+
+template<typename ElementTy, typename Fn>
+inline void dispatchForEach(std::span<ElementTy> elements, Fn fn)
+{
+ dispatchForEach(elements, defaultMapChunkSize, fn);
+}
+
+template<typename ElementTy, typename ChunkTy>
+inline void mapReduce(std::span<ElementTy> elements, void (^map)(size_t, ChunkTy&, std::span<ElementTy>),
+ void (^reduce)(std::span<ChunkTy>)=nullptr)
+{
+ mapReduce(elements, defaultMapChunkSize, map, reduce);
+}
+
+#if 0
+// Unused. Leaving it here in case we want to use it for something in future.
+template<typename ElementTy, typename ChunkTy>
+inline void mapImmediateReduce(std::span<ElementTy> elements, size_t elementsPerChunk,
+ void (^map)(size_t, ChunkTy&, std::span<ElementTy>),
+ void (^reduce)(ChunkTy&))
+{
+ if ( elements.empty() )
+ return;
+
+ // map:
+ // divvy up all elements into chunks
+ // construct a T object for each chunk
+ // call map(elements range) on each MAP object for a subrange of elements
+ const size_t chunkCount = (elements.size() + (elementsPerChunk - 1)) / elementsPerChunk;
+ const size_t lastChunkIndex = chunkCount - 1;
+ ChunkTy chunksStorage[chunkCount];
+ ChunkTy* chunks = chunksStorage; // work around clang bug
+
+ // Chunks are applied (reduced) immediately when they are the next chunk in order
+ // We'll have a state per chunk and use atomics to compare them
+ // When a chunk is ready, it will be atomically swapped to the ready state
+ std::atomic_bool chunksStateStorage[chunkCount];
+ std::atomic_bool* chunksState = chunksStateStorage; // work around clang bug
+ for ( uint32_t i = 0; i != chunkCount; ++i )
+ chunksStateStorage[i].store(false, std::memory_order_relaxed);
+
+ // Chunk[0] is the next chunk we should apply
+ chunksStateStorage[0].store(true, std::memory_order_relaxed);
+
+ dispatchApply(std::span(chunks, chunkCount), ^(size_t i, ChunkTy& chunk) {
+ if ( i == lastChunkIndex )
+ map(i, chunk, elements.subspan(i * elementsPerChunk)); // Run the last chunk with whatever is left over
+ else
+ map(i, chunk, elements.subspan(i * elementsPerChunk, elementsPerChunk));
+
+ // Our chunk now has data. So we want to transition to the ready to be reduced state
+ // If we have already been marked as the next chunk to be reduced, then we can immediately reduce now
+ // If we are in the unset state, then just move to the ready state, and some other thread will do the reduce
+ bool expected = false;
+ if ( chunksState[i].compare_exchange_strong(expected, true) ) {
+ // Our chunk wasn't next to be reduced, but we've at least marked it as ready
+ // Some other thread will run reduce on it.
+ } else {
+ // This chunk is the next chunk, so we are going to reduce it
+ size_t chunkIndex = i;
+ while ( true ) {
+ reduce(chunks[chunkIndex]);
+
+ // Stop if we are out of chunks
+ if ( ++chunkIndex == chunkCount )
+ break;
+
+ // Try set the next chunk as being next to be reduced, from the false state
+ // In that succeeds, then we are done here, and some other thread
+ // will see the nextChunk state and handle it
+ expected = false;
+ if ( chunksState[chunkIndex].compare_exchange_strong(expected, true) ) {
+ break;
+ }
+ }
+ }
+ });
+}
+#endif
+
+template<typename T, typename Fn>
+inline void dispatchApply(T&& container, Fn fn)
+{
+ if constexpr ( std::is_convertible_v<T, size_t> ) {
+ size_t count = container;
+ if ( (count <= 1) || gSerializeDispatchApply ) {
+ for ( size_t i = 0; i < count; ++i )
+ fn(i);
+ } else {
+ dispatch_apply(count, DISPATCH_APPLY_AUTO, ^(size_t i) {
+ fn(i);
+ });
+ }
+ } else {
+ if ( (container.size() <= 1) || gSerializeDispatchApply ) {
+ for ( size_t i = 0; i < container.size(); ++i )
+ fn(i, container[i]);
+ } else {
+ dispatch_apply(container.size(), DISPATCH_APPLY_AUTO, ^(size_t i) {
+ fn(i, container[i]);
+ });
+ }
+ }
+}
+
+template<typename ValTy>
+inline void mergeVectorChunks(std::vector<ValTy>& outVec, std::span<std::vector<ValTy>> chunks)
+{
+ size_t totalSize = 0;
+ for ( auto& chunk : chunks ) {
+ totalSize += chunk.size();
+ }
+ if ( totalSize == 0 ) return;
+
+ outVec.reserve(outVec.size() + totalSize);
+ for ( auto& chunk : chunks ) {
+ if ( !chunk.empty() )
+ outVec.insert(outVec.end(), chunk.begin(), chunk.end());
+ }
+}
+
+template<typename ValTy>
+inline void mergeVectorChunks(std::vector<ValTy>& outVec, std::vector<ValTy>* chunks, size_t numChunks, size_t stride)
+{
+ size_t totalSize = 0;
+ for ( size_t i = 0; i < numChunks; ++i ) {
+ std::vector<ValTy>* chunk = (std::vector<ValTy>*)((uint8_t*)chunks + i * stride);
+ totalSize += chunk->size();
+ }
+ if ( totalSize == 0 ) return;
+
+ outVec.reserve(outVec.size() + totalSize);
+ for ( size_t i = 0; i < numChunks; ++i ) {
+ std::vector<ValTy>& chunk = *(std::vector<ValTy>*)((uint8_t*)chunks + i * stride);
+ if ( !chunk.empty() )
+ outVec.insert(outVec.end(), chunk.begin(), chunk.end());
+ }
+}
+
+namespace details
+{
+
+// Rturns a pair of iterators of the sorted range or std::nullopt if there are less than two
+// elements. Result iterators and input iterators form two sub-arrays at
+// [begin, result.first] and [result.second, end] that need to be sorted by calling
+// `quicksortPartTasks` recursively or with other algorithm.
+template <typename It, typename Comp>
+inline std::optional<std::pair<It, It>> quicksortPartTasks(It begin, It end, Comp comp) {
+ size_t size = std::distance(begin, end);
+ if ( size < 2 ) return std::nullopt;
+
+ const auto pivot = *(begin + size / 2);
+ auto low = begin - 1;
+ auto high = end;
+
+ while (true) {
+ do {
+ ++low;
+ } while (comp(*low, pivot));
+
+ do {
+ --high;
+ } while (comp(pivot, *high));
+
+ if (low >= high) {
+ return std::make_pair(low, high + 1);
+ }
+
+ std::swap(*low, *high);
+ }
+}
+
+constexpr size_t serialThreshold = 4096;
+}
+
+inline bool shouldUseParallelSort(size_t size) { return size > details::serialThreshold; }
+
+// NOTE: This implementation is suitable only for large ranges and when the comparison
+// is expensive, e.g. strings, so that it can be done in parallel. When sorting a simple
+// vector of integers the overhead of concurrency and simple quicksort implementation
+// will be slower than std::sort.
+//
+// Parallel sort algorithm based on divide-and-conquer and quicksort algorithm.
+// Regular quicksort algorithm could be written recursively as:
+// quicksort(It begin, It end)
+// if ( distance(begin, end) < 2 ) return
+// auto pivot = partition(begin, end)
+// quicksort(begin, pivot)
+// quicksort(pivot + 1, end)
+//
+// This parallel implementation works in the same principle, but instead of
+// serial recursive execution the subranges created by pivot partition
+// are gathered into a worklist and processed in parallel. e.g.:
+// 1. [begin, end] range is partitioned (sequentially) to create
+// two sub ranges [begin, pivot], [pivot + 1, end]
+// 2. the 2 subranges are added to the worklist array
+// and then both will be partitioned concurrently to create
+// 4 subranges
+// 3. then the 4 subranges will create 8 new subranges etc.
+// 4. this will be repeated until the number of elements in a subrange
+// exceed the details::serialThreshold limit, once the subrange
+// is smaller than the threshold it will be sorted with std::sort
+// and that will finish the recursion
+template <typename It, typename Comp>
+void parallelSort(It begin, It end, Comp comp)
+{
+ {
+ size_t size = std::distance(begin, end);
+ if ( size < 2 ) return;
+ if ( !shouldUseParallelSort(size) ) {
+ std::sort(begin, end, comp);
+ return;
+ }
+ }
+
+ auto startPair = details::quicksortPartTasks(begin, end, comp);
+ if ( !startPair ) return;
+
+ using TaskTy = std::pair<It, It>;
+ STACK_ALLOC_OVERFLOW_SAFE_ARRAY(TaskTy, nextTasks, 1024);
+ STACK_ALLOC_OVERFLOW_SAFE_ARRAY(TaskTy, currentTasks, 1024);
+ auto* nextTasksPtr = &nextTasks;
+
+ currentTasks.push_back(std::make_pair(begin, startPair->first));
+ currentTasks.push_back(std::make_pair(startPair->second, end));
+
+ do {
+ nextTasks.resize(currentTasks.count() * 2);
+
+ std::atomic_size_t nextTasksCount = 0;
+ auto* nextTasksCountPtr = &nextTasksCount;
+
+ auto doTask = ^(size_t i) {
+ auto curTask = currentTasks[i];
+
+ auto pair = details::quicksortPartTasks(curTask.first, curTask.second, comp);
+ if ( pair ) {
+ auto firstSize = std::distance(curTask.first, pair->first);
+ auto secondSize = std::distance(pair->second, curTask.second);
+
+ bool serialFirst = !shouldUseParallelSort(firstSize);
+ bool serialSecond = !shouldUseParallelSort(secondSize);
+
+ if ( serialFirst || serialSecond ) {
+ if ( serialFirst ) {
+ std::sort(curTask.first, pair->first, comp);
+ } else {
+ auto idx = nextTasksCountPtr->fetch_add(1, std::memory_order::relaxed);
+ (*nextTasksPtr)[idx] = std::make_pair(curTask.first, pair->first);
+ }
+
+ if ( serialSecond ) {
+ std::sort(pair->second, curTask.second, comp);
+ } else {
+ auto idx = nextTasksCountPtr->fetch_add(1, std::memory_order::relaxed);
+ (*nextTasksPtr)[idx] = std::make_pair(pair->second, curTask.second);
+ }
+ } else {
+ auto idx = nextTasksCountPtr->fetch_add(2, std::memory_order::relaxed);
+ (*nextTasksPtr)[idx] = std::make_pair(curTask.first, pair->first);
+ (*nextTasksPtr)[idx + 1] = std::make_pair(pair->second, curTask.second);
+ }
+ }
+ };
+ if ( currentTasks.count() == 1 ) {
+ doTask(0);
+ } else {
+ dispatch_apply(currentTasks.count(), DISPATCH_APPLY_AUTO, ^(size_t i) {
+ doTask(i);
+ });
+ }
+
+ nextTasks.resize(nextTasksCount);
+ {
+ currentTasks.clear();
+ auto tmp = std::move(currentTasks);
+ currentTasks = std::move(nextTasks);
+ nextTasks = std::move(tmp);
+ }
+ } while ( !currentTasks.empty() );
+}
+
+
+template <typename It>
+void parallelSort(It begin, It end)
+{
+ parallelSort(begin, end, std::less<std::remove_reference_t<decltype(*begin)>>{});
+}
+
+template <typename Container, typename Comp>
+void parallelSort(Container& c, Comp comp)
+{
+ parallelSort(c.begin(), c.end(), comp);
+}
+
+template <typename Container>
+void parallelSort(Container& c)
+{
+ parallelSort(c, std::less<std::remove_reference_t<decltype(*c.begin())>>{});
+}
+
+#endif /* Algorithm_h */