Loading...
common/Algorithm.h dyld-1340 /dev/null
--- dyld/dyld-1340/common/Algorithm.h
+++ /dev/null
@@ -1,419 +0,0 @@
-/*
- * Copyright (c) 2023 Apple Inc. All rights reserved.
- *
- * @APPLE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this
- * file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_LICENSE_HEADER_END@
- */
-
-#ifndef Algorithm_h
-#define Algorithm_h
-
-#include <atomic>
-#include <algorithm>
-#include <optional>
-#include <span>
-
-#include <dispatch/dispatch.h>
-
-#include "Array.h"
-
-// A global switch to force all uses of `dispatchApply` to run sequentially instead of in parallel.
-// This is useful for debugging parallel algorithms.
-extern bool gSerializeDispatchApply;
-
-template<typename T, typename Fn>
-void dispatchApply(T&& container, Fn fn);
-
-constexpr uint64_t defaultMapChunkSize = 0x2000;
-
-// mapReduce() is a generalized way to process the entire set of elements in parallel.
-// It uses a map-reduce style algorithm where the entire set of elements is broken up into
-// subranges (by default 8192 elements per subrange). In parallel, the subranges are passed
-// to the 'map' callback along with a T object.  The map callback should process the range
-// of elements and update the results in the T object. Once all elements have been processed,
-// the 'reduce' callback is called with the full set of T objects. It should then combine
-// all the information from the Ts.
-//
-// Note: since many map callbacks are in flight at the same time, each should only store any
-// state information in the T object, and not in captured variables.
-//
-template<typename ElementTy, typename ChunkTy>
-inline void mapReduce(std::span<ElementTy> elements, size_t elementsPerChunk,
-                      void (^map)(size_t, ChunkTy&, std::span<ElementTy>),
-                      void (^reduce)(std::span<ChunkTy>)=nullptr)
-{
-    if ( elements.empty() )
-        return;
-
-    // map:
-    // divvy up all elements into chunks
-    // construct a T object for each chunk
-    // call map(elements range) on each MAP object for a subrange of elements
-    const size_t chunkCount     = (elements.size() + (elementsPerChunk - 1)) / elementsPerChunk;
-    const size_t lastChunkIndex = chunkCount - 1;
-
-    // rdar://130080127 (Inputs with lots of potential duplicate functions may exhaust available stack space)
-    std::unique_ptr<ChunkTy[]> chunksHeapStorage;
-    ChunkTy*                   chunks = nullptr;
-    constexpr size_t MaxStackAllocaSize = 1 << 17; // limit at 128kb
-    if ( (sizeof(ChunkTy) * chunkCount) <= MaxStackAllocaSize ) {
-        void* space = alloca(sizeof(ChunkTy) * chunkCount);
-        chunks = new (space) ChunkTy[chunkCount];
-    } else {
-        chunksHeapStorage = std::make_unique<ChunkTy[]>(chunkCount);
-        chunks = chunksHeapStorage.get();
-    }
-
-    dispatchApply(std::span(chunks, chunkCount), ^(size_t i, ChunkTy& chunk) {
-        if ( i == lastChunkIndex )
-            map(i, chunk, elements.subspan(i * elementsPerChunk)); // Run the last chunk with whatever is left over
-        else
-            map(i, chunk, elements.subspan(i * elementsPerChunk, elementsPerChunk));
-    });
-
-    // reduce:
-    if ( reduce )
-        reduce(std::span<ChunkTy>(chunks, chunkCount));
-}
-
-template<typename ElementTy, typename Fn>
-inline void dispatchForEach(std::span<ElementTy> elements, size_t elementsPerChunk, Fn fn)
-{
-    if ( elements.empty() )
-        return;
-
-    // divvy up all elements into chunks
-    // call fn on each subrange of elements
-    const size_t chunkCount     = (elements.size() + (elementsPerChunk - 1)) / elementsPerChunk;
-    const size_t lastChunkIndex = chunkCount - 1;
-    dispatchApply(chunkCount, [elements, elementsPerChunk, lastChunkIndex, fn](size_t chunkIndex) {
-        std::span<ElementTy> chunkElements;
-        if ( chunkIndex == lastChunkIndex )
-            chunkElements = elements.subspan(chunkIndex * elementsPerChunk); // Run the last chunk with whatever is left over
-        else
-            chunkElements = elements.subspan(chunkIndex * elementsPerChunk, elementsPerChunk);
-
-        size_t chunkStart = chunkIndex * elementsPerChunk;
-        for ( size_t i = 0; i < chunkElements.size(); ++i ) {
-            fn(chunkStart + i, chunkElements[i]);
-        }
-    });
-}
-
-template<typename ElementTy, typename Fn>
-inline void dispatchForEach(std::span<ElementTy> elements, Fn fn)
-{
-    dispatchForEach(elements, defaultMapChunkSize, fn);
-}
-
-template<typename ElementTy, typename ChunkTy>
-inline void mapReduce(std::span<ElementTy> elements, void (^map)(size_t, ChunkTy&, std::span<ElementTy>),
-                      void (^reduce)(std::span<ChunkTy>)=nullptr)
-{
-    mapReduce(elements, defaultMapChunkSize, map, reduce);
-}
-
-#if 0
-// Unused.  Leaving it here in case we want to use it for something in future.
-template<typename ElementTy, typename ChunkTy>
-inline void mapImmediateReduce(std::span<ElementTy> elements, size_t elementsPerChunk,
-                               void (^map)(size_t, ChunkTy&, std::span<ElementTy>),
-                               void (^reduce)(ChunkTy&))
-{
-    if ( elements.empty() )
-        return;
-
-    // map:
-    // divvy up all elements into chunks
-    // construct a T object for each chunk
-    // call map(elements range) on each MAP object for a subrange of elements
-    const size_t chunkCount     = (elements.size() + (elementsPerChunk - 1)) / elementsPerChunk;
-    const size_t lastChunkIndex = chunkCount - 1;
-    ChunkTy      chunksStorage[chunkCount];
-    ChunkTy*     chunks = chunksStorage; // work around clang bug
-
-    // Chunks are applied (reduced) immediately when they are the next chunk in order
-    // We'll have a state per chunk and use atomics to compare them
-    // When a chunk is ready, it will be atomically swapped to the ready state
-    std::atomic_bool chunksStateStorage[chunkCount];
-    std::atomic_bool* chunksState = chunksStateStorage; // work around clang bug
-    for ( uint32_t i = 0; i != chunkCount; ++i )
-        chunksStateStorage[i].store(false, std::memory_order_relaxed);
-
-    // Chunk[0] is the next chunk we should apply
-    chunksStateStorage[0].store(true, std::memory_order_relaxed);
-
-    dispatchApply(std::span(chunks, chunkCount), ^(size_t i, ChunkTy& chunk) {
-        if ( i == lastChunkIndex )
-            map(i, chunk, elements.subspan(i * elementsPerChunk)); // Run the last chunk with whatever is left over
-        else
-            map(i, chunk, elements.subspan(i * elementsPerChunk, elementsPerChunk));
-
-        // Our chunk now has data.  So we want to transition to the ready to be reduced state
-        // If we have already been marked as the next chunk to be reduced, then we can immediately reduce now
-        // If we are in the unset state, then just move to the ready state, and some other thread will do the reduce
-        bool expected = false;
-        if ( chunksState[i].compare_exchange_strong(expected, true) ) {
-            // Our chunk wasn't next to be reduced, but we've at least marked it as ready
-            // Some other thread will run reduce on it.
-        } else {
-            // This chunk is the next chunk, so we are going to reduce it
-            size_t chunkIndex = i;
-            while ( true ) {
-                reduce(chunks[chunkIndex]);
-
-                // Stop if we are out of chunks
-                if ( ++chunkIndex == chunkCount )
-                    break;
-
-                // Try set the next chunk as being next to be reduced, from the false state
-                // In that succeeds, then we are done here, and some other thread
-                // will see the nextChunk state and handle it
-                expected = false;
-                if ( chunksState[chunkIndex].compare_exchange_strong(expected, true) ) {
-                    break;
-                }
-            }
-        }
-    });
-}
-#endif
-
-template<typename T, typename Fn>
-inline void dispatchApply(T&& container, Fn fn)
-{
-    if constexpr ( std::is_convertible_v<T, size_t> ) {
-        size_t count = container;
-        if ( (count <= 1) || gSerializeDispatchApply ) {
-            for ( size_t i = 0; i < count; ++i )
-                fn(i);
-        } else {
-            dispatch_apply(count, DISPATCH_APPLY_AUTO, ^(size_t i) {
-                fn(i);
-            });
-        }
-    } else {
-        if ( (container.size() <= 1) || gSerializeDispatchApply ) {
-            for ( size_t i = 0; i < container.size(); ++i )
-                fn(i, container[i]);
-        } else {
-            dispatch_apply(container.size(), DISPATCH_APPLY_AUTO, ^(size_t i) {
-                fn(i, container[i]);
-            });
-        }
-    }
-}
-
-template<typename ValTy>
-inline void mergeVectorChunks(std::vector<ValTy>& outVec, std::span<std::vector<ValTy>> chunks)
-{
-    size_t totalSize = 0;
-    for ( auto& chunk : chunks ) {
-        totalSize += chunk.size();
-    }
-    if ( totalSize == 0 ) return;
-
-    outVec.reserve(outVec.size() + totalSize);
-    for ( auto& chunk : chunks ) {
-        if ( !chunk.empty() )
-            outVec.insert(outVec.end(), chunk.begin(), chunk.end());
-    }
-}
-
-template<typename ValTy>
-inline void mergeVectorChunks(std::vector<ValTy>& outVec, std::vector<ValTy>* chunks, size_t numChunks, size_t stride)
-{
-    size_t totalSize = 0;
-    for ( size_t i = 0; i < numChunks; ++i ) {
-        std::vector<ValTy>* chunk = (std::vector<ValTy>*)((uint8_t*)chunks + i * stride);
-        totalSize += chunk->size();
-    }
-    if ( totalSize == 0 ) return;
-
-    outVec.reserve(outVec.size() + totalSize);
-    for ( size_t i = 0; i < numChunks; ++i ) {
-        std::vector<ValTy>& chunk = *(std::vector<ValTy>*)((uint8_t*)chunks + i * stride);
-        if ( !chunk.empty() )
-            outVec.insert(outVec.end(), chunk.begin(), chunk.end());
-    }
-}
-
-namespace details
-{
-
-// Rturns a pair of iterators of the sorted range or std::nullopt if there are less than two
-// elements. Result iterators and input iterators form two sub-arrays at
-// [begin, result.first] and [result.second, end] that need to be sorted by calling
-// `quicksortPartTasks` recursively or with other algorithm.
-template <typename It, typename Comp>
-inline std::optional<std::pair<It, It>> quicksortPartTasks(It begin, It end, Comp comp) {
-    size_t size = std::distance(begin, end);
-    if ( size < 2 ) return std::nullopt;
-
-    const auto pivot = *(begin + size / 2);
-    auto low = begin - 1;
-    auto high = end;
-
-    while (true) {
-        do {
-            ++low;
-        } while (comp(*low, pivot));
-
-        do {
-            --high;
-        } while (comp(pivot, *high));
-
-        if (low >= high) {
-            return std::make_pair(low, high + 1);
-        }
-
-        std::swap(*low, *high);
-    }
-}
-
-constexpr size_t serialThreshold = 4096;
-}
-
-inline bool shouldUseParallelSort(size_t size) { return size > details::serialThreshold; }
-
-// NOTE: This implementation is suitable only for large ranges and when the comparison
-// is expensive, e.g. strings, so that it can be done in parallel. When sorting a simple
-// vector of integers the overhead of concurrency and simple quicksort implementation
-// will be slower than std::sort.
-//
-// Parallel sort algorithm based on divide-and-conquer and quicksort algorithm.
-// Regular quicksort algorithm could be written recursively as:
-// quicksort(It begin, It end)
-//  if ( distance(begin, end) < 2 ) return
-//  auto pivot = partition(begin, end)
-//  quicksort(begin, pivot)
-//  quicksort(pivot + 1, end)
-//
-// This parallel implementation works in the same principle, but instead of
-// serial recursive execution the subranges created by pivot partition
-// are gathered into a worklist and processed in parallel. e.g.:
-// 1. [begin, end] range is partitioned (sequentially) to create
-//    two sub ranges [begin, pivot], [pivot + 1, end]
-// 2. the 2 subranges are added to the worklist array
-//    and then both will be partitioned concurrently to create
-//    4 subranges
-// 3. then the 4 subranges will create 8 new subranges etc.
-// 4. this will be repeated until the number of elements in a subrange
-//    exceed the details::serialThreshold limit, once the subrange
-//    is smaller than the threshold it will be sorted with std::sort
-//    and that will finish the recursion
-template <typename It, typename Comp>
-void parallelSort(It begin, It end, Comp comp)
-{
-    {
-        size_t size = std::distance(begin, end);
-        if ( size < 2 ) return;
-        if ( !shouldUseParallelSort(size) ) {
-            std::sort(begin, end, comp);
-            return;
-        }
-    }
-
-    auto startPair = details::quicksortPartTasks(begin, end, comp);
-    if ( !startPair ) return;
-
-    using TaskTy = std::pair<It, It>;
-    STACK_ALLOC_OVERFLOW_SAFE_ARRAY(TaskTy, nextTasks, 1024);
-    STACK_ALLOC_OVERFLOW_SAFE_ARRAY(TaskTy, currentTasks, 1024);
-    auto* nextTasksPtr = &nextTasks;
-
-    currentTasks.push_back(std::make_pair(begin, startPair->first));
-    currentTasks.push_back(std::make_pair(startPair->second, end));
-
-    do {
-        nextTasks.resize(currentTasks.count() * 2);
-
-        std::atomic_size_t nextTasksCount = 0;
-        auto* nextTasksCountPtr = &nextTasksCount;
-
-        auto doTask = ^(size_t i) {
-            auto curTask = currentTasks[i];
-
-            auto pair = details::quicksortPartTasks(curTask.first, curTask.second, comp);
-            if ( pair ) {
-                auto firstSize = std::distance(curTask.first, pair->first);
-                auto secondSize = std::distance(pair->second, curTask.second);
-
-                bool serialFirst = !shouldUseParallelSort(firstSize);
-                bool serialSecond = !shouldUseParallelSort(secondSize);
-
-                if ( serialFirst || serialSecond ) {
-                    if ( serialFirst ) {
-                        std::sort(curTask.first, pair->first, comp);
-                    } else {
-                        auto idx = nextTasksCountPtr->fetch_add(1, std::memory_order::relaxed);
-                        (*nextTasksPtr)[idx] = std::make_pair(curTask.first, pair->first);
-                    }
-
-                    if ( serialSecond ) {
-                        std::sort(pair->second, curTask.second, comp);
-                    } else {
-                        auto idx = nextTasksCountPtr->fetch_add(1, std::memory_order::relaxed);
-                        (*nextTasksPtr)[idx] = std::make_pair(pair->second, curTask.second);
-                    }
-                } else {
-                    auto idx = nextTasksCountPtr->fetch_add(2, std::memory_order::relaxed);
-                    (*nextTasksPtr)[idx] = std::make_pair(curTask.first, pair->first);
-                    (*nextTasksPtr)[idx + 1] = std::make_pair(pair->second, curTask.second);
-                }
-            }
-        };
-        if ( currentTasks.count() == 1 ) {
-            doTask(0);
-        } else {
-            dispatch_apply(currentTasks.count(), DISPATCH_APPLY_AUTO, ^(size_t i) {
-                doTask(i);
-            });
-        }
-
-        nextTasks.resize(nextTasksCount);
-        {
-            currentTasks.clear();
-            auto tmp = std::move(currentTasks);
-            currentTasks = std::move(nextTasks);
-            nextTasks = std::move(tmp);
-        }
-    } while ( !currentTasks.empty() );
-}
-
-
-template <typename It>
-void parallelSort(It begin, It end)
-{
-    parallelSort(begin, end, std::less<std::remove_reference_t<decltype(*begin)>>{});
-}
-
-template <typename Container, typename Comp>
-void parallelSort(Container& c, Comp comp)
-{
-    parallelSort(c.begin(), c.end(), comp);
-}
-
-template <typename Container>
-void parallelSort(Container& c)
-{
-    parallelSort(c, std::less<std::remove_reference_t<decltype(*c.begin())>>{});
-}
-
-#endif /* Algorithm_h */