strtofp.c diff - stdlib/strtofp.c - Libc source code Libc-166

stdlib/strtofp.c /dev/null ⇄ Libc-1725.0.11
--- /dev/null
+++ Libc/Libc-1725.0.11/stdlib/strtofp.c
@@ -0,0 +1,3954 @@
+/*
+* Copyright (c) 2021 Apple Inc. All rights reserved.
+*
+* @APPLE_LICENSE_HEADER_START@
+*
+* This file contains Original Code and/or Modifications of Original Code
+* as defined in and that are subject to the Apple Public Source License
+* Version 2.0 (the 'License'). You may not use this file except in
+* compliance with the License. Please obtain a copy of the License at
+* http://www.opensource.apple.com/apsl/ and read it before using this
+* file.
+*
+* The Original Code and all software distributed under the License are
+* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+* Please see the License for the specific language governing rights and
+* limitations under the License.
+*
+* @APPLE_LICENSE_HEADER_END@
+*/
+
+// Implementation of `strtod` and related functions
+// Based on version 3 of "ffpp_strtofp"
+// Author: Tim Kientzle
+//
+// This file supports parsing floating-point values from ASCII/UTF8
+// text into the following commonly-used floating-point formats:
+//  * IEEE 754/ISO 60559 binary16 ("half")
+//  * IEEE 754/ISO 60559 binary32 ("float")
+//  * IEEE 754/ISO 60559 binary64 ("double")
+//  * Intel x87 80-bit extended format ("float80")
+//  * IEEE 754/ISO 60559 binary128
+//
+// The core functionality consists of the following private functions,
+// which can be compiled for use on any platform regardless of the
+// local floating-point support:
+//
+//  * _ffpp_strtoencf16_l (see footnote [1])
+//  * _ffpp_strtoencf32_l
+//  * _ffpp_strtoencf64_l
+//  * _ffpp_strtoencf80_l (see footnote [2])
+//  * _ffpp_strtoencf128_l
+//
+// The following standard functions are defined as wrappers around the
+// above to provide the APIs defined in the current C standardization
+// effort:
+//
+//  * strtof (ISO C17)
+//  * strtof_l (ISO C17)
+//  * strtod (ISO C17)
+//  * strtod_l (ISO C17)
+//  * strtold (ISO C17, see footnote [3])
+//  * strtold_l (ISO C17, see footnote [3])
+//  * strtoencf16 (TS 18661-3)
+//  * strtoencf32 (TS 18661-3)
+//  * strtoencf64 (TS 18661-3)
+//  * strtoencf64x (TS 18661-3, see footnote [2])
+//  * strtoencf128 (TS 18661-3)
+//
+// TODO: The following wrappers can easily be added for
+// any platform that defines the required `_Float##` types.
+//  * strtof16 (TS 18661-3)
+//  * strtof32 (TS 18661-3)
+//  * strtof64 (TS 18661-3)
+//  * strtof64x (TS 18661-3, see footnote [2])
+//  * strtof128 (TS 18661-3)
+//
+// IMPLEMENTATION NOTES
+// --------------------------------
+//
+// This is a new implementation that uses ideas from a number of
+// sources, including Clinger's 1990 paper, Gay's gdtoa library,
+// Lemire's fast_double_parser implementation, Google's abseil
+// library, as well as work I've done for the Swift standard library.
+//
+// All of the parsers use the same initial parsing logic and fall back
+// to the same arbitrary-precision integer code.  In between these,
+// they use varying format-specific optimizations.
+//
+// First Step: Initial Parsing
+//
+// The initial parsing of the textual input is handled by
+// `fastParse64`.  As the name suggests, this uses a fixed-size 64-bit
+// accumulator for speed and is heavily optimized assuming that the
+// input significand has at most 19 digits. Longer input will overflow
+// the accumulator, triggering an additional scan of the input.  This
+// initial parse also detects Hex floats, nans, and "inf"/"infinity"
+// strings and dispatches those to specialized implementations.
+//
+// With the initial parse complete, the challenge is to compute
+//    decimalSignificand * 10^decimalExponent
+// with precisely correct rounding as quickly as possible.
+//
+// Last Step: arbitrary-precision integer calculation (generalSlowpath)
+//
+// Specific formatters use a variety of optimized paths that provide
+// quick results for specific classes of input.  But none of those
+// work for every input value.  So we have a final fallback that uses
+// arbitrary-precision integer arithmetic to compute the exact results
+// with guaranteed accuracy for all inputs.  Of course, the required
+// arbitrary-precision arithmetic can be significantly more expensive,
+// especially when the significand is very long or the exponent is
+// very large.
+//
+// Two optimizations are worth mentioning:
+//
+// Powers of 5: We break the power of 10 computation into a power of 5
+// and a power of 2.  The latter can be simply folded into the final
+// FP exponent, so this effectively reduces the power of 10
+// computation to the computation of a power of 5, which is a
+// significantly smaller number.  For very large exponents, the run
+// time is dominated by this power of 5 computation.  (Up to 95% of
+// the CPU time for extreme binary128 values.)
+//
+// Limit on significand digits: I first saw this optimization in the
+// Abseil library.  First, consider the exact decimal expansions for
+// all the exact midpoints between adjacent pairs of floating-point
+// values.  There is some maximum number of significant digits
+// `maxDecimalMidpointDigits`.  Following an argument from Clinger, we
+// only need to be able to distinguish whether we are above or below
+// any such midpoint.  So it suffices to consider the first
+// `maxDecimalMidpointDigits`, appending a single digit that is
+// non-zero if the trailing digits are non-zero. This allows us to
+// limit the total size of the arithmetic used in this stage.  In
+// particular, for double, this limits us to less than 1024 bytes of
+// total space, which can easily fit on the stack, allowing us to
+// parse any double input regardless of length without allocation.
+//
+// For binary128, the comparable limit is 11564 digits, which gives a
+// maximum work buffer size of nearly 10k.  This seems a bit large for
+// the stack, but a buffer of 1536 bytes is big enough to process any
+// binary128 with less than 100 digits, regardless of exponent.  TODO:
+// For a smaller range of exponents, we can limit
+// maxDecimalMidpointDigits further.  That would allow us to process
+// any binary128 within a range of exponents regardless of number of
+// digits with the same 1536-byte buffer.
+//
+// Note: Compared to Clinger's AlgorithmR, this requires fewer
+// arbitrary-precision operations and gives the correct answer
+// directly without requiring a nearly-correct initial value.
+// Compared to Clinger's AlgorithmM, this takes advantage of the fact
+// that our integer arithmetic is occuring in the same base as used by
+// the final FP format.  This means we can interpret the bits from a
+// simple calculation instead of doing additional work to abstractly
+// compute the target format.
+//
+// These first and last steps (`fastParse64` and `generalSlowpath`)
+// are sufficient to provide guaranteed correct results for any
+// format.  The optimizations described next are accelerators that
+// allow us to provide a result more quickly for common cases where
+// the additional code complexity and testing cost can be justified.
+//
+// Optimization: Use a single Floating-point calculation
+//
+// Clinger(1990) observed that when converting to double, if the
+// significand and 10^k are both exactly representable by doubles,
+// then
+//    (double)significand * (double)10^k
+// is always correct with a single double multiplication.
+// Similarly, if 10^-k is exactly representable, then
+//    (double)significand / (double)10^(-k)
+// is always correct with a single double division.
+//
+// In particular, any significand of 15 digits or less can be exactly
+// represented by a double, as can any power of 10 up to 10^22.
+//
+// There are a few similar cases where we can provide exact inputs to
+// a single floating-point operation.  Since a single FP operation
+// always produces correctly-rounded results (in the current rounding
+// mode), these always produce correct results for the corresponding
+// range of inputs.  Since this relies on the hardware FPU, it is very
+// fast when it can be used.
+//
+// This optimization works especially well for hand-entered input,
+// which typically has few digits and a small exponent.  It works less
+// well for JSON, as random double values in JSON are typically
+// presented with 16 or 17 digits.  Fast FMA or mixed-precision
+// arithmetic can extend this technique further in certain
+// environments.  In particular, FPU-supported multiplication and
+// division of binary128 arguments with a binary64 result would handle
+// the common JSON cases.
+//
+// Optimization: Interval calculation
+//
+// We can easily compute fixed-precision upper and lower bounds for
+// the power-of-10 value from a lookup table.  Likewise, we can
+// construct bounds for an arbitrary-length significand by inspecting
+// just the first digits.  From these bounds, we can compute upper and
+// lower bounds for the final result, all with fast fixed-precision
+// integer arithmetic.  Depending on the precision, these upper and
+// lower bounds can coincide for more than 99% of all inputs,
+// guaranteeing the correct result in those cases.  This also allows
+// us to use fast fixed-precision arithmetic for very long inputs,
+// only using the first digits of the significand in cases where the
+// additional digits do not affect the result.
+//
+// PERFORMANCE
+// --------------------------------
+//
+// This strtod is about 10x faster than the implementation used by
+// Apple's libc prior to 2022.  On Lemire's `canada.txt` benchmark,
+// this implementation achieves over 700MB/s compared to 75MB/s for
+// the earlier implementation when parsing a large collection of
+// latitude/longitude values expressed as doubles with 15-17 digits.
+// That makes it about 70% the speed of Lemire's fast_double_parser
+// implementation, but still significantly faster than the other
+// implementations benchmarked by Lemire.
+//
+// Note: You can achieve speed similar to Lemire's implementation by
+// judiciously inlining supporting routines, disabling rounding mode
+// and locale support, and using a direct lookup into a fully-expanded
+// table instead of two table lookups and a multiplication.
+//
+// For other inputs, performance varies depending on the particular
+// optimization that gets used.  Binary128 and Float80 parsing here
+// uses a more heavily-factored power-of-10 table that requires up to
+// 3 multiplications to produce a lower bound.  Binary16 currently
+// has no format-specific optimizations since the arbitrary-precision
+// path is fast enough for such a small range of exponents.
+//
+// CORRECTNESS
+// --------------------------------
+//
+// Primary accuracy testing has relied on a modified form of the
+// "FFPP" test suite I developed for work on "SwiftDtoa.c".  This includes
+// over 100 million computed test cases.
+//
+// This strtod has also been fuzz-tested to ensure that the behavior
+// matches the previous macOS libc implementation based on gdtoa
+// (except for known bugs in that code):
+//  * It returns the same double result
+//  * It updates `end` identically
+//  * It updates `errno` identically
+//  * For any locale
+//  * For any standard FP rounding mode
+//
+// NOTES
+// --------------------------------
+//
+// General terminology note: I use "binary##" to refer to the IEEE 754
+// portable binary formats, and "float80" to refer to the Intel x87
+// 80-bit extended format.
+//
+// [1] TS18661-3 defines parsing functions that use the current
+// default locale, but it does not (as of early 2021) define any
+// variants that take an explicit locale argument. In order to provide
+// ISO C17 `strtod_l` functionality, I've found it useful to extend
+// the TS18661-3 suite of functions with `_l` variants.
+//
+// [2] The term "binary64x" or "f64x" in various standards refers
+// generally to any extended format, not specifically the Intel x87
+// 80-bit format.  This makes it difficult to use the term `f64x` for
+// portable implementations.  To keep things clear, I have
+// `strtoencf80_l` to specifically support the Intel float80 format
+// and define `strtoencf64x` as a wrapper that calls into either
+// `strtoencf64_l` or `strtoencf80_l` depending on the local system
+// conventions (using the same approach as used for `strtold`).
+//
+// [3] `strtold` is defined as a wrapper for whichever supported
+// format (binary64, float80, or binary128) corresponds to long double
+// on the local system.  Search below for uses of `LONG_DOUBLE_IS_` to
+// see how this is done.
+//
+//
+// TODO:
+//
+//  * Figure out how to test whether `_Float##` types are provided,
+//    use that to define `strtof##` automatically on systems that
+//    support TS18661-3.
+//
+//  * Support big-endian systems. Much of the support for
+//    binary16/32/64 should already work correctly since it builds the
+//    result bitwise in an unsigned integer and uses memcpy() to copy
+//    the bits.  This works if ints and floats have the same
+//    endianness, which I believe is universally true on all current
+//    hardware.  Other code builds up results assuming a particular
+//    byte order and will need to be modified to support big-endian
+//    environments.
+//
+
+#define _POSIX_C_SOURCE 200809L
+
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <float.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+
+// Check whether access to the floating-point environment is supported
+#if !defined(ENABLE_FENV_ACCESS)
+#if __has_include(<fenv.h>)
+  #include <fenv.h>
+
+  #define ENABLE_FENV_ACCESS 1
+#else
+  #define ENABLE_FENV_ACCESS 0
+#endif
+#endif
+
+// Check whether precision/range of floating-point values is defined
+#if __has_include(<math.h>)
+  #include <math.h>
+#else
+#if !defined(FLT_EVAL_METHOD)
+  // Note that Clang's builtin float.h header does define this macro
+  #define FLT_EVAL_METHOD -1
+#endif
+#endif
+
+// Check whether locales are supported
+#if !defined(ENABLE_LOCALE_SUPPORT)
+#if __has_include(<langinfo.h>) && __has_include(<locale.h>)
+  #include <langinfo.h>
+  #include <locale.h>
+#if defined(__APPLE__)
+  #include <xlocale.h>
+  #include <xlocale_private.h>
+#endif
+
+  #define ENABLE_LOCALE_SUPPORT 1
+#else
+  #define ENABLE_LOCALE_SUPPORT 0
+#endif
+#endif
+
+// ================================================================
+// Detect the floating-point formats supported on this platform
+// by testing the standard macros defined in float.h
+// ================================================================
+
+// Does "float" on this system use IEEE 754 binary32 format?
+// (Almost all modern systems do this.)
+#if (FLT_RADIX == 2) && (FLT_MANT_DIG == 24) && (FLT_MIN_EXP == -125) && (FLT_MAX_EXP == 128)
+  #define FLOAT_IS_BINARY32 1
+#else
+  #define FLOAT_IS_BINARY32 0
+#endif
+
+// Does "double" on this system use IEEE 754 binary64 format?
+// (Almost all modern systems do this.)
+#if (FLT_RADIX == 2) && (DBL_MANT_DIG == 53) && (DBL_MIN_EXP == -1021) && (DBL_MAX_EXP == 1024)
+  #define DOUBLE_IS_BINARY64 1
+#else
+  #define DOUBLE_IS_BINARY64 0
+#endif
+
+// Does "long double" on this system use IEEE 754 binary64 format?
+// (Example: Windows on all hardware, macOS on ARM.)
+#if (FLT_RADIX == 2) && (LDBL_MANT_DIG == 53) && (LDBL_MIN_EXP == -1021) && (LDBL_MAX_EXP == 1024)
+  #define LONG_DOUBLE_IS_BINARY64 1
+#else
+  #define LONG_DOUBLE_IS_BINARY64 0
+#endif
+
+// Is "long double" on this system the same as Float80?
+// (Example: macOS, Linux, and FreeBSD when running on x86 or x86_64 processors.)
+#if (FLT_RADIX == 2) && (LDBL_MANT_DIG == 64) && (LDBL_MIN_EXP == -16381) && (LDBL_MAX_EXP == 16384)
+  #define LONG_DOUBLE_IS_FLOAT80 1
+#else
+  #define LONG_DOUBLE_IS_FLOAT80 0
+#endif
+
+// Does "long double" on this system use IEEE 754 binary128 format?
+// (Example: Android on LP64 hardware.)
+#if (FLT_RADIX == 2) && (LDBL_MANT_DIG == 113) && (LDBL_MIN_EXP == -16381) && (LDBL_MAX_EXP == 16384)
+  #define LONG_DOUBLE_IS_BINARY128 1
+#else
+  #define LONG_DOUBLE_IS_BINARY128 0
+#endif
+
+// ================================================================
+// Detect/configure local platform arithmetic
+
+#if defined(__SIZEOF_INT128__)
+  // We get a significant speed boost if we can use the __uint128_t
+  // type that's present in GCC and Clang on 64-bit architectures.
+  #define HAVE_UINT128_T 1
+  #define MP_WORD_BITS 32 // Multi-precision ints use 32-bit words on 64-bit platforms
+#else
+  #define HAVE_UINT128_T 0
+  #define MP_WORD_BITS 16 // Multi-precision ints use 16-bit words on 32-bit platforms
+#endif
+
+// ================================================================
+// How to get locale-specific decimal point character
+// (This is more complicated than it should be, largely because
+// the standard functions are a lot slower than we'd like and
+// the standard mechanisms return an arbitrary-length string
+// for the decimal point.  So we go to some lengths to avoid
+// calling the standard functions:  We defer such calls until
+// we see a character that is not a digit or other known character,
+// and we handle the "C" locale specially.)
+
+// Define macros used for locale information below:
+//
+// strtofp_locale_decimal_point(loc)
+//  - Return the decimal point string for the specified locale
+//
+// strtofp_locale_t
+//  - Type of locale info:  locale_t on systems that support locales, else void *
+//
+// strtofp_C_locale
+//  - Constant that defines the "C" locale, used to bypass other locale checks
+//
+// strtofp_current_locale()
+//  - Returns the default locale to use when no explicit locale is given
+//
+#if !ENABLE_LOCALE_SUPPORT
+ // If no locale support, always use "." as the decimal point
+ #define strtofp_locale_t void *
+ #define strtofp_current_locale() (NULL)
+ #define strtofp_C_locale (NULL)
+ #define strtofp_locale_decimal_point(loc) ((const unsigned char *)("."))
+#elif defined(__linux__)
+ #define strtofp_locale_t locale_t
+ #define strtofp_current_locale() (uselocale(NULL))
+ #define strtofp_C_locale (NULL)
+ #define strtofp_locale_decimal_point(loc) \
+  ((const unsigned char *)((loc) == LC_GLOBAL_LOCALE ? nl_langinfo(RADIXCHAR) : nl_langinfo_l(RADIXCHAR, (loc))))
+#elif defined(__APPLE__)
+ #define strtofp_locale_t locale_t
+ #define strtofp_C_locale (NULL)
+ // For use inside of Libc: Use thread-specific locale if set, else global locale
+ #define strtofp_current_locale() (__current_locale())
+ // For use outside of libc
+ // #define strtofp_current_locale() (uselocale(NULL))
+ #define strtofp_locale_decimal_point(loc) ((const unsigned char *)(localeconv_l((loc))->decimal_point))
+#else
+ #error Need definition for strtofp_locale_decimal_point and strtofp_current_locale for this platform
+#endif
+
+// ================================================================
+// Floating-point rounding mode
+
+#if ENABLE_FENV_ACCESS
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wignored-pragmas"
+  #pragma STDC FENV_ACCESS ON
+  #pragma GCC diagnostic pop
+
+  #define FENV_ROUNDING_MODE() fegetround()
+#else
+  // By default, we use FE_TONEAREST, which rounds to the nearest representable value
+  #define FENV_ROUNDING_MODE() 0
+#endif
+
+// In certain scenarios, we require FE_TOWARDZERO, which rounds towards zero,
+// but need to provide a sentinel value if it is not defined
+#ifndef FE_TOWARDZERO
+  #define FE_TOWARDZERO 0xFE00
+#endif
+
+#if defined(FE_DOWNWARD) && FE_TOWARDZERO == FE_DOWNWARD
+  #error "Definition of FE_TOWARDZERO conflicts with FE_DOWNWARD"
+#endif
+#if defined(FE_TONEAREST) && FE_TOWARDZERO == FE_TONEAREST
+  #error "Definition of FE_TOWARDZERO conflicts with FE_TONEAREST"
+#endif
+#if defined(FE_UPWARD) && FE_TOWARDZERO == FE_UPWARD
+  #error "Definition of FE_TOWARDZERO conflicts with FE_UPWARD"
+#endif
+
+// ================================================================
+//
+// Enable/disable particular formats
+//
+// ================================================================
+
+// Enable binary16/32/64 by default everywhere.
+#ifndef ENABLE_BINARY16_SUPPORT
+ #define ENABLE_BINARY16_SUPPORT 1
+#endif
+#ifndef ENABLE_BINARY32_SUPPORT
+ #define ENABLE_BINARY32_SUPPORT 1
+#endif
+#ifndef ENABLE_BINARY64_SUPPORT
+ #define ENABLE_BINARY64_SUPPORT 1
+#endif
+// Enable float80 by default only if necessary to support long double
+#ifndef ENABLE_FLOAT80_SUPPORT
+ #if LONG_DOUBLE_IS_FLOAT80
+  #define ENABLE_FLOAT80_SUPPORT 1
+ #else
+  #define ENABLE_FLOAT80_SUPPORT 0
+ #endif
+#endif
+// Enable binary128 by default only if necessary to support long double
+#ifndef ENABLE_BINARY128_SUPPORT
+ #if LONG_DOUBLE_IS_BINARY128
+  #define ENABLE_BINARY128_SUPPORT 1
+ #else
+  #define ENABLE_BINARY128_SUPPORT 0
+ #endif
+#endif
+
+// Enable float80 interval optimization by default.
+// If you want to be able to tick off "float80" support but don't want to
+// pay for it, define this to 0.  Then you'll just get the minimum.
+// Enabling this costs about 3.5k of code.
+// Note: The code is shared with binary128, so there's no point to enabling
+// it for one but not the other.
+#if ENABLE_FLOAT80_SUPPORT
+ #ifndef ENABLE_FLOAT80_OPTIMIZATIONS
+  #define ENABLE_FLOAT80_OPTIMIZATIONS 1
+ #endif
+#else
+ #define ENABLE_FLOAT80_OPTIMIZATIONS 0
+#endif
+
+// Enable binary128 interval optimization by default.
+#if ENABLE_BINARY128_SUPPORT
+  #ifndef ENABLE_BINARY128_OPTIMIZATIONS
+   #define ENABLE_BINARY128_OPTIMIZATIONS 1
+ #endif
+#else
+ #define ENABLE_BINARY128_OPTIMIZATIONS 0
+#endif
+
+// At least one format must be enabled
+#if !ENABLE_BINARY16_SUPPORT && !ENABLE_BINARY32_SUPPORT && !ENABLE_BINARY64_SUPPORT && !ENABLE_FLOAT80_SUPPORT && !ENABLE_BINARY128_SUPPORT
+#error At least one format must be enabled
+#endif
+
+// ================================================================
+// Look up tables
+
+static const unsigned char hexdigit[256] = {
+  0xff, 0xff, 0xff, 0xff,  0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff,  0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff,  0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff,  0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff,  0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff,  0xff, 0xff, 0xff, 0xff,
+     0,    1,    2,    3,     4,    5,    6,    7,
+     8,    9, 0xff, 0xff,  0xff, 0xff, 0xff, 0xff,
+  0xff,   10,   11,   12,    13,   14,   15, 0xff,
+  0xff, 0xff, 0xff, 0xff,  0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff,  0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff,  0xff, 0xff, 0xff, 0xff,
+  0xff,   10,   11,   12,    13,   14,   15, 0xff,
+  0xff, 0xff, 0xff, 0xff,  0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff,  0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff,  0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff,  0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff,  0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff,  0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff,  0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff,  0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff,  0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff,  0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff,  0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff,  0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff,  0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff,  0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff,  0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff,  0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff,  0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff,  0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff,  0xff, 0xff, 0xff, 0xff
+};
+
+// ================================================================
+// ================================================================
+//
+// Power-of-10 tables
+//
+// ================================================================
+// ================================================================
+
+
+#define binaryExponentFor10ToThe(p) ((int)(((((int64_t)(p)) * 55732705) >> 24) + 1))
+
+// For binary32 only, we need a single 880-byte powersOf10_Float table below
+
+// For binary64 only, we need a 224-byte subset of the first table and
+// a 232-byte additional table for a total of 456 bytes
+
+// For both, we can overlap somewhat so we only need the 880-byte binary32
+// table and the auxiliary 232-byte table for binary64, for a total of 1112 bytes.
+
+#if ENABLE_BINARY32_SUPPORT || ENABLE_BINARY64_SUPPORT || ENABLE_FLOAT80_OPTIMIZATIONS || ENABLE_BINARY128_OPTIMIZATIONS
+// These are the 64-bit binary significands of the
+// largest binary floating-point value that is not greater
+// than the corresponding power of 10.
+
+// That is, when these are not exact, they are less than
+// the exact decimal value.  This allows us to use these
+// to construct tight intervals around the true power
+// of ten value.
+static const uint64_t powersOf10_Float[] = {
+#if ENABLE_BINARY32_SUPPORT
+    0xb0af48ec79ace837ULL, // x 2^-232 ~= 10^-70
+    0xdcdb1b2798182244ULL, // x 2^-229 ~= 10^-69
+    0x8a08f0f8bf0f156bULL, // x 2^-225 ~= 10^-68
+    0xac8b2d36eed2dac5ULL, // x 2^-222 ~= 10^-67
+    0xd7adf884aa879177ULL, // x 2^-219 ~= 10^-66
+    0x86ccbb52ea94baeaULL, // x 2^-215 ~= 10^-65
+    0xa87fea27a539e9a5ULL, // x 2^-212 ~= 10^-64
+    0xd29fe4b18e88640eULL, // x 2^-209 ~= 10^-63
+    0x83a3eeeef9153e89ULL, // x 2^-205 ~= 10^-62
+    0xa48ceaaab75a8e2bULL, // x 2^-202 ~= 10^-61
+    0xcdb02555653131b6ULL, // x 2^-199 ~= 10^-60
+    0x808e17555f3ebf11ULL, // x 2^-195 ~= 10^-59
+    0xa0b19d2ab70e6ed6ULL, // x 2^-192 ~= 10^-58
+    0xc8de047564d20a8bULL, // x 2^-189 ~= 10^-57
+    0xfb158592be068d2eULL, // x 2^-186 ~= 10^-56
+    0x9ced737bb6c4183dULL, // x 2^-182 ~= 10^-55
+    0xc428d05aa4751e4cULL, // x 2^-179 ~= 10^-54
+    0xf53304714d9265dfULL, // x 2^-176 ~= 10^-53
+    0x993fe2c6d07b7fabULL, // x 2^-172 ~= 10^-52
+    0xbf8fdb78849a5f96ULL, // x 2^-169 ~= 10^-51
+    0xef73d256a5c0f77cULL, // x 2^-166 ~= 10^-50
+    0x95a8637627989aadULL, // x 2^-162 ~= 10^-49
+    0xbb127c53b17ec159ULL, // x 2^-159 ~= 10^-48
+    0xe9d71b689dde71afULL, // x 2^-156 ~= 10^-47
+    0x9226712162ab070dULL, // x 2^-152 ~= 10^-46
+    0xb6b00d69bb55c8d1ULL, // x 2^-149 ~= 10^-45
+    0xe45c10c42a2b3b05ULL, // x 2^-146 ~= 10^-44
+    0x8eb98a7a9a5b04e3ULL, // x 2^-142 ~= 10^-43
+    0xb267ed1940f1c61cULL, // x 2^-139 ~= 10^-42
+    0xdf01e85f912e37a3ULL, // x 2^-136 ~= 10^-41
+    0x8b61313bbabce2c6ULL, // x 2^-132 ~= 10^-40
+    0xae397d8aa96c1b77ULL, // x 2^-129 ~= 10^-39
+    0xd9c7dced53c72255ULL, // x 2^-126 ~= 10^-38
+    0x881cea14545c7575ULL, // x 2^-122 ~= 10^-37
+    0xaa242499697392d2ULL, // x 2^-119 ~= 10^-36
+    0xd4ad2dbfc3d07787ULL, // x 2^-116 ~= 10^-35
+    0x84ec3c97da624ab4ULL, // x 2^-112 ~= 10^-34
+    0xa6274bbdd0fadd61ULL, // x 2^-109 ~= 10^-33
+    0xcfb11ead453994baULL, // x 2^-106 ~= 10^-32
+    0x81ceb32c4b43fcf4ULL, // x 2^-102 ~= 10^-31
+    0xa2425ff75e14fc31ULL, // x 2^-99 ~= 10^-30
+    0xcad2f7f5359a3b3eULL, // x 2^-96 ~= 10^-29
+    0xfd87b5f28300ca0dULL, // x 2^-93 ~= 10^-28
+    0x9e74d1b791e07e48ULL, // x 2^-89 ~= 10^-27
+    0xc612062576589ddaULL, // x 2^-86 ~= 10^-26
+    0xf79687aed3eec551ULL, // x 2^-83 ~= 10^-25
+    0x9abe14cd44753b52ULL, // x 2^-79 ~= 10^-24
+    0xc16d9a0095928a27ULL, // x 2^-76 ~= 10^-23
+    0xf1c90080baf72cb1ULL, // x 2^-73 ~= 10^-22
+    0x971da05074da7beeULL, // x 2^-69 ~= 10^-21
+    0xbce5086492111aeaULL, // x 2^-66 ~= 10^-20
+    0xec1e4a7db69561a5ULL, // x 2^-63 ~= 10^-19
+    0x9392ee8e921d5d07ULL, // x 2^-59 ~= 10^-18
+    0xb877aa3236a4b449ULL, // x 2^-56 ~= 10^-17
+    0xe69594bec44de15bULL, // x 2^-53 ~= 10^-16
+    0x901d7cf73ab0acd9ULL, // x 2^-49 ~= 10^-15
+    0xb424dc35095cd80fULL, // x 2^-46 ~= 10^-14
+    0xe12e13424bb40e13ULL, // x 2^-43 ~= 10^-13
+    0x8cbccc096f5088cbULL, // x 2^-39 ~= 10^-12
+    0xafebff0bcb24aafeULL, // x 2^-36 ~= 10^-11
+    0xdbe6fecebdedd5beULL, // x 2^-33 ~= 10^-10
+    0x89705f4136b4a597ULL, // x 2^-29 ~= 10^-9
+    0xabcc77118461cefcULL, // x 2^-26 ~= 10^-8
+    0xd6bf94d5e57a42bcULL, // x 2^-23 ~= 10^-7
+    0x8637bd05af6c69b5ULL, // x 2^-19 ~= 10^-6
+    0xa7c5ac471b478423ULL, // x 2^-16 ~= 10^-5
+    0xd1b71758e219652bULL, // x 2^-13 ~= 10^-4
+    0x83126e978d4fdf3bULL, // x 2^-9 ~= 10^-3
+    0xa3d70a3d70a3d70aULL, // x 2^-6 ~= 10^-2
+    0xccccccccccccccccULL, // x 2^-3 ~= 10^-1
+#endif
+    // These values are exact; we use them together with
+    // _CoarseBinary64 below for binary64 format.
+    0x8000000000000000ULL, // x 2^1 == 10^0 exactly
+    0xa000000000000000ULL, // x 2^4 == 10^1 exactly
+    0xc800000000000000ULL, // x 2^7 == 10^2 exactly
+    0xfa00000000000000ULL, // x 2^10 == 10^3 exactly
+    0x9c40000000000000ULL, // x 2^14 == 10^4 exactly
+    0xc350000000000000ULL, // x 2^17 == 10^5 exactly
+    0xf424000000000000ULL, // x 2^20 == 10^6 exactly
+    0x9896800000000000ULL, // x 2^24 == 10^7 exactly
+    0xbebc200000000000ULL, // x 2^27 == 10^8 exactly
+    0xee6b280000000000ULL, // x 2^30 == 10^9 exactly
+    0x9502f90000000000ULL, // x 2^34 == 10^10 exactly
+    0xba43b74000000000ULL, // x 2^37 == 10^11 exactly
+    0xe8d4a51000000000ULL, // x 2^40 == 10^12 exactly
+    0x9184e72a00000000ULL, // x 2^44 == 10^13 exactly
+    0xb5e620f480000000ULL, // x 2^47 == 10^14 exactly
+    0xe35fa931a0000000ULL, // x 2^50 == 10^15 exactly
+    0x8e1bc9bf04000000ULL, // x 2^54 == 10^16 exactly
+    0xb1a2bc2ec5000000ULL, // x 2^57 == 10^17 exactly
+    0xde0b6b3a76400000ULL, // x 2^60 == 10^18 exactly
+    0x8ac7230489e80000ULL, // x 2^64 == 10^19 exactly
+    0xad78ebc5ac620000ULL, // x 2^67 == 10^20 exactly
+    0xd8d726b7177a8000ULL, // x 2^70 == 10^21 exactly
+    0x878678326eac9000ULL, // x 2^74 == 10^22 exactly
+    0xa968163f0a57b400ULL, // x 2^77 == 10^23 exactly
+    0xd3c21bcecceda100ULL, // x 2^80 == 10^24 exactly
+    0x84595161401484a0ULL, // x 2^84 == 10^25 exactly
+    0xa56fa5b99019a5c8ULL, // x 2^87 == 10^26 exactly
+    0xcecb8f27f4200f3aULL, // x 2^90 == 10^27 exactly
+#if ENABLE_BINARY32_SUPPORT
+    0x813f3978f8940984ULL, // x 2^94 ~= 10^28
+    0xa18f07d736b90be5ULL, // x 2^97 ~= 10^29
+    0xc9f2c9cd04674edeULL, // x 2^100 ~= 10^30
+    0xfc6f7c4045812296ULL, // x 2^103 ~= 10^31
+    0x9dc5ada82b70b59dULL, // x 2^107 ~= 10^32
+    0xc5371912364ce305ULL, // x 2^110 ~= 10^33
+    0xf684df56c3e01bc6ULL, // x 2^113 ~= 10^34
+    0x9a130b963a6c115cULL, // x 2^117 ~= 10^35
+    0xc097ce7bc90715b3ULL, // x 2^120 ~= 10^36
+    0xf0bdc21abb48db20ULL, // x 2^123 ~= 10^37
+    0x96769950b50d88f4ULL, // x 2^127 ~= 10^38
+    0xbc143fa4e250eb31ULL, // x 2^130 ~= 10^39
+#endif
+};
+#endif
+
+#if ENABLE_BINARY64_SUPPORT || ENABLE_FLOAT80_OPTIMIZATIONS || ENABLE_BINARY128_OPTIMIZATIONS
+ #if ENABLE_BINARY32_SUPPORT
+  static const uint64_t *powersOf10_Exact64 = powersOf10_Float + 70;
+ #else
+  static const uint64_t *powersOf10_Exact64 = powersOf10_Float;
+ #endif
+#endif
+
+#if ENABLE_BINARY64_SUPPORT
+// Rounded-down values supporting the full range of binary64.
+// As above, when not exact, these are rounded down to the
+// nearest value lower than or equal to the exact power of 10.
+//
+// Table size: 232 bytes
+//
+// We only store every 28th power of ten here.
+// We can multiply by an exact 64-bit power of
+// ten from powersOf10_Exact64 above to reconstruct the
+// significand for any power of 10.
+static const uint64_t powersOf10_CoarseBinary64[30] = {
+    0xdd5a2c3eab3097cbULL, // x 2^-1395 ~= 10^-420
+    0xdf82365c497b5453ULL, // x 2^-1302 ~= 10^-392
+    0xe1afa13afbd14d6dULL, // x 2^-1209 ~= 10^-364
+    0xe3e27a444d8d98b7ULL, // x 2^-1116 ~= 10^-336
+    0xe61acf033d1a45dfULL, // x 2^-1023 ~= 10^-308
+    0xe858ad248f5c22c9ULL, // x 2^-930 ~= 10^-280
+    0xea9c227723ee8bcbULL, // x 2^-837 ~= 10^-252
+    0xece53cec4a314ebdULL, // x 2^-744 ~= 10^-224
+    0xef340a98172aace4ULL, // x 2^-651 ~= 10^-196
+    0xf18899b1bc3f8ca1ULL, // x 2^-558 ~= 10^-168
+    0xf3e2f893dec3f126ULL, // x 2^-465 ~= 10^-140
+    0xf64335bcf065d37dULL, // x 2^-372 ~= 10^-112
+    0xf8a95fcf88747d94ULL, // x 2^-279 ~= 10^-84
+    0xfb158592be068d2eULL, // x 2^-186 ~= 10^-56
+    0xfd87b5f28300ca0dULL, // x 2^-93 ~= 10^-28
+    0x8000000000000000ULL, // x 2^1 == 10^0 exactly
+    0x813f3978f8940984ULL, // x 2^94 == 10^28 exactly
+    0x82818f1281ed449fULL, // x 2^187 ~= 10^56
+    0x83c7088e1aab65dbULL, // x 2^280 ~= 10^84
+    0x850fadc09923329eULL, // x 2^373 ~= 10^112
+    0x865b86925b9bc5c2ULL, // x 2^466 ~= 10^140
+    0x87aa9aff79042286ULL, // x 2^559 ~= 10^168
+    0x88fcf317f22241e2ULL, // x 2^652 ~= 10^196
+    0x8a5296ffe33cc92fULL, // x 2^745 ~= 10^224
+    0x8bab8eefb6409c1aULL, // x 2^838 ~= 10^252
+    0x8d07e33455637eb2ULL, // x 2^931 ~= 10^280
+    0x8e679c2f5e44ff8fULL, // x 2^1024 ~= 10^308
+    0x8fcac257558ee4e6ULL, // x 2^1117 ~= 10^336
+    0x91315e37db165aa9ULL, // x 2^1210 ~= 10^364
+    0x929b7871de7f22b9ULL, // x 2^1303 ~= 10^392
+};
+#endif
+
+#if ENABLE_FLOAT80_OPTIMIZATIONS || ENABLE_BINARY128_OPTIMIZATIONS
+// Every 56th power of 10 across the range of Float80/Binary128
+//
+// Table size: 2880 bytes
+static const uint64_t powersOf10_Binary128[180] = {
+    // Low-order ... high-order
+    0x0000000000000000ULL, 0x8000000000000000ULL, // x 2^1 == 10^0 exactly
+    0xbff8f10e7a8921a4ULL, 0x82818f1281ed449fULL, // x 2^187 ~= 10^56
+    0x03e2cf6bc604ddb0ULL, 0x850fadc09923329eULL, // x 2^373 ~= 10^112
+    0x90fb44d2f05d0842ULL, 0x87aa9aff79042286ULL, // x 2^559 ~= 10^168
+    0x82bd6b70d99aaa6fULL, 0x8a5296ffe33cc92fULL, // x 2^745 ~= 10^224
+    0xdb0b487b6423e1e8ULL, 0x8d07e33455637eb2ULL, // x 2^931 ~= 10^280
+    0x213a4f0aa5e8a7b1ULL, 0x8fcac257558ee4e6ULL, // x 2^1117 ~= 10^336
+    0x1c306f5d1b0b5fdfULL, 0x929b7871de7f22b9ULL, // x 2^1303 ~= 10^392
+    0xa7ea9c8838ce9437ULL, 0x957a4ae1ebf7f3d3ULL, // x 2^1489 ~= 10^448
+    0xbf1d49cacccd5e68ULL, 0x9867806127ece4f4ULL, // x 2^1675 ~= 10^504
+    0x655494c5c95d77f2ULL, 0x9b63610bb9243e46ULL, // x 2^1861 ~= 10^560
+    0x02e008393fd60b55ULL, 0x9e6e366733f85561ULL, // x 2^2047 ~= 10^616
+    0x55e04dba4b3bd4ddULL, 0xa1884b69ade24964ULL, // x 2^2233 ~= 10^672
+    0x44b222741eb1ebbfULL, 0xa4b1ec80f47c84adULL, // x 2^2419 ~= 10^728
+    0x1cf4a5c3bc09fa6fULL, 0xa7eb6799e8aec999ULL, // x 2^2605 ~= 10^784
+    0x3c4a575151b294dcULL, 0xab350c27feb90accULL, // x 2^2791 ~= 10^840
+    0x870a8d87239d8f35ULL, 0xae8f2b2ce3d5dbe9ULL, // x 2^2977 ~= 10^896
+    0xdd929f09c3eff5acULL, 0xb1fa17404a30e5e8ULL, // x 2^3163 ~= 10^952
+    0x1931b583a9431d7eULL, 0xb5762497dbf17a9eULL, // x 2^3349 ~= 10^1008
+    0xe30db03e0f8dd286ULL, 0xb903a90f561d25e2ULL, // x 2^3535 ~= 10^1064
+    0x9eb5cb19647508c5ULL, 0xbca2fc30cc19f090ULL, // x 2^3721 ~= 10^1120
+    0x24bd4c00042ad125ULL, 0xc054773d149bf26bULL, // x 2^3907 ~= 10^1176
+    0x7ea30dbd7ea479e3ULL, 0xc418753460cdcca9ULL, // x 2^4093 ~= 10^1232
+    0x764f4cf916b4deceULL, 0xc7ef52defe87b751ULL, // x 2^4279 ~= 10^1288
+    0xbeb7fbdc1cbe8b37ULL, 0xcbd96ed6466cf081ULL, // x 2^4465 ~= 10^1344
+    0xdce472c619aa3f63ULL, 0xcfd7298db6cb9672ULL, // x 2^4651 ~= 10^1400
+    0xe47defc14a406e4fULL, 0xd3e8e55c3c1f43d0ULL, // x 2^4837 ~= 10^1456
+    0xb7157c60a24a0569ULL, 0xd80f0685a81b2a81ULL, // x 2^5023 ~= 10^1512
+    0xfb0b98f6bbc4f0cbULL, 0xdc49f3445824e360ULL, // x 2^5209 ~= 10^1568
+    0xc6c6c1764e047e15ULL, 0xe09a13d30c2dba62ULL, // x 2^5395 ~= 10^1624
+    0x87e8dcfc09dbc33aULL, 0xe4ffd276eedce658ULL, // x 2^5581 ~= 10^1680
+    0xb1a3642a8da3cf4fULL, 0xe97b9b89d001dab3ULL, // x 2^5767 ~= 10^1736
+    0x2d4070f33b21ab7bULL, 0xee0ddd84924ab88cULL, // x 2^5953 ~= 10^1792
+    0xa2bf0c63a814e04eULL, 0xf2b70909cd3fd35cULL, // x 2^6139 ~= 10^1848
+    0x08f13995cf9c2747ULL, 0xf77790f0a48a45ceULL, // x 2^6325 ~= 10^1904
+    0x7a37993eb21444faULL, 0xfc4fea4fd590b40aULL, // x 2^6511 ~= 10^1960
+    0xb7b1ada9cdeba84dULL, 0x80a046447e3d49f1ULL, // x 2^6698 ~= 10^2016
+    0x0cc6866c5d69b2cbULL, 0x8324f8aa08d7d411ULL, // x 2^6884 ~= 10^2072
+    0x7fe2b4308dcbf1a3ULL, 0x85b64a659077660eULL, // x 2^7070 ~= 10^2128
+    0x1d73ef3eaac3c964ULL, 0x88547abb1d8e5bd9ULL, // x 2^7256 ~= 10^2184
+    0x1e34291b1ef566c7ULL, 0x8affca2bd1f88549ULL, // x 2^7442 ~= 10^2240
+    0x9e9383d73d486881ULL, 0x8db87a7c1e56d873ULL, // x 2^7628 ~= 10^2296
+    0x9cc5ee51962c011aULL, 0x907eceba168949b3ULL, // x 2^7814 ~= 10^2352
+    0x413407cfeeac9743ULL, 0x93530b43e5e2c129ULL, // x 2^8000 ~= 10^2408
+    0x7efa7d29c44e11b7ULL, 0x963575ce63b6332dULL, // x 2^8186 ~= 10^2464
+    0x5a848859645d1c6fULL, 0x9926556bc8defe43ULL, // x 2^8372 ~= 10^2520
+    0x51edea897b34601fULL, 0x9c25f29286e9ddb6ULL, // x 2^8558 ~= 10^2576
+    0xb50008d92529e91fULL, 0x9f3497244186fca4ULL, // x 2^8744 ~= 10^2632
+    0xf09e780bcc8238d9ULL, 0xa2528e74eaf101fcULL, // x 2^8930 ~= 10^2688
+    0x3a5828869701a165ULL, 0xa580255203f84b47ULL, // x 2^9116 ~= 10^2744
+    0x8b231a70eb5444ceULL, 0xa8bdaa0a0064fa44ULL, // x 2^9302 ~= 10^2800
+    0xfa1bde1f473556a4ULL, 0xac0b6c73d065f8ccULL, // x 2^9488 ~= 10^2856
+    0x7730e00421da4d55ULL, 0xaf69bdf68fc6a740ULL, // x 2^9674 ~= 10^2912
+    0x7f959cb702329d14ULL, 0xb2d8f1915ba88ca5ULL, // x 2^9860 ~= 10^2968
+    0x40c3a071220f5567ULL, 0xb6595be34f821493ULL, // x 2^10046 ~= 10^3024
+    0x11c48d02b8326bd3ULL, 0xb9eb5333aa272e9bULL, // x 2^10232 ~= 10^3080
+    0x566765461bd2f61bULL, 0xbd8f2f7a1ba47d6dULL, // x 2^10418 ~= 10^3136
+    0xb889018e4f6e9a52ULL, 0xc1454a673cb9b1ceULL, // x 2^10604 ~= 10^3192
+    0xf85333a94848659fULL, 0xc50dff6d30c3aefcULL, // x 2^10790 ~= 10^3248
+    0x1a1aeae7cf8a9d3dULL, 0xc8e9abc872eb2bc1ULL, // x 2^10976 ~= 10^3304
+    0x12e29f09d9061609ULL, 0xccd8ae88cf70ad84ULL, // x 2^11162 ~= 10^3360
+    0xdf7601457ca20b35ULL, 0xd0db689a89f2f9b1ULL, // x 2^11348 ~= 10^3416
+    0xcbdcd02f23cc7690ULL, 0xd4f23ccfb1916df5ULL, // x 2^11534 ~= 10^3472
+    0x44289dd21b589d7aULL, 0xd91d8fe9a3d019ccULL, // x 2^11720 ~= 10^3528
+    0x95aa118ec1d08317ULL, 0xdd5dc8a2bf27f3f7ULL, // x 2^11906 ~= 10^3584
+    0x72c4d2cad73b0a7bULL, 0xe1b34fb846321d04ULL, // x 2^12092 ~= 10^3640
+    0xe20a88f1134f906dULL, 0xe61e8ff47461cda9ULL, // x 2^12278 ~= 10^3696
+    0xc7c91d5c341ed39dULL, 0xea9ff638c54554e1ULL, // x 2^12464 ~= 10^3752
+    0xf659ede2159a45ecULL, 0xef37f1886f4b6690ULL, // x 2^12650 ~= 10^3808
+    0x78d946bab954b82fULL, 0xf3e6f313130ef0efULL, // x 2^12836 ~= 10^3864
+    0xc9b1474d8f89c269ULL, 0xf8ad6e3fa030bd15ULL, // x 2^13022 ~= 10^3920
+    0x6b1d2745340e7b14ULL, 0xfd8bd8b770cb469eULL, // x 2^13208 ~= 10^3976
+    0xf22e502fcdd4bca2ULL, 0x81415538ce493bd5ULL, // x 2^13395 ~= 10^4032
+    0x7c1735fc3b813c8cULL, 0x83c92edf425b292dULL, // x 2^13581 ~= 10^4088
+    0x0367500a8e9a178fULL, 0x865db7a9ccd2839eULL, // x 2^13767 ~= 10^4144
+    0xc9ac50475e25293aULL, 0x88ff2f2bade74531ULL, // x 2^13953 ~= 10^4200
+    0x0879b2e5f6ee8b1cULL, 0x8badd636cc48b341ULL, // x 2^14139 ~= 10^4256
+    0x2f33c652bd12fab7ULL, 0x8e69eee1f23f2be5ULL, // x 2^14325 ~= 10^4312
+    0xad6a6308a8e8b557ULL, 0x9133bc8f2a130fe5ULL, // x 2^14511 ~= 10^4368
+    0x9dbaa465efe141a0ULL, 0x940b83f23a55842aULL, // x 2^14697 ~= 10^4424
+    0x888c9ab2fc5b3437ULL, 0x96f18b1742aad751ULL, // x 2^14883 ~= 10^4480
+    0xba00864671d1053fULL, 0x99e6196979b978f1ULL, // x 2^15069 ~= 10^4536
+    0x61d59d402aae4feaULL, 0x9ce977ba0ce3a0bdULL, // x 2^15255 ~= 10^4592
+    0x803c1cd864033781ULL, 0x9ffbf04722750449ULL, // x 2^15441 ~= 10^4648
+    0xa28a151725a55e10ULL, 0xa31dcec2fef14b30ULL, // x 2^15627 ~= 10^4704
+    0x5b8452af2302fe13ULL, 0xa64f605b4e3352cdULL, // x 2^15813 ~= 10^4760
+    0x82b84cabc828bf93ULL, 0xa990f3c09110c544ULL, // x 2^15999 ~= 10^4816
+    0x8d29dd5122e4278dULL, 0xace2d92db0390b59ULL, // x 2^16185 ~= 10^4872
+    0x58f8fde02c03a6c6ULL, 0xb045626fb50a35e7ULL, // x 2^16371 ~= 10^4928
+    0xd950102978dbd0ffULL, 0xb3b8e2eda91a232dULL, // x 2^16557 ~= 10^4984
+};
+#endif
+
+// ================================================================
+// ================================================================
+//
+// Power-of-5 tables
+//
+// ================================================================
+// ================================================================
+
+// Integer powers of 5 used to drive the power-of-5 calculator used
+// in the slow path
+const static uint64_t powersOfFive[] = {1, 5, 25, 125, 625, 3125, 15625,
+  78125, 390625, 1953125, 9765625, 48828125UL, 244140625UL, 1220703125UL,
+  6103515625ULL, 30517578125ULL, 152587890625ULL, 762939453125ULL,
+  3814697265625ULL, 19073486328125ULL, 95367431640625ULL,
+  476837158203125ULL, 2384185791015625ULL, 11920928955078125ULL,
+  59604644775390625ULL, 298023223876953125ULL, 1490116119384765625ULL,
+  7450580596923828125ULL
+};
+
+//
+// Large integer powers of 5.  These are used to accelerate the
+// power-of-5 calculator when computing very large powers needed for
+// float80 and binary128 in the slow path.
+//
+// TODO: Consider dropping these.  If float80/binary128 optimizations
+// are enabled, then the interval optimization removes a lot of the
+// need for these.  (I've already disabled these to save code for
+// anyone that prefers code size to performance for these formats.)
+
+#if ENABLE_FLOAT80_OPTIMIZATIONS || ENABLE_BINARY128_OPTIMIZATIONS
+// Useful python snippet to generate these constants:
+// a = hex(5 ** 2000)[2:]
+// while a:
+//   print("0x" + a[-8:] + "UL,")
+//   a = a[:-8]
+
+#if MP_WORD_BITS == 32
+// Broken down into 32-bit words for use on 64-bit processors.
+// Stored from least-significant to most-significant.
+
+const static uint32_t fiveToThe1000[] = {
+  0xe76e2661UL, 0xa75f463cUL, 0x8f9ac297UL, 0xb426e5d5UL, 0x8021f455UL,
+  0x484b538eUL, 0xd39bcb19UL, 0x32c52790UL, 0x69805e7cUL, 0x78612ddbUL,
+  0x84a48774UL, 0x4b7ad105UL, 0xa1bca9a9UL, 0x16d4cc76UL, 0x1b3bc79dUL,
+  0x22c38759UL, 0x4ec6864aUL, 0x670dbd77UL, 0xa93ad95bUL, 0xfd07b514UL,
+  0x185d4af8UL, 0x491981abUL, 0x7f6e513bUL, 0x1a8298bfUL, 0xccdd06fcUL,
+  0x9c225999UL, 0x7a000d77UL, 0x211ecc86UL, 0x678b85c6UL, 0xccb243d3UL,
+  0x8c2fb72dUL, 0xf95cada8UL, 0x9c82b396UL, 0x3ef53cd3UL, 0x9093cb88UL,
+  0xb5aff7d4UL, 0x7da0bc3eUL, 0x522aea78UL, 0xa0165d55UL, 0x3b34e560UL,
+  0x80a282e5UL, 0xf0c08162UL, 0x9d7e4698UL, 0xe0fe7772UL, 0x3d9d710eUL,
+  0x8d98ec4eUL, 0xb5954022UL, 0xa0e542c3UL, 0x3be09b67UL, 0x5da7d253UL,
+  0x86949f98UL, 0x68641b9eUL, 0xaa166a05UL, 0xdfb84e6bUL, 0x2955510aUL,
+  0xe62bf617UL, 0xe8c18fc6UL, 0x8afb443aUL, 0x22f12a84UL, 0x048f59a7UL,
+  0xaf620341UL, 0xd377548eUL, 0xefd7f15cUL, 0x2f1121d5UL, 0x9aa2146bUL,
+  0xf36453b5UL, 0xc7a3e8ecUL, 0xcab2ae32UL, 0xdb4abe87UL, 0x5a54e6f2UL,
+  0xb015e34aUL, 0xc7e774f6UL, 0x3ce36UL,
+};
+
+#elif MP_WORD_BITS == 16
+
+// Broken down into 16-bit words for use on 32-bit processors.
+// Stored from least-significant to most-significant.
+const static uint16_t fiveToThe1000[] = {
+  0x2661U, 0xe76eU, 0x463cU, 0xa75fU, 0xc297U, 0x8f9aU, 0xe5d5U,
+  0xb426U, 0xf455U, 0x8021U, 0x538eU, 0x484bU, 0xcb19U, 0xd39bU,
+  0x2790U, 0x32c5U, 0x5e7cU, 0x6980U, 0x2ddbU, 0x7861U, 0x8774U,
+  0x84a4U, 0xd105U, 0x4b7aU, 0xa9a9U, 0xa1bcU, 0xcc76U, 0x16d4U,
+  0xc79dU, 0x1b3bU, 0x8759U, 0x22c3U, 0x864aU, 0x4ec6U, 0xbd77U,
+  0x670dU, 0xd95bU, 0xa93aU, 0xb514U, 0xfd07U, 0x4af8U, 0x185dU,
+  0x81abU, 0x4919U, 0x513bU, 0x7f6eU, 0x98bfU, 0x1a82U, 0x06fcU,
+  0xccddU, 0x5999U, 0x9c22U, 0x0d77U, 0x7a00U, 0xcc86U, 0x211eU,
+  0x85c6U, 0x678bU, 0x43d3U, 0xccb2U, 0xb72dU, 0x8c2fU, 0xada8U,
+  0xf95cU, 0xb396U, 0x9c82U, 0x3cd3U, 0x3ef5U, 0xcb88U, 0x9093U,
+  0xf7d4U, 0xb5afU, 0xbc3eU, 0x7da0U, 0xea78U, 0x522aU, 0x5d55U,
+  0xa016U, 0xe560U, 0x3b34U, 0x82e5U, 0x80a2U, 0x8162U, 0xf0c0U,
+  0x4698U, 0x9d7eU, 0x7772U, 0xe0feU, 0x710eU, 0x3d9dU, 0xec4eU,
+  0x8d98U, 0x4022U, 0xb595U, 0x42c3U, 0xa0e5U, 0x9b67U, 0x3be0U,
+  0xd253U, 0x5da7U, 0x9f98U, 0x8694U, 0x1b9eU, 0x6864U, 0x6a05U,
+  0xaa16U, 0x4e6bU, 0xdfb8U, 0x510aU, 0x2955U, 0xf617U, 0xe62bU,
+  0x8fc6U, 0xe8c1U, 0x443aU, 0x8afbU, 0x2a84U, 0x22f1U, 0x59a7U,
+  0x048fU, 0x0341U, 0xaf62U, 0x548eU, 0xd377U, 0xf15cU, 0xefd7U,
+  0x21d5U, 0x2f11U, 0x146bU, 0x9aa2U, 0x53b5U, 0xf364U, 0xe8ecU,
+  0xc7a3U, 0xae32U, 0xcab2U, 0xbe87U, 0xdb4aU, 0xe6f2U, 0x5a54U,
+  0xe34aU, 0xb015U, 0x74f6U, 0xc7e7U, 0xce36U, 0x3U,
+};
+#endif
+#endif
+
+// ================================================================
+// ================================================================
+//
+// Fixed-width integer routines
+//
+// ================================================================
+// ================================================================
+
+#if HAVE_UINT128_T
+#define multiply64x64RoundingDown(lhs, rhs) (((__uint128_t)(lhs) * (rhs)) >> 64)
+#else
+static uint64_t multiply64x64RoundingDown(uint64_t lhs, uint64_t rhs) {
+  uint64_t a = (lhs >> 32) * (rhs >> 32);
+  uint64_t b = (lhs >> 32) * (rhs & UINT32_MAX);
+  uint64_t c = (lhs & UINT32_MAX) * (rhs >> 32);
+  uint64_t d = (lhs & UINT32_MAX) * (rhs & UINT32_MAX);
+  b += (c & UINT32_MAX) + (d >> 32);
+  return a + (b >> 32) + (c >> 32);
+}
+#endif
+
+// ================================================================
+// ================================================================
+//
+// Multi-Precision Integer Routines
+//
+// ================================================================
+// ================================================================
+
+// Configure our multi-precision integer machinery with appropriate types
+#if MP_WORD_BITS == 16
+typedef uint16_t mp_word_t; // Type of a single word in an MP integer
+typedef uint32_t mp_dword_t; // Type big enough to hold a two-word MP integer
+static const mp_word_t MP_WORD_MAX = UINT16_MAX;
+#elif MP_WORD_BITS == 32
+typedef uint32_t mp_word_t; // Type of a single word in an MP integer
+typedef uint64_t mp_dword_t; // Type big enough to hold a two-word MP integer
+static const mp_word_t MP_WORD_MAX = UINT32_MAX;
+#else
+#error MP_WORD_BITS undefined
+#endif
+static const int mp_word_bits = sizeof(mp_word_t) * 8;
+// __builtin_clz() takes an `unsigned` which may be different from mp_word_t
+// This adjusts the result accordingly.
+#define CLZ_WORD(word) ((unsigned)__builtin_clz((word)) + (mp_word_bits - sizeof(unsigned) * 8))
+
+// A multi-precision integer is represented as two pointers:
+// lsw - points to least-significant word (lowest address in memory)
+// msw - points to one beyond the most-significant word
+// In particular, the following are true:
+//  number of words == msw - lsw
+//  mp is empty/zero if msw == lsw
+//  lsw[0] is least-significant word
+//  lsw[1] is second-least-significant word
+//  ...
+//  msw[-2] is second-most-significant word
+//  msw[-1] is most-significant word
+
+// Caveat: The MP routines in this file are optimized to just extend
+// the value in-place as necessary without any checking to see whether
+// the allocated memory is sufficient.  That requires that the original
+// allocation be sufficient for the largest value that will occur.
+typedef struct { mp_word_t *lsw; mp_word_t *msw; } mp_t;
+
+// Shift the mpint left by the indicated number of
+// bits.  This will extend the mpint at the msw end
+// as necessary.
+static void shiftLeftMP(mp_t *work, int shift) {
+  int wordsShift = shift / mp_word_bits;
+  int bitsShift = shift % mp_word_bits;
+
+  if (wordsShift > 0) {
+    memmove(work->lsw + wordsShift,
+            work->lsw,
+            (work->msw - work->lsw) * sizeof(mp_word_t));
+    memset(work->lsw, 0, wordsShift * sizeof(mp_word_t));
+    work->msw += wordsShift;
+  }
+
+  if (bitsShift > 0) {
+    mp_dword_t t = 0;
+    mp_word_t *p = work->lsw;
+    for (; p < work->msw; p++) {
+      t |= (mp_dword_t)*p << bitsShift;
+      *p = (mp_word_t)t;
+      t >>= mp_word_bits;
+    }
+    if (t != 0) {
+      *p = (mp_word_t)t;
+      work->msw = p + 1;
+    }
+  }
+}
+
+// Return the index of the most-significant set bit
+static int bitCountMP(mp_t work) {
+  if (work.msw == work.lsw) {
+    return 0;
+  }
+  assert(work.msw[-1] != 0); // High-order word cannot be zero
+  return (int)(mp_word_bits - CLZ_WORD(work.msw[-1])) // Bits in high-order word
+    + (int)(mp_word_bits * (work.msw - work.lsw - 1)); // Bits in all other words
+}
+
+// Add small integer to varint
+static void addToMP(mp_t *work, uint64_t addend) {
+  uint64_t t = addend;
+  mp_word_t *p = work->lsw;
+  while (t > 0 && p < work->msw) {
+    t += *p;
+    *p = (mp_word_t)t;
+    t >>= mp_word_bits;
+    p += 1;
+  }
+  while (t > 0) {
+    *p = (mp_word_t)t;
+    t >>= mp_word_bits;
+    p += 1;
+  }
+  if (p > work->msw) {
+    work->msw = p;
+  }
+}
+
+// Shift an MP right, rounding the result according to
+// the current FP rounding mode.
+static mp_t shiftRightMPWithRounding(mp_t work,
+                                     int shift,
+                                     int trailingNonZero,
+                                     bool negative,
+                                     int roundingMode) {
+  (void)negative;
+
+  if (shift == 0) {
+    return work;
+  }
+  if (shift < 0) {
+    shiftLeftMP(&work, -shift);
+    return work;
+  }
+
+  mp_t result = work;
+  int wordsShift = shift / mp_word_bits;
+  int bitsShift = shift % mp_word_bits;
+
+  if (bitsShift == 0) {
+    // We don't really need to shift anything, just drop the low-order
+    // words and possibly increment.
+    result.lsw += wordsShift;
+    switch (roundingMode) {
+#ifdef FE_TOWARDZERO
+    case FE_TOWARDZERO:
+      return result;
+#endif
+#ifdef FE_DOWNWARD
+    case FE_DOWNWARD:
+      // Upwards & downwards rounding are symmetric
+      negative = !negative;
+      // FALL THROUGH
+#endif
+#ifdef FE_UPWARD
+    case FE_UPWARD:
+      for (mp_word_t *p = work.lsw; p < result.lsw; p++) {
+        trailingNonZero |= *p;
+      }
+      if (negative || !trailingNonZero) {
+        return result;
+      } else {
+        break; // Increment and return
+      }
+#endif
+#ifdef FE_TONEAREST
+    case FE_TONEAREST:
+#endif
+    default: {
+      // shift is non-zero, so result.lsw[-1] is valid
+      // and is the most-significant-word of the fraction:
+      mp_word_t fractionMsw = result.lsw[-1];
+      mp_word_t oneHalf = 1U << (mp_word_bits - 1);
+      if (fractionMsw < oneHalf) {
+        return result;
+      } else if (fractionMsw > oneHalf) {
+        break; // Increment and return
+      } else {
+        for (mp_word_t *p = work.lsw; p < result.lsw - 1; p++) {
+          trailingNonZero |= *p;
+        }
+        if (trailingNonZero || ((result.msw > result.lsw) && (result.lsw[0] & 1))) {
+          break; // Increment and return
+        }
+        return result;
+      }
+    }
+    }
+    // Increment and return
+    addToMP(&result, 1);
+    return result;
+  }
+
+  result.lsw += wordsShift;
+  mp_word_t fraction = result.lsw[0] & ((1U << bitsShift) - 1);
+
+  {
+    mp_word_t *p = result.lsw;
+    mp_dword_t t = *p++ >> bitsShift;
+    for (; p < result.msw; p++) {
+      t |= (mp_dword_t)*p << (mp_word_bits - bitsShift);
+      p[-1] = (mp_word_t)t;
+      t >>= mp_word_bits;
+    }
+    if (t == 0) {
+      result.msw -= 1;
+    } else {
+      p[-1] = (mp_word_t)t;
+    }
+  }
+
+  switch (roundingMode) {
+#ifdef FE_TOWARDZERO
+  case FE_TOWARDZERO:
+    return result;
+#endif
+#ifdef FE_DOWNWARD
+  case FE_DOWNWARD:
+    // Upwards & downwards rounding are symmetric
+    negative = !negative;
+    // FALL THROUGH
+#endif
+#ifdef FE_UPWARD
+  case FE_UPWARD:
+    trailingNonZero |= fraction;
+    for (mp_word_t *p = work.lsw; p < result.lsw; p++) {
+      trailingNonZero |= *p;
+    }
+    if (negative || !trailingNonZero) {
+      return result;
+    } else {
+      break; // Increment and return
+    }
+#endif
+#ifdef FE_TONEAREST
+  case FE_TONEAREST:
+#endif
+  default: {
+    mp_word_t half = (mp_word_t)(1U << (bitsShift - 1));
+    if (fraction < half) {
+      return result;
+    } else if (fraction > half) {
+      break; // Increment and return
+    } else { // First part of fraction is exact half..
+      for (mp_word_t *p = work.lsw; p < result.lsw; p++) {
+        trailingNonZero |= *p;
+      }
+      if (trailingNonZero || ((result.msw > result.lsw) && (result.lsw[0] & 1))) {
+        break; // Increment and return
+      } else {
+        return result;
+      }
+    }
+  }
+  }
+  // Increment and return
+  addToMP(&result, 1);
+  return result;
+}
+
+static mp_t shiftRightMPWithTruncation(mp_t work,
+                                       int shift) {
+  return shiftRightMPWithRounding(work, shift,
+                                  0, 0, FE_TOWARDZERO);
+}
+
+// Multiply varint by N
+static void multiplyMPByN(mp_t *work, uint32_t n) {
+  uint64_t t = 0;
+  for (mp_word_t *p = work->lsw; p < work->msw; p++) {
+    t += *p * (uint64_t)n;
+    *p = (mp_word_t)t;
+    t >>= mp_word_bits;
+  }
+  while (t > 0) {
+    work->msw[0] = (mp_word_t)t;
+    t >>= mp_word_bits;
+    work->msw += 1;
+  }
+}
+
+static void multiplyByFiveToTheN(mp_t *dest, int power) {
+#if HAVE_UINT128_T
+  // 128-bit arithmetic lets us multiply 32-bit words by 5^40 (93 bits).
+  // For a double, this can loop up to 8 times.
+  // Without the large-power optimization in fiveToTheN, this can
+  // loop > 100 times for a binary128.  :-/
+  while (power > 40) {
+    static const uint64_t fiveToThe20 = 95367431640625ULL;
+    static const __uint128_t fiveToThe40 = (__uint128_t)fiveToThe20 * fiveToThe20;
+    __uint128_t t = 0;
+    mp_word_t *p = dest->lsw;
+    while (p < dest->msw) {
+      t += *p * fiveToThe40;
+      *p++ = (mp_word_t)t;
+      t >>= mp_word_bits;
+    }
+    while (t > 0) {
+      *p++ = (mp_word_t)t;
+      t >>= mp_word_bits;
+    }
+    dest->msw = p;
+    power -= 40;
+  }
+#endif
+
+  while (power > 0) {
+#if MP_WORD_BITS == 16
+    // Limit to 5^20 (47 bits) so we don't overflow
+    // a 64-bit accumulator.
+    const static int maxPower = 13; // For 64-bit
+    uint64_t t = 0;
+    uint64_t powerOfFive = powersOfFive[power > maxPower ? maxPower : power];
+#elif MP_WORD_BITS == 32
+    // With a 128-bit accumulator, we can use the full table
+    // of 64-bit powers (up to 5^27).
+    const static int maxPower = 27;
+    __uint128_t t = 0;
+    __uint128_t powerOfFive = powersOfFive[power > maxPower ? maxPower : power];
+#endif
+    mp_word_t *p = dest->lsw;
+    while (p < dest->msw) {
+      t += *p * powerOfFive;
+      *p++ = (mp_word_t)t;
+      t >>= mp_word_bits;
+    }
+    while (t > 0) {
+      *p++ = (mp_word_t)t;
+      t >>= mp_word_bits;
+    }
+    dest->msw = p;
+    power -= maxPower;
+  }
+}
+
+// Compute 5^N
+// This is _THE_ performance-critical function for the slow path
+// when handling float80 or binary128 with large exponents.
+// (Of course, that only applies to the few percent of inputs
+// that don't get handled by the optimized paths.)
+static void fiveToTheN(mp_t *dest, int power) {
+
+#if ENABLE_FLOAT80_OPTIMIZATIONS || ENABLE_BINARY128_OPTIMIZATIONS
+  // Accelerate a very large power with a pre-computed initial value
+  // Only for float80/binary128 since binary64 only goes up to 10^325
+  if (power >= 1000) {
+    memcpy(dest->lsw, fiveToThe1000, sizeof(fiveToThe1000));
+    dest->msw = dest->lsw + sizeof(fiveToThe1000) / sizeof(fiveToThe1000[0]);
+    power -= 1000;
+  }
+  else
+#endif
+  {
+    // Initialize with as large a power of 5 as we can from the standard table
+    const static int maxTablePower = sizeof(powersOfFive) / sizeof(powersOfFive[0]) - 1;
+    int thisPower = power > maxTablePower ? maxTablePower : power;
+    uint64_t t = powersOfFive[thisPower];
+    mp_word_t *p = dest->lsw;
+    while (t > 0) {
+      *p++ = (mp_word_t)t;
+      t >>= mp_word_bits;
+    }
+    dest->msw = p;
+    power -= thisPower;
+  }
+
+  multiplyByFiveToTheN(dest, power);
+}
+
+// Following "Algorithm D" from Knuth AOCP Section 4.3.1
+// Accepts:
+//   Numerator
+//   Denominator
+//   *nonZeroRemainder: integer to hold remainder status
+// Returns:
+//   quotient stored in numerator area
+//   numerator is destroyed
+//   *nonZeroRemainder set to non-zero iff remainder was non-zero
+static mp_t divideMPByMP(mp_t numerator, mp_t denominator, int *nonZeroRemainder) {
+  // Make sure we haven't picked up a leading zero word anywhere...
+  assert(numerator.msw > numerator.lsw);
+  assert(denominator.msw > denominator.lsw);
+  assert(numerator.msw[-1] != 0);
+  assert(denominator.msw[-1] != 0);
+
+  // Full long division algorithm assumes denominator is more than 1 word,
+  // so we need to handle 1-word case separately.
+  if (denominator.msw - denominator.lsw == 1) {
+    mp_dword_t n = denominator.lsw[0];
+    mp_dword_t t = 0;
+    mp_word_t *p = numerator.msw;
+    while (p > numerator.lsw) {
+      p -= 1;
+      t <<= mp_word_bits;
+      t += *p;
+      mp_word_t q0 = (mp_word_t)(t / n);
+      *p = q0;
+      t -= q0 * n;
+    }
+    *nonZeroRemainder = (t != 0);
+    while (numerator.msw[-1] == 0) {
+      numerator.msw -= 1;
+    }
+    return numerator;
+  }
+
+  // D1. Normalize: Multiply numerator and denominator by a power of 2
+  // so that denominator has the most significant bit set in the
+  // most significant word.  This guarantees that qhat below
+  // will always be very good.
+  int shift = (int)CLZ_WORD(denominator.msw[-1]);
+  shiftLeftMP(&denominator, shift);
+  shiftLeftMP(&numerator, shift);
+
+  // Add a high-order word to the numerator if necessary
+  if (numerator.msw[-1] >= denominator.msw[-1]) {
+    numerator.msw[0] = 0;
+    numerator.msw += 1;
+  }
+
+  // D2. Iterate
+  // Numerator and denominator must not be immediately adjacent in
+  // memory, since we need an extra word for the quotient to fit in.
+  // TODO: Rearrange this so that the quotient can exactly overlay
+  // the numerator instead of going one word beyond.  The requirement
+  // for an empty word beyond the numerator is non-obvious and easy
+  // to screw up.
+  assert((numerator.msw < denominator.lsw) || (denominator.msw < numerator.lsw));
+  mp_t quotient = {numerator.msw + 1, numerator.msw + 1}; // Quotient will overwrite numerator
+  int iterations = (int)((numerator.msw - numerator.lsw) - (denominator.msw - denominator.lsw));
+  for (int j = 0; j < iterations; ++j) {
+    // D3. Trial division of high-order bits
+    mp_word_t qhat;
+    mp_dword_t numerator2 =
+      ((mp_dword_t)numerator.msw[-1] << mp_word_bits)
+      + numerator.msw[-2];
+    if (numerator.msw[-1] == denominator.msw[-1]) {
+      qhat = MP_WORD_MAX;
+    } else {
+      qhat = (mp_word_t)(numerator2 / denominator.msw[-1]);
+    }
+    while (1) {
+      mp_dword_t r = numerator2 - qhat * (mp_dword_t)denominator.msw[-1];
+      if (r <= MP_WORD_MAX
+          && ((denominator.msw[-2] * (mp_dword_t)qhat) > (numerator.msw[-3] + (r << mp_word_bits)))) {
+        qhat -= 1;
+      } else {
+        break;
+      }
+    }
+
+    // D4. numerator -= qhat * denominator
+    mp_dword_t t = 0;
+    for (mp_word_t *den = denominator.lsw,
+           *num = numerator.msw - (denominator.msw - denominator.lsw) - 1;
+         den < denominator.msw;
+         num += 1, den += 1) {
+      t += qhat * (mp_dword_t)*den;
+      unsigned borrow = *num < (mp_word_t)t;
+      *num -= (mp_word_t)t;
+      t >>= mp_word_bits;
+      t += borrow;
+    }
+
+    // D5/D6. qhat may have been one too high; if so, correct for that
+    // Per Knuth, this happens very infrequently
+    if (numerator.msw[-1] < t) {
+      qhat -= 1;
+      t = 0;
+      for (mp_word_t *den = denominator.lsw,
+             *num = numerator.msw - (denominator.msw - denominator.lsw) - 1;
+           den < denominator.msw;
+           num += 1, den += 1) {
+        t += *num + (mp_dword_t)*den;
+        *num = (mp_word_t)t;
+        t >>= mp_word_bits;
+      }
+    }
+    --quotient.lsw;
+    *quotient.lsw = qhat;
+
+    // D7. Iterate
+    numerator.msw -= 1;
+  }
+
+  // D8. Post-process the remainder
+  mp_word_t remainderHash = 0;
+  for (mp_word_t *num = numerator.lsw; num < numerator.msw; ++num) {
+    remainderHash |= *num;
+  }
+  *nonZeroRemainder = remainderHash != 0;
+
+  while (quotient.msw[-1] == 0) {
+    quotient.msw -= 1;
+  }
+  return quotient;
+}
+
+// ================================================================
+// ================================================================
+//
+// Parse state
+//
+// ================================================================
+// ================================================================
+
+// To reduce argument-passing overhead, we store all the state
+// in this struct and pass around pointers to it.
+struct parseInfo {
+  // ================================================================
+  // Basic parameters for the target FP format
+
+  // Number of significant bits (including hidden bit for IEEE formats)
+  int sigBits;
+  // Minimum/maximum binary exponents
+  int minBinaryExp;
+  int maxBinaryExp;
+  // Total number of bytes in this format
+  int bytes;
+  // Approximate bounds on the decimal exponent, used for
+  // early overflow/underflow checks.
+  int minDecimalExp;
+  int maxDecimalExp;
+  // Maximum number of significant digits in the full
+  // decimal representation of any exact midpoint.
+  int maxDecimalMidpointDigits;
+
+  // ================================================================
+  // Inputs
+
+  // Location where the value should be written
+  unsigned char *dest;
+  // First character of the provided string
+  const char *start;
+  // Where to put the address of the first unparsed character
+  char **end;
+  // Locale
+  strtofp_locale_t loc;
+
+  // ================================================================
+  // Outputs from the initial parse
+
+  // First 19 digits of significand as an integer
+  uint64_t digits;
+  // Total number of significand digits (ignoring decimal point)
+  int digitCount;
+  // Address of the 20th digit or NULL if digitCount < 20
+  const unsigned char *firstUnparsedDigit;
+  // Number of unparsed digits = max(digitCount - 19, 0)
+  int unparsedDigitCount;
+  // Decimal exponent, corrected for decimal point location
+  int base10Exponent;
+  // True if number is negative
+  bool negative;
+};
+
+// ================================================================
+// ================================================================
+//
+// Over/Underflow
+//
+// ================================================================
+// ================================================================
+
+// Store a suitably-signed infinity to the destination
+static void infinity(struct parseInfo *info) {
+  // 16/32/64-bit formats we can hardcode the full value
+  // and memcpy() it.  This is endian-safe (assuming that
+  // integers and FP values have the same endianness).
+  switch (info->bytes) {
+#if ENABLE_BINARY16_SUPPORT
+  case 2: { // binary16
+    uint16_t raw = info->negative ? 0xfc00 : 0x7c00;
+    memcpy(info->dest, &raw, sizeof(raw));
+    return;
+  }
+#endif
+#if ENABLE_BINARY32_SUPPORT
+  case 4: { // binary32
+    uint32_t raw = info->negative ? 0xff800000UL : 0x7f800000UL;
+    memcpy(info->dest, &raw, sizeof(raw));
+    return;
+  }
+#endif
+#if ENABLE_BINARY64_SUPPORT
+  case 8: { // binary64
+    uint64_t raw = info->negative ? 0xfff0000000000000ULL : 0x7ff0000000000000ULL;
+    memcpy(info->dest, &raw, sizeof(raw));
+    return;
+  }
+#endif
+  default:
+    break;
+  }
+
+  // 80- and 128-bit formats we build up incrementally.
+  // TODO: Support big-endian.
+  memset(info->dest, 0, (size_t)info->bytes);
+  switch(info->bytes) {
+#if ENABLE_FLOAT80_SUPPORT
+  case 10: // float80
+    info->dest[7] = 0x80;
+    info->dest[8] = 0xff;
+    info->dest[9] = info->negative ? 0xff : 0x7f;
+#endif
+#if ENABLE_BINARY128_SUPPORT
+  case 16: // binary128
+    info->dest[14] = 0xff;
+    info->dest[15] = info->negative ? 0xff : 0x7f;
+    break;
+#endif
+  }
+}
+
+// Store the max normal value to the destination, suitably signed.
+static void max_value(struct parseInfo *info) {
+  switch (info->bytes) {
+#if ENABLE_BINARY16_SUPPORT
+  case 2: { // binary16
+    uint16_t raw = info->negative ? 0xfbff : 0x7bff;
+    memcpy(info->dest, &raw, sizeof(raw));
+    return;
+  }
+#endif
+#if ENABLE_BINARY32_SUPPORT
+  case 4: { // binary32
+    uint32_t raw = info->negative ? 0xff7fffffUL : 0x7f7fffffUL;
+    memcpy(info->dest, &raw, sizeof(raw));
+    return;
+  }
+#endif
+#if ENABLE_BINARY64_SUPPORT
+  case 8: { // binary64
+    uint64_t raw = info->negative ? 0xffefffffffffffffULL : 0x7fefffffffffffffULL;
+    memcpy(info->dest, &raw, sizeof(raw));
+    return;
+  }
+#endif
+  default:
+    break;
+  }
+
+  // 80- and 128-bit formats we build up incrementally.
+  // TODO: Support big-endian.
+  memset(info->dest, 0xff, info->bytes);
+  switch(info->bytes) {
+#if ENABLE_FLOAT80_SUPPORT
+  case 10: // float80
+    info->dest[7] = 0xff;
+    info->dest[8] = 0xfe;
+    info->dest[9] = info->negative ? 0xff : 0x7f;
+#endif
+#if ENABLE_BINARY128_SUPPORT
+  case 16: // binary128
+    info->dest[14] = 0xfe;
+    info->dest[15] = info->negative ? 0xff : 0x7f;
+    break;
+#endif
+  }
+}
+
+// This gets invoked for inputs whose magnitude is greater
+// than the max normal value + 1 ulp.
+// Gdtoa returns signed INF for such values regardless of the
+// rounding mode.  Glibc rounds them correctly.
+static void overflow(struct parseInfo *info) {
+  // Overflow is always an ERANGE error
+  errno = ERANGE;
+  int roundingMode = FENV_ROUNDING_MODE();
+  int negative = info->negative;
+
+  if (0
+#ifdef FE_TOWARDZERO
+      || roundingMode == FE_TOWARDZERO
+#endif
+#ifdef FE_DOWNWARD
+      || (roundingMode == FE_DOWNWARD && !negative)
+#endif
+#ifdef FE_UPWARD
+      || (roundingMode == FE_UPWARD && negative)
+#endif
+      ) {
+    max_value(info);
+  } else {
+    infinity(info);
+  }
+}
+
+// This gets invoked for inputs that are nonzero, but closer to zero
+// than to the least positive or negative subnormal.  For IEEE754
+// formats (and Intel x87 Extended format), these get returned as
+// either +/- zero or the least subnormal, depending on the current
+// rounding mode.  In all of these cases, the bit pattern is all zero
+// bits except possibly for the top bit (sign) and bottom bit (either zero for
+// zero or 1 for the least subnormal).
+static void underflow(struct parseInfo *info) {
+  // Note: C17 allows implementations to set ERANGE for underflow or not.
+  // Traditionally, ERANGE has been set somewhat inconsistently:
+  // gdtoa sets it for subnormals expressed in decimal format but not
+  // hexadecimal; glibc uses ERANGE as an inexact flag in various cases.
+
+  // To generally match traditional usage, this implementation sets ERANGE
+  // for any subnormal return.
+  // It also sets it when a non-zero input is rounded to zero.  This provides
+  // a way for clients to distinguish a true zero (such as "0.0e0") from a
+  // very small non-zero (such as "1e-999999").
+  errno = ERANGE;
+  uint8_t bottomBit = 0;
+  int roundingMode = FENV_ROUNDING_MODE();
+  if (0
+#ifdef FE_DOWNWARD
+      || (roundingMode == FE_DOWNWARD && info->negative)
+#endif
+#ifdef FE_UPWARD
+      || (roundingMode == FE_UPWARD && !info->negative)
+#endif
+    ) {
+    bottomBit = 1;
+  }
+
+  // 16/32/64-bit formats we can hardcode the full value
+  // and memcpy() it.  This is endian-safe (assuming that
+  // integers and FP values have the same endianness).
+  switch (info->bytes) {
+#if ENABLE_BINARY16_SUPPORT
+  case 2: { // binary16
+    uint16_t raw = (info->negative ? 0x8000 : 0) | bottomBit;
+    memcpy(info->dest, &raw, sizeof(raw));
+    break;
+  }
+#endif
+#if ENABLE_BINARY32_SUPPORT
+  case 4: { // binary32
+    uint32_t raw = (info->negative ? 0x80000000UL : 0) | bottomBit;
+    memcpy(info->dest, &raw, sizeof(raw));
+    break;
+  }
+#endif
+#if ENABLE_BINARY64_SUPPORT
+  case 8: { // binary64
+    uint64_t raw = (info->negative ? 0x8000000000000000ULL : 0) | bottomBit;
+    memcpy(info->dest, &raw, sizeof(raw));
+    break;
+  }
+#endif
+#if ENABLE_FLOAT80_SUPPORT || ENABLE_BINARY128_SUPPORT
+  case 10: case 16: {
+    // TODO: Make this endian-safe
+    memset(info->dest, 0, (size_t)info->bytes); // Initialize to +0
+    info->dest[0] = bottomBit;
+    info->dest[info->bytes - 1] = info->negative ? 0x80 : 0;
+    break;
+  }
+#endif
+  }
+}
+
+// ================================================================
+// ================================================================
+//
+// General slow path
+//
+// ================================================================
+// ================================================================
+
+
+// Given a pointer to an ASCII digit string, initialize
+// the mpint with the decimal value of those digits.
+//
+// Any non-digit characters (e.g., decimal points) are ignored.
+//
+// If there are more than maxDecimalMidpointDigits, then
+// we stop when we've reached that many digits and then
+// add exactly one digit: zero if all the remaining digits
+// are zero, else one.
+//
+// Arguments:
+//  work - pointer to the mpint
+//  info - parseInfo struct
+static void initMPFromDigits(mp_t *work, struct parseInfo *info) {
+  // Break first19 digits into words
+  mp_word_t *component = work->lsw;
+  uint64_t first19 = info->digits;
+  while (first19 > 0) {
+    *component = (mp_word_t)first19;
+    first19 >>= mp_word_bits;
+    component += 1;
+  }
+  work->msw = component;
+
+  // Figure out how many more digits we should parse,
+  // "extra" digits are the ones beyond maxDecimalMidpointDigits.
+  int remainingDigitCount = info->unparsedDigitCount;
+  int extraDigitCount = 0;
+  if (info->digitCount > info->maxDecimalMidpointDigits) {
+    remainingDigitCount
+      = info->maxDecimalMidpointDigits - (info->digitCount - info->unparsedDigitCount);
+    extraDigitCount = info->unparsedDigitCount - remainingDigitCount;
+  }
+
+  // Handle remaining digits in batches of <= 9 digits
+  static const uint32_t powersOfTen[] = {1, 10, 100, 1000, 10000UL,
+    100000UL, 1000000UL, 10000000UL, 100000000UL, 1000000000UL};
+  const unsigned char *p = info->firstUnparsedDigit;
+  while (remainingDigitCount > 0) {
+    int batchSize = remainingDigitCount > 9 ? 9 : remainingDigitCount;
+    uint64_t batch = 0;
+    for (int i = 0; i < batchSize; i++, p++) {
+      unsigned t = (unsigned)(*p - '0');
+      while (t > 9) {
+        p += 1; // Skip non-digits (decimal point)
+        t = (unsigned)(*p - '0');
+      }
+      batch = batch * 10 + t;
+    }
+    multiplyMPByN(work, powersOfTen[batchSize]);
+    addToMP(work, batch);
+    remainingDigitCount -= batchSize;
+  }
+
+  // Extra digits are summarized in a single digit:
+  // zero if all the extra digits are zero, else one.
+  if (extraDigitCount > 0) {
+    multiplyMPByN(work, 10);
+    while (extraDigitCount > 0) {
+      if (*p == '0') {
+        extraDigitCount -= 1;
+      } else if (*p >= '1' && *p <= '9') {
+        addToMP(work, 1);
+        return;
+      }
+      // Note: Non-digit (decimal point) chars are ignored
+      p += 1;
+    }
+  }
+}
+
+// This takes a significand and power of ten and returns the high-order
+// bits of the product (correctly rounded) using exact varint arithmetic.
+// This is the common slowpath used by all of the parsers below.
+
+// General strategy: For positive exponents, the significand is
+// multiplied by the power of 10 to form a large varint.  The
+// high-order bits (suitably rounded) are returned as the binary
+// significand.  The number of bits are returned as the
+// resultExponent.  For negative exponents, the significand is divided
+// by the power of ten with suitable scaling to ensure the quotient
+// will have sufficient bits to return a correctly-rounded binary significand.
+
+// This takes a pointer to a stack work area; that allows each
+// format-specific implementation to pass a buffer that is
+// suitably-sized.  For binary16/32/64, the passed-in buffer is sized
+// so that we will never have to allocate a larger buffer on the heap,
+// and those versions pass `false` for `heapAllocOK` to assert this.
+// For float80/binary128, the buffer is large enough for common
+// requests; larger inputs may require us to allocate a temporary
+// stack buffer for the duration of this call.
+
+// Arguments:
+//  info - parseInfo struct with format details, original request
+//     information, and parser output
+//  roundingMode - result from fegetround()
+//  stackWorkBuffer - pointer to an on-stack work area
+//  stackWorkBufferBytes - size of work area in mp_word_t
+//  heapAllocOK - if zero, we'll assert on any heap allocation
+static void generalSlowpath(struct parseInfo *info,
+                            int roundingMode,
+                            mp_word_t *stackWorkArea,
+                            int stackWorkAreaWords,
+                            bool heapAllocOK)
+{
+  mp_t mpSignificand;
+  int binaryExponent;
+
+  // If stackWorkArea is not big enough, space will be allocated
+  // on the heap that will need to be released before we return.
+  mp_word_t *heapAlloc = NULL;
+
+  // We'll process `digitCount` digits but no more than
+  // maxDecimalMidpointDigits + 1.  (See initMPWithDigits above)
+  int significandDigits = info->digitCount;
+  if (significandDigits > info->maxDecimalMidpointDigits) {
+    significandDigits = info->maxDecimalMidpointDigits + 1;
+  }
+  int base10Exponent = info->base10Exponent - significandDigits + info->digitCount;
+
+  // Figure out how many words we need for the significand and power of 5
+  // 1701 / 512 is slightly bigger than log2(10) ~= 3.32
+  // 1189 / 512 is slightly bigger than log2(5) ~= 2.32
+  int significandBitsNeeded = (significandDigits * 1701) >> 9;
+  int significandWordsNeeded = (significandBitsNeeded + (mp_word_bits - 1)) / mp_word_bits;
+
+  int exponentBitsNeeded = (((base10Exponent < 0 ? -base10Exponent : base10Exponent) + 1) * 1189) >> 9;
+  int exponentWordsNeeded = (exponentBitsNeeded + (mp_word_bits - 1)) / mp_word_bits;
+
+  if (base10Exponent >= 0) {
+    // ================================================================
+    // Slow path for exponent > 0
+    int totalWordsNeeded = significandWordsNeeded + exponentWordsNeeded;
+    mp_t workMP;
+    if (totalWordsNeeded <= stackWorkAreaWords) {
+      memset(stackWorkArea, 0, (size_t)stackWorkAreaWords * sizeof(mp_word_t));
+      workMP.lsw = workMP.msw = stackWorkArea;
+    } else {
+      assert(heapAllocOK);
+      heapAlloc = (mp_word_t *)calloc((size_t)totalWordsNeeded, sizeof(mp_word_t));
+      if (heapAlloc == NULL) {
+        memset(info->dest, 0, (size_t)info->bytes);
+        return;
+      }
+      workMP.lsw = workMP.msw = heapAlloc;
+    }
+
+    // Load workMP with digits * 5^n
+    initMPFromDigits(&workMP, info);
+    multiplyByFiveToTheN(&workMP, base10Exponent);
+    assert((workMP.msw - workMP.lsw) <= totalWordsNeeded);
+
+    // Bit count => binary exponent
+    int bits = bitCountMP(workMP);
+    binaryExponent = bits + base10Exponent; // Factor of 2^n from above
+    // Leading bits => significand
+    mpSignificand = shiftRightMPWithRounding(workMP,
+      bits - info->sigBits, 0, info->negative, roundingMode);
+    // Adjust the exponent if a round-up gave us one too many bits
+    if (bitCountMP(mpSignificand) > info->sigBits) {
+      binaryExponent += 1;
+      mpSignificand = shiftRightMPWithTruncation(mpSignificand, 1);
+    }
+    if (binaryExponent > info->maxBinaryExp) {
+      // Overflow.
+      free(heapAlloc);
+      overflow(info);
+      return;
+    }
+  } else {
+    // ================================================================
+    // Slow path for exponent < 0
+
+    // Basic idea: Since base10Exponent is negative, we can't work
+    // directly with 10^base10Exponent (it's an infinite binary
+    // fraction), but 10^-base10Exponent is an integer.  So we use
+    // varint arithmetic to compute
+    //    digits / 10^(-base10Exponent)
+    // scaling the numerator so that the quotient will have at least
+    // 53 bits. The tricky part is keeping the right information to do
+    // accurate rounding of the result.
+
+    // Widen numerator so the result after division will have enough bits to round
+    int numeratorBitsNeeded = significandBitsNeeded;
+    if (numeratorBitsNeeded < exponentBitsNeeded + info->sigBits + 2) {
+      numeratorBitsNeeded = exponentBitsNeeded + info->sigBits + 2;
+    }
+    int numeratorWordsNeeded = (numeratorBitsNeeded + (mp_word_bits - 1)) / mp_word_bits + 2;
+    int denominatorWordsNeeded = exponentWordsNeeded;
+    int totalWordsNeeded = numeratorWordsNeeded + denominatorWordsNeeded;
+
+    mp_word_t *work;
+    if (totalWordsNeeded <= stackWorkAreaWords) {
+      memset(stackWorkArea, 0, stackWorkAreaWords * sizeof(mp_word_t));
+      work = stackWorkArea;
+    } else {
+      assert(heapAllocOK);
+      heapAlloc = (mp_word_t *)calloc((size_t)totalWordsNeeded, sizeof(mp_word_t));
+      if (heapAlloc == NULL) {
+        memset(info->dest, 0, (size_t)info->bytes);
+        return;
+      }
+      work = heapAlloc;
+    }
+
+    mp_t numerator = {work, work};
+    mp_t denominator = {work + numeratorWordsNeeded, work + numeratorWordsNeeded};
+
+    // Denominator holds power of 10^N
+    // (Actually 5^N, the remaining factor of 2^N is handled later.)
+    fiveToTheN(&denominator, -base10Exponent);
+    assert((denominator.msw - denominator.lsw) <= denominatorWordsNeeded);
+    assert(denominator.msw[-1] != 0);
+
+    // Populate numerator with digits, widen it to ensure final
+    // quotient has at least sigBits + 2 bits.
+    initMPFromDigits(&numerator, info);
+    assert(numerator.msw[-1] != 0);
+    int numeratorShift = bitCountMP(denominator) - bitCountMP(numerator) + info->sigBits + 2;
+    if (numeratorShift > 0) {
+      shiftLeftMP(&numerator, numeratorShift);
+      assert(numerator.msw[-1] != 0);
+      assert((numerator.msw - numerator.lsw) < numeratorWordsNeeded);
+    } else {
+      numeratorShift = 0;
+    }
+
+    // Divide, compute exact binaryExponent
+    // Note: division is destructive; overwrites numerator with quotient
+    int remainderNonZero;
+    mp_t quotient = divideMPByMP(numerator, denominator, &remainderNonZero);
+    // Binary exponent starts from number of bits in quotient
+    int quotientBits = bitCountMP(quotient);
+    binaryExponent = quotientBits;
+    // 2^base10Exponent was omitted from the 10^N denominator above
+    binaryExponent += base10Exponent;
+    // We multiplied by 2^numeratorShift above, so divide by
+    // 2^numeratorShift to cancel it out.
+    binaryExponent -= numeratorShift;
+
+    if (binaryExponent > info->minBinaryExp) {
+      // Normal decimal
+      mpSignificand = shiftRightMPWithRounding(quotient,
+        quotientBits - info->sigBits, remainderNonZero, info->negative, roundingMode);
+      if (bitCountMP(mpSignificand) > info->sigBits) {
+        binaryExponent += 1;
+        mpSignificand = shiftRightMPWithTruncation(mpSignificand, 1);
+      }
+      if (binaryExponent > info->maxBinaryExp) {
+        free(heapAlloc);
+        overflow(info);
+        return;
+      }
+    } else if (binaryExponent > info->minBinaryExp - info->sigBits) {
+      // Subnormal decimal
+      int bitsNeeded = binaryExponent - (info->minBinaryExp - info->sigBits + 1);
+      binaryExponent = info->minBinaryExp;
+      mpSignificand = shiftRightMPWithRounding(quotient,
+        quotientBits - bitsNeeded, remainderNonZero, info->negative, roundingMode);
+
+      // Usually, overflowing the expected number of bits doesn't
+      // break anything; it just results in a significand 1 bit longer
+      // than we expected.
+
+      // Except when the significand overflows into the exponent.
+      // Then we have a normal, so the extra overflow bit
+      // will naturally get dropped, we just have to bump the
+      // exponent.
+      if (bitCountMP(mpSignificand) >= info->sigBits) {
+        binaryExponent += 1;
+      } else {
+        errno = ERANGE; // This will be a true subnormal return, set ERANGE
+      }
+    } else {
+      // Underflow.
+      free(heapAlloc);
+      underflow(info);
+      return;
+    }
+  }
+
+  // Zero-extend to sigBits and copy to dest
+  size_t mpWords = (size_t)(mpSignificand.msw - mpSignificand.lsw);
+  size_t expectedWords = (size_t)((info->sigBits + mp_word_bits - 1) / mp_word_bits);
+  if (mpWords < expectedWords) {
+    memset(mpSignificand.lsw + mpWords, 0, (expectedWords - mpWords) * sizeof(mp_word_t));
+  }
+  // TODO: Endianness.  This only works for little-endian systems.
+  memcpy(info->dest, mpSignificand.lsw, ((unsigned)info->sigBits + 7) / 8);
+  // Free the heap work area (if any)
+  free(heapAlloc);
+
+  // Set the exponent & sign bits
+  uint16_t exponentBits = (uint16_t)(binaryExponent - info->minBinaryExp);
+  if (info->bytes <= 8) {
+    // float80 and binary128 have 16 bit exponent+sign, so no shift needed
+    exponentBits <<= 16 - (info->bytes * 8 - info->sigBits + 1);
+  }
+  exponentBits |= info->negative ? 0x8000 : 0;
+
+  unsigned char *p = info->dest + info->bytes;
+  switch (info->bytes) {
+  case 2:
+    p[-1] = (info->dest[1] & 0x03) | (unsigned char)(exponentBits >> 8);
+    return;
+  case 4:
+    p[-2] = (info->dest[2] & 0x7f) | (unsigned char)exponentBits;
+    break;
+  case 8:
+    p[-2] = (info->dest[6] & 0x0f) | (unsigned char)exponentBits;
+    break;
+  case 10:
+  case 16:
+    p[-2] = (unsigned char)exponentBits;
+    break;
+  }
+  p[-1] = (unsigned char)(exponentBits >> 8);
+}
+
+// ================================================================
+// ================================================================
+//
+// Hex Float parsing
+//
+// ================================================================
+// ================================================================
+
+// This is called with `start` pointing to the `0x` that begins
+// the hex float.  We assume the first two characters have already
+// been verified to be '0x'.
+static void
+hexFloat(const unsigned char *start, struct parseInfo *info) {
+  const unsigned char *p = start + 2; // Skip leading '0x'
+
+  // Two 64-bit ints that we use as a joint 128-bit accumulator
+  uint64_t significand_lsw = 0, significand_msw = 0;
+
+  // Digits before the decimal point
+  const unsigned char *firstDigit = p;
+  unsigned remainder = 0;
+  int base2Exponent = 0;
+  // Perf: Just use the lower 64 bits until it's full...
+  while (hexdigit[*p] < 16 && significand_lsw < (uint64_t)1 << 60) {
+    significand_lsw <<= 4;
+    significand_lsw += hexdigit[*p];
+    p += 1;
+  }
+  // ... then work with the full 128 bits until it's full ...
+  while (hexdigit[*p] < 16 && significand_msw < (uint64_t)1 << 59) {
+    significand_msw <<= 4;
+    significand_msw |= (significand_lsw >> 60);
+    significand_lsw <<= 4;
+    significand_lsw += hexdigit[*p];
+    p += 1;
+  }
+  // ... if there's more beyond that, just track whether it's non-zero.
+  while (hexdigit[*p] < 16) {
+    remainder |= hexdigit[*p];
+    base2Exponent += 4;
+    p += 1;
+  }
+  int digitCount = (int)(p - firstDigit);
+
+  // Try to match decimal point
+  if (info->loc == strtofp_C_locale) {
+    if (*p == '.') {
+      p += 1;
+    } else {
+      goto possible_exponent;
+    }
+  } else {
+    const unsigned char *decimalPoint = strtofp_locale_decimal_point(info->loc);
+    const unsigned char *startOfPotentialDecimalPoint = p;
+    for (const unsigned char *d = decimalPoint; *d; d++) {
+      if (*p != *d) {
+        p = startOfPotentialDecimalPoint;
+        goto possible_exponent;
+      }
+      p++;
+    }
+  }
+
+  // Collect digits after decimal point
+  const unsigned char *firstFractionDigit = p;
+  if (significand_msw == 0) {
+    while (hexdigit[*p] < 16 && significand_lsw < (uint64_t)1 << 60) {
+      significand_lsw <<= 4;
+      significand_lsw += hexdigit[*p];
+      p += 1;
+    }
+  }
+  while (hexdigit[*p] < 16 && significand_msw < (uint64_t)1 << 59) {
+    significand_msw <<= 4;
+    significand_msw |= (significand_lsw >> 60);
+    significand_lsw <<= 4;
+    significand_lsw += hexdigit[*p];
+    p += 1;
+  }
+  // Initialize exponent from the number of digits after
+  // the decimal point.
+  base2Exponent -= 4 * (p - firstFractionDigit);
+  // Any remaining digits may impact rounding...
+  while (hexdigit[*p] < 16) {
+    remainder |= hexdigit[*p];
+    p += 1;
+  }
+  digitCount += p - firstFractionDigit;
+
+ possible_exponent:
+  if (*p == 'p' || *p == 'P') {
+    const unsigned char *exponentPhrase = p;
+    p += 1;
+    int negativeExponent = 0;
+    if (*p == '-') {
+      negativeExponent = 1;
+      p += 1;
+    } else if (*p == '+') {
+      p += 1;
+    }
+    if (*p < '0' || *p > '9') {
+      // Ignore 'e' or 'E' not followed by number.
+      p = exponentPhrase;
+    } else {
+      // Skip zeros in "0x1p+0000000000000000000000000001"
+      int exp = 0;
+      unsigned t = (unsigned)(*p - '0');
+      while (t < 10) {
+        if (exp > 99999999) {
+          exp = 99999999;
+        } else {
+          exp = exp * 10 + (int)t;
+        }
+        p += 1;
+        t = (unsigned)(*p - '0');
+      }
+      if (negativeExponent) {
+        exp = -exp;
+      }
+      base2Exponent += exp;
+    }
+  }
+
+  if (significand_lsw == 0 && significand_msw == 0) {
+    if (digitCount == 0) {
+      // Malformed hexfloat with no digits after '0x',
+      // BUT '0x' is still a valid zero followed by non-parsed 'x'
+      p = start + 1; // Address of 'x'
+    } else {
+      // Just a regular zero
+    }
+    base2Exponent = info->minBinaryExp;
+  } else {
+    // Normalize to 127 bits
+    if (significand_msw == 0) {
+      if ((significand_lsw >> 63) == 0) {
+        significand_msw = significand_lsw;
+        significand_lsw = 0;
+        base2Exponent -= 64;
+      } else {
+        significand_msw = significand_lsw >> 1;
+        significand_lsw <<= 63;
+        base2Exponent -= 63;
+      }
+    }
+    int normalizeShift = __builtin_clzll(significand_msw) - 1;
+    if (normalizeShift > 0) {
+      significand_msw <<= normalizeShift;
+      significand_msw |= significand_lsw >> (64 - normalizeShift);
+      significand_lsw <<= normalizeShift;
+      base2Exponent -= normalizeShift;
+    }
+    base2Exponent += 127;
+    if (remainder) significand_lsw |= 1;
+
+    if (base2Exponent <= info->maxBinaryExp
+        && base2Exponent >= info->minBinaryExp - info->sigBits + 1) {
+      int fractionBits;
+      uint64_t fraction;
+      if (base2Exponent > info->minBinaryExp) {
+        // Hexfloat normal
+        fractionBits = 127 - info->sigBits;
+      } else {
+        fractionBits = 127 - (base2Exponent - info->minBinaryExp + info->sigBits - 1);
+        base2Exponent = info->minBinaryExp;
+      }
+      if (fractionBits < 64) {
+        fraction = significand_lsw << (64 - fractionBits);
+        significand_lsw >>= fractionBits;
+        significand_lsw |= significand_msw << (64 - fractionBits);
+        significand_msw >>= fractionBits;
+      } else {
+        fraction = (significand_msw << (128 - fractionBits))
+          | (significand_lsw >> (fractionBits - 64));
+        if ((significand_lsw << (128 - fractionBits)) != 0) {
+          fraction |= 1;
+        }
+        significand_lsw = significand_msw >> (fractionBits - 64);
+        significand_msw = 0;
+      }
+
+      switch (FENV_ROUNDING_MODE()) {
+#ifdef FE_TOWARDZERO
+      case FE_TOWARDZERO:
+        break;
+#endif
+#ifdef FE_DOWNWARD
+      case FE_DOWNWARD:
+        if (info->negative && (fraction != 0)) {
+          significand_lsw += 1;
+          if (significand_lsw == 0) {
+            significand_msw += 1;
+          }
+        }
+        break;
+#endif
+#ifdef FE_UPWARD
+      case FE_UPWARD:
+        if (!info->negative && (fraction != 0)) {
+          significand_lsw += 1;
+          if (significand_lsw == 0) {
+            significand_msw += 1;
+          }
+        }
+        break;
+#endif
+#ifdef FE_TONEAREST
+      case FE_TONEAREST:
+#endif
+      default: {
+        const uint64_t oneHalf = (uint64_t)1 << 63;
+        if (fraction > oneHalf
+            || (fraction == oneHalf && (significand_lsw & 1))) {
+          significand_lsw += 1;
+          if (significand_lsw == 0) {
+            significand_msw += 1;
+          }
+        }
+        break;
+      }
+      }
+
+      // Rounding up may have caused us to overflow:
+      // For subnormals, overflow to sigBits converts this to a normal
+      // For normal, overflow just needs to be renormalized
+      int overflowBits = (base2Exponent == info->minBinaryExp) ? (info->sigBits - 1) : info->sigBits;
+      if (((overflowBits > 64) && ((significand_msw >> (overflowBits - 64)) != 0))
+          || ((overflowBits == 64) && (significand_msw != 0))
+          || ((overflowBits < 64) && ((significand_msw != 0) || ((significand_lsw >> overflowBits) != 0)))) {
+        if (base2Exponent > info->minBinaryExp) {
+          significand_lsw >>= 1;
+          significand_lsw |= significand_msw << 63;
+          significand_msw >>= 1;
+        }
+        base2Exponent += 1;
+      } else if (base2Exponent == info->minBinaryExp // Subnormal
+                 && fraction != 0) { // Not exact
+        errno = ERANGE;
+      }
+    }
+  }
+
+  if (info->end) *info->end = (char *)(uintptr_t)p;
+  if (base2Exponent > info->maxBinaryExp) {
+    overflow(info);
+  } else if (base2Exponent < info->minBinaryExp - info->sigBits + 1) {
+    underflow(info);
+  } else {
+    switch (info->bytes) {
+#if ENABLE_BINARY16_SUPPORT
+    case 2: {
+      uint16_t exponentBits = (uint16_t)(base2Exponent - info->minBinaryExp);
+      uint16_t raw = (uint16_t)
+        ((info->negative ? 0x8000U : 0U)
+        | (uint16_t)(exponentBits << 10)
+        | (significand_lsw & 0x3ffU));
+      memcpy(info->dest, &raw, sizeof(raw));
+      break;
+    }
+#endif
+#if ENABLE_BINARY32_SUPPORT
+    case 4: {
+      uint32_t exponentBits = (uint32_t)(base2Exponent - info->minBinaryExp);
+      uint32_t raw = (uint32_t)
+        ((info->negative ? 0x80000000UL : 0UL)
+        | (exponentBits << 23U)
+        | (uint32_t)(significand_lsw & 0x7fffffULL));
+      memcpy(info->dest, &raw, sizeof(raw));
+      break;
+    }
+#endif
+#if ENABLE_BINARY64_SUPPORT
+    case 8: {
+      uint64_t exponentBits = (uint64_t)(base2Exponent - info->minBinaryExp);
+      uint64_t raw = (uint64_t)
+        ((info->negative ? 0x8000000000000000ULL : 0UL)
+        | (exponentBits << 52U)
+        | (significand_lsw & 0xfffffffffffffULL));
+      memcpy(info->dest, &raw, sizeof(raw));
+      break;
+    }
+#endif
+#if ENABLE_FLOAT80_SUPPORT
+    case 10: {
+      // TODO: Support big-endian
+      uint16_t exponentBits = (uint16_t)(base2Exponent - info->minBinaryExp);
+      memcpy(info->dest, &significand_lsw, sizeof(significand_lsw));
+      info->dest[8] = exponentBits & 0xff;
+      info->dest[9] = (exponentBits >> 8) | (info->negative ? 0x80 : 0);
+      break;
+    }
+#endif
+#if ENABLE_BINARY128_SUPPORT
+    case 16: {
+      // TODO: Support big-endian
+      uint16_t exponentBits = (uint16_t)(base2Exponent - info->minBinaryExp);
+      memcpy(info->dest, &significand_lsw, sizeof(significand_lsw));
+      memcpy(info->dest + 8, &significand_msw, sizeof(significand_msw));
+      info->dest[14] = exponentBits & 0xff;
+      info->dest[15] = (exponentBits >> 8) | (info->negative ? 0x80 : 0);
+      break;
+    }
+#endif
+    }
+  }
+}
+
+// ================================================================
+// ================================================================
+//
+// NaN parsing
+//
+// ================================================================
+// ================================================================
+
+// Parse a NaN.
+// This implements the same logic as Apple's previous libc strtod.  It
+// recognizes an optional payload between parentheses and uses
+// that to construct a valid NaN return value.
+
+static void parseNan(const unsigned char *start, struct parseInfo *info) {
+  const unsigned char *p = start + 3; // Skip "nan"
+  unsigned char _Alignas(sizeof(mp_word_t)) stackWorkArea[20];
+  memset(stackWorkArea, 0, sizeof(stackWorkArea));
+  const unsigned char *endNan = p;
+  if (*p == '(') {
+    p += 1;
+    unsigned base = 10;
+    if (*p == '0') {
+      p += 1;
+      if (*p == 'x') {
+        base = 16;
+        p += 1;
+      } else {
+        base = 8;
+      }
+    }
+    mp_t stackMP = { (mp_word_t *)stackWorkArea,
+      (mp_word_t *)(stackWorkArea + 16) };
+    mp_t payload = stackMP;
+    while (hexdigit[*p] < base) {
+      multiplyMPByN(&payload, base);
+      addToMP(&payload, hexdigit[*p]);
+      payload.msw = stackMP.msw; // Prune off excess bits
+      p += 1;
+    }
+    if (*p == ')') {
+      p += 1;
+    } else {
+      memset(stackWorkArea, 0, sizeof(stackWorkArea));
+      while (*p != '\0' && *p != ')') {
+        p += 1;
+      }
+      if (*p == ')') {
+        p += 1;
+      } else {
+        p = endNan;
+      }
+    }
+  }
+  // TODO: Endianness.  This only works for little-endian.
+  memcpy(info->dest, stackWorkArea, (size_t)info->bytes);
+  switch (info->bytes) {
+#if ENABLE_BINARY16_SUPPORT
+  case 2: {
+    info->dest[1] = info->dest[1] | (info->negative ? 0xfe : 0x7e);
+    break;
+  }
+#endif
+#if ENABLE_BINARY32_SUPPORT
+  case 4: {
+    info->dest[2] |= 0xc0; // Set quiet bit and low-order exponent bit
+    info->dest[3] = info->negative ? 0xff : 0x7f; // exponent and sign bit
+    break;
+  }
+#endif
+#if ENABLE_BINARY64_SUPPORT
+  case 8: {
+    info->dest[6] |= 0xf8; // Set quiet bit and low-order exponent bits
+    info->dest[7] = info->negative ? 0xff : 0x7f; // exponent and sign bit
+    break;
+  }
+#endif
+#if ENABLE_FLOAT80_SUPPORT
+  case 10: {
+    info->dest[7] |= 0xc0; // Set bit 63 and quiet bit
+    info->dest[8] = 0xff;
+    info->dest[9] = info->negative ? 0xff : 0x7f;
+    break;
+  }
+#endif
+#if ENABLE_BINARY128_SUPPORT
+  case 16: {
+    info->dest[13] |= 0x80; // Set quiet bit
+    info->dest[14] = 0xff;
+    info->dest[15] = info->negative ? 0xff : 0x7f;
+    break;
+  }
+#endif
+  }
+  if (info->end) *info->end = (char *)(uintptr_t)p;
+}
+
+// This is used as the initial parse for all formats.
+//
+// It verifies the format and handles accordingly:
+// * hexFloat is parsed by calling hexFloat() above
+// * NaN payloads are parse by calling parseNan() above
+// * Infinity and NaN w/o payload are parsed directly
+// * True zero is parsed directly
+// The above are fully handled within this function and
+// it returns zero (false) to flag that there's nothing
+// more to do.
+//
+// For a decimal input, it collects the digits into a 64-bit
+// accumulator and returns that and the parsed exponent.
+// If there are <= 19 digits, this is enough to generate
+// a final result directly.
+//
+// If there are more than 19 digits, this logic first lets the
+// accumulator overflow.  After we detect the overflow, we re-parse
+// the first 19 digits and return those.  Callers who need more digits
+// will have to use firstUnparsedDigit and digitCount to re-parse the
+// significand (they can ignore the decimal point, though, since that
+// has already been factored into the base10Exponent).
+static int
+fastParse64(struct parseInfo *info) {
+  const unsigned char *p = (const unsigned char *)info->start;
+
+  uint64_t digits = 0;
+  int digitCount = 0;
+  int base10Exponent = 0;
+  info->negative = 0;
+
+  // Skip leading whitespace.  Stop at a +/-/digit or other character.
+  // This is a little oddly phrased in order to avoid a pointless call
+  // to `isspace` for common cases.
+  while (1) {
+    if (*p >= '0' && *p <= '9') {
+      break;
+    } else if (*p == '-') {
+      info->negative = 1;
+      p += 1;
+      break;
+    } else if (*p == '+') {
+      p += 1;
+      break;
+    } else if (*p == ' ' || isspace(*p)) {
+      p += 1;
+    } else {
+      break;
+    }
+  }
+
+  if (*p == '0') {
+    if (p[1] == 'x' || p[1] == 'X') {
+      hexFloat(p, info);
+      return 0;
+    }
+    while (*p == '0') {
+      p += 1;
+    }
+  } else if (*p == 'i' || *p == 'I') {
+    if ((p[1] == 'n' || p[1] == 'N')
+        && (p[2] == 'f' || p[2] == 'F')) {
+      if ((p[3] == 'i' || p[3] == 'I')
+          && (p[4] == 'n' || p[4] == 'N')
+          && (p[5] == 'i' || p[5] == 'I')
+          && (p[6] == 't' || p[6] == 'T')
+          && (p[7] == 'y' || p[7] == 'Y')) {
+        p += 8;
+      } else {
+        p += 3;
+      }
+      // Matched 'inf' or 'infinity' case-insensitive
+      if (info->end) *info->end = (char *)(uintptr_t)p;
+      infinity(info);
+      return 0;
+    }
+    goto fail;
+  } else if (*p == 'n' || *p == 'N') {
+    if ((p[1] == 'a' || p[1] == 'A')
+        && (p[2] == 'n' || p[2] == 'N')) {
+      parseNan(p, info);
+      return 0;
+    }
+    goto fail;
+  } else if (*p < '0' || *p > '9') {
+    // If this isn't a hexfloat, nan, or infinity and does
+    // start with a digit, it must start with a decimal point,
+    // and that decimal point must be immediately followed
+    // by a digit.
+    if (info->loc == strtofp_C_locale) {
+      // Decimal point is '.' in C locale, avoid calling localeconv_l
+      if (*p == '.') {
+        p += 1;
+        if (*p >= '0' && *p <= '9') {
+          goto parseFraction;
+        }
+      }
+    } else {
+      // Look up decimal point in locale
+      const unsigned char *decimalPoint = strtofp_locale_decimal_point(info->loc);
+      if (decimalPoint[1] == '\0') {
+        if (decimalPoint[0] == *p) {
+          p += 1;
+          if (*p >= '0' && *p <= '9') {
+            goto parseFraction;
+          }
+        }
+      } else {
+        // Multi-byte decimal point
+        int matchedDecimalPoint = (1 == 1);
+        for (const unsigned char *d = decimalPoint; *d; d++) {
+          matchedDecimalPoint &= (*p == *d);
+          p++;
+        }
+        if (matchedDecimalPoint && *p >= '0' && *p <= '9') {
+          goto parseFraction;
+        }
+      }
+    }
+    goto fail;
+  }
+
+  // Collect digits before the decimal point
+  const unsigned char *firstUnparsedDigit = p;
+  {
+    uint8_t t = (uint8_t)(*p - '0');
+    if (t < 10) {
+      digits = t;
+      p += 1;
+      t = (uint8_t)(*p - '0');
+      while(t < 10) {
+        digits = 10 * digits + t;
+        p += 1;
+        t = (uint8_t)(*p - '0');
+      }
+      digitCount = (int)(p - firstUnparsedDigit);
+    }
+  }
+
+  // Try to match an optional decimal point
+  if (info->loc == strtofp_C_locale) {
+    // Decimal point is '.' in C locale, avoid calling localeconv_l
+    if (*p == '.') {
+      p += 1;
+      goto parseFraction;
+    }
+    goto maybeParseExponent;
+  } else if (*p == ' ' || *p == '\0' || *p == 'e' || *p == 'E') {
+    // Don't call localeconv_l if we know the next character
+    // cannot possibly be a decimal point.
+    goto maybeParseExponent;
+  } else {
+    // Look up decimal point in locale
+    const unsigned char *decimalPoint = strtofp_locale_decimal_point(info->loc);
+    if (decimalPoint[1] == '\0') {
+      if (decimalPoint[0] == *p) {
+        p += 1;
+        goto parseFraction;
+      }
+    } else {
+      // Multi-byte decimal point
+      const unsigned char *startOfPotentialDecimalPoint = p;
+      int matchedDecimalPoint = (1 == 1);
+      for (const unsigned char *d = decimalPoint; *d; d++) {
+        matchedDecimalPoint &= (*p == *d);
+        p++;
+      }
+      if (matchedDecimalPoint) {
+        goto parseFraction;
+      } else {
+        p = startOfPotentialDecimalPoint;
+        goto maybeParseExponent;
+      }
+    }
+  }
+
+ parseFraction:
+  {
+    const unsigned char *firstDigitAfterDecimalPoint = p;
+    if (digitCount == 0) {
+      while (*p == '0') {
+        p += 1;
+      }
+      // "0.000000001234" has 4 digits
+      firstUnparsedDigit = p;
+      unsigned t = (unsigned)(*p - '0');
+      if (t < 10) {
+        p += 1;
+        digits = t;
+        t = (unsigned)(*p - '0');
+        while (t < 10) {
+          digits = 10 * digits + t;
+          p += 1;
+          t = (unsigned)(*p - '0');
+        }
+      }
+      digitCount = (int)(p - firstUnparsedDigit);
+    } else {
+      // Perf: For canada.txt benchmark, this loop is ~30% of total runtime
+      unsigned t = (unsigned)(*p - '0');
+      if (t < 10) {
+        p += 1;
+        digits = 10 * digits + t;
+        t = (unsigned)(*p - '0');
+        while (t < 10) {
+          p += 1;
+          digits = 10 * digits + t;
+          t = (unsigned)(*p - '0');
+        }
+      }
+      digitCount += p - firstDigitAfterDecimalPoint;
+    }
+    base10Exponent = (int)(firstDigitAfterDecimalPoint - p);
+  }
+
+  // ================================================================
+  // Step 1e: Parse the optional exponent
+ maybeParseExponent:
+  if (*p == 'e' || *p == 'E') {
+    const unsigned char *exponentPhraseStart = p;
+    p += 1;
+    int negativeExponent = 1;
+    if (*p == '-') {
+      negativeExponent = -1;
+      p += 1;
+    } else if (*p == '+') {
+      p += 1;
+    }
+    uint8_t t = (uint8_t)(*p - '0');
+    if (t < 10) {
+      unsigned exp = t;
+      p += 1;
+      t = (uint8_t)(*p - '0');
+      while (t < 10) {
+        p += 1;
+        exp = 10 * exp + t;
+        t = (uint8_t)(*p - '0');
+      }
+      if (p - exponentPhraseStart > 9) {
+        // The exponent text was unusually long... re-parse
+        // it more carefully to see if it really should overflow.
+        const unsigned char *q = exponentPhraseStart + 1;
+        if (*q == '-' || *q == '+') {
+          q += 1;
+        }
+        while (*q == '0') {
+          q += 1;
+        }
+        if (p - q > 8) {
+          // If there were more than 8 digits with leading zeros
+          // excluded, we've definitely overflowed.
+          exp = 99999999;
+        }
+      }
+      base10Exponent += (int)exp * negativeExponent;
+    } else {
+      p = exponentPhraseStart;
+    }
+  }
+
+  if (info->end) *info->end = (char *)(uintptr_t)p;
+
+  // No non-zero digits, must be an explicit zero:
+  // "0", ".000", "0.0", "0e0", "0.0e999", etc.
+  if (digitCount == 0) {
+    memset(info->dest, 0, info->bytes);
+    info->dest[info->bytes - 1] = info->negative ? 0x80 : 0;
+    return 0;
+  }
+
+  // Coarse over/underflow check
+  if (base10Exponent + digitCount < info->minDecimalExp) {
+    underflow(info);
+    return 0;
+  }
+  if (base10Exponent + digitCount > info->maxDecimalExp) {
+    overflow(info);
+    return 0;
+  }
+
+  int unparsedDigitCount = 0;
+  if (digitCount > 19) {
+    digits = 0;
+    int i = 0;
+    const unsigned char *q = firstUnparsedDigit;
+    while (i < 19) {
+      // Note: Skip non-digit chars (e.g., decimal point)
+      unsigned t = (unsigned)(*q - '0');
+      if (t < 10) {
+        digits = digits * 10 + t;
+        i += 1;
+      }
+      q += 1;
+    }
+    firstUnparsedDigit = q;
+    unparsedDigitCount = digitCount - 19;
+  } else {
+    firstUnparsedDigit = NULL;
+  }
+
+  info->digitCount = digitCount;
+  info->firstUnparsedDigit = firstUnparsedDigit;
+  info->unparsedDigitCount = unparsedDigitCount;
+  info->digits = digits;
+  info->base10Exponent = base10Exponent;
+
+  return 1; // Regular decimal case...
+
+ fail:
+  if (info->end) *info->end = (char *)(uintptr_t)info->start;
+  memset(info->dest, 0, info->bytes);
+  return 0;
+}
+
+// ================================================================
+// ================================================================
+//
+// Parse an IEEE 754 Binary16 (aka "Half")
+//
+// ================================================================
+// ================================================================
+
+#if ENABLE_BINARY16_SUPPORT
+static void _ffpp_strtoencf16_l(unsigned char *dest, const char *start, char **end, strtofp_locale_t loc) {
+  static const int bytes = 2;
+  static const int sigBits = 11;
+  static const int minBinaryExp = -14;
+  static const int maxBinaryExp = 16;
+  static const int minDecimalExp = -7;
+  static const int maxDecimalExp = 5;
+  static const int maxDecimalMidpointDigits = 22;
+  struct parseInfo info = {
+    .bytes = bytes,
+    .sigBits = sigBits,
+    .minBinaryExp = minBinaryExp,
+    .maxBinaryExp = maxBinaryExp,
+    .minDecimalExp = minDecimalExp,
+    .maxDecimalExp = maxDecimalExp,
+    .maxDecimalMidpointDigits = maxDecimalMidpointDigits,
+    .dest = dest,
+    .start = start,
+    .end = end,
+    .loc = loc,
+  };
+
+  // ================================================================
+  // Parse the input (mostly)
+  // ================================================================
+  if (!fastParse64(&info)) {
+    return;
+  }
+
+  // TODO: Someday, we might implement fast paths for binary16
+  // But the range of binary16 is so small that the varint slow
+  // path is actually reasonably fast.
+
+  // ================================================================
+  // Slow Path (varint calculation)
+  // ================================================================
+  char _Alignas(sizeof(mp_word_t)) stackWorkArea[32];
+  static const size_t stackWorkAreaWords = sizeof(stackWorkArea) / sizeof(mp_word_t);
+  generalSlowpath(&info, FENV_ROUNDING_MODE(), (mp_word_t *)stackWorkArea, stackWorkAreaWords, false);
+}
+#endif
+
+// ================================================================
+// ================================================================
+//
+// Parse an IEEE 754 Binary32 (aka "Float" aka "Single")
+//
+// ================================================================
+// ================================================================
+#if ENABLE_BINARY32_SUPPORT
+static void _ffpp_strtoencf32_l(unsigned char *dest, const char *start, char **end, strtofp_locale_t loc) {
+  static const int bytes = 4;
+  static const int sigBits = 24;
+  static const int minBinaryExp = -126;
+  static const int maxBinaryExp = 128;
+  static const int minDecimalExp = -46;
+  static const int maxDecimalExp = 40;
+  static const int maxDecimalMidpointDigits = 113;
+  struct parseInfo info = {
+    .bytes = bytes,
+    .sigBits = sigBits,
+    .minBinaryExp = minBinaryExp,
+    .maxBinaryExp = maxBinaryExp,
+    .minDecimalExp = minDecimalExp,
+    .maxDecimalExp = maxDecimalExp,
+    .maxDecimalMidpointDigits = maxDecimalMidpointDigits,
+    .dest = dest,
+    .start = start,
+    .end = end,
+    .loc = loc,
+  };
+
+  // ================================================================
+  // Parse the input (mostly)
+  // ================================================================
+  if (!fastParse64(&info)) {
+    return;
+  }
+
+#if FLOAT_IS_BINARY32
+  // ================================================================
+  // Use a single float operation
+  // ================================================================
+  const static float floatPowerOf10[] = {1.0f, 10.0f, 100.0f,
+    1e3f, 1e4f, 1e5f, 1e6f, 1e7f, 1e8f, 1e9f, 1e10f};
+  if (info.base10Exponent > -11 && info.base10Exponent < 11 && info.digitCount < 8) {
+    int32_t sdigits = info.negative ? -(int32_t)info.digits : (int32_t)info.digits;
+    if (info.base10Exponent < 0) {
+      float result = (float)sdigits / floatPowerOf10[-info.base10Exponent];
+      memcpy(info.dest, &result, sizeof(result));
+      return;
+    } else {
+      float result = (float)sdigits * floatPowerOf10[info.base10Exponent];
+      memcpy(info.dest, &result, sizeof(result));
+      return;
+    }
+  }
+#endif
+
+  int roundingMode = FENV_ROUNDING_MODE();
+
+#if 1
+  // ================================================================
+  // Fixed-precision interval arithmetic
+  // ================================================================
+  // The idea:  Use 64-bit fixed-precision arithmetic to compute
+  // upper/lower bounds for the correct answer.  If those bounds
+  // agree, then we can return the result.
+  int16_t exp10;
+  unsigned upperBoundOffset;
+  if (info.digitCount <= 19) {
+    exp10 = (int16_t)info.base10Exponent;
+    upperBoundOffset = 4;
+  } else {
+    exp10 = (int16_t)(info.base10Exponent + info.digitCount - 19);
+    upperBoundOffset = 36;
+  }
+
+  // Powers in the table are rounded so that
+  //    powerOfTenRoundedDown <= true value <= powerOfTenRoundedDown + 1
+  const uint64_t powerOfTenRoundedDown = (powersOf10_Float + 70)[exp10];
+
+  // Binary exponent for the power-of-10 product
+  const int powerOfTenExponent = binaryExponentFor10ToThe(exp10);
+
+  // Normalize the digits, adjust binary exponent
+  int normalizeDigits = __builtin_clzll(info.digits); // 0 <= normalizeDigits <= 4
+  assert(normalizeDigits <= 4 || info.digitCount < 20);
+  uint64_t d = info.digits << normalizeDigits;
+  // For <= 19 digits, the upper bound for d is just d
+  // For > 19 digits, the upper bound is 1 << normalizeDigits <= 16
+  int binaryExponent = powerOfTenExponent - normalizeDigits + 64;
+
+  // A 64-bit lower bound on the binary significand
+  uint64_t l = multiply64x64RoundingDown(powerOfTenRoundedDown, d);
+  // An upper bound:
+  //  <= 19 digits: (powerOfTenRoundedDown + 1) * d == l128 + d
+  //   > 19 digits: (powerOfTenRoundedDown + 1) * (d + 16)
+  //               == l128 + d + 16 * powerOfTenRoundedDown + 16
+  // For <=19 digits, upper bound is l + 2
+  // For >19 digits, upper bound is l + 18
+
+  // Normalize the product, adjust binary exponent
+  // (In particular, this lets us shift by a constant below.)
+  int normalizeProduct = __builtin_clzll(l); // 0 <= normalizeProduct <= 1
+  assert(normalizeProduct <= 1);
+  // Upper bound is <= (l + 2) << 1  or (l + 18) << 1
+  l <<= normalizeProduct;
+  // Upper bound is <= l + 4   or   l + 36
+  binaryExponent -= normalizeProduct;
+
+  // Upper/lower bounds for the 24-bit significand
+  uint64_t u = l + upperBoundOffset;
+  uint32_t lowerSignificand, upperSignificand;
+  switch (roundingMode) {
+#ifdef FE_TOWARDZERO
+  case FE_TOWARDZERO:
+    lowerSignificand = (l) >> 40;
+    upperSignificand = (u) >> 40;
+    break;
+#endif
+#ifdef FE_DOWNWARD
+  case FE_DOWNWARD:
+    if (info.negative) {
+      lowerSignificand = (l + 0x0ffffffffff) >> 40;
+      upperSignificand = (u + 0x10000000000) >> 40;
+    } else {
+      lowerSignificand = (l) >> 40;
+      upperSignificand = (l + 4) >> 40;
+    }
+    break;
+#endif
+#ifdef FE_UPWARD
+  case FE_UPWARD:
+    if (!info.negative) {
+      lowerSignificand = (l + 0x0ffffffffff) >> 40;
+      upperSignificand = (u + 0x10000000000) >> 40;
+    } else {
+      lowerSignificand = (l) >> 40;
+      upperSignificand = (u) >> 40;
+    }
+    break;
+#endif
+#ifdef FE_TONEAREST
+  case FE_TONEAREST:
+#endif
+  default:
+    // Instead of worrying about exact ties-round-even, round lower
+    // down (adding 0x7ff...ff) and upper up (adding 0x800...00) so
+    // that exact ties fall through to be handled elsewhere.
+    lowerSignificand = (l + 0x7fffffffff) >> 40;
+    upperSignificand = (u + 0x8000000000) >> 40;
+  }
+  if (lowerSignificand == 0) { // lowerSignificand wrapped...
+    binaryExponent += 1;
+  }
+  if (binaryExponent > maxBinaryExp) {
+    overflow(&info);
+    return;
+  } else if (binaryExponent <= minBinaryExp) {
+    if (binaryExponent <= minBinaryExp - sigBits) {
+      underflow(&info);
+      return;
+    }
+    // TODO: ... Subnormal? ...
+  } else if (upperSignificand == lowerSignificand) {
+    uint32_t exponentBits = (uint32_t)(binaryExponent - minBinaryExp) << (sigBits - 1);
+    uint32_t significandMask = (((uint32_t)1 << (sigBits - 1)) - 1);
+    uint32_t significandBits = lowerSignificand & significandMask;
+    uint32_t signbit = info.negative ? 0x80000000UL : 0ULL;
+    uint32_t raw = signbit | exponentBits | significandBits;
+    memcpy(info.dest, &raw, sizeof(raw));
+    return;
+  }
+#endif
+
+  // ================================================================
+  // Slow Path (varint calculation)
+  // ================================================================
+  char _Alignas(sizeof(mp_word_t)) stackWorkArea[128];
+  static const size_t stackWorkAreaWords = sizeof(stackWorkArea) / sizeof(mp_word_t);
+  generalSlowpath(&info, FENV_ROUNDING_MODE(), (mp_word_t *)stackWorkArea, stackWorkAreaWords, false);
+}
+#endif
+
+// ================================================================
+// ================================================================
+//
+// Parse an IEEE 754 Binary64 (Double)
+//
+// ================================================================
+// ================================================================
+
+#if ENABLE_BINARY64_SUPPORT
+static void _ffpp_strtoencf64_l(unsigned char *dest, const char *start, char **end, strtofp_locale_t loc) {
+  static const int bytes = 8;
+  static const int sigBits = 53;
+  static const int minBinaryExp = -1022;
+  static const int maxBinaryExp = 1024;
+  static const int minDecimalExp = -325;
+  static const int maxDecimalExp = 310;
+  static const int maxDecimalMidpointDigits = 768;
+  struct parseInfo info = {
+    .bytes = bytes,
+    .sigBits = sigBits,
+    .minBinaryExp = minBinaryExp,
+    .maxBinaryExp = maxBinaryExp,
+    .minDecimalExp = minDecimalExp,
+    .maxDecimalExp = maxDecimalExp,
+    .maxDecimalMidpointDigits = maxDecimalMidpointDigits,
+    .dest = dest,
+    .start = start,
+    .end = end,
+    .loc = loc,
+  };
+
+  // ================================================================
+  // Parse the input (mostly)
+  // ================================================================
+  if (!fastParse64(&info)) {
+    return;
+  }
+
+  // If digitCount <= 19, then the result we want is:
+  //  (info.negative ? -1 : 1) * info.digits * 10^info.base10Exponent
+
+  // The rest of this function consists of several different methods
+  // of computing this product with varying trade-offs of input range
+  // and performance.  The first ones are fast but only work for
+  // certain inputs; the later ones are slower and more general.
+
+  // ================================================================
+  // Floating-point Calculation
+  // ================================================================
+  // Note: This optimization relies on the host platform `double` type
+  // supporting true IEEE754 binary64 arithmetic.
+#if DOUBLE_IS_BINARY64 && (FLT_EVAL_METHOD == 0 || FLT_EVAL_METHOD == 1)
+
+  const static double doublePowerOf10[] = { 1.0, 10.0, 100.0, 1e3, 1e4, 1e5,
+    1e6, 1e7, 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16,
+    1e17, 1e18, 1e19, 1e20, 1e21, 1e22 };
+
+  if (info.base10Exponent >= -22 && info.base10Exponent <= 18) {
+    if (info.base10Exponent < 0) {
+      // We can give exact inputs to a division operation if
+      // we have <= 15 digits and base10Exponent is in [-22..0]
+      // For example, this handles 1.23456789012345
+      if (info.digitCount <= 15) {
+        int64_t sdigits = info.negative ? -(int64_t)info.digits : (int64_t)info.digits;
+        double result = (double)sdigits / doublePowerOf10[-info.base10Exponent];
+        memcpy(info.dest, &result, sizeof(result));
+        return;
+      }
+      // TODO: I really want to handle 35.678912345678934 here
+      // (17 digits and negative exponent).  This case is the common
+      // case for Lemire's "canada.txt" benchmark (which is derived from
+      // latitude/longitude data).
+    } else if (info.base10Exponent == 0) {
+      if (info.digitCount <= 19) {
+        if (!info.negative) {
+          // We can use HW conversion of unsigned (positive) int
+          double result = (double)info.digits;
+          memcpy(info.dest, &result, sizeof(result));
+          return;
+        } else if (info.digits <= INT64_MAX) {
+          // We can use HW conversion of signed (negative) int
+          double result = (double)(-(int64_t)info.digits);
+          memcpy(info.dest, &result, sizeof(result));
+          return;
+        }
+      }
+    } else {
+      if (info.digitCount <= 19) {
+        // We can give exact inputs to fma if we have <= 19 digits
+        // and base10Exponent is in [1..18].
+        uint64_t lowMask = 0x7ff;
+        uint64_t highMask = ~lowMask;
+        // digits has <= 64 bits (19 digits)
+        double highDigits = (double)(info.digits & highMask); // <= 53 bits
+        double lowDigits = (double)(info.digits & lowMask); // 11 bits
+        double p10 = doublePowerOf10[info.base10Exponent]; // Exact (10^18 has 42 bits)
+        if (info.negative) p10 = -p10;
+        double u = lowDigits * p10; // Exact (11 bits + 42 bits)
+        // Inputs to fma are all exact, so result is correctly rounded
+        double result = __builtin_fma(highDigits, p10, u);
+        memcpy(info.dest, &result, sizeof(result));
+        return;
+      }
+    }
+  }
+#endif
+
+  int roundingMode = FENV_ROUNDING_MODE();
+
+#if 1
+  // ================================================================
+  // Fixed-width interval arithmetic
+  // ================================================================
+  int16_t exp10;
+  // Total possible inaccuracy in our calculations below
+  unsigned intervalWidth;
+  if (info.digitCount <= 19) {
+    exp10 = (int16_t)info.base10Exponent;
+    // We have all the digits, so our upper/lower bounds
+    // on the significand are the same, which makes the final
+    // bounds fairly tight.
+    intervalWidth = 12;
+  } else {
+    // We're going to treat the input as if it were:
+    //    (first 19 digits).(remaining digits) * 10^p
+    // In this form, (first 19 digits) is a lower bound for the decimal
+    // significand and (first 19 digits) + 1 is an upper bound. We also
+    // have upper/lower bounds for 10^p.
+    // info.digits already has the first 19 digits, we just need to
+    // adjust the effective exponent.
+    exp10 = (int16_t)(info.base10Exponent + info.digitCount - 19);
+    // The non-trivial significand bounds lead to a wider final interval.
+    // This still gives us success ~96% of the time.
+    intervalWidth = 80;
+  }
+
+  // Multiply an exact value for 10^{0..27} times a rounded value 10^{n * 28}
+  const int16_t coarseIndex = (exp10 * 585 + 256) >> 14; // exp10 / 28;
+  const int16_t coarsePower = (coarseIndex * 32) - (coarseIndex * 4); // coarseIndex * 28
+  const int16_t exactPower = exp10 - coarsePower; // exp10 % 28
+  const uint64_t exact = powersOf10_Exact64[exactPower]; // Exact
+  const uint64_t coarse = (powersOf10_CoarseBinary64 + 15)[coarseIndex];
+
+  // Powers in the coarse table are never rounded up, so
+  //    coarse <= true value <= coarse + 1
+  const uint64_t powerOfTenRoundedDown = multiply64x64RoundingDown(coarse, exact);
+  // And powerOfTenRoundedUp
+  //    <= ((coarse + 1) * exact + UINT64_MAX) >> 64
+  //    <= (coarse * exact + exact + UINT64_MAX) >> 64
+  //    <= powerOfTenRoundedDown + 2
+
+  // Binary exponent for the power-of-10 product
+  const int powerOfTenExponent = binaryExponentFor10ToThe(coarsePower)
+    + binaryExponentFor10ToThe(exactPower);
+
+  // Normalize the digits, adjust binary exponent
+  int normalizeDigits = __builtin_clzll(info.digits);
+  assert(normalizeDigits <= 4 || info.digitCount < 20);
+  uint64_t d = info.digits << normalizeDigits;
+  int binaryExponent = powerOfTenExponent - normalizeDigits + 64;
+  // The upper bound for d is:
+  //   exactly d (for <= 19 digit case)
+  //   d + 16 (for > 19 digit case)
+
+  // A 64-bit lower bound on the binary significand
+  uint64_t l = multiply64x64RoundingDown(powerOfTenRoundedDown, d);
+  // Corresponding upper bound:
+  // <= 19 digits:  (powerOfTenRoundedDown + 2) * d == l128 + d + d
+  //  > 19 digits:  (powerOfTenRoundedDown + 2) * (d + 16)
+  // l is a lower bound for the 64-bit binary significand
+  // An upper bound for <= 19 digit case:
+  //    (l128 + d + d + UINT64_MAX) >> 64   <=   l + 3
+  // For >19 digits, we similarly get l + 20 as an upper bound.
+
+  // Normalize the product, adjust binary exponent
+  // (In particular, this lets us shift by a constant below.)
+  int normalizeProduct = __builtin_clzll(l); // 0 <= normalizeProduct <= 2
+  assert(normalizeProduct <= 2);
+  // Upper bound is  (l + 3) or  (l + 20)
+  l <<= normalizeProduct;
+  // Upper bound is (l + 12)  or (l + 80)
+  binaryExponent -= normalizeProduct;
+
+  // Upper/lower bounds for the 53-bit significand, with rounding
+  //
+  // We round by adding an offset and then truncating the bottom 11
+  // bits. For example, to precisely round away from zero, we
+  // would ideally want to add 0x7ff.ffffffffffffffff.  Rounding that
+  // value down/up to the nearest integer gives us 0x7ff, 0x800.
+  unsigned lowerRoundOffset, upperRoundOffset;
+  {
+    bool negative = info.negative;
+    switch (roundingMode) {
+#ifdef FE_UPWARD
+    case FE_UPWARD:
+      negative = !negative;
+      // FALL THROUGH
+#endif
+#ifdef FE_DOWNWARD
+    case FE_DOWNWARD:
+      if (negative) {
+        lowerRoundOffset = 0x7ff; upperRoundOffset = 0x800 + intervalWidth;
+        break;
+      }
+      // FALL THROUGH
+#endif
+#ifdef FE_TOWARDZERO
+    case FE_TOWARDZERO:
+      lowerRoundOffset = 0; upperRoundOffset = intervalWidth;
+      break;
+#endif
+#ifdef FE_TONEAREST
+    case FE_TONEAREST:
+#endif
+    default:
+      (void)negative;
+      lowerRoundOffset = 0x3ff; upperRoundOffset = 0x400 + intervalWidth;
+      break;
+    }
+  }
+  uint64_t lowerSignificand = (l + lowerRoundOffset) >> 11;
+  uint64_t upperSignificand = (l + upperRoundOffset) >> 11;
+
+  int adjustedBinaryExponent = binaryExponent;
+  if (lowerSignificand == 0) { // lowerSignificand wrapped...
+    adjustedBinaryExponent += 1;
+  }
+  if (adjustedBinaryExponent > maxBinaryExp) {
+    overflow(&info);
+    return;
+  } else if (adjustedBinaryExponent <= minBinaryExp) {
+    if (adjustedBinaryExponent <= minBinaryExp - sigBits) {
+      underflow(&info);
+      return;
+    }
+    // Might be subnormal; we need to re-round to the appropriate
+    // number of bits, which requires recomputing the shift amount and
+    // the rounding offsets.  This is a generalized version of the
+    // code above that's also slightly slower due to the variable shifts.
+    int shift = 65 - binaryExponent + (minBinaryExp - sigBits);
+    if (shift < 64) {
+      assert(0 < shift && shift < 64);
+      uint64_t lro, uro;
+      bool negative = info.negative;
+      switch (roundingMode) {
+#ifdef FE_UPWARD
+      case FE_UPWARD:
+        negative = !negative;
+        // FALL THROUGH
+#endif
+#ifdef FE_DOWNWARD
+      case FE_DOWNWARD:
+        if (negative) {
+          lro = (1ULL << (shift)) - 1; uro = lro + intervalWidth; break;
+        }
+        // FALL THROUGH
+#endif
+#ifdef FE_TOWARDZERO
+      case FE_TOWARDZERO:
+        lro = 0; uro = intervalWidth; break;
+#endif
+#ifdef FE_TONEAREST
+      case FE_TONEAREST:
+#endif
+      default:
+        (void)negative;
+        lro = (1ULL << (shift - 1)) - 1; uro = (1ULL << (shift - 1)) + intervalWidth; break;
+      }
+      lowerSignificand = (l + lro) >> shift;
+      upperSignificand = (l + uro) >> shift;
+      if (upperSignificand == lowerSignificand) {
+        if (lowerSignificand == 0) { // lowerSignificand wrapped...
+          lowerSignificand = 1ULL << (64 - shift);
+        }
+        // Subnormal
+        uint64_t signbit = info.negative ? 0x8000000000000000ULL : 0ULL;
+        uint64_t raw = signbit | lowerSignificand;
+        if (raw != 0x0010000000000000ULL) {
+          errno = ERANGE;
+        }
+        memcpy(info.dest, &raw, sizeof(raw));
+        return;
+      }
+    }
+  } else if (upperSignificand == lowerSignificand) {
+    // Normal
+    uint64_t exponentBits = (uint64_t)(adjustedBinaryExponent - minBinaryExp) << (sigBits - 1);
+    uint64_t significandMask = (((uint64_t)1 << (sigBits - 1)) - 1);
+    uint64_t significandBits = lowerSignificand & significandMask;
+    uint64_t signbit = info.negative ? 0x8000000000000000ULL : 0ULL;
+    uint64_t raw = signbit | exponentBits | significandBits;
+    memcpy(info.dest, &raw, sizeof(raw));
+    return;
+  }
+  // TODO: If we fall through, should we try refining the
+  // estimate?  If we can extend `l` to 128 bits with very
+  // little work, it might be worth the effort.
+#endif
+
+  // ================================================================
+  // Slow Path (varint calculation)
+  // ================================================================
+  // Random testing suggests this step only runs about 3% of the
+  // time, so we focus here on optimizing for code size rather than perf.
+  char _Alignas(sizeof(mp_word_t)) stackWorkArea[656];
+  static const size_t stackWorkAreaWords = sizeof(stackWorkArea) / sizeof(mp_word_t);
+  generalSlowpath(&info, roundingMode, (mp_word_t *)stackWorkArea, stackWorkAreaWords, false);
+}
+#endif
+
+// ================================================================
+// ================================================================
+//
+// 128-bit interval calculation for Float80 and Binary128
+//
+// Unlike binary32/64 above (where we optimize for performance),
+// this logic has been generalized to support both float80 and
+// binary128 with common code.  It's a little slower, but these
+// formats are less commonly used, so the code savings are worth
+// it.  (And we're still about 80x faster than gdtoa even with this,
+// so I doubt anyone will complain about the performance here!)
+//
+// ================================================================
+// ================================================================
+
+#if ENABLE_FLOAT80_OPTIMIZATIONS || ENABLE_BINARY128_OPTIMIZATIONS
+#if HAVE_UINT128_T
+typedef __uint128_t my_uint128_t;
+#define create128FromHighLow64(high,low) ((low) + ((__uint128_t)(high) << 64))
+#define multiply128xInt(lhs, rhs) ((lhs) * (rhs))
+#define fullMultiply64x64(lhs, rhs) ((__uint128_t)(lhs) * (rhs))
+#define add128x64(lhs, rhs) ((lhs) + (rhs))
+#define add128x128(lhs, rhs) ((lhs) + (rhs))
+#define extractLow64(x) ((uint64_t)(x))
+#define extractHigh64(x) ((uint64_t)((x) >> 64))
+#define isZero(x) ((x) == 0)
+#define isEqual(lhs, rhs) ((lhs) == (rhs))
+#define shiftLeft(lhs, rhs) ((lhs) << (rhs))
+#define shiftRight(lhs, rhs) ((lhs) >> (rhs))
+#else
+typedef struct { uint64_t low; uint64_t high; } my_uint128_t;
+#define extractLow64(x) ((x).low)
+#define extractHigh64(x) ((x).high)
+#define isZero(x) ((x).low == 0 && (x).high == 0)
+#define isEqual(lhs, rhs) ((lhs).low == (rhs).low && ((lhs).high == (rhs).high))
+static my_uint128_t shiftLeft(my_uint128_t lhs, int rhs) {
+  if (rhs > 64) {
+    lhs.high = lhs.low << (rhs - 64);
+    lhs.low = 0;
+  } else if (rhs == 64) {
+    lhs.high = lhs.low;
+    lhs.low = 0;
+  } else if (rhs > 0) {
+    lhs.high = (lhs.high << rhs) + (lhs.low >> (64 - rhs));
+    lhs.low <<= rhs;
+  }
+  return lhs;
+}
+static my_uint128_t shiftRight(my_uint128_t lhs, int rhs) {
+  if (rhs > 64) {
+    lhs.low = lhs.high >> (rhs - 64);
+    lhs.high = 0;
+  } else if (rhs == 64) {
+    lhs.low = lhs.high;
+    lhs.high = 0;
+  } else if (rhs > 0) {
+    lhs.low = (lhs.low >> rhs) + (lhs.high << (64 - rhs));
+    lhs.high >>= rhs;
+  }
+  return lhs;
+}
+static my_uint128_t create128FromHighLow64(uint64_t high, uint64_t low) {
+  my_uint128_t result = {low, high};
+  return result;
+}
+static my_uint128_t add128x64(my_uint128_t lhs, uint64_t rhs) {
+  if (lhs.low > UINT64_MAX - rhs) {
+    lhs.high += 1;
+  }
+  lhs.low += rhs;
+  return lhs;
+}
+static my_uint128_t add128x128(my_uint128_t lhs, my_uint128_t rhs) {
+  if (lhs.low > UINT64_MAX - rhs.low) {
+    lhs.high += 1;
+  }
+  lhs.low += rhs.low;
+  lhs.high += rhs.high;
+  return lhs;
+}
+static my_uint128_t fullMultiply64x64(uint64_t lhs, uint64_t rhs) {
+  uint64_t a = (lhs >> 32) * (rhs >> 32);
+  uint64_t b = (lhs >> 32) * (rhs & UINT32_MAX);
+  uint64_t c = (lhs & UINT32_MAX) * (rhs >> 32);
+  uint64_t d = (lhs & UINT32_MAX) * (rhs & UINT32_MAX);
+  b += (c & UINT32_MAX) + (d >> 32);
+  return create128FromHighLow64(a + (b >> 32) + (c >> 32),
+                                (b << 32) + (d & UINT32_MAX));
+}
+static my_uint128_t multiply128xInt(my_uint128_t lhs, int rhs) {
+  uint64_t a = (lhs.low & UINT32_MAX) * (uint64_t)rhs;
+  uint64_t b = (lhs.low >> 32) * (uint64_t)rhs;
+  b += (a >> 32);
+  lhs.high = (lhs.high * (uint64_t)rhs) + (b >> 32);
+  lhs.low = (a & UINT32_MAX) + (b << 32);
+  return lhs;
+}
+#endif
+static my_uint128_t multiply128x128RoundingDown(my_uint128_t lhs, my_uint128_t rhs) {
+  my_uint128_t a = fullMultiply64x64(extractHigh64(lhs), extractHigh64(rhs));
+  my_uint128_t b = fullMultiply64x64(extractHigh64(lhs), extractLow64(rhs));
+  my_uint128_t c = fullMultiply64x64(extractLow64(lhs), extractHigh64(rhs));
+  my_uint128_t d = fullMultiply64x64(extractLow64(lhs), extractLow64(rhs));
+  b = add128x64(b, extractLow64(c));
+  b = add128x64(b, extractHigh64(d));
+  a = add128x64(a, extractHigh64(b));
+  a = add128x64(a, extractHigh64(c));
+  return a;
+}
+
+static my_uint128_t getPowerOfTenRoundedDown(int p, int *exponent) {
+  my_uint128_t result;
+  int e;
+
+  // If power is < 0, multiply by 10^-5040 and adjust p
+  // That lets us use a table with only positive powers (half the size)
+  if (p < 0) {
+    result = create128FromHighLow64(0xb2d31bf022977fd8ULL, 0xbf034c011f5000deULL);
+    p += 5040;
+    e = binaryExponentFor10ToThe(-5040);
+  } else {
+    result = create128FromHighLow64(1ULL << 63, 0);
+    e = 1;
+  }
+
+  int finePower = p % 56;
+  my_uint128_t fine;
+  p -= finePower;
+  if (finePower <= 27) {
+    fine = create128FromHighLow64(powersOf10_Exact64[finePower], 0);
+  } else if (finePower <= 54) {
+    fine = fullMultiply64x64(powersOf10_Exact64[finePower - 27],
+                             0xcecb8f27f4200f3aULL); // 10^27
+    if ((extractHigh64(fine) >> 63) == 0) {
+      fine = shiftLeft(fine, 1);
+    }
+  } else {
+    fine = create128FromHighLow64(0xd0cf4b50cfe20765ULL, 0xfff4b4e3f741cf6dULL);
+  }
+  e += binaryExponentFor10ToThe(finePower);
+  result = multiply128x128RoundingDown(result, fine);
+
+  int coarseIndex = p / 56;
+  const uint64_t *c = powersOf10_Binary128 + coarseIndex * 2;
+  my_uint128_t coarse = create128FromHighLow64(c[1], c[0]);
+  e += binaryExponentFor10ToThe(coarseIndex * 56);
+  result = multiply128x128RoundingDown(result, coarse);
+
+  *exponent = e;
+  return result;
+}
+
+static int highPrecisionIntervalPath(struct parseInfo *info, int roundingMode) {
+  int16_t exp10;
+  unsigned upperBoundOffset;
+  if (info->digitCount <= 38) {
+    exp10 = (int16_t)info->base10Exponent;
+    upperBoundOffset = 16; // FIXME
+  } else {
+    exp10 = (int16_t)(info->base10Exponent + info->digitCount - 38);
+    upperBoundOffset = 272;  // FIXME
+  }
+
+  my_uint128_t digits = create128FromHighLow64(0, info->digits);
+  int normalizeDigits;
+  if (info->digitCount <= 19) {
+    normalizeDigits = __builtin_clzll(info->digits) + 64;
+  } else {
+    int remaining = info->digitCount - 19;
+    if (remaining > 19) { remaining = 19; }
+    const unsigned char *p = info->firstUnparsedDigit;
+    for (int i = 0; i < remaining; i++) {
+      // Skip decimal point (depends on locale, can be any non-digit!)
+      while (*p < '0' || *p > '9') {
+        p++;
+      }
+      digits = multiply128xInt(digits, 10);
+      digits = add128x64(digits, (uint64_t)(*p - '0'));
+      p += 1;
+    }
+    if (extractHigh64(digits) != 0) {
+      normalizeDigits = __builtin_clzll(extractHigh64(digits));
+    } else {
+      normalizeDigits = __builtin_clzll(extractLow64(digits)) + 64;
+    }
+  }
+  assert(normalizeDigits <= 5 || info->digitCount <= 38);
+  digits = shiftLeft(digits, normalizeDigits);
+  int binaryExponent = 128 - normalizeDigits;
+  // For <= 38 digits, the upper bound for d is just d
+  // For > 38 digits, the upper bound is 1 << normalizeDigits <= 32
+
+  int powerOfTenExponent;
+  const my_uint128_t powerOfTenRoundedDown = getPowerOfTenRoundedDown(exp10, &powerOfTenExponent);
+  binaryExponent += powerOfTenExponent;
+  //    powerOfTenRoundedDown <= true value <= powerOfTenRoundedDown + 2
+
+  // A 128-bit lower bound on the binary significand
+  my_uint128_t l = multiply128x128RoundingDown(powerOfTenRoundedDown, digits);
+
+  // (In particular, this lets us shift by a constant below.)
+  int normalizeProduct = __builtin_clzll(extractHigh64(l)); // 0 <= normalizeProduct <= 1
+  // Upper bound is <= (l + 2) or (l + 34)
+  assert(normalizeProduct <= 3);
+  l = shiftLeft(l, normalizeProduct);
+  // Upper bound is <= l + 16   or   l + 272
+  binaryExponent -= normalizeProduct;
+
+  // Upper/lower bounds for the 64-bit significand
+  my_uint128_t u = add128x64(l, upperBoundOffset);
+  bool negative = info->negative;
+  switch (roundingMode) {
+#ifdef FE_DOWNWARD
+  case FE_DOWNWARD:
+    negative = !negative;
+    // FALL THROUGH
+#endif
+#ifdef FE_UPWARD
+  case FE_UPWARD:
+    if (!negative) {
+      my_uint128_t offset = create128FromHighLow64(UINT64_MAX, UINT64_MAX);
+      offset = shiftRight(offset, info->sigBits);
+      l = add128x128(l, offset);
+      u = add128x128(u, offset);
+      u = add128x64(u, 1);
+      break;
+    }
+    // FALL THROUGH
+#endif
+#ifdef FE_TOWARDZERO
+  case FE_TOWARDZERO:
+    break;
+#endif
+#ifdef FE_TONEAREST
+  case FE_TONEAREST:
+#endif
+  default: {
+    (void)negative;
+    my_uint128_t offset = create128FromHighLow64(UINT64_MAX, UINT64_MAX);
+    offset = shiftRight(offset, info->sigBits + 1);
+    l = add128x128(l, offset);
+    u = add128x128(u, offset);
+    u = add128x64(u, 1);
+  }
+  }
+  my_uint128_t lowerSignificand = shiftRight(l, (128 - info->sigBits));
+  my_uint128_t upperSignificand = shiftRight(u, (128 - info->sigBits));
+  if (isZero(lowerSignificand)) { // lowerSignificand wrapped...
+    binaryExponent += 1;
+    lowerSignificand = create128FromHighLow64(1ULL << 63, 0);
+    lowerSignificand = shiftRight(lowerSignificand, 128 - info->sigBits);
+  }
+  if (binaryExponent > info->maxBinaryExp) {
+    overflow(info);
+    return 1;
+  } else if (binaryExponent <= info->minBinaryExp) {
+    if (binaryExponent <= info->minBinaryExp - info->sigBits) {
+      underflow(info);
+      return 1;
+    }
+    // TODO: ... Subnormal? ...
+  } else if (isEqual(upperSignificand, lowerSignificand)) {
+    switch (info->bytes) {
+    case 10: {
+      uint16_t signbit = info->negative ? 0x8000U : 0U;
+      uint16_t exponentBits = signbit | (uint16_t)(binaryExponent - info->minBinaryExp);
+      uint64_t significandBits = extractLow64(lowerSignificand);
+      memcpy(info->dest, &significandBits, sizeof(significandBits));
+      memcpy(info->dest + 8, &exponentBits, sizeof(exponentBits));
+      return 1;
+    }
+    case 16: {
+      uint16_t signbit = info->negative ? 0x8000U : 0U;
+      uint16_t exponentBits = signbit | (uint16_t)(binaryExponent - info->minBinaryExp);
+      memcpy(info->dest, &lowerSignificand, sizeof(lowerSignificand));
+      memcpy(info->dest + 14, &exponentBits, sizeof(exponentBits));
+      return 1;
+    }
+    }
+  }
+  return 0;
+}
+#endif
+
+// ================================================================
+// ================================================================
+//
+// Parse an Intel x87 80-bit extended format value
+//
+// ================================================================
+// ================================================================
+
+#if ENABLE_FLOAT80_SUPPORT
+static void _ffpp_strtoencf80_l(unsigned char *dest, const char *start, char **end, strtofp_locale_t loc) {
+  static const int bytes = 10;
+  static const int sigBits = 64;
+  static const int minBinaryExp = -16382;
+  static const int maxBinaryExp = 16384;
+  static const int minDecimalExp = -5000;
+  static const int maxDecimalExp = 5000;
+  static const int maxDecimalMidpointDigits = 11515;
+  struct parseInfo info = {
+    .bytes = bytes,
+    .sigBits = sigBits,
+    .minBinaryExp = minBinaryExp,
+    .maxBinaryExp = maxBinaryExp,
+    .minDecimalExp = minDecimalExp,
+    .maxDecimalExp = maxDecimalExp,
+    .maxDecimalMidpointDigits = maxDecimalMidpointDigits,
+    .dest = dest,
+    .start = start,
+    .end = end,
+    .loc = loc,
+  };
+
+  // ================================================================
+  // Parse the input (mostly)
+  // ================================================================
+  if (!fastParse64(&info)) {
+    return;
+  }
+
+#if ENABLE_FLOAT80_OPTIMIZATIONS && LONG_DOUBLE_IS_FLOAT80
+  // ================================================================
+  // Use a single float80 operation when we can
+  // ================================================================
+  const static long double longDoublePowerOf10[] = {1.0L, 10.0L, 100.0L,
+    1e3L, 1e4L, 1e5L, 1e6L, 1e7L, 1e8L, 1e9L, 1e10L, 1e11L, 1e12L,
+    1e13L, 1e14L, 1e15L, 1e16L, 1e17L, 1e18L, 1e19L, 1e20L, 1e21L,
+    1e22L, 1e23L, 1e24L, 1e25L, 1e26L, 1e27L};
+  if (info.base10Exponent > -28 && info.base10Exponent < 28 && info.digitCount <= 19) {
+    if (info.base10Exponent < 0) {
+      long double p = longDoublePowerOf10[-info.base10Exponent];
+      if (info.negative) p = -p;
+      long double result = (long double)info.digits / p;
+      memcpy(info.dest, &result, sizeof(result));
+      return;
+    } else {
+      long double p = longDoublePowerOf10[info.base10Exponent];
+      if (info.negative) p = -p;
+      long double result = (long double)info.digits * p;
+      memcpy(info.dest, &result, sizeof(result));
+      return;
+    }
+  }
+#endif
+
+  int roundingMode = FENV_ROUNDING_MODE();
+
+#if ENABLE_FLOAT80_OPTIMIZATIONS
+  if (highPrecisionIntervalPath(&info, roundingMode)) {
+    return;
+  }
+#endif
+
+  // ================================================================
+  // Slow Path (varint calculation)
+  // ================================================================
+  char _Alignas(sizeof(mp_word_t)) stackWorkArea[1536];
+  static const size_t stackWorkAreaWords = sizeof(stackWorkArea) / sizeof(mp_word_t);
+  generalSlowpath(&info, roundingMode, (mp_word_t *)stackWorkArea, stackWorkAreaWords, true);
+}
+#endif
+
+// ================================================================
+// ================================================================
+//
+// Parse an IEEE 754 Binary128
+//
+// ================================================================
+// ================================================================
+
+#if ENABLE_BINARY128_SUPPORT
+static void _ffpp_strtoencf128_l(unsigned char *dest, const char *start, char **end, strtofp_locale_t loc) {
+  static const int bytes = 16;
+  static const int sigBits = 113;
+  static const int minBinaryExp = -16382;
+  static const int maxBinaryExp = 16384;
+  static const int minDecimalExp = -5000;
+  static const int maxDecimalExp = 5000;
+  static const int maxDecimalMidpointDigits = 11564;
+  struct parseInfo info = {
+    .bytes = bytes,
+    .sigBits = sigBits,
+    .minBinaryExp = minBinaryExp,
+    .maxBinaryExp = maxBinaryExp,
+    .minDecimalExp = minDecimalExp,
+    .maxDecimalExp = maxDecimalExp,
+    .maxDecimalMidpointDigits = maxDecimalMidpointDigits,
+    .dest = dest,
+    .start = start,
+    .end = end,
+    .loc = loc,
+  };
+
+  // ================================================================
+  // Parse the input (mostly)
+  // ================================================================
+  if (!fastParse64(&info)) {
+    return;
+  }
+
+#if LONG_DOUBLE_IS_BINARY128 && ENABLE_BINARY128_OPTIMIZATIONS
+  // ================================================================
+  // Use a single binary128 operation when we can
+  // ================================================================
+  const static long double longDoublePowerOf10[] = {
+    1.0L, 10.0L, 100.0L, 1e3L, 1e4L, 1e5L, 1e6L, 1e7L, 1e8L, 1e9L,
+    1e10L, 1e11L, 1e12L, 1e13L, 1e14L, 1e15L, 1e16L, 1e17L, 1e18L, 1e19L,
+    1e20L, 1e21L, 1e22L, 1e23L, 1e24L, 1e25L, 1e26L, 1e27L, 1e28L, 1e29L,
+    1e30L, 1e31L, 1e32L, 1e33L, 1e34L, 1e35L, 1e36L, 1e37L, 1e38L, 1e39L,
+    1e40L, 1e41L, 1e42L, 1e43L, 1e44L, 1e45L, 1e46L, 1e47L, 1e48L};
+  if (info.base10Exponent > -49 && info.base10Exponent < 49 && info.digitCount <= 19) {
+    if (info.base10Exponent < 0) {
+      long double p = longDoublePowerOf10[-info.base10Exponent];
+      if (info.negative) p = -p;
+      long double result = (long double)info.digits / p;
+      memcpy(info.dest, &result, sizeof(result));
+      return;
+    } else {
+      long double p = longDoublePowerOf10[info.base10Exponent];
+      if (info.negative) p = -p;
+      long double result = (long double)info.digits * p;
+      memcpy(info.dest, &result, sizeof(result));
+      return;
+    }
+  }
+#endif
+
+  int roundingMode = FENV_ROUNDING_MODE();
+
+#if ENABLE_BINARY128_OPTIMIZATIONS
+  if (highPrecisionIntervalPath(&info, roundingMode)) {
+    return;
+  }
+#endif
+
+  // ================================================================
+  // Slow Path (varint calculation)
+  // ================================================================
+  char _Alignas(sizeof(mp_word_t)) stackWorkArea[1536];
+  static const size_t stackWorkAreaWords = sizeof(stackWorkArea) / sizeof(mp_word_t);
+  generalSlowpath(&info, roundingMode, (mp_word_t *)stackWorkArea, stackWorkAreaWords, true);
+}
+#endif
+
+// ================================================================
+// ================================================================
+//
+// Public APIs
+//
+// The public functions exported from this file are all defined
+// in terms of the private `_ffpp_strtoencf**_l` functions defined
+// above.
+//
+// ================================================================
+// ================================================================
+
+// ================================================================
+// Wrappers for Binary16
+#if ENABLE_BINARY16_SUPPORT
+// TS 18661-3 `strtoencf16` API that can be supported on
+// every platform regardless of local FP
+void strtoencf16(unsigned char * restrict encptr,
+                      const char * restrict nptr,
+                      char ** restrict endptr) {
+  _ffpp_strtoencf16_l(encptr, nptr, endptr, strtofp_current_locale());
+}
+#endif
+
+// ================================================================
+// Wrappers for Binary32
+
+#if ENABLE_BINARY32_SUPPORT
+// TS 18661-3 `strtoencf32` API that can be supported on
+// every platform regardless of local FP
+void strtoencf32(unsigned char * restrict encptr,
+                      const char * restrict nptr,
+                      char ** restrict endptr) {
+  _ffpp_strtoencf32_l(encptr, nptr, endptr, strtofp_current_locale());
+}
+#endif
+
+#if ENABLE_BINARY32_SUPPORT && FLOAT_IS_BINARY32
+// ISO C17 `strtof` API
+float strtof(const char * restrict nptr,
+                   char ** restrict endptr) {
+  union { float d; unsigned char raw[4]; } result;
+  _ffpp_strtoencf32_l(result.raw, nptr, endptr, strtofp_current_locale());
+  return result.d;
+}
+#endif
+
+#if ENABLE_BINARY32_SUPPORT && FLOAT_IS_BINARY32 && ENABLE_LOCALE_SUPPORT
+// ISO C17 `strtof_l` API
+float strtof_l(const char * restrict nptr,
+                     char ** restrict endptr,
+                     strtofp_locale_t loc) {
+  union { float d; unsigned char raw[4]; } result;
+  _ffpp_strtoencf32_l(result.raw, nptr, endptr, loc);
+  return result.d;
+}
+#endif
+
+// ================================================================
+// Wrappers for Binary64
+
+#if ENABLE_BINARY64_SUPPORT
+// TS 18661-3 `strtoencf64` API that can be supported on
+// every platform regardless of local FP
+void strtoencf64(unsigned char * restrict encptr,
+                      const char * restrict nptr,
+                      char ** restrict endptr) {
+  _ffpp_strtoencf64_l(encptr, nptr, endptr, strtofp_current_locale());
+}
+#endif
+
+#if ENABLE_BINARY64_SUPPORT && LONG_DOUBLE_IS_BINARY64
+// TS 18661-3 `strtoencf64x` API
+// If `long double` is binary64, we assume Float64x is binary64
+void strtoencf64x(unsigned char * restrict encptr,
+                       const char * restrict nptr,
+                       char ** restrict endptr) {
+  _ffpp_strtoencf64_l(encptr, nptr, endptr, strtofp_current_locale());
+}
+#endif
+
+#if ENABLE_BINARY64_SUPPORT && DOUBLE_IS_BINARY64
+// ISO C17 `strtod` API
+double strtod(const char * restrict nptr,
+                   char ** restrict endptr) {
+  union { double d; unsigned char raw[8]; } result;
+  _ffpp_strtoencf64_l(result.raw, nptr, endptr, strtofp_current_locale());
+  return result.d;
+}
+#endif
+
+#if ENABLE_BINARY64_SUPPORT && DOUBLE_IS_BINARY64 && ENABLE_LOCALE_SUPPORT
+// ISO C17 `strtod_l` API
+double strtod_l(const char * restrict nptr,
+                     char ** restrict endptr,
+                     strtofp_locale_t loc) {
+  union { double d; unsigned char raw[8]; } result;
+  _ffpp_strtoencf64_l(result.raw, nptr, endptr, loc);
+  return result.d;
+}
+#endif
+
+#if ENABLE_BINARY64_SUPPORT && LONG_DOUBLE_IS_BINARY64
+// ISO C17 `strtold` API
+long double strtold(const char * restrict nptr,
+                         char ** restrict endptr) {
+  union { long double d; unsigned char raw[8]; } result;
+  _ffpp_strtoencf64_l(result.raw, nptr, endptr, strtofp_current_locale());
+  return result.d;
+}
+#endif
+
+#if ENABLE_BINARY64_SUPPORT && LONG_DOUBLE_IS_BINARY64 && ENABLE_LOCALE_SUPPORT
+// ISO C17 `strtold_l` API
+long double strtold_l(const char * restrict nptr,
+                           char ** restrict endptr,
+                           strtofp_locale_t loc) {
+  union { long double d; unsigned char raw[8]; } result;
+  _ffpp_strtoencf64_l(result.raw, nptr, endptr, loc);
+  return result.d;
+}
+#endif
+
+// ================================================================
+// Wrappers for Float80
+
+#if ENABLE_FLOAT80_SUPPORT && ENABLE_LOCALE_SUPPORT
+// Non-standard but helpful for testing.
+void strtoencf80_l(unsigned char * restrict encptr,
+                        const char * restrict nptr,
+                        char ** restrict endptr,
+                        strtofp_locale_t loc) {
+  _ffpp_strtoencf80_l(encptr, nptr, endptr, loc);
+}
+#endif
+
+#if ENABLE_FLOAT80_SUPPORT && LONG_DOUBLE_IS_FLOAT80
+// TS 18661-3 `strtoencf64x` API
+// If `long double` is float80, assume `Float64x` is float80
+void strtoencf64x(unsigned char * restrict encptr,
+                       const char * restrict nptr,
+                       char ** restrict endptr) {
+  _ffpp_strtoencf80_l(encptr, nptr, endptr, strtofp_current_locale());
+}
+#endif
+
+#if ENABLE_FLOAT80_SUPPORT && LONG_DOUBLE_IS_FLOAT80
+// ISO C17 `strtold` API
+long double strtold(const char * restrict nptr,
+                         char ** restrict endptr) {
+  union { long double d; unsigned char raw[10]; } result;
+  _ffpp_strtoencf80_l(result.raw, nptr, endptr, strtofp_current_locale());
+  return result.d;
+}
+#endif
+
+#if ENABLE_FLOAT80_SUPPORT && LONG_DOUBLE_IS_FLOAT80 && ENABLE_LOCALE_SUPPORT
+// ISO C17 `strtold` API
+long double strtold_l(const char * restrict nptr,
+                           char ** restrict endptr,
+                           strtofp_locale_t loc) {
+  union { long double d; unsigned char raw[10]; } result;
+  _ffpp_strtoencf80_l(result.raw, nptr, endptr, loc);
+  return result.d;
+}
+#endif
+
+// ================================================================
+// Wrappers for Binary128
+
+#if ENABLE_BINARY128_SUPPORT
+// TS 18661-3 `strtoencf128` API that can be supported on
+// every platform regardless of local FP
+void strtoencf128(unsigned char * restrict encptr,
+                       const char * restrict nptr,
+                       char ** restrict endptr) {
+  _ffpp_strtoencf128_l(encptr, nptr, endptr, strtofp_current_locale());
+}
+#endif
+
+#if ENABLE_BINARY128_SUPPORT && LONG_DOUBLE_IS_BINARY128
+// ISO C17 `strtold` API
+long double strtold(const char * restrict nptr,
+                         char ** restrict endptr) {
+  union { long double d; unsigned char raw[16]; } result;
+  _ffpp_strtoencf128_l(result.raw, nptr, endptr, strtofp_current_locale());
+  return result.d;
+}
+#endif
+
+#if ENABLE_BINARY128_SUPPORT && LONG_DOUBLE_IS_BINARY128 && ENABLE_LOCALE_SUPPORT
+// ISO C17 `strtold_l` API
+long double strtold_l(const char * restrict nptr,
+                           char ** restrict endptr,
+                           strtofp_locale_t loc) {
+  union { long double d; unsigned char raw[16]; } result;
+  _ffpp_strtoencf128_l(result.raw, nptr, endptr, loc);
+  return result.d;
+}
+#endif