swiftlang · stephentyrone · Sep 2, 2021 · xwu · Sep 2, 2021
@@ -104,19 +104,132 @@ extension RandomNumberGenerator {
     upperBound: T
   ) -> T {
     _precondition(upperBound != 0, "upperBound cannot be zero.")
-    // We use Lemire's "nearly divisionless" method for generating random
-    // integers in an interval. For a detailed explanation, see:
-    // https://arxiv.org/abs/1805.10941
-    var random: T = next()
-    var m = random.multipliedFullWidth(by: upperBound)
-    if m.low < upperBound {
-      let t = (0 &- upperBound) % upperBound
-      while m.low < t {
-        random = next()
-        m = random.multipliedFullWidth(by: upperBound)
-      }
+    // Everyone knows that generating an unbiased random integer in a range
+    // 0 ..< upperBound, where upperBound is not a power of two, requires
+    // rejection sampling. What if I told you that Big Random Number has
+    // lied to us for decades, and we have been played for absolute fools?
+    //
+    // Previously Swift used Lemire's "nearly divisionless" method
+    // (https://arxiv.org/abs/1805.10941) for this operation. We instead
+    // now use a novel method that:
+    //
+    // - never divides
+    // - avoids rejection sampling entirely
+    // - achieves a theoretically optimal bound on the amount of randomness
+    //   consumed to generate a sample
+    // - delivers actual performance improvements for most real cases
+    //
+    // Lemire interprets each word from the random source as a fixed-point
+    // number in [0, 1), multiplies by upperBound, and takes the floor. Up
+    // to this point, this is the algorithm suggested by Knuth in TAoCP vol 2,
+    // and as observed by Knuth, it is slightly biased. Lemire cleverly
+    // corrects this bias via rejection sampling, which requires one division
+    // in the general case (hence, "nearly divisionless").
+    //
+    // Our new algorithm takes a different approach. Rather than using
+    // rejection sampling, we observe that the bias decreases exponentially
+    // in the number of bits used for the computation. In the limit we are
+    // interpreting the bitstream from the random source as a uniform real
+    // number r in [0, 1) and ⌊r * upperBound⌋ provides an unbiased sample
+    // in 0 ..< upperBound. The only challenge, then, is to know when we
+    // have computed enough bits of the product to know what the result is.
+    //
+    // Observe that we can split the random stream at any bit position i,
+    // yielding r = r₀ + r₁ with r₀ a fixed-point number in [0,1) and
+    // 0 ≤ r₁ < 2⁻ⁱ. Further observe that:
+    //
+    //    result = ⌊r * upperBound⌋
+    //           = ⌊r₀ * upperBound⌋ + ⌊frac(r₀*upperBound) + r₁*upperBound⌋
+    //
+    // The first term of this expression is Knuth's biased sample, which is
+    // computed with just a full-width multiply.
+    //
+    // If i > log₂(upperBound), both summands in the second term are smaller
+    // than 1, so the second term is either 0 or 1. Applying the bound on r₁,
+    // we see that if frac(r₀ * upperBound) <= 1 - upperBound * 2⁻ⁱ, the
+    // second term is necessarily zero, and the first term is the unbiased
+    // result. Happily, this is _also_ a trivial computation on the low-order
+    // part of the full-width multiply.
+    //
+    // If the test fails, we do not reject the sample, throwing away the bits
+    // we have already consumed from the random source; instead we increase i
+    // by a convenient amount, computing more bits of the product. This is the
+    // criticial improvement; while Lemire has a probability of 1/2 to reject
+    // for each word consumed in the worst case, we have a probability of
+    // terminating of 1/2 for each _bit_ consumed. This reduces the worst-case
+    // expected number of random bits required from O(log₂(upperBound)) to
+    // log₂(upperBound) + O(1), which is optimal[1].
+    //
+    // Of more practical interest, this new algorithm opens an intriguing
+    // possibility: we can compute just 64 extra bits, and have a probability
+    // of 1 - 2⁻⁶⁴ of terminating. This is so close to certainty that we can
+    // simply stop without introducing any measurable bias (detecting any
+    // difference would require about 2¹²⁸ samples, which is prohibitive).
+    // This is a significant performance improvement for slow random
+    // generators, since it asymptotically reduces the number of bits
+    // required by a factor of two for bignums, while matching or reducing
+    // the expected number of bits required for smaller numbers. This is the
+    // algorithm implemented below (the formally-uniform method is not
+    // much more complex to implement and is only a little bit slower, but
+    // there's no reason to do so).
+    //
+    // More intriguing still, this algorithm can be made unconditional by
+    // removing the early out, so that every value computed requires word
+    // size + 64 bits from the stream, which breaks the loop-carried
+    // dependency for fast generators, unlocking vectorization and
+    // parallelization where it was previously impossible. This is an
+    // especially powerful advantage when paired with bitstream generators
+    // that allow skip-ahead such as newer counter-based generators used
+    // in simulations and ML.
+    //
+    // Note that it is _possible_ to employ Lemire's tighter early-out
+    // check that involves a division with this algorithm as well; this
+    // is beneficial in some cases when upperBound is a constant and the
+    // generator is slow, but we do not think it necessary with the new
+    // algorithm and other planned improvements.
+    //
+    // [1] We can actually achieve log₂(upperBound) + ε for any ε > 0 by
+    // generating multiple random samples at once, but that is only of
+    // theoretical interest--it is still interesting, however, since I
+    // don't think anyone has described how to attain it previously.
+    if T.bitWidth < 64 {
+      // Ensure T is at least 64 bits wide, as this simplifies things
+      // somewhat later on, and doesn't cost us anything with the existing
+      // RNG protocol (we might investigate buffered RNGs that let you
+      // draw less than 64 bits at a time at some future point).
+      return T(truncatingIfNeeded: next(upperBound: UInt64(upperBound)))
     }
-    return m.high
+    let (result, fraction) = upperBound.multipliedFullWidth(by: next())
+    // Optional early out: this is a performance win when the generator is
+    // very slow (as with an unbuffered SystemRandomNumberGenerator), but
+    // the unpredictable branch and variable consumption from the bitstream
+    // is a performance hazard for fast generators. We'll keep it for now,
+    // but future changes to allow buffering slow generators should lead
+    // to its removal.
+    if T(fraction) <= 0 &- upperBound { return result }
+    // Compute the product with 64 additional bits of randomness, add that
+    // to the fraction, and adjust result upwards if that sum carries out.
+    let (_, carry) = T(fraction).addingReportingOverflow(
+      multiplyHigh(upperBound, next())
+    )
+    return result + (carry ? 1 : 0)
+  }
+
+  /// Computes a*b >> 64.
+  ///
+  /// Requires T.bitWidth >= 64.
+  @_transparent @usableFromInline
+  internal func multiplyHigh<T:FixedWidthInteger & UnsignedInteger>(
+    _ a: T, _ b: UInt64
+  ) -> T {
+    // This should eventually be made more efficient for user-defined
+    // bignum types, via an explicit nx1 word product operation, but
+    // is great for all standard library types, and still generally
+    // better than the old implementation for bignums.
+    let (phi, plo) = a.multipliedFullWidth(by: T(b))
+    // Return the low 64 bits of phi (the bits above are all zero)
+    // grafted on to the high bitWidth - 64 bits of plo.
+    return phi &<< (T.bitWidth - 64) | T(plo >> 64)
   }
 }