~ chicken-core (chicken-5) 1d6b133c488867b640460ef72855e8b8fe580e6b
commit 1d6b133c488867b640460ef72855e8b8fe580e6b Author: Peter Bex <peter@more-magic.net> AuthorDate: Sun Feb 1 13:31:32 2015 +0100 Commit: Peter Bex <peter@more-magic.net> CommitDate: Sun May 31 14:15:51 2015 +0200 Restore optimized algorithms for multiplication, division and number->string conversion. diff --git a/chicken.h b/chicken.h index 65878c84..86407329 100644 --- a/chicken.h +++ b/chicken.h @@ -418,6 +418,29 @@ static inline int isinf_ld (long double x) # define C_HALF_WORD_SIZE 16 #endif +/* Tunable performance-related constants */ +#ifndef C_KARATSUBA_THRESHOLD +/* This defines when we'll switch from schoolbook to Karatsuba + * multiplication. The smallest of the two numbers determines the + * switch. It is pretty high right now because it generates a bit + * more garbage and GC overhead dominates the algorithmic performance + * gains. If the GC is improved, this can be readjusted. + */ +# define C_KARATSUBA_THRESHOLD 70 +#endif +#ifndef C_BURNIKEL_ZIEGLER_THRESHOLD +/* This defines when to switch from schoolbook to Burnikel-Ziegler + * division. It creates even more garbage than Karatsuba :( + */ +# define C_BURNIKEL_ZIEGLER_THRESHOLD 300 +#endif +#ifndef C_RECURSIVE_TO_STRING_THRESHOLD +/* This threshold is in terms of the expected string length. It + * depends on division speed: if you change the above, change this too. + */ +# define C_RECURSIVE_TO_STRING_THRESHOLD 750 +#endif + /* These might fit better in runtime.c? */ #define C_fitsinbignumhalfdigitp(n) (C_BIGNUM_DIGIT_HI_HALF(n) == 0) #define C_BIGNUM_DIGIT_LENGTH C_WORD_SIZE @@ -1911,6 +1934,7 @@ C_fctexport void C_ccall C_basic_divrem(C_word c, C_word self, C_word k, C_word C_fctexport void C_ccall C_u_integer_divrem(C_word c, C_word self, C_word k, C_word x, C_word y) C_noret; C_fctexport void C_ccall C_u_flo_to_int(C_word c, C_word self, C_word k, C_word x) C_noret; C_fctexport void C_ccall C_u_integer_shift(C_word c, C_word self, C_word k, C_word x, C_word y) C_noret; +C_fctexport void C_ccall C_u_bignum_extract_digits(C_word c, C_word self, C_word k, C_word x, C_word start, C_word end) C_noret; C_fctexport void C_ccall C_u_2_integer_bitwise_and(C_word c, C_word self, C_word k, C_word x, C_word y) C_noret; C_fctexport void C_ccall C_u_2_integer_bitwise_ior(C_word c, C_word self, C_word k, C_word x, C_word y) C_noret; C_fctexport void C_ccall C_u_2_integer_bitwise_xor(C_word c, C_word self, C_word k, C_word x, C_word y) C_noret; diff --git a/library.scm b/library.scm index 16d58c3d..72f9ce57 100644 --- a/library.scm +++ b/library.scm @@ -38,7 +38,7 @@ maximal-string-length find-ratio-between find-ratio make-complex flonum->ratnum ratnum rat+/- minimum-denorm-flonum-expt +maximum-allowed-exponent+ mantexp->dbl ldexp round-quotient - ##sys#string->compnum) + ##sys#string->compnum ##sys#bignum-extract-digits ##sys#internal-gcd) (not inline ##sys#user-read-hook ##sys#error-hook ##sys#signal-hook ##sys#schedule ##sys#default-read-info-hook ##sys#infix-list-hook ##sys#sharp-number-hook ##sys#user-print-hook ##sys#user-interrupt-hook ##sys#step-hook) @@ -746,6 +746,10 @@ EOF ;;; Numeric routines: +;; Abbreviations of paper and book titles used in comments are: +;; [Knuth] Donald E. Knuth, "The Art of Computer Programming", Volume 2 +;; [MpNT] Tiplea at al., "MpNT: A Multi-Precision Number Theory Package" +;; [MCA] Richard P. Brent & Paul Zimmermann, "Modern Computer Arithmetic" (define most-positive-fixnum (foreign-value "C_MOST_POSITIVE_FIXNUM" int)) (define most-negative-fixnum (foreign-value "C_MOST_NEGATIVE_FIXNUM" int)) @@ -1308,6 +1312,48 @@ EOF (loop (##sys#slot args 1) (##sys#*-2 x (##sys#slot args 0))) ) ) ) ) ) ) +(define-inline (%bignum-digit-count b) (##core#inline "C_u_i_bignum_size" b)) +(define ##sys#bignum-extract-digits + (##core#primitive "C_u_bignum_extract_digits")) + +;; Karatsuba multiplication: invoked from C when the two numbers are +;; large enough to make it worthwhile. Complexity is O(n^log2(3)), +;; where n is max(len(x), len(y)). The description in [Knuth, 4.3.3] +;; leaves a lot to be desired. [MCA, 1.3.2] and [MpNT, 3.2] are a bit +;; easier to understand. We assume that length(x) <= length(y). +(define (##sys#bignum-times-karatsuba x y) + (let* ((same? (eqv? x y)) ; Check before calling (abs) + (rs (fx* (##core#inline "C_u_i_integer_signum" x) + (##core#inline "C_u_i_integer_signum" y))) + (x (##sys#integer-abs x)) + (n (%bignum-digit-count y)) + (n/2 (fxshr n 1)) + (bits (fx* n/2 (foreign-value "C_BIGNUM_DIGIT_LENGTH" int))) + (x-hi (##sys#bignum-extract-digits x n/2 #f)) + (x-lo (##sys#bignum-extract-digits x 0 n/2))) + (if same? ; This looks pointless, but reduces garbage + (let* ((a (##sys#*-2 x-hi x-hi)) + (b (##sys#*-2 x-lo x-lo)) + (ab (##sys#--2 x-hi x-lo)) + (c (##sys#*-2 ab ab))) + (##sys#+-2 (##sys#integer-shift a (fxshl bits 1)) + (##sys#+-2 (##sys#integer-shift + (##sys#+-2 b (##sys#--2 a c)) + bits) + b))) + (let* ((y (##sys#integer-abs y)) + (y-hi (##sys#bignum-extract-digits y n/2 #f)) + (y-lo (##sys#bignum-extract-digits y 0 n/2)) + (a (##sys#*-2 x-hi y-hi)) + (b (##sys#*-2 x-lo y-lo)) + (c (##sys#*-2 (##sys#--2 x-hi x-lo) + (##sys#--2 y-hi y-lo)))) + (##sys#*-2 rs (##sys#+-2 (##sys#integer-shift a (fxshl bits 1)) + (##sys#+-2 (##sys#integer-shift + (##sys#+-2 b (##sys#--2 a c)) + bits) + b))))))) + (define (##sys#extended-times x y) (define (nonrat*rat x y) ;; a/b * c/d = a*c / b*d [with b = 1] @@ -1404,6 +1450,102 @@ EOF ((not (number? x)) (##sys#error-bad-number x '/)) (else (##sys#error-bad-number y '/))) ) +;; Burnikel-Ziegler recursive division: Split high number (x) in three +;; or four parts and divide by the lowest number (y), split in two +;; parts. There are descriptions in [MpNT, 4.2], [MCA, 1.4.3] and the +;; paper "Fast Recursive Division" by Christoph Burnikel & Joachim +;; Ziegler is freely available. There is also a description in Karl +;; Hasselstrom's thesis "Fast Division of Integers". +;; +;; The complexity of this is supposedly O(r*s^{log(3)-1} + r*log(s)), +;; where s is the length of a, and r is the length of b (in digits). +;; +;; TODO: See if it's worthwhile to implement "division without remainder" +;; from the Burnikel-Ziegler paper. +(define (##sys#bignum-divrem-burnikel-ziegler x y return-quot? return-rem?) + (define-inline (digit-bits n) + (fx* (foreign-value "C_BIGNUM_DIGIT_LENGTH" int) n)) + (define DIV-LIMIT (foreign-value "C_BURNIKEL_ZIEGLER_THRESHOLD" int)) + + ;; Here and in 2n/1n we pass both b and [b1, b2] to avoid splitting + ;; up the number more than once. + (define (burnikel-ziegler-3n/2n a12 a3 b b1 b2 n) + (receive (q^ r1) + (if (##sys#<-2 (##sys#integer-shift a12 (fxneg (digit-bits n))) b1) + (let* ((n/2 (fxshr n 1)) + (b11 (##sys#bignum-extract-digits b1 n/2 #f)) + (b12 (##sys#bignum-extract-digits b1 0 n/2))) + (burnikel-ziegler-2n/1n a12 b1 b11 b12 n)) + (let ((base*n (digit-bits n))) + (values (##sys#--2 (##sys#integer-shift 1 base*n) 1) ; B^n-1 + (##sys#+-2 (##sys#--2 a12 (##sys#integer-shift b1 base*n)) + b1)))) + (let ((r1a3 (##sys#+-2 (##sys#integer-shift r1 (digit-bits n)) a3))) + (let lp ((r^ (##sys#--2 r1a3 (##sys#*-2 q^ b2))) + (q^ q^)) + (if (negative? r^) + (lp (##sys#+-2 r^ b) (##sys#--2 q^ 1)) + (values q^ r^)))))) + + (define (burnikel-ziegler-2n/1n a b b1 b2 n) + (if (or (fxodd? n) (fx< n DIV-LIMIT)) + (##sys#integer-quotient&remainder a b) + (let* ((n/2 (fxshr n 1)) + ;; Split a and b into n-sized parts a1,..,a4 and b1,b2 + (a12 (##sys#bignum-extract-digits a n #f)) + (a3 (##sys#bignum-extract-digits a n/2 n)) + (a4 (##sys#bignum-extract-digits a 0 n/2))) + (receive (q1 r1) (burnikel-ziegler-3n/2n a12 a3 b b1 b2 n/2) + (receive (q2 r) (burnikel-ziegler-3n/2n r1 a4 b b1 b2 n/2) + (values (##sys#+-2 (##sys#integer-shift q1 (digit-bits n/2)) + q2) + r)))))) + + ;; The caller will ensure that abs(x) > abs(y) + (let* ((q-neg? (if (negative? y) (not (negative? x)) (negative? x))) + (r-neg? (negative? x)) + (x (abs x)) + (y (abs y)) + (s (%bignum-digit-count y)) + ;; Define m as min{2^k|(2^k)*BURNIKEL_ZIEGLER_THRESHOLD > s} + ;; This ensures we shift as little as possible (less pressure + ;; on the GC) while maintaining a power of two until we drop + ;; below the threshold, so we can always split N in half. + (m (fxshl 1 (integer-length (fx/ s DIV-LIMIT)))) + (j (fx/ (fx+ s (fx- m 1)) m)) ; j = s/m, rounded up + (n (fx* j m)) + (norm-shift (fx- (digit-bits n) (integer-length y))) + (x (##sys#integer-shift x norm-shift)) + (y (##sys#integer-shift y norm-shift)) + ;; l needs to be the smallest value so that a < base^{l*n}/2 + (l (fx/ (fx+ (%bignum-digit-count x) (fx- n 1)) n)) + (l (if (fx= (digit-bits l) (integer-length x)) (fx+ l 1) l)) + (t (fxmax l 2)) + (y-hi (##sys#bignum-extract-digits y (fxshr n 1) #f)) + (y-lo (##sys#bignum-extract-digits y 0 (fxshr n 1)))) + (let lp ((zi (##sys#integer-shift x (fxneg (digit-bits (fx* (fx- t 2) n))))) + (i (fx- t 2)) + (quot 0)) + (receive (qi ri) (burnikel-ziegler-2n/1n zi y y-hi y-lo n) + (let ((quot (##sys#+-2 (##sys#integer-shift quot (digit-bits n)) qi))) + (if (fx> i 0) + (let ((zi-1 (let* ((base*n*i-1 (fx* n (fx- i 1))) + (base*n*i (fx* n i)) + (xi-1 (##sys#bignum-extract-digits + x base*n*i-1 base*n*i))) + (##sys#+-2 (##sys#integer-shift ri (digit-bits n)) + xi-1)))) + (lp zi-1 (fx- i 1) quot)) + (let ((rem (if (or (not return-rem?) (eq? 0 norm-shift)) + ri + (##sys#integer-shift ri (fxneg norm-shift))))) + ;; Return requested values (quot, rem or both) with correct sign: + (cond ((and return-quot? return-rem?) + (values (if q-neg? (- quot) quot) + (if r-neg? (- rem) rem))) + (return-quot? (if q-neg? (- quot) quot)) + (else (if r-neg? (- rem) rem)))))))))) + (define (floor x) (cond ((exact-integer? x) x) ((##core#inline "C_i_flonump" x) (fpfloor x)) @@ -1770,14 +1912,56 @@ EOF (##sys#integer-power a b)))) ) (define (##sys#integer-gcd a b) - ;; Currently this is only Euclidean GCD algorithm. TODO: Restore - ;; Lehmer's algorithm when everything else has been implemented. + (define k fixnum-precision) ; Can be anything between 2 and min(F, B). + (define k/2 (fx/ k 2)) ; F is fixnum precision and B bits in a big digit + + ;; This is Lehmer's GCD algorithm with Jebelean's quotient test, as + ;; it is presented in the paper "An Analysis of Lehmer’s Euclidean + ;; GCD Algorithm", by J. Sorenson. Fuck the ACM and their goddamn + ;; paywall; you can currently find the paper here: + ;; http://www.csie.nuk.edu.tw/~cychen/gcd/An%20analysis%20of%20Lehmer%27s%20Euclidean%20GCD%20algorithm.pdf + ;; If that URI fails, it's also explained in [MpNT, 5.2] + ;; + ;; The basic idea is to avoid divisions which yield only small + ;; quotients, in which the remainder won't reduce the numbers by + ;; much. This can be detected by dividing only the leading k bits. + (define (lehmer-gcd u v) + (let ((-h (fxneg (fx- (integer-length u) k)))) + (let lp ((i-even? #t) + (u^ (arithmetic-shift u -h)) + (v^ (arithmetic-shift v -h)) + (x-prev 1) (y-prev 0) + (x-curr 0) (y-curr 1)) + (let* ((q^ (fx/ u^ v^)) ; Estimated quotient for this step + (x-next (fx- x-prev (fx* q^ x-curr))) + (y-next (fx- y-prev (fx* q^ y-curr)))) + ;; Euclidian GCD swap on u^ and v^ + (let ((u^ v^) + (v^ (fx- u^ (fx* q^ v^)))) + (let ((done? (if i-even? + (or (fx< v^ (fxneg y-next)) + (fx< (fx- u^ v^) (fx- x-next x-curr))) + (or (fx< v^ (fxneg x-next)) + (fx< (fx- u^ v^) (fx- y-next y-curr)))))) + (if done? + (values (+ (* x-prev u) (* y-prev v)) + (+ (* x-curr u) (* y-curr v))) + (lp (not i-even?) u^ v^ x-curr y-curr x-next y-next)))))))) + + ;; This implements the basic Euclidian GCD algorithm, with a + ;; conditional call to Lehmer's GCD algorithm when the length + ;; difference between a and b is at most one halfdigit. + ;; The complexity of the whole thing is supposedly O(n^2/log n) + ;; where n is the number of bits in a and b. (let* ((a (abs a)) (b (abs b)) ; Enforce loop invariant on input: (swap? (##sys#<-2 a b))) ; both must be positive, and a >= b (let lp ((a (if swap? b a)) (b (if swap? a b))) (cond ((eq? b 0) a) - ((fixnum? a) (fxgcd a b)) ; b MUST be fixnum due to loop invariant + ((fx< (fx- (integer-length a) (integer-length b)) k/2) + (receive (a b) (lehmer-gcd a b) + (if (eq? b 0) a (lp b (##sys#integer-remainder a b))))) + ((fixnum? a) (fxgcd a b)) ; b MUST be fixnum due to loop invariant (else (lp b (##sys#integer-remainder a b))))))) ;; Useful for sane error messages @@ -1825,6 +2009,19 @@ EOF (##sys#internal-gcd 'lcm head n2)) (##sys#slot next 1)) ) ) ) ) ) +;; This simple enough idea is from +;; http://www.numberworld.org/y-cruncher/algorithms/radix-conversion.html +(define (##sys#integer->string/recursive n base expected-string-size) + (let*-values (((halfsize) (fxshr (fx+ expected-string-size 1) 1)) + ((b^M/2) (##sys#integer-power base halfsize)) + ((hi lo) (quotient&remainder n b^M/2)) + ((strhi) (number->string hi base)) + ((strlo) (number->string (abs lo) base))) + (string-append strhi + ;; Fix up any leading zeroes that were stripped from strlo + (make-string (fx- halfsize (string-length strlo)) #\0) + strlo))) + (define ##sys#extended-number->string (let ((string-append string-append)) (lambda (n base) diff --git a/runtime.c b/runtime.c index 7dfdc322..41c4091e 100644 --- a/runtime.c +++ b/runtime.c @@ -502,6 +502,7 @@ static C_ccall void values_continuation(C_word c, C_word closure, C_word dummy, static C_word add_symbol(C_word **ptr, C_word key, C_word string, C_SYMBOL_TABLE *stable); static C_regparm int C_fcall C_in_new_heapp(C_word x); static void bignum_negate_2(C_word c, C_word self, C_word new_big); +static void bignum_actual_extraction(C_word c, C_word self, C_word result) C_noret; static void bignum_bitwise_and_2(C_word c, C_word self, C_word result) C_noret; static void bignum_bitwise_ior_2(C_word c, C_word self, C_word result) C_noret; static void bignum_bitwise_xor_2(C_word c, C_word self, C_word result) C_noret; @@ -5641,6 +5642,57 @@ C_regparm C_word C_fcall C_i_integer_length(C_word x) } } +/* This is currently only used by Karatsuba multiplication and + * Burnikel-Ziegler division. It is not intended as a public API! + */ +void C_ccall +C_u_bignum_extract_digits(C_word c, C_word self, C_word k, C_word x, C_word start, C_word end) +{ + if (x & C_FIXNUM_BIT) { /* Needed? */ + if (C_unfix(start) == 0 && (end == C_SCHEME_FALSE || C_unfix(end) > 0)) + C_kontinue(k, x); + else + C_kontinue(k, C_fix(0)); + } else { + C_word negp, size; + + negp = C_mk_bool(C_bignum_negativep(x)); /* Always false */ + + start = C_unfix(start); + /* We might get passed larger values than actually fits; pad w/ zeroes */ + if (end == C_SCHEME_FALSE) end = C_bignum_size(x); + else end = nmin(C_unfix(end), C_bignum_size(x)); + assert(start >= 0); + + size = end - start; + + if (size == 0 || start >= C_bignum_size(x)) { + C_kontinue(k, C_fix(0)); + } else { + C_word k2, kab[C_SIZEOF_CLOSURE(5)], *ka = kab; + k2 = C_closure(&ka, 5, (C_word)bignum_actual_extraction, + k, x, C_fix(start), C_fix(end)); + C_allocate_bignum(5, (C_word)NULL, k2, C_fix(size), negp, C_SCHEME_FALSE); + } + } +} + +static void bignum_actual_extraction(C_word c, C_word self, C_word result) +{ + C_word k = C_block_item(self, 1), + x = C_block_item(self, 2), + start = C_unfix(C_block_item(self, 3)), + end = C_unfix(C_block_item(self, 4)); + C_uword *result_digits = C_bignum_digits(result), + *x_digits = C_bignum_digits(x); + + /* Can't use bignum_digits_destructive_copy because that assumes + * target is at least as big as source. + */ + C_memcpy(result_digits, x_digits + start, C_wordstobytes(end-start)); + C_kontinue(k, C_bignum_simplify(result)); +} + /* This returns a tmp bignum negated copy of X (must be freed!) when * the number is negative, or #f if it doesn't need to be negated. * The size can be larger or smaller than X (it may be 1-padded). @@ -7111,15 +7163,14 @@ bignum_times_bignum_unsigned(C_word k, C_word x, C_word y, C_word negp) y = z; } - /* TODO: Activate later */ - /* if (C_bignum_size(x) < C_KARATSUBA_THRESHOLD) { /\* Gradebook *\/ */ + if (C_bignum_size(x) < C_KARATSUBA_THRESHOLD) { /* Gradebook */ C_word kab[C_SIZEOF_CLOSURE(4)], *ka = kab, k2, size; k2 = C_closure(&ka, 4, (C_word)bignum_times_bignum_unsigned_2, k, x, y); size = C_fix(C_bignum_size(x) + C_bignum_size(y)); C_allocate_bignum(5, (C_word)NULL, k2, size, negp, C_SCHEME_TRUE); - /* } else { */ - /* try_extended_number("numbers#@bignum-2-times-karatsuba", 3, k, x, y); */ - /* } */ + } else { + try_extended_number("\003sysbignum-times-karatsuba", 3, k, x, y); + } } static void bignum_times_bignum_unsigned_2(C_word c, C_word self, C_word result) @@ -7962,13 +8013,12 @@ bignum_divrem(C_word c, C_word self, C_word k, C_word x, C_word y, C_word return case 1: default: size = C_bignum_size(y); - /* TODO: Activate later */ - /* if (size > C_BURNIKEL_ZIEGLER_THRESHOLD && */ - /* /\* This avoids endless recursion for odd Ns just above threshold *\/ */ - /* !(size & 1 && size < (C_BURNIKEL_ZIEGLER_THRESHOLD << 1))) { */ - /* try_extended_number("\003sysbignum-divrem-burnikel-ziegler", 5, */ - /* k, x, y, return_q, return_r); */ - /* } else */ if (C_truep(return_q)) { + if (size > C_BURNIKEL_ZIEGLER_THRESHOLD && + /* This avoids endless recursion for odd Ns just above threshold */ + !(size & 1 && size < (C_BURNIKEL_ZIEGLER_THRESHOLD << 1))) { + try_extended_number("\003sysbignum-divrem-burnikel-ziegler", 5, + k, x, y, return_q, return_r); + } else if (C_truep(return_q)) { C_word kab[C_SIZEOF_FIX_BIGNUM+C_SIZEOF_CLOSURE(9)], *ka = kab, k2; k2 = C_closure(&ka, 9, (C_word)bignum_divide_2_unsigned, k, x, y, return_q, return_r, r_negp, @@ -10028,20 +10078,19 @@ C_integer_to_string(C_word c, C_word self, C_word k, C_word num, C_word radix) len += C_bignum_negativep(num) ? 1 : 0; /* Add space for negative sign */ radix_shift = C_ilen(C_unfix(radix)) - 1; - /* TODO: Activate later */ - /* if (len > C_RECURSIVE_TO_STRING_THRESHOLD && */ - /* /\* The power of two fast path is much faster than recursion *\/ */ - /* ((C_uword)1 << radix_shift) != C_unfix(radix)) { */ - /* try_extended_number("numbers#@integer->string/recursive", */ - /* 4, k, num, radix, C_fix(len)); */ - /* } else { */ + if (len > C_RECURSIVE_TO_STRING_THRESHOLD && + /* The power of two fast path is much faster than recursion */ + ((C_uword)1 << radix_shift) != C_unfix(radix)) { + try_extended_number("\003sysinteger->string/recursive", + 4, k, num, radix, C_fix(len)); + } else { C_word k2, negp = C_mk_bool(C_bignum_negativep(num)), *ka; ka = C_alloc(C_SIZEOF_CLOSURE(4)); k2 = C_closure(&ka, 4, (C_word)bignum_to_str_2, k, num, radix); C_allocate_vector(6, (C_word)NULL, k2, C_fix(len), /* Byte vec, no initialization, align at 8 bytes */ C_SCHEME_TRUE, C_SCHEME_FALSE, C_SCHEME_FALSE); - /* } */ + } } }Trap