~ chicken-core (chicken-5) 3044620ac1a20c22a0f501cf01592c09396fb89f
commit 3044620ac1a20c22a0f501cf01592c09396fb89f Author: Peter Bex <peter@more-magic.net> AuthorDate: Sun Apr 19 13:38:49 2015 +0200 Commit: Peter Bex <peter@more-magic.net> CommitDate: Sun May 31 14:55:25 2015 +0200 Restore Burnikel-Ziegler division, which makes a big difference for division of huge numbers diff --git a/chicken.h b/chicken.h index 4a278ff4..2ffd81d7 100644 --- a/chicken.h +++ b/chicken.h @@ -428,6 +428,12 @@ static inline int isinf_ld (long double x) */ # define C_KARATSUBA_THRESHOLD 70 #endif +#ifndef C_BURNIKEL_ZIEGLER_THRESHOLD +/* This defines when to switch from schoolbook to Burnikel-Ziegler + * division. It creates even more garbage than Karatsuba :( + */ +# define C_BURNIKEL_ZIEGLER_THRESHOLD 300 +#endif #ifndef C_RECURSIVE_TO_STRING_THRESHOLD /* This threshold is in terms of the expected string length. */ # define C_RECURSIVE_TO_STRING_THRESHOLD 750 diff --git a/runtime.c b/runtime.c index e1f5c00c..4c86e228 100644 --- a/runtime.c +++ b/runtime.c @@ -527,6 +527,9 @@ static C_word bignum_minus_unsigned(C_word **ptr, C_word x, C_word y); static C_regparm void integer_divrem(C_word **ptr, C_word x, C_word y, C_word *q, C_word *r); static C_regparm C_word bignum_remainder_unsigned_halfdigit(C_word x, C_word y); static C_regparm void bignum_divrem(C_word **ptr, C_word x, C_word y, C_word *q, C_word *r); +static C_regparm C_word bignum_divide_burnikel_ziegler(C_word **ptr, C_word x, C_word y, C_word *q, C_word *r); +static C_regparm void burnikel_ziegler_3n_div_2n(C_word **ptr, C_word a12, C_word a3, C_word b, C_word b1, C_word b2, C_word n, C_word *q, C_word *r); +static C_regparm void burnikel_ziegler_2n_div_1n(C_word **ptr, C_word a, C_word b, C_word b1, C_word b2, C_word n, C_word *q, C_word *r); static C_word rat_cmp(C_word x, C_word y); static void flo_to_int_2(C_word c, C_word self, C_word result) C_noret; static void fabs_frexp_to_digits(C_uword exp, double sign, C_uword *start, C_uword *scan); @@ -5897,7 +5900,8 @@ C_regparm C_word C_fcall C_i_integer_length(C_word x) } } -/* This is currently only used by Karatsuba multiplication. */ +/* This is currently only used by Karatsuba multiplication and + * Burnikel-Ziegler division. */ static C_regparm C_word bignum_extract_digits(C_word **ptr, C_word n, C_word x, C_word start, C_word end) { @@ -8571,7 +8575,7 @@ static C_regparm void bignum_divrem(C_word **ptr, C_word x, C_word y, C_word *q, C_word *r) { C_word q_negp = C_mk_bool(C_bignum_negativep(y) != C_bignum_negativep(x)), - r_negp = C_mk_bool(C_bignum_negativep(x)); + r_negp = C_mk_bool(C_bignum_negativep(x)), res, size; switch(bignum_cmp_unsigned(x, y)) { case 0: @@ -8584,13 +8588,279 @@ bignum_divrem(C_word **ptr, C_word x, C_word y, C_word *q, C_word *r) break; case 1: default: - bignum_divide_unsigned(ptr, x, y, q, q_negp, r, r_negp); - if (q != NULL) *q = C_bignum_simplify(*q); - if (r != NULL) *r = C_bignum_simplify(*r); + res = C_SCHEME_FALSE; + size = C_bignum_size(x); + if (size >= C_BURNIKEL_ZIEGLER_THRESHOLD && + /* This avoids endless recursion for odd Ns just above threshold */ + !(size & 1 && size < (C_BURNIKEL_ZIEGLER_THRESHOLD << 1))) { + res = bignum_divide_burnikel_ziegler(ptr, x, y, q, r); + } + + if (!C_truep(res)) { + bignum_divide_unsigned(ptr, x, y, q, q_negp, r, r_negp); + if (q != NULL) *q = C_bignum_simplify(*q); + if (r != NULL) *r = C_bignum_simplify(*r); + } break; } } +C_inline int integer_length_abs(C_word x) +{ + if (x & C_FIXNUM_BIT) { + return C_ilen(labs(C_unfix(x))); + } else { + C_uword result = (C_bignum_size(x) - 1) * C_BIGNUM_DIGIT_LENGTH, + *last_digit = C_bignum_digits(x) + C_bignum_size(x) - 1, + last_digit_length = C_ilen(*last_digit); + return result + last_digit_length; + } +} + +/* Burnikel-Ziegler recursive division: Split high number (x) in three + * or four parts and divide by the lowest number (y), split in two + * parts. There are descriptions in [MpNT, 4.2], [MCA, 1.4.3] and the + * paper "Fast Recursive Division" by Christoph Burnikel & Joachim + * Ziegler is freely available. There is also a description in Karl + * Hasselstrom's thesis "Fast Division of Integers". + * + * The complexity of this is supposedly O(r*s^{log(3)-1} + r*log(s)), + * where s is the length of x, and r is the length of y (in digits). + * + * TODO: See if it's worthwhile to implement "division without remainder" + * from the Burnikel-Ziegler paper. + */ +static C_regparm C_word +bignum_divide_burnikel_ziegler(C_word **ptr, C_word x, C_word y, C_word *q, C_word *r) +{ + C_word ab[C_SIZEOF_FIX_BIGNUM*9], *a = ab, + lab[2][C_SIZEOF_FIX_BIGNUM*10], *la, + q_negp = (C_bignum_negativep(y) ? C_mk_nbool(C_bignum_negativep(x)) : + C_mk_bool(C_bignum_negativep(x))), + r_negp = C_mk_bool(C_bignum_negativep(x)), s, m, n, i, j, l, shift, + yhi, ylo, zi, zi_orig, newx, newy, quot, qi, ri; + + /* Ran out of stack? Fall back to non-recursive division */ + C_stack_check1(return C_SCHEME_FALSE); + + x = C_s_a_u_i_integer_abs(&a, 1, x); + y = C_s_a_u_i_integer_abs(&a, 1, y); + + /* Define m as min{2^k|(2^k)*BURNIKEL_ZIEGLER_THRESHOLD > s} + * This ensures we shift as little as possible (less pressure + * on the GC) while maintaining a power of two until we drop + * below the threshold, so we can always split N in half. + */ + s = C_bignum_size(y); + m = 1 << C_ilen(s / C_BURNIKEL_ZIEGLER_THRESHOLD); + j = (s+m-1) / m; /* j = s/m, rounded up */ + n = j * m; + + shift = (C_BIGNUM_DIGIT_LENGTH * n) - integer_length_abs(y); + newx = C_s_a_i_arithmetic_shift(&a, 2, x, C_fix(shift)); + newy = C_s_a_i_arithmetic_shift(&a, 2, y, C_fix(shift)); + if (shift != 0) { + clear_buffer_object(ab, x); + clear_buffer_object(ab, y); + } + x = newx; + y = newy; + + /* l needs to be the smallest value so that a < base^{l*n}/2 */ + l = (C_bignum_size(x) + n) / n; + if ((C_BIGNUM_DIGIT_LENGTH * l) == integer_length_abs(x)) l++; + l = nmax(l, 2); + + yhi = bignum_extract_digits(&a, 3, y, C_fix(n >> 1), C_SCHEME_FALSE); + ylo = bignum_extract_digits(&a, 3, y, C_fix(0), C_fix(n >> 1)); + + s = (l - 2) * n * C_BIGNUM_DIGIT_LENGTH; + zi_orig = zi = C_s_a_i_arithmetic_shift(&a, 2, x, C_fix(-s)); + quot = C_fix(0); + + for(i = l - 2; i >= 0; --i) { + la = lab[i&1]; + + burnikel_ziegler_2n_div_1n(&la, zi, y, yhi, ylo, C_fix(n), &qi, &ri); + + newx = C_s_a_i_arithmetic_shift(&la, 2, quot, C_fix(n*C_BIGNUM_DIGIT_LENGTH)); + clear_buffer_object(lab, quot); + quot = C_s_a_u_i_integer_plus(&la, 2, newx, qi); + move_buffer_object(&la, lab[(i+1)&1], quot); + clear_buffer_object(lab, newx); + clear_buffer_object(lab, qi); + + if (i > 0) { /* Set z_{i-1} = [r{i}, x{i-1}] */ + newx = bignum_extract_digits(&la, 3, x, C_fix(n * (i-1)), C_fix(n * i)); + newy = C_s_a_i_arithmetic_shift(&la, 2, ri, C_fix(n*C_BIGNUM_DIGIT_LENGTH)); + clear_buffer_object(lab, zi); + zi = C_s_a_u_i_integer_plus(&la, 2, newx, newy); + move_buffer_object(&la, lab[(i+1)&1], zi); + move_buffer_object(&la, lab[(i+1)&1], quot); + clear_buffer_object(lab, newx); + clear_buffer_object(lab, newy); + clear_buffer_object(lab, ri); + } + } + clear_buffer_object(ab, x); + clear_buffer_object(ab, y); + clear_buffer_object(ab, yhi); + clear_buffer_object(ab, ylo); + clear_buffer_object(ab, zi_orig); + clear_buffer_object(lab, zi); + + if (q != NULL) { + if (C_truep(q_negp)) { + newx = C_s_a_u_i_integer_negate(&la, 1, quot); + clear_buffer_object(lab, quot); + quot = newx; + } + *q = move_buffer_object(ptr, lab, quot); + } + clear_buffer_object(lab, quot); + + if (r != NULL) { + newx = C_s_a_i_arithmetic_shift(&la, 2, ri, C_fix(-shift)); + if (C_truep(r_negp)) { + newy = C_s_a_u_i_integer_negate(ptr, 1, newx); + clear_buffer_object(lab, newx); + newx = newy; + } + *r = move_buffer_object(ptr, lab, newx); + } + clear_buffer_object(lab, ri); + + return C_SCHEME_TRUE; +} + +static C_regparm void +burnikel_ziegler_3n_div_2n(C_word **ptr, C_word a12, C_word a3, C_word b, C_word b1, C_word b2, C_word n, C_word *q, C_word *r) +{ + C_word kab[C_SIZEOF_FIX_BIGNUM*6 + C_SIZEOF_BIGNUM(2)], *ka = kab, + lab[2][C_SIZEOF_FIX_BIGNUM*4], *la, + size, tmp, less, qhat, rhat, r1, r1a3, i = 0; + + size = C_unfix(n) * C_BIGNUM_DIGIT_LENGTH; + tmp = C_s_a_i_arithmetic_shift(&ka, 2, a12, C_fix(-size)); + less = C_i_integer_lessp(tmp, b1); /* a1 < b1 ? */ + clear_buffer_object(kab, tmp); + + if (C_truep(less)) { + C_word atmpb[C_SIZEOF_FIX_BIGNUM*2], *atmp = atmpb, b11, b12, halfn; + + halfn = C_fix(C_unfix(n) >> 1); + b11 = bignum_extract_digits(&atmp, 3, b1, halfn, C_SCHEME_FALSE); + b12 = bignum_extract_digits(&atmp, 3, b1, C_fix(0), halfn); + + burnikel_ziegler_2n_div_1n(&ka, a12, b1, b11, b12, n, &qhat, &r1); + qhat = move_buffer_object(&ka, atmpb, qhat); + r1 = move_buffer_object(&ka, atmpb, r1); + + clear_buffer_object(atmpb, b11); + clear_buffer_object(atmpb, b12); + } else { + C_word atmpb[C_SIZEOF_FIX_BIGNUM*5], *atmp = atmpb, tmp2; + + tmp = C_s_a_i_arithmetic_shift(&atmp, 2, C_fix(1), C_fix(size)); + qhat = C_s_a_u_i_integer_minus(&ka, 2, tmp, C_fix(1)); /* B^n - 1 */ + qhat = move_buffer_object(&ka, atmpb, qhat); + clear_buffer_object(atmpb, tmp); + + /* r1 = (a12 - b1*B^n) + b1 */ + tmp = C_s_a_i_arithmetic_shift(&atmp, 2, b1, C_fix(size)); + tmp2 = C_s_a_u_i_integer_minus(&atmp, 2, a12, tmp); + r1 = C_s_a_u_i_integer_plus(&ka, 2, tmp2, b1); + r1 = move_buffer_object(&ka, atmpb, r1); + clear_buffer_object(atmpb, tmp); + clear_buffer_object(atmpb, tmp2); + } + + tmp = C_s_a_i_arithmetic_shift(&ka, 2, r1, C_fix(size)); + clear_buffer_object(kab, r1); + r1a3 = C_s_a_u_i_integer_plus(&ka, 2, tmp, a3); + b2 = C_s_a_u_i_integer_times(&ka, 2, qhat, b2); + + la = lab[0]; + rhat = C_s_a_u_i_integer_minus(&la, 2, r1a3, b2); + rhat = move_buffer_object(&la, kab, rhat); + qhat = move_buffer_object(&la, kab, qhat); + + clear_buffer_object(kab, tmp); + clear_buffer_object(kab, r1a3); + clear_buffer_object(kab, b2); + + while(C_truep(C_i_negativep(rhat))) { + la = lab[(++i)&1]; + /* rhat += b */ + r1 = C_s_a_u_i_integer_plus(&la, 2, rhat, b); + tmp = move_buffer_object(&la, lab[(i-1)&1], r1); + clear_buffer_object(lab[(i-1)&1], r1); + clear_buffer_object(lab[(i-1)&1], rhat); + clear_buffer_object(kab, rhat); + rhat = tmp; + + /* qhat -= 1 */ + r1 = C_s_a_u_i_integer_minus(&la, 2, qhat, C_fix(1)); + tmp = move_buffer_object(&la, lab[(i-1)&1], r1); + clear_buffer_object(lab[(i-1)&1], r1); + clear_buffer_object(lab[(i-1)&1], qhat); + clear_buffer_object(kab, qhat); + qhat = tmp; + } + + if (q != NULL) *q = move_buffer_object(ptr, lab, qhat); + if (r != NULL) *r = move_buffer_object(ptr, lab, rhat); + clear_buffer_object(lab, qhat); + clear_buffer_object(lab, rhat); +} + +static C_regparm void +burnikel_ziegler_2n_div_1n(C_word **ptr, C_word a, C_word b, C_word b1, C_word b2, C_word n, C_word *q, C_word *r) +{ + C_word kab[2][C_SIZEOF_FIX_BIGNUM*7], *ka, a12, a3, a4, + q1 = C_fix(0), r1, q2 = C_fix(0), r2, *qp; + int stack_full = 0; + + C_stack_check1(stack_full = 1); + + n = C_unfix(n); + if (stack_full || (n & 1) || (n < C_BURNIKEL_ZIEGLER_THRESHOLD)) { + integer_divrem(ptr, a, b, q, r); + } else { + ka = kab[0]; + a12 = bignum_extract_digits(&ka, 3, a, C_fix(n), C_SCHEME_FALSE); + a3 = bignum_extract_digits(&ka, 3, a, C_fix(n >> 1), C_fix(n)); + + qp = (q == NULL) ? NULL : &q1; + ka = kab[1]; + burnikel_ziegler_3n_div_2n(&ka, a12, a3, b, b1, b2, C_fix(n >> 1), qp, &r1); + q1 = move_buffer_object(&ka, kab[0], q1); + r1 = move_buffer_object(&ka, kab[0], r1); + clear_buffer_object(kab[0], a12); + clear_buffer_object(kab[0], a3); + + a4 = bignum_extract_digits(&ka, 3, a, C_fix(0), C_fix(n >> 1)); + + qp = (q == NULL) ? NULL : &q2; + ka = kab[0]; + burnikel_ziegler_3n_div_2n(&ka, r1, a4, b, b1, b2, C_fix(n >> 1), qp, r); + if (r != NULL) *r = move_buffer_object(ptr, kab[0], *r); + clear_buffer_object(kab[1], r1); + + if (q != NULL) { + C_word halfn_bits = (n >> 1) * C_BIGNUM_DIGIT_LENGTH; + r1 = C_s_a_i_arithmetic_shift(&ka, 2, q1, C_fix(halfn_bits)); + *q = C_s_a_i_plus(ptr, 2, r1, q2); /* q = [q1, q2] */ + *q = move_buffer_object(ptr, kab[0], *q); + clear_buffer_object(kab[0], r1); + clear_buffer_object(kab[1], q1); + clear_buffer_object(kab[0], q2); + } + clear_buffer_object(kab[1], a4); + } +} + + static C_regparm C_word bignum_remainder_unsigned_halfdigit(C_word x, C_word y) { C_uword *start = C_bignum_digits(x), @@ -10046,18 +10316,6 @@ void C_ccall C_string_to_symbol(C_word c, C_word closure, C_word k, C_word strin C_kontinue(k, s); } -C_inline int integer_length_abs(C_word x) -{ - if (x & C_FIXNUM_BIT) { - return C_ilen(labs(C_unfix(x))); - } else { - C_uword result = (C_bignum_size(x) - 1) * C_BIGNUM_DIGIT_LENGTH, - *last_digit = C_bignum_digits(x) + C_bignum_size(x) - 1, - last_digit_length = C_ilen(*last_digit); - return result + last_digit_length; - } -} - /* This will usually return a flonum, but it may also return a cplxnum * consisting of two flonums, making for a total of 12 words. */Trap