~ chicken-core (chicken-5) 3044620ac1a20c22a0f501cf01592c09396fb89f


commit 3044620ac1a20c22a0f501cf01592c09396fb89f
Author:     Peter Bex <peter@more-magic.net>
AuthorDate: Sun Apr 19 13:38:49 2015 +0200
Commit:     Peter Bex <peter@more-magic.net>
CommitDate: Sun May 31 14:55:25 2015 +0200

    Restore Burnikel-Ziegler division, which makes a big difference for division of huge numbers

diff --git a/chicken.h b/chicken.h
index 4a278ff4..2ffd81d7 100644
--- a/chicken.h
+++ b/chicken.h
@@ -428,6 +428,12 @@ static inline int isinf_ld (long double x)
  */
 # define C_KARATSUBA_THRESHOLD        70
 #endif
+#ifndef C_BURNIKEL_ZIEGLER_THRESHOLD
+/* This defines when to switch from schoolbook to Burnikel-Ziegler
+ * division.  It creates even more garbage than Karatsuba :(
+ */
+# define C_BURNIKEL_ZIEGLER_THRESHOLD 300
+#endif
 #ifndef C_RECURSIVE_TO_STRING_THRESHOLD
 /* This threshold is in terms of the expected string length. */
 # define C_RECURSIVE_TO_STRING_THRESHOLD 750
diff --git a/runtime.c b/runtime.c
index e1f5c00c..4c86e228 100644
--- a/runtime.c
+++ b/runtime.c
@@ -527,6 +527,9 @@ static C_word bignum_minus_unsigned(C_word **ptr, C_word x, C_word y);
 static C_regparm void integer_divrem(C_word **ptr, C_word x, C_word y, C_word *q, C_word *r);
 static C_regparm C_word bignum_remainder_unsigned_halfdigit(C_word x, C_word y);
 static C_regparm void bignum_divrem(C_word **ptr, C_word x, C_word y, C_word *q, C_word *r);
+static C_regparm C_word bignum_divide_burnikel_ziegler(C_word **ptr, C_word x, C_word y, C_word *q, C_word *r);
+static C_regparm void burnikel_ziegler_3n_div_2n(C_word **ptr, C_word a12, C_word a3, C_word b, C_word b1, C_word b2, C_word n, C_word *q, C_word *r);
+static C_regparm void burnikel_ziegler_2n_div_1n(C_word **ptr, C_word a, C_word b, C_word b1, C_word b2, C_word n, C_word *q, C_word *r);
 static C_word rat_cmp(C_word x, C_word y);
 static void flo_to_int_2(C_word c, C_word self, C_word result) C_noret;
 static void fabs_frexp_to_digits(C_uword exp, double sign, C_uword *start, C_uword *scan);
@@ -5897,7 +5900,8 @@ C_regparm C_word C_fcall C_i_integer_length(C_word x)
   }
 }
 
-/* This is currently only used by Karatsuba multiplication. */
+/* This is currently only used by Karatsuba multiplication and
+ * Burnikel-Ziegler division. */
 static C_regparm C_word
 bignum_extract_digits(C_word **ptr, C_word n, C_word x, C_word start, C_word end)
 {
@@ -8571,7 +8575,7 @@ static C_regparm void
 bignum_divrem(C_word **ptr, C_word x, C_word y, C_word *q, C_word *r)
 {
   C_word q_negp = C_mk_bool(C_bignum_negativep(y) != C_bignum_negativep(x)),
-         r_negp = C_mk_bool(C_bignum_negativep(x));
+         r_negp = C_mk_bool(C_bignum_negativep(x)), res, size;
 
   switch(bignum_cmp_unsigned(x, y)) {
   case 0:
@@ -8584,13 +8588,279 @@ bignum_divrem(C_word **ptr, C_word x, C_word y, C_word *q, C_word *r)
     break;
   case 1:
   default:
-    bignum_divide_unsigned(ptr, x, y, q, q_negp, r, r_negp);
-    if (q != NULL) *q = C_bignum_simplify(*q);
-    if (r != NULL) *r = C_bignum_simplify(*r);
+    res = C_SCHEME_FALSE;
+    size = C_bignum_size(x);
+    if (size >= C_BURNIKEL_ZIEGLER_THRESHOLD &&
+        /* This avoids endless recursion for odd Ns just above threshold */
+        !(size & 1 && size < (C_BURNIKEL_ZIEGLER_THRESHOLD << 1))) {
+      res = bignum_divide_burnikel_ziegler(ptr, x, y, q, r);
+    }
+
+    if (!C_truep(res)) {
+      bignum_divide_unsigned(ptr, x, y, q, q_negp, r, r_negp);
+      if (q != NULL) *q = C_bignum_simplify(*q);
+      if (r != NULL) *r = C_bignum_simplify(*r);
+    }
     break;
   }
 }
 
+C_inline int integer_length_abs(C_word x)
+{
+  if (x & C_FIXNUM_BIT) {
+    return C_ilen(labs(C_unfix(x)));
+  } else {
+    C_uword result = (C_bignum_size(x) - 1) * C_BIGNUM_DIGIT_LENGTH,
+            *last_digit = C_bignum_digits(x) + C_bignum_size(x) - 1,
+            last_digit_length = C_ilen(*last_digit);
+    return result + last_digit_length;
+  }
+}
+
+/* Burnikel-Ziegler recursive division: Split high number (x) in three
+ * or four parts and divide by the lowest number (y), split in two
+ * parts.  There are descriptions in [MpNT, 4.2], [MCA, 1.4.3] and the
+ * paper "Fast Recursive Division" by Christoph Burnikel & Joachim
+ * Ziegler is freely available.  There is also a description in Karl
+ * Hasselstrom's thesis "Fast Division of Integers".
+ *
+ * The complexity of this is supposedly O(r*s^{log(3)-1} + r*log(s)),
+ * where s is the length of x, and r is the length of y (in digits).
+ *
+ * TODO: See if it's worthwhile to implement "division without remainder"
+ * from the Burnikel-Ziegler paper.
+ */
+static C_regparm C_word
+bignum_divide_burnikel_ziegler(C_word **ptr, C_word x, C_word y, C_word *q, C_word *r)
+{
+  C_word ab[C_SIZEOF_FIX_BIGNUM*9], *a = ab,
+         lab[2][C_SIZEOF_FIX_BIGNUM*10], *la,
+         q_negp = (C_bignum_negativep(y) ? C_mk_nbool(C_bignum_negativep(x)) :
+                   C_mk_bool(C_bignum_negativep(x))),
+         r_negp = C_mk_bool(C_bignum_negativep(x)), s, m, n, i, j, l, shift,
+         yhi, ylo, zi, zi_orig, newx, newy, quot, qi, ri;
+
+  /* Ran out of stack?  Fall back to non-recursive division */
+  C_stack_check1(return C_SCHEME_FALSE);
+
+  x = C_s_a_u_i_integer_abs(&a, 1, x);
+  y = C_s_a_u_i_integer_abs(&a, 1, y);
+
+  /* Define m as min{2^k|(2^k)*BURNIKEL_ZIEGLER_THRESHOLD > s}
+   * This ensures we shift as little as possible (less pressure
+   * on the GC) while maintaining a power of two until we drop
+   * below the threshold, so we can always split N in half.
+   */
+  s = C_bignum_size(y);
+  m = 1 << C_ilen(s / C_BURNIKEL_ZIEGLER_THRESHOLD);
+  j = (s+m-1) / m;              /* j = s/m, rounded up */
+  n = j * m;
+
+  shift = (C_BIGNUM_DIGIT_LENGTH * n) - integer_length_abs(y);
+  newx = C_s_a_i_arithmetic_shift(&a, 2, x, C_fix(shift));
+  newy = C_s_a_i_arithmetic_shift(&a, 2, y, C_fix(shift));
+  if (shift != 0) {
+    clear_buffer_object(ab, x);
+    clear_buffer_object(ab, y);
+  }
+  x = newx;
+  y = newy;
+
+  /* l needs to be the smallest value so that a < base^{l*n}/2 */
+  l = (C_bignum_size(x) + n) / n;
+  if ((C_BIGNUM_DIGIT_LENGTH * l) == integer_length_abs(x)) l++;
+  l = nmax(l, 2);
+
+  yhi = bignum_extract_digits(&a, 3, y, C_fix(n >> 1), C_SCHEME_FALSE);
+  ylo = bignum_extract_digits(&a, 3, y, C_fix(0), C_fix(n >> 1));
+
+  s = (l - 2) * n * C_BIGNUM_DIGIT_LENGTH;
+  zi_orig = zi = C_s_a_i_arithmetic_shift(&a, 2, x, C_fix(-s));
+  quot = C_fix(0);
+
+  for(i = l - 2; i >= 0; --i) {
+    la = lab[i&1];
+
+    burnikel_ziegler_2n_div_1n(&la, zi, y, yhi, ylo, C_fix(n), &qi, &ri);
+
+    newx = C_s_a_i_arithmetic_shift(&la, 2, quot, C_fix(n*C_BIGNUM_DIGIT_LENGTH));
+    clear_buffer_object(lab, quot);
+    quot = C_s_a_u_i_integer_plus(&la, 2, newx, qi);
+    move_buffer_object(&la, lab[(i+1)&1], quot);
+    clear_buffer_object(lab, newx);
+    clear_buffer_object(lab, qi);
+
+    if (i > 0) {  /* Set z_{i-1} = [r{i}, x{i-1}] */
+      newx = bignum_extract_digits(&la, 3, x, C_fix(n * (i-1)), C_fix(n * i));
+      newy = C_s_a_i_arithmetic_shift(&la, 2, ri, C_fix(n*C_BIGNUM_DIGIT_LENGTH));
+      clear_buffer_object(lab, zi);
+      zi = C_s_a_u_i_integer_plus(&la, 2, newx, newy);
+      move_buffer_object(&la, lab[(i+1)&1], zi);
+      move_buffer_object(&la, lab[(i+1)&1], quot);
+      clear_buffer_object(lab, newx);
+      clear_buffer_object(lab, newy);
+      clear_buffer_object(lab, ri);
+    }
+  }
+  clear_buffer_object(ab, x);
+  clear_buffer_object(ab, y);
+  clear_buffer_object(ab, yhi);
+  clear_buffer_object(ab, ylo);
+  clear_buffer_object(ab, zi_orig);
+  clear_buffer_object(lab, zi);
+
+  if (q != NULL) {
+    if (C_truep(q_negp)) {
+      newx = C_s_a_u_i_integer_negate(&la, 1, quot);
+      clear_buffer_object(lab, quot);
+      quot = newx;
+    }
+    *q = move_buffer_object(ptr, lab, quot);
+  }
+  clear_buffer_object(lab, quot);
+
+  if (r != NULL) {
+    newx = C_s_a_i_arithmetic_shift(&la, 2, ri, C_fix(-shift));
+    if (C_truep(r_negp)) {
+      newy = C_s_a_u_i_integer_negate(ptr, 1, newx);
+      clear_buffer_object(lab, newx);
+      newx = newy;
+    }
+    *r = move_buffer_object(ptr, lab, newx);
+  }
+  clear_buffer_object(lab, ri);
+
+  return C_SCHEME_TRUE;
+}
+
+static C_regparm void
+burnikel_ziegler_3n_div_2n(C_word **ptr, C_word a12, C_word a3, C_word b, C_word b1, C_word b2, C_word n, C_word *q, C_word *r)
+{
+  C_word kab[C_SIZEOF_FIX_BIGNUM*6 + C_SIZEOF_BIGNUM(2)], *ka = kab,
+         lab[2][C_SIZEOF_FIX_BIGNUM*4], *la,
+         size, tmp, less, qhat, rhat, r1, r1a3, i = 0;
+
+  size = C_unfix(n) * C_BIGNUM_DIGIT_LENGTH;
+  tmp = C_s_a_i_arithmetic_shift(&ka, 2, a12, C_fix(-size));
+  less = C_i_integer_lessp(tmp, b1); /* a1 < b1 ? */
+  clear_buffer_object(kab, tmp);
+
+  if (C_truep(less)) {
+    C_word atmpb[C_SIZEOF_FIX_BIGNUM*2], *atmp = atmpb, b11, b12, halfn;
+
+    halfn = C_fix(C_unfix(n) >> 1);
+    b11 = bignum_extract_digits(&atmp, 3, b1, halfn, C_SCHEME_FALSE);
+    b12 = bignum_extract_digits(&atmp, 3, b1, C_fix(0), halfn);
+
+    burnikel_ziegler_2n_div_1n(&ka, a12, b1, b11, b12, n, &qhat, &r1);
+    qhat = move_buffer_object(&ka, atmpb, qhat);
+    r1 = move_buffer_object(&ka, atmpb, r1);
+
+    clear_buffer_object(atmpb, b11);
+    clear_buffer_object(atmpb, b12);
+  } else {
+    C_word atmpb[C_SIZEOF_FIX_BIGNUM*5], *atmp = atmpb, tmp2;
+
+    tmp = C_s_a_i_arithmetic_shift(&atmp, 2, C_fix(1), C_fix(size));
+    qhat = C_s_a_u_i_integer_minus(&ka, 2, tmp, C_fix(1));  /* B^n - 1 */
+    qhat = move_buffer_object(&ka, atmpb, qhat);
+    clear_buffer_object(atmpb, tmp);
+
+    /* r1 = (a12 - b1*B^n) + b1 */
+    tmp = C_s_a_i_arithmetic_shift(&atmp, 2, b1, C_fix(size));
+    tmp2 = C_s_a_u_i_integer_minus(&atmp, 2, a12, tmp);
+    r1 = C_s_a_u_i_integer_plus(&ka, 2, tmp2, b1);
+    r1 = move_buffer_object(&ka, atmpb, r1);
+    clear_buffer_object(atmpb, tmp);
+    clear_buffer_object(atmpb, tmp2);
+  }
+
+  tmp = C_s_a_i_arithmetic_shift(&ka, 2, r1, C_fix(size));
+  clear_buffer_object(kab, r1);
+  r1a3 = C_s_a_u_i_integer_plus(&ka, 2, tmp, a3);
+  b2 = C_s_a_u_i_integer_times(&ka, 2, qhat, b2);
+
+  la = lab[0];
+  rhat = C_s_a_u_i_integer_minus(&la, 2, r1a3, b2);
+  rhat = move_buffer_object(&la, kab, rhat);
+  qhat = move_buffer_object(&la, kab, qhat);
+
+  clear_buffer_object(kab, tmp);
+  clear_buffer_object(kab, r1a3);
+  clear_buffer_object(kab, b2);
+
+  while(C_truep(C_i_negativep(rhat))) {
+    la = lab[(++i)&1];
+    /* rhat += b */
+    r1 = C_s_a_u_i_integer_plus(&la, 2, rhat, b);
+    tmp = move_buffer_object(&la, lab[(i-1)&1], r1);
+    clear_buffer_object(lab[(i-1)&1], r1);
+    clear_buffer_object(lab[(i-1)&1], rhat);
+    clear_buffer_object(kab, rhat);
+    rhat = tmp;
+
+    /* qhat -= 1 */
+    r1 = C_s_a_u_i_integer_minus(&la, 2, qhat, C_fix(1));
+    tmp = move_buffer_object(&la, lab[(i-1)&1], r1);
+    clear_buffer_object(lab[(i-1)&1], r1);
+    clear_buffer_object(lab[(i-1)&1], qhat);
+    clear_buffer_object(kab, qhat);
+    qhat = tmp;
+  }
+
+  if (q != NULL) *q = move_buffer_object(ptr, lab, qhat);
+  if (r != NULL) *r = move_buffer_object(ptr, lab, rhat);
+  clear_buffer_object(lab, qhat);
+  clear_buffer_object(lab, rhat);
+}
+
+static C_regparm void
+burnikel_ziegler_2n_div_1n(C_word **ptr, C_word a, C_word b, C_word b1, C_word b2, C_word n, C_word *q, C_word *r)
+{
+  C_word kab[2][C_SIZEOF_FIX_BIGNUM*7], *ka, a12, a3, a4,
+         q1 = C_fix(0), r1, q2 = C_fix(0), r2, *qp;
+  int stack_full = 0;
+
+  C_stack_check1(stack_full = 1);
+
+  n = C_unfix(n);
+  if (stack_full || (n & 1) || (n < C_BURNIKEL_ZIEGLER_THRESHOLD)) {
+    integer_divrem(ptr, a, b, q, r);
+  } else {
+    ka = kab[0];
+    a12 = bignum_extract_digits(&ka, 3, a, C_fix(n), C_SCHEME_FALSE);
+    a3 = bignum_extract_digits(&ka, 3, a, C_fix(n >> 1), C_fix(n));
+
+    qp = (q == NULL) ? NULL : &q1;
+    ka = kab[1];
+    burnikel_ziegler_3n_div_2n(&ka, a12, a3, b, b1, b2, C_fix(n >> 1), qp, &r1);
+    q1 = move_buffer_object(&ka, kab[0], q1);
+    r1 = move_buffer_object(&ka, kab[0], r1);
+    clear_buffer_object(kab[0], a12);
+    clear_buffer_object(kab[0], a3);
+
+    a4 = bignum_extract_digits(&ka, 3, a, C_fix(0), C_fix(n >> 1));
+
+    qp = (q == NULL) ? NULL : &q2;
+    ka = kab[0];
+    burnikel_ziegler_3n_div_2n(&ka, r1, a4, b, b1, b2, C_fix(n >> 1), qp, r);
+    if (r != NULL) *r = move_buffer_object(ptr, kab[0], *r);
+    clear_buffer_object(kab[1], r1);
+
+    if (q != NULL) {
+      C_word halfn_bits = (n >> 1) * C_BIGNUM_DIGIT_LENGTH;
+      r1 = C_s_a_i_arithmetic_shift(&ka, 2, q1, C_fix(halfn_bits));
+      *q = C_s_a_i_plus(ptr, 2, r1, q2); /* q = [q1, q2] */
+      *q = move_buffer_object(ptr, kab[0], *q);
+      clear_buffer_object(kab[0], r1);
+      clear_buffer_object(kab[1], q1);
+      clear_buffer_object(kab[0], q2);
+    }
+    clear_buffer_object(kab[1], a4);
+  }
+}
+
+
 static C_regparm C_word bignum_remainder_unsigned_halfdigit(C_word x, C_word y)
 {
   C_uword *start = C_bignum_digits(x),
@@ -10046,18 +10316,6 @@ void C_ccall C_string_to_symbol(C_word c, C_word closure, C_word k, C_word strin
   C_kontinue(k, s);
 }
 
-C_inline int integer_length_abs(C_word x)
-{
-  if (x & C_FIXNUM_BIT) {
-    return C_ilen(labs(C_unfix(x)));
-  } else {
-    C_uword result = (C_bignum_size(x) - 1) * C_BIGNUM_DIGIT_LENGTH,
-            *last_digit = C_bignum_digits(x) + C_bignum_size(x) - 1,
-            last_digit_length = C_ilen(*last_digit);
-    return result + last_digit_length;
-  }
-}
-
 /* This will usually return a flonum, but it may also return a cplxnum
  * consisting of two flonums, making for a total of 12 words.
  */
Trap