~ chicken-core (chicken-5) 3d833218dae4e9d9bdce7a8fe4b64ba5d00c2124


commit 3d833218dae4e9d9bdce7a8fe4b64ba5d00c2124
Author:     Peter Bex <peter@more-magic.net>
AuthorDate: Thu Dec 17 21:44:14 2015 +0100
Commit:     Evan Hanson <evhan@foldling.org>
CommitDate: Sun Dec 20 18:24:57 2015 +1300

    Update irregex to upstream version 0.9.4
    
    This fixes a pathological performance problem with {n,m} patterns.
    
    Signed-off-by: Evan Hanson <evhan@foldling.org>

diff --git a/LICENSE b/LICENSE
index 7b78e394..97cc5bee 100644
--- a/LICENSE
+++ b/LICENSE
@@ -60,7 +60,7 @@ synrules.scm:
 
 irregex.scm:
 
-  Copyright (c) 2005-2011, Alex Shinn
+  Copyright (c) 2005-2015, Alex Shinn
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without
diff --git a/NEWS b/NEWS
index 3d06be89..f5d0e256 100644
--- a/NEWS
+++ b/NEWS
@@ -52,7 +52,10 @@
      last to resolve ambiguities (#1214).
   - Compiler rewrites for char{<,>,<=,>=,=}? are now safe (#1122).
 
-- Unit "posix": The following posix procedures now work on port
+- Core libraries
+  - Irregex has been updated to 0.9.4, which fixes severe performance
+    problems with {n,m} repeating patterns (thanks to Caolan McMahon).
+  - Unit "posix": The following posix procedures now work on port
     objects: file-stat, file-size, file-owner, file-permissions,
     file-modification-time, file-access-time, file-change-time,
     file-type and all procedures using file-type. These are:
diff --git a/irregex-core.scm b/irregex-core.scm
index c4dbea2e..1f62cd64 100644
--- a/irregex-core.scm
+++ b/irregex-core.scm
@@ -1,6 +1,6 @@
 ;;;; irregex.scm -- IrRegular Expressions
 ;;
-;; Copyright (c) 2005-2011 Alex Shinn.  All rights reserved.
+;; Copyright (c) 2005-2015 Alex Shinn.  All rights reserved.
 ;; BSD-style license: http://synthcode.com/license.txt
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -27,12 +27,11 @@
 ;; performance tuning, but you can only go so far while staying
 ;; portable.  AND-LET*, SRFI-9 records and custom macros would've been
 ;; nice.
-;;
-;; Version 1.0 will be released as a portable R7RS library.
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;; History
-;;
+;; 0.9.4: 2015/12/14 - performance improvement for {n,m} matches
+;; 0.9.3: 2014/07/01 - R7RS library
 ;; 0.9.2: 2012/11/29 - fixed a bug in -fold on conditional bos patterns
 ;; 0.9.1: 2012/11/27 - various accumulated bugfixes
 ;; 0.9.0: 2012/06/03 - Using tags for match extraction from Peter Bex.
@@ -506,8 +505,8 @@
 (define (find-tail pred ls)
   (let lp ((ls ls))
     (cond ((null? ls) #f)
-	  ((pred (car ls)) ls)
-	  (else (lp (cdr ls))))))
+          ((pred (car ls)) ls)
+          (else (lp (cdr ls))))))
 
 (define (last ls)
   (if (not (pair? ls))
@@ -557,7 +556,7 @@
 
 (define (bit-shl n i)
   (* n (expt 2 i)))
-  
+
 (define (bit-not n) (- #xFFFF n))
 
 (define (bit-ior a b)
@@ -2876,19 +2875,26 @@
                       (cons (list dfa-state finalizer dfa-trans) marked-states)
                       (+ dfa-size 1)))
                 (let* ((closure (nfa-epsilon-closure nfa (cdar trans)))
-                       (reordered (find-reorder-commands nfa closure marked-states))
+                       (reordered
+                        (find-reorder-commands nfa closure marked-states))
                        (copy-cmds (if reordered (cdr reordered) '()))
                        ;; Laurikari doesn't mention what "k" is, but it seems it
                        ;; must be the mappings of the state's reach
-                       (set-cmds (tag-set-commands-for-closure nfa (cdar trans) closure copy-cmds))
+                       (set-cmds (tag-set-commands-for-closure
+                                  nfa (cdar trans) closure copy-cmds))
                        (trans-closure (if reordered (car reordered) closure)))
                   (lp2 (cdr trans)
                        (if reordered
                            unmarked-states
                            (cons trans-closure unmarked-states))
-                       (cons `(,trans-closure ,(caar trans) ,copy-cmds . ,set-cmds)
+                       (cons `(,trans-closure
+                               ,(caar trans) ,copy-cmds . ,set-cmds)
                              dfa-trans)))))))))))
 
+;; When the conversion is complete we renumber the DFA sets-of-states
+;; in order and convert the result to a vector for fast lookup.
+;; Charsets containing single characters are converted to those characters
+;; for quick matching of the literal parts in a regex.
 (define (dfa-renumber states)
   (let ((indexes (let lp ((i 0) (states states) (indexes '()))
                    (if (null? states)
@@ -2912,7 +2918,6 @@
 ;; Extract all distinct ranges and the potential states they can transition
 ;; to from a given set of states.  Any ranges that would overlap with
 ;; distinct characters are split accordingly.
-
 ;; This function is like "reach" in Laurikari's papers, but for each
 ;; possible distinct range of characters rather than per character.
 (define (get-distinct-transitions nfa annotated-states)
@@ -2935,7 +2940,8 @@
                ;; but takes longer to compile.
                (cons (cons cs (nfa-state->mst nfa state mappings))
                      res))
-              ((cset=? cs (caar ls)) ; Add state to existing set for this charset
+              ((cset=? cs (caar ls))
+               ;; Add state to existing set for this charset
                (mst-add! nfa (cdar ls) state mappings)
                (append ls res))
               ((csets-intersect? cs (caar ls)) =>
@@ -2943,8 +2949,9 @@
                  (let* ((only-in-new (cset-difference cs (caar ls)))
                         (only-in-old (cset-difference (caar ls) cs))
                         (states-in-both (cdar ls))
-                        (states-for-old (and (not (cset-empty? only-in-old))
-                                             (mst-copy states-in-both)))
+                        (states-for-old
+                         (and (not (cset-empty? only-in-old))
+                              (mst-copy states-in-both)))
                         (res (if states-for-old
                                  (cons (cons only-in-old states-for-old) res)
                                  res)))
@@ -2990,15 +2997,16 @@
                      ((cdar trans) =>   ; tagged transition?
                       (lambda (tag)
                        (let* ((index (next-index-for-tag! nfa tag closure))
-                              (new-mappings (mst-add-tagged!
-                                             nfa closure state mappings tag index)))
-                         (lp2 (cdr trans) (cons (cons state new-mappings) stack)))))
+                              (new-mappings
+                               (mst-add-tagged!
+                                nfa closure state mappings tag index)))
+                         (lp2 (cdr trans)
+                              (cons (cons state new-mappings) stack)))))
                      (else
                       (mst-add/fast! nfa closure state mappings)
                       (lp2 (cdr trans) (cons (cons state mappings) stack)))))
                    (else (lp2 (cdr trans) stack))))))))))
 
-
 (define (nfa-epsilon-closure nfa states)
   (or (nfa-get-closure nfa states)
       (let ((res (nfa-epsilon-closure-internal nfa states)))
@@ -3079,7 +3087,6 @@
         (nfa-set-reorder-commands! nfa closure res)
         res)))
 
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;; Closure Compilation
 ;;
@@ -3099,7 +3106,7 @@
                    (irregex-match-start-index-set! matches 0 (cdr init))
                    (irregex-match-end-chunk-set! matches 0 src)
                    (irregex-match-end-index-set! matches 0 i)
-		   (%irregex-match-fail-set! matches fail)
+                   (%irregex-match-fail-set! matches fail)
                    matches)))
     ;; XXXX this should be inlined
     (define (rec sre) (lp sre n flags next))
@@ -3209,7 +3216,7 @@
              (rec `(** ,(cadr sre) ,(cadr sre) ,@(cddr sre))))
             ((>=)
              (rec `(** ,(cadr sre) #f ,@(cddr sre))))
-            ((** **?)
+            ((**)
              (cond
               ((or (and (number? (cadr sre))
                         (number? (caddr sre))
@@ -3217,27 +3224,67 @@
                    (and (not (cadr sre)) (caddr sre)))
                (lambda (cnk init src str i end matches fail) (fail)))
               (else
-               (let* ((from (cadr sre))
-                      (to (caddr sre))
-                      (? (if (eq? '** (car sre)) '? '??))
-                      (* (if (eq? '** (car sre)) '* '*?))
-                      (sre (sre-sequence (cdddr sre)))
-                      (x-sre (sre-strip-submatches sre))
-                      (next (if to
-                                (if (= from to)
-                                    next
-                                    (fold (lambda (x next)
-                                            (lp `(,? ,sre) n flags next))
-                                          next
-                                          (zero-to (- to from))))
-                                (rec `(,* ,sre)))))
-                 (if (zero? from)
+               (letrec
+                   ((from (cadr sre))
+                    (to (caddr sre))
+                    (body-contents (sre-sequence (cdddr sre)))
+                    (body
+                     (lambda (count)
+                       (lp body-contents
+                           n
+                           flags
+                           (lambda (cnk init src str i end matches fail)
+                             (if (and to (= count to))
+                                 (next cnk init src str i end matches fail)
+                                 ((body (+ 1 count))
+                                  cnk init src str i end matches
+                                  (lambda ()
+                                    (if (>= count from)
+                                        (next cnk init src str i end matches fail)
+                                        (fail))))))))))
+                 (if (and (zero? from) to (zero? to))
                      next
-                     (lp `(seq ,@(map (lambda (x) x-sre) (zero-to (- from 1)))
-                               ,sre)
-                         n
-                         flags
-                         next))))))
+                     (lambda (cnk init src str i end matches fail)
+                       ((body 1) cnk init src str i end matches
+                        (lambda ()
+                          (if (zero? from)
+                              (next cnk init src str i end matches fail)
+                              (fail))))))))))
+            ((**?)
+             (cond
+              ((or (and (number? (cadr sre))
+                        (number? (caddr sre))
+                        (> (cadr sre) (caddr sre)))
+                   (and (not (cadr sre)) (caddr sre)))
+               (lambda (cnk init src str i end matches fail) (fail)))
+              (else
+               (letrec
+                   ((from (cadr sre))
+                    (to (caddr sre))
+                    (body-contents (sre-sequence (cdddr sre)))
+                    (body
+                     (lambda (count)
+                       (lp body-contents
+                           n
+                           flags
+                           (lambda (cnk init src str i end matches fail)
+                             (if (< count from)
+                                 ((body (+ 1 count)) cnk init
+                                  src str i end matches fail)
+                                 (next cnk init src str i end matches
+                                       (lambda ()
+                                         (if (and to (= count to))
+                                             (fail)
+                                             ((body (+ 1 count)) cnk init
+                                              src str i end matches fail))))))))))
+                 (if (and (zero? from) to (zero? to))
+                     next
+                     (lambda (cnk init src str i end matches fail)
+                       (if (zero? from)
+                           (next cnk init src str i end matches
+                                 (lambda ()
+                                   ((body 1) cnk init src str i end matches fail)))
+                           ((body 1) cnk init src str i end matches fail))))))))
             ((word)
              (rec `(seq bow ,@(cdr sre) eow)))
             ((word+)
diff --git a/tests/re-tests.txt b/tests/re-tests.txt
index 37e951a7..7a56edb7 100644
--- a/tests/re-tests.txt
+++ b/tests/re-tests.txt
@@ -73,6 +73,27 @@ a**	-	c	-	-
 (a+|b)*	ab	y	&-\1	ab-b
 (a+|b)+	ab	y	&-\1	ab-b
 (a+|b)?	ab	y	&-\1	a-a
+(a+|b){0,0}	ab	y	&-\1	-
+(a+|b){0,2}	ab	y	&-\1	ab-b
+(a+|b){1,2}	ab	y	&-\1	ab-b
+^(a+|b){0,0}$	a	n	-	-
+^(a+|b){1,2}$	ab	y	&-\1	ab-b
+^(a+|b){1,2}$	abc	n	-	-
+^(a+|b){0,1}$	ab	n	-	-
+(a+|b){0,2}b	ab	y	&-\1	ab-a
+(a+|b){0,2}b	aab	y	&-\1	aab-aa
+(a+|b){0,2}b	abb	y	&-\1	abb-b
+(a+|b){0,2}?b	ab	y	&-\1	ab-a
+(a+|b){0,2}?b	aab	y	&-\1	aab-aa
+(a+|b){0,2}?b	abb	y	&-\1	ab-a
+^(a+|b){0,2}?b$	abb	y	&-\1	abb-b
+^(a+|b){0,2}?$	aab	y	&-\1	aab-b
+^((a+)|(b)){0,2}?$	aaab	y	&-\1-\2-\3	aaab-b-aaa-b
+^(a+|b){0,0}?$	a	n	-	-
+(a+|b){0,0}?	ab	y	&-\1	-
+(a+|b){1,2}?b	b	n	-	-
+(a+|b){0,2}?ab	ab	y	&-\1	ab-
+(a+|b){2,3}?b	ab	n	-	-
 [^ab]*	cde	y	&	cde
 (^)*	-	c	-	-
 (ab|)*	-	c	-	-
@@ -149,3 +170,4 @@ multiple words	multiple words, yeah	y	&	multiple words
 (we|wee|week)(knights|night)	weeknights	y	&-\1-\2	weeknights-wee-knights
 (a([^a])*)*	abcaBC	y	&-\1-\2	abcaBC-aBC-C
 ([Aa]b).*\1	abxyzab	y	&-\1	abxyzab-ab
+a([\/\\]*)b	a//\\b	y	&-\1	a//\\b-//\\
Trap