~ chicken-core (chicken-5) e207a1f6543fb235d7bd9334efcf858d744f3d07


commit e207a1f6543fb235d7bd9334efcf858d744f3d07
Author:     Peter Bex <Peter.Bex@xs4all.nl>
AuthorDate: Fri Sep 24 22:39:04 2010 +0200
Commit:     Peter Bex <Peter.Bex@xs4all.nl>
CommitDate: Fri Sep 24 22:39:04 2010 +0200

    Apply upstream changeset b0e9bf428748 (Simplify NFA and DFA code by using csets instead of custom pairs/chars)

diff --git a/irregex-core.scm b/irregex-core.scm
index 78802bb6..4e299496 100644
--- a/irregex-core.scm
+++ b/irregex-core.scm
@@ -31,7 +31,7 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;; History
 ;;
-;; 0.8.2: 2010/08/19 - (...)? submatch extraction fix and alternate
+;; 0.8.2: 2010/08/28 - (...)? submatch extraction fix and alternate
 ;;                     named submatches from Peter Bex
 ;;                     Added irregex-split, irregex-extract,
 ;;                     irregex-match-names and irregex-match-valid-index?
@@ -2167,11 +2167,9 @@
             ((< i end)
              (let* ((ch (string-ref str i))
                     (next (find (lambda (x)
-                                  (if (eqv? ch (car x))
-                                      #t
-                                      (and (pair? (car x))
-                                           (char<=? (caar x) ch)
-                                           (char<=? ch (cdar x)))))
+                                  (or (eqv? ch (car x))
+                                      (and (not (char? (car x)))
+                                           (cset-contains? (car x) ch))))
                                 (cdr state))))
                (and next (lp2 (+ i 1) (dfa-next-state dfa next)))))
             (else
@@ -2215,11 +2213,9 @@
            (else
             (let* ((ch (string-ref str i))
                    (cell (find (lambda (x)
-                                 (if (eqv? ch (car x))
-                                     #t
-                                     (and (pair? (car x))
-                                          (char<=? (caar x) ch)
-                                          (char<=? ch (cdar x)))))
+                                 (or (eqv? ch (car x))
+                                     (and (not (char? (car x)))
+                                          (cset-contains? (car x) ch))))
                                (cdr state))))
               (cond
                (cell
@@ -2354,11 +2350,11 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;; SRE->NFA compilation
 ;;
-;; An NFA state is a numbered node with a list of patter->number
-;; transitions, where pattern is either a character, (lo . hi)
-;; character range, or epsilon (indicating an empty transition).
-;; There may be duplicate characters and overlapping ranges - since
-;; it's an NFA we process it by considering all possible transitions.
+;; An NFA state is a numbered node with a list of pattern->number
+;; transitions, where pattern is character set range, or epsilon
+;; (indicating an empty transition).
+;; There may be overlapping ranges - since it's an NFA we process it
+;; by considering all possible transitions.
 
 (define *nfa-presize* 128)  ;; constant
 (define *nfa-num-fields* 4) ;; constant
@@ -2370,8 +2366,6 @@
   (vector-ref nfa (* i *nfa-num-fields*)))
 (define (nfa-set-state-trans! nfa i x)
   (vector-set! nfa (* i *nfa-num-fields*) x))
-(define (nfa-push-state-trans! nfa i x)
-  (nfa-set-state-trans! nfa i (cons x (nfa-get-state-trans nfa i))))
 
 (define (nfa-get-epsilons nfa i)
   (vector-ref nfa (+ (* i *nfa-num-fields*) 1)))
@@ -2419,15 +2413,15 @@
               (set! buf tmp)))
         (nfa-set-state-trans! buf n2 trans-ls)
         n2)
-      (define (extend-state! next . trans)
+      (define (extend-state! next trans-cs)
         (and next
-             (add-state! (new-state-number next)
-                         (map (lambda (x) (cons x next)) trans))))
+             (add-state! (new-state-number next) (cons trans-cs next))))
       (define (add-char-state! next ch)
         (let ((alt (char-altcase ch)))
-          (if (and (flag-set? flags ~case-insensitive?) (not (eqv? ch alt)))
-              (extend-state! next ch alt)
-              (extend-state! next ch))))
+          (if (flag-set? flags ~case-insensitive?)
+              (extend-state! next (cset-union (range->cset ch ch)
+                                              (range->cset alt alt)))
+              (extend-state! next (range->cset ch ch)))))
       (if (null? ls)
           next
           (cond
@@ -2463,12 +2457,8 @@
                        next))))
            ((pair? (car ls))
             (cond
-             ((string? (caar ls))
-              ;; enumerated character set
-              (lp (cons (sre-alternate (string->list (caar ls))) (cdr ls))
-                  n
-                  flags
-                  next))
+             ((string? (caar ls))       ; Enumerated character set
+              (extend-state! (lp (cdr ls) n flags next) (string->cset (caar ls))))
              (else
               (case (caar ls)
                 ((seq :)
@@ -2493,7 +2483,7 @@
                    (case (length pl) ;; TODO could be expensive
                      ((2)
                       (extend-state! (lp (cdr ls) n flags next)
-                                     (cons (car pl) (cadr pl))))
+                                     (range->cset (car pl) (cadr pl))))
                      (else
                       (let ((next (lp (cdr ls) n flags next)))
                         (and
@@ -2521,7 +2511,8 @@
                                            flags
                                            next))))
                           (and a
-                               (let ((c (add-state! (new-state-number a) '())))
+                               (let ((c (add-state! (new-state-number a)
+                                                    '())))
                                  (nfa-add-epsilon! buf c a)
                                  (nfa-add-epsilon! buf c b)
                                  c)))))))
@@ -2783,6 +2774,8 @@
 
 ;; When the conversion is complete we renumber the DFA sets-of-states
 ;; in order and convert the result to a vector for fast lookup.
+;; Charsets containing single characters are converted to those characters
+;; for quick matching of the literal parts in a regex.
 (define (dfa-renumber nfa dfa)
   (let* ((len (length dfa))
          (states (make-vector (nfa-num-states nfa) '()))
@@ -2798,127 +2791,66 @@
     (let lp ((ls dfa) (i 0))
       (cond ((pair? ls)
              (for-each
-              (lambda (x) (set-cdr! x (renumber (cdr x))))
+              (lambda (x)
+                (set-car! x (maybe-cset->char (car x)))
+                (set-cdr! x (renumber (cdr x))))
               (cddar ls))
              (vector-set! res i (cdar ls))
              (lp (cdr ls) (+ i 1)))))
     res))
 
-;; Extract all distinct characters or ranges and the potential states
-;; they can transition to from a given set of states.  Any ranges that
-;; would overlap with distinct characters are split accordingly.
+;; Extract all distinct ranges and the potential states they can transition
+;; to from a given set of states.  Any ranges that would overlap with
+;; distinct characters are split accordingly.
 (define (nfa-state-transitions nfa states)
   (let ((res (nfa-multi-state-fold
               states
               (lambda (st res)
-                (fold (lambda (trans res)
-                        (nfa-join-transitions! nfa res (car trans) (cdr trans)))
+                (let ((trans (nfa-get-state-trans nfa st)))
+                  (if (null? trans)
                       res
-                      (nfa-get-state-trans nfa st)))
+                      (nfa-join-transitions! nfa res (car trans) (cdr trans)))))
               '())))
     (for-each (lambda (x) (set-cdr! x (nfa-closure nfa (cdr x)))) res)
     res))
 
 (define (nfa-join-transitions! nfa existing elt state)
-  (define (join! ls elt state)
-    (if (not elt)
-        ls
-        (nfa-join-transitions! nfa ls elt state)))
-  (cond
-   ((char? elt)
-    (let lp ((ls existing) (res '()))
-      (cond
-       ((null? ls)
-        ;; done, just cons this on to the original list
-        (cons (cons elt (nfa-state->multi-state nfa state)) existing))
-       ((eq? elt (caar ls))
-        ;; add a new state to an existing char
-        (set-cdr! (car ls) (nfa-multi-state-add! (cdar ls) state))
-        existing)
-       ((and (pair? (caar ls))
-             (char<=? (caaar ls) elt)
-             (char<=? elt (cdaar ls)))
-        ;; split a range
-        (apply
-         (lambda (left right)
-           (let ((left-copy (nfa-multi-state-copy (cdar ls)))
-                 (right-copy (nfa-multi-state-copy (cdar ls))))
-             (cons (cons elt (nfa-multi-state-add! (cdar ls) state))
-                   (append (if left (list (cons left left-copy)) '())
-                           (if right (list (cons right right-copy)) '())
-                           res
-                           (cdr ls)))))
-         (split-char-range (caar ls) elt)))
-       (else
-        ;; keep looking
-        (lp (cdr ls) (cons (car ls) res))))))
-   (else
-    (let ((lo (car elt))
-          (hi (cdr elt)))
-      (let lp ((ls existing) (res '()))
-        (cond
-         ((null? ls)
-          ;; done, just cons this on to the original list
-          (cons (cons elt (nfa-state->multi-state nfa state)) existing))
-         ((and (char? (caar ls)) (char<=? lo (caar ls)) (char<=? (caar ls) hi))
-          ;; range enclosing a character
-          (apply
-           (lambda (left right)
-             (set-cdr! (car ls) (nfa-multi-state-add! (cdar ls) state))
-             (join! (join! existing left state) right state))
-           (split-char-range elt (caar ls))))
-         ((and (pair? (caar ls))
-               (or (and (char<=? (caaar ls) hi) (char<=? lo (cdaar ls)))
-                   (and (char<=? hi (caaar ls)) (char<=? (cdaar ls) lo))))
-          ;; overlapping ranges
-          (apply
-           (lambda (left1 left2 same right1 right2) ;; 5 regions
-             (let ((right1-copy (nfa-multi-state-copy (cdar ls)))
-                   (right2-copy (nfa-multi-state-copy (cdar ls))))
-               (set-car! (car ls) same)
-               (set-cdr! (car ls) (nfa-multi-state-add! (cdar ls) state))
-               (let* ((res (if right1
-                               (cons (cons right1 right1-copy) existing)
-                               existing))
-                      (res (if right2
-                               (cons (cons right2 right2-copy) res)
-                               res)))
-                 (join! (join! res left1 state) left2 state))))
-           (intersect-char-ranges elt (caar ls))))
-         (else
-          (lp (cdr ls) (cons (car ls) res)))))))))
-
-(define (char-range c1 c2)
-  (if (eqv? c1 c2) c1 (cons c1 c2)))
-
-;; assumes ch is included in the range
-(define (split-char-range range ch)
-  (list
-   (and (not (eqv? ch (car range)))
-        (char-range (car range) (integer->char (- (char->integer ch) 1))))
-   (and (not (eqv? ch (cdr range)))
-        (char-range (integer->char (+ (char->integer ch) 1)) (cdr range)))))
-
-;; returns 5 (possibly #f) char ranges:
-;;    a-only-1  a-only-2  a-and-b  b-only-1  b-only-2
-;;
-;; TODO: Is this odd procedure really necessary?
-(define (intersect-char-ranges a b)
-  (if (char>? (car a) (car b))
-      (reverse (intersect-char-ranges b a))
-      (let ((a-lo (car a))
-            (a-hi (cdr a))
-            (b-lo (car b))
-            (b-hi (cdr b)))
-        (list
-         (and (char<? a-lo b-lo)
-              (char-range a-lo (prev-char b-lo)))
-         (and (char>? a-hi b-hi)
-              (char-range (next-char b-hi) a-hi))
-         (char-range b-lo (if (char<? b-hi a-hi) b-hi a-hi))
-         #f
-         (and (char>? b-hi a-hi)
-              (char-range (next-char a-hi) b-hi))))))
+  (define (csets-intersect? a b)
+    (let ((i (cset-intersection a b)))
+      (and (not (cset-empty? i)) i)))
+  (let lp ((ls existing) (res '()))
+    (cond
+     ((null? ls)
+      (cond       ; First try to find a group that includes this state
+       ((find (lambda (x) (nfa-multi-state-contains? (cdr x) state)) existing) =>
+        (lambda (existing-state)    ; If found, merge charsets with it
+          (set-car! existing-state (cset-union (car existing-state) elt))
+          existing))
+       ;; State not seen yet?  Add a new state transition
+       (else (cons (cons elt (nfa-state->multi-state nfa state)) existing))))
+     ((cset=? elt (caar ls)) ; Add state to existing set for this charset
+      (set-cdr! (car ls) (nfa-multi-state-add! (cdar ls) state))
+      existing)
+     ((csets-intersect? elt (caar ls)) => ; overlapping charset, but diff state
+      (lambda (intersection)
+        (let* ((only-in-old (cset-difference (caar ls) elt))
+               (states-for-old (and (not (cset-empty? only-in-old))
+                                    (nfa-multi-state-copy (cdar ls))))
+               (only-in-new (cset-difference elt (caar ls)))
+               (states-for-new (and (not (cset-empty? only-in-new))
+                                    (nfa-multi-state-copy (cdar ls)))))
+          ;; Add this state to the states already here and restrict to
+          ;; the overlapping charset
+          (set-car! (car ls) intersection)
+          (set-cdr! (car ls) (nfa-multi-state-add! (cdar ls) state))
+          ;; Continue with the remaining subset of the new cset that we got
+          (nfa-join-transitions! nfa (if states-for-old
+                                         (cons (cons only-in-old states-for-old)
+                                               existing)
+                                         existing)
+                                 only-in-new state))))
+     (else
+      (lp (cdr ls) (cons (car ls) res))))))
 
 (define (nfa-cache-state-closure! nfa state)
   (let ((cached (nfa-get-state-closure nfa state)))
@@ -3619,6 +3551,15 @@
 
 (define (make-cset) (vector))
 (define (range->cset from to) (vector (cons from to)))
+(define (cset-empty? cs) (zero? (vector-length cs)))
+(define (maybe-cset->char cs)
+  (if (and (= (vector-length cs) 1)
+           (char=? (car (vector-ref cs 0)) (cdr (vector-ref cs 0))))
+      (car (vector-ref cs 0))
+      cs))
+
+;; Since csets are sorted, there's only one possible representation of any cset
+(define cset=? equal?)
 
 (define (cset-size cs)
   (let ((len (vector-length cs)))
@@ -3696,16 +3637,19 @@
 
 (define (cset-contains? cset ch)
   (let ((len (vector-length cset)))
-    (and (> len 0)
-         (let lp ((lower 0) (upper len))
-           (let* ((middle (+ lower (quotient (- upper lower) 2)))
-                  (range (vector-ref cset middle)))
-             (cond
-              ((char<? ch (car range))
-               (and (< lower middle) (lp lower middle)))
-              ((char>? ch (cdr range))
-               (and (< (+ middle 1) upper) (lp (+ middle 1) upper)))
-              (else #t)))))))
+    (case len
+      ((0) #f)
+      ((1) (let ((range (vector-ref cset 0)))
+             (and (char<=? ch (cdr range)) (char<=? (car range) ch))))
+      (else (let lp ((lower 0) (upper len))
+              (let* ((middle (quotient (+ upper lower) 2))
+                     (range (vector-ref cset middle)))
+                (cond ((char<? (cdr range) ch)
+                       (let ((next (+ middle 1)))
+                         (and (< next upper) (lp next upper))))
+                      ((char<? ch (car range))
+                       (and (< lower middle) (lp lower middle)))
+                      (else #t))))))))
 
 (define (char-ranges-union a b)
   (cons (if (char<=? (car a) (car b)) (car a) (car b))
Trap