~ chicken-core (chicken-5) 6fe652ab24d6ef8c4cdb0fe5ebaccb30c779c401


commit 6fe652ab24d6ef8c4cdb0fe5ebaccb30c779c401
Author:     Peter Bex <peter.bex@xs4all.nl>
AuthorDate: Sun Sep 16 15:43:26 2012 +0200
Commit:     felix <felix@call-with-current-continuation.org>
CommitDate: Tue Sep 18 23:00:42 2012 +0200

    Convert irregex's NFA representation to support tags (becoming tNFAs). (upstream changeset ed694ba7adff)

diff --git a/irregex-core.scm b/irregex-core.scm
index 25a9c1c5..017e0903 100644
--- a/irregex-core.scm
+++ b/irregex-core.scm
@@ -80,6 +80,11 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;; Data Structures
 
+(define (vector-copy v)
+  (let ((v2 (make-vector (vector-length v))))
+    (vector-copy! v v2)
+    v2))
+
 (cond-expand
   (chicken-bootstrap
    (begin
@@ -126,10 +131,7 @@
 	    (internal
 	     "##sys#make-structure"
 	     'regexp-match
-	     (let* ((v (internal "##sys#slot" m 1))
-		    (v2 (make-vector (internal "##sys#size" v))))
-	       (vector-copy! v v2)
-	       v2)
+	     (vector-copy (internal "##sys#slot" m 1))
 	     (internal "##sys#slot" m 2)
 	     (internal "##sys#slot" m 3)
 	     (internal "##sys#slot" m 4))))
@@ -195,11 +197,7 @@
 	   ((<= i 3) m)
 	 (vector-set! m i #f)))
      (define (irregex-copy-matches m)
-       (and (vector? m)
-	    (let ((r (make-vector (vector-length m))))
-	      (do ((i (- (vector-length m) 1) (- i 1)))
-		  ((< i 0) r)
-		(vector-set! r i (vector-ref m i))))))
+       (and (vector? m) (vector-copy m)))
      (define irregex-match-tag '*irregex-match-tag*)
      (define (irregex-match-data? obj)
        (and (vector? obj)
@@ -1597,6 +1595,9 @@
          (searcher? (sre-searcher? sre))
          (sre-dfa (if searcher? (sre-remove-initial-bos sre) sre))
          (dfa-limit (cond ((memq 'small o) 1) ((memq 'fast o) 50) (else 10)))
+         ;; TODO: Maybe make these two promises; if we only want to search,
+         ;; it's wasteful to compile the matcher, and vice versa
+         ;; Maybe provide a flag to compile eagerly, to help benchmarking etc.
          (dfa/search
           (cond ((memq 'backtrack o) #f)
                 (searcher? #t)
@@ -2308,11 +2309,15 @@
 
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;; SRE->NFA compilation
+;;;; SRE->tNFA compilation
+;;
+;; A tagged NFA (tNFA) state is a numbered node with a list of
+;; pattern->number transitions, where pattern is character set range,
+;; or epsilon (indicating an empty transition).
+;;
+;; (Only) epsilon transitions may be *tagged*.  Each tag represents
+;; either the start or the end of a submatch.
 ;;
-;; An NFA state is a numbered node with a list of pattern->number
-;; transitions, where pattern is character set range, or epsilon
-;; (indicating an empty transition).
 ;; There may be overlapping ranges - since it's an NFA we process it
 ;; by considering all possible transitions.
 
@@ -2322,8 +2327,11 @@
 (define (nfa-num-states nfa) (quotient (vector-length nfa) *nfa-num-fields*))
 (define (nfa-start-state nfa) (- (nfa-num-states nfa) 1))
 
+(define (nfa-num-tags nfa)
+  (vector-ref nfa 0))
+
 (define (nfa-get-state-trans nfa i)
-  (vector-ref nfa (* i *nfa-num-fields*)))
+  (if (= i 0) '() (vector-ref nfa (* i *nfa-num-fields*))))
 (define (nfa-set-state-trans! nfa i x)
   (vector-set! nfa (* i *nfa-num-fields*) x))
 
@@ -2331,10 +2339,10 @@
   (vector-ref nfa (+ (* i *nfa-num-fields*) 1)))
 (define (nfa-set-epsilons! nfa i x)
   (vector-set! nfa (+ (* i *nfa-num-fields*) 1) x))
-(define (nfa-add-epsilon! nfa i x)
+(define (nfa-add-epsilon! nfa i x t)
   (let ((eps (nfa-get-epsilons nfa i)))
-    (if (not (memq x eps))
-        (nfa-set-epsilons! nfa i (cons x eps)))))
+    (if (not (assv x eps))
+        (nfa-set-epsilons! nfa i (cons (cons x t) eps)))))
 
 (define (nfa-get-state-closure nfa i)
   (vector-ref nfa (+ (* i *nfa-num-fields*) 2)))
@@ -2359,7 +2367,25 @@
 ;; descending numeric order, with state 0 being the unique accepting
 ;; state.
 (define (sre->nfa sre init-flags)
-  (let ((buf (make-vector (* *nfa-presize* *nfa-num-fields*) '())))
+  (let* ((buf (make-vector (* *nfa-presize* *nfa-num-fields*) '()))
+         ;; Get cons cells and map them to numeric submatch indexes.
+         ;; Doing it here is slightly easier than integrating into the loop below
+         (match-index
+          (let lp ((sre (list sre)) (max 0) (res '()))
+            (cond
+             ((not (pair? sre))
+              ;; We abuse the transitions slot for state 0 (the final state,
+              ;; which can have no transitions) to store the number of tags.
+              (vector-set! buf 0 (* max 2))
+              res)
+             ((pair? (car sre))
+              ;; The appends here should be safe (are they?)
+              (case (caar sre)
+                (($ submatch => submatch-named)
+                 (lp (append (cdar sre) (cdr sre)) (+ max 1)
+                     (cons (cons (car sre) max) res)))
+                (else (lp (append (car sre) (cdr sre)) max res))))
+             (else (lp (cdr sre) max res))))))
     ;; we loop over an implicit sequence list
     (define (lp ls n flags next)
       (define (new-state-number state)
@@ -2389,7 +2415,7 @@
             (let ((next (lp (cdr ls) n flags next)))
               (and next
                    (let ((new (add-state! (new-state-number next) '())))
-                     (nfa-add-epsilon! buf new next)
+                     (nfa-add-epsilon! buf new next #f)
                      new))))
            ((string? (car ls))
             ;; process literal strings a char at a time
@@ -2463,8 +2489,8 @@
                           (and a
                                (let ((c (add-state! (new-state-number a)
                                                     '())))
-                                 (nfa-add-epsilon! buf c a)
-                                 (nfa-add-epsilon! buf c b)
+                                 (nfa-add-epsilon! buf c a #f)
+                                 (nfa-add-epsilon! buf c b #f)
                                  c)))))))
                 ((?)
                  (let ((next (lp (cdr ls) n flags next)))
@@ -2473,7 +2499,7 @@
                     next
                     (let ((a (lp (cdar ls) (new-state-number next) flags next)))
                       (if a
-                          (nfa-add-epsilon! buf a next))
+                          (nfa-add-epsilon! buf a next #f))
                       a))))
                 ((+ *)
                  (let ((next (lp (cdr ls) n flags next)))
@@ -2488,9 +2514,9 @@
                        (a
                         ;; for *, insert an epsilon transition as in ? above
                         (if (eq? '* (caar ls))
-                            (nfa-add-epsilon! buf a new))
+                            (nfa-add-epsilon! buf a new #f))
                         ;; for both, insert a loop back to self
-                        (nfa-add-epsilon! buf new a)))
+                        (nfa-add-epsilon! buf new a #f)))
                       a))))
                 ;; need to add these to the match extractor first,
                 ;; but they tend to generate large DFAs
@@ -2519,9 +2545,31 @@
                 ;;     n flags next))
                 ;; ignore submatches altogether
                 (($ submatch)
-                 (lp (cons (sre-sequence (cdar ls)) (cdr ls)) n flags next))
+                 (let* ((pre-tag (* (cdr (assq (car ls) match-index)) 2))
+                        (post-tag (+ pre-tag 1))
+                        (next (lp (cdr ls) n flags next)))
+                   (and next
+                        (let* ((after (add-state! (new-state-number next) '()))
+                               (sub (lp (list (sre-sequence (cdar ls)))
+                                        (new-state-number after) flags after))
+                               (before (and sub (add-state! (new-state-number sub) '()))))
+                          (cond (before
+                                 (nfa-add-epsilon! buf before sub pre-tag)
+                                 (nfa-add-epsilon! buf after next post-tag)))
+                          before))))
                 ((=> submatch-named)
-                 (lp (cons (sre-sequence (cddar ls)) (cdr ls)) n flags next))
+                 (let* ((pre-tag (* (cdr (assq (car ls) match-index)) 2))
+                        (post-tag (+ pre-tag 1))
+                        (next (lp (cdr ls) n flags next)))
+                   (and next
+                        (let* ((after (add-state! (new-state-number next) '()))
+                               (sub (lp (list (sre-sequence (cddar ls)))
+                                        (new-state-number after) flags after))
+                               (before (and sub (add-state! (new-state-number sub) '()))))
+                          (cond (before
+                                 (nfa-add-epsilon! buf before sub pre-tag)
+                                 (nfa-add-epsilon! buf after next post-tag)))
+                          before))))
                 (else
                  (cond
                   ((assq (caar ls) sre-named-definitions)
@@ -2546,19 +2594,23 @@
 ;; sre->nfa conversion.
 
 ;; (define (nfa-match nfa str)
-;;   (let lp ((ls (string->list str)) (state (car nfa)) (epsilons '()))
-;;     (if (null? ls)
-;;         (zero? (car state))
-;;         (any (lambda (m)
-;;                (if (eq? 'epsilon (car m))
-;;                    (and (not (memv (cdr m) epsilons))
-;;                         (lp ls (assv (cdr m) nfa) (cons (cdr m) epsilons)))
-;;                    (and (or (eqv? (car m) (car ls))
-;;                             (and (pair? (car m))
-;;                                  (char<=? (caar m) (car ls))
-;;                                  (char<=? (car ls) (cdar m))))
-;;                         (lp (cdr ls) (assv (cdr m) nfa) '()))))
-;;              (cdr state)))))
+;;   (let ((matches (make-vector (nfa-num-tags nfa) #f)))
+;;     (let lp ((pos 0) (ls (string->list str)) (state (nfa-start-state nfa)) (epsilons '()))
+;;       (and (or (and (null? ls) (zero? state))
+;;                (let ((t (nfa-get-state-trans nfa state)))
+;;                  (and (not (null? t)) (not (null? ls))
+;;                       (cset-contains? (car t) (car ls))
+;;                       (lp (+ pos 1) (cdr ls) (cdr t) '())))
+;;                (any (lambda (e)
+;;                       (let ((old-matches (vector-copy matches)))
+;;                         (cond ((cdr e)
+;;                                (vector-set! matches (cdr e) pos)))
+;;                         (or (and (not (memv (car e) epsilons))
+;;                                  (lp pos ls (car e) (cons (car e) epsilons)))
+;;                             ;; reset match, apparently this branch failed
+;;                             (begin (set! matches old-matches) #f))))
+;;                     (nfa-get-epsilons nfa state)))
+;;            matches))))
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;; NFA multi-state representation
@@ -2834,7 +2886,8 @@
      ((nfa-multi-state-contains? res (car ls))
       (lp (cdr ls) res))
      (else
-      (lp (append (nfa-get-epsilons nfa (car ls)) (cdr ls))
+      ;; Ignore any epsilon tags for now
+      (lp (append (map car (nfa-get-epsilons nfa (car ls))) (cdr ls))
           (nfa-multi-state-add! res (car ls)))))))
 
 (define (nfa-closure-internal nfa states)
Trap