~ chicken-core (chicken-5) e207a1f6543fb235d7bd9334efcf858d744f3d07
commit e207a1f6543fb235d7bd9334efcf858d744f3d07 Author: Peter Bex <Peter.Bex@xs4all.nl> AuthorDate: Fri Sep 24 22:39:04 2010 +0200 Commit: Peter Bex <Peter.Bex@xs4all.nl> CommitDate: Fri Sep 24 22:39:04 2010 +0200 Apply upstream changeset b0e9bf428748 (Simplify NFA and DFA code by using csets instead of custom pairs/chars) diff --git a/irregex-core.scm b/irregex-core.scm index 78802bb6..4e299496 100644 --- a/irregex-core.scm +++ b/irregex-core.scm @@ -31,7 +31,7 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; History ;; -;; 0.8.2: 2010/08/19 - (...)? submatch extraction fix and alternate +;; 0.8.2: 2010/08/28 - (...)? submatch extraction fix and alternate ;; named submatches from Peter Bex ;; Added irregex-split, irregex-extract, ;; irregex-match-names and irregex-match-valid-index? @@ -2167,11 +2167,9 @@ ((< i end) (let* ((ch (string-ref str i)) (next (find (lambda (x) - (if (eqv? ch (car x)) - #t - (and (pair? (car x)) - (char<=? (caar x) ch) - (char<=? ch (cdar x))))) + (or (eqv? ch (car x)) + (and (not (char? (car x))) + (cset-contains? (car x) ch)))) (cdr state)))) (and next (lp2 (+ i 1) (dfa-next-state dfa next))))) (else @@ -2215,11 +2213,9 @@ (else (let* ((ch (string-ref str i)) (cell (find (lambda (x) - (if (eqv? ch (car x)) - #t - (and (pair? (car x)) - (char<=? (caar x) ch) - (char<=? ch (cdar x))))) + (or (eqv? ch (car x)) + (and (not (char? (car x))) + (cset-contains? (car x) ch)))) (cdr state)))) (cond (cell @@ -2354,11 +2350,11 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; SRE->NFA compilation ;; -;; An NFA state is a numbered node with a list of patter->number -;; transitions, where pattern is either a character, (lo . hi) -;; character range, or epsilon (indicating an empty transition). -;; There may be duplicate characters and overlapping ranges - since -;; it's an NFA we process it by considering all possible transitions. +;; An NFA state is a numbered node with a list of pattern->number +;; transitions, where pattern is character set range, or epsilon +;; (indicating an empty transition). +;; There may be overlapping ranges - since it's an NFA we process it +;; by considering all possible transitions. (define *nfa-presize* 128) ;; constant (define *nfa-num-fields* 4) ;; constant @@ -2370,8 +2366,6 @@ (vector-ref nfa (* i *nfa-num-fields*))) (define (nfa-set-state-trans! nfa i x) (vector-set! nfa (* i *nfa-num-fields*) x)) -(define (nfa-push-state-trans! nfa i x) - (nfa-set-state-trans! nfa i (cons x (nfa-get-state-trans nfa i)))) (define (nfa-get-epsilons nfa i) (vector-ref nfa (+ (* i *nfa-num-fields*) 1))) @@ -2419,15 +2413,15 @@ (set! buf tmp))) (nfa-set-state-trans! buf n2 trans-ls) n2) - (define (extend-state! next . trans) + (define (extend-state! next trans-cs) (and next - (add-state! (new-state-number next) - (map (lambda (x) (cons x next)) trans)))) + (add-state! (new-state-number next) (cons trans-cs next)))) (define (add-char-state! next ch) (let ((alt (char-altcase ch))) - (if (and (flag-set? flags ~case-insensitive?) (not (eqv? ch alt))) - (extend-state! next ch alt) - (extend-state! next ch)))) + (if (flag-set? flags ~case-insensitive?) + (extend-state! next (cset-union (range->cset ch ch) + (range->cset alt alt))) + (extend-state! next (range->cset ch ch))))) (if (null? ls) next (cond @@ -2463,12 +2457,8 @@ next)))) ((pair? (car ls)) (cond - ((string? (caar ls)) - ;; enumerated character set - (lp (cons (sre-alternate (string->list (caar ls))) (cdr ls)) - n - flags - next)) + ((string? (caar ls)) ; Enumerated character set + (extend-state! (lp (cdr ls) n flags next) (string->cset (caar ls)))) (else (case (caar ls) ((seq :) @@ -2493,7 +2483,7 @@ (case (length pl) ;; TODO could be expensive ((2) (extend-state! (lp (cdr ls) n flags next) - (cons (car pl) (cadr pl)))) + (range->cset (car pl) (cadr pl)))) (else (let ((next (lp (cdr ls) n flags next))) (and @@ -2521,7 +2511,8 @@ flags next)))) (and a - (let ((c (add-state! (new-state-number a) '()))) + (let ((c (add-state! (new-state-number a) + '()))) (nfa-add-epsilon! buf c a) (nfa-add-epsilon! buf c b) c))))))) @@ -2783,6 +2774,8 @@ ;; When the conversion is complete we renumber the DFA sets-of-states ;; in order and convert the result to a vector for fast lookup. +;; Charsets containing single characters are converted to those characters +;; for quick matching of the literal parts in a regex. (define (dfa-renumber nfa dfa) (let* ((len (length dfa)) (states (make-vector (nfa-num-states nfa) '())) @@ -2798,127 +2791,66 @@ (let lp ((ls dfa) (i 0)) (cond ((pair? ls) (for-each - (lambda (x) (set-cdr! x (renumber (cdr x)))) + (lambda (x) + (set-car! x (maybe-cset->char (car x))) + (set-cdr! x (renumber (cdr x)))) (cddar ls)) (vector-set! res i (cdar ls)) (lp (cdr ls) (+ i 1))))) res)) -;; Extract all distinct characters or ranges and the potential states -;; they can transition to from a given set of states. Any ranges that -;; would overlap with distinct characters are split accordingly. +;; Extract all distinct ranges and the potential states they can transition +;; to from a given set of states. Any ranges that would overlap with +;; distinct characters are split accordingly. (define (nfa-state-transitions nfa states) (let ((res (nfa-multi-state-fold states (lambda (st res) - (fold (lambda (trans res) - (nfa-join-transitions! nfa res (car trans) (cdr trans))) + (let ((trans (nfa-get-state-trans nfa st))) + (if (null? trans) res - (nfa-get-state-trans nfa st))) + (nfa-join-transitions! nfa res (car trans) (cdr trans))))) '()))) (for-each (lambda (x) (set-cdr! x (nfa-closure nfa (cdr x)))) res) res)) (define (nfa-join-transitions! nfa existing elt state) - (define (join! ls elt state) - (if (not elt) - ls - (nfa-join-transitions! nfa ls elt state))) - (cond - ((char? elt) - (let lp ((ls existing) (res '())) - (cond - ((null? ls) - ;; done, just cons this on to the original list - (cons (cons elt (nfa-state->multi-state nfa state)) existing)) - ((eq? elt (caar ls)) - ;; add a new state to an existing char - (set-cdr! (car ls) (nfa-multi-state-add! (cdar ls) state)) - existing) - ((and (pair? (caar ls)) - (char<=? (caaar ls) elt) - (char<=? elt (cdaar ls))) - ;; split a range - (apply - (lambda (left right) - (let ((left-copy (nfa-multi-state-copy (cdar ls))) - (right-copy (nfa-multi-state-copy (cdar ls)))) - (cons (cons elt (nfa-multi-state-add! (cdar ls) state)) - (append (if left (list (cons left left-copy)) '()) - (if right (list (cons right right-copy)) '()) - res - (cdr ls))))) - (split-char-range (caar ls) elt))) - (else - ;; keep looking - (lp (cdr ls) (cons (car ls) res)))))) - (else - (let ((lo (car elt)) - (hi (cdr elt))) - (let lp ((ls existing) (res '())) - (cond - ((null? ls) - ;; done, just cons this on to the original list - (cons (cons elt (nfa-state->multi-state nfa state)) existing)) - ((and (char? (caar ls)) (char<=? lo (caar ls)) (char<=? (caar ls) hi)) - ;; range enclosing a character - (apply - (lambda (left right) - (set-cdr! (car ls) (nfa-multi-state-add! (cdar ls) state)) - (join! (join! existing left state) right state)) - (split-char-range elt (caar ls)))) - ((and (pair? (caar ls)) - (or (and (char<=? (caaar ls) hi) (char<=? lo (cdaar ls))) - (and (char<=? hi (caaar ls)) (char<=? (cdaar ls) lo)))) - ;; overlapping ranges - (apply - (lambda (left1 left2 same right1 right2) ;; 5 regions - (let ((right1-copy (nfa-multi-state-copy (cdar ls))) - (right2-copy (nfa-multi-state-copy (cdar ls)))) - (set-car! (car ls) same) - (set-cdr! (car ls) (nfa-multi-state-add! (cdar ls) state)) - (let* ((res (if right1 - (cons (cons right1 right1-copy) existing) - existing)) - (res (if right2 - (cons (cons right2 right2-copy) res) - res))) - (join! (join! res left1 state) left2 state)))) - (intersect-char-ranges elt (caar ls)))) - (else - (lp (cdr ls) (cons (car ls) res))))))))) - -(define (char-range c1 c2) - (if (eqv? c1 c2) c1 (cons c1 c2))) - -;; assumes ch is included in the range -(define (split-char-range range ch) - (list - (and (not (eqv? ch (car range))) - (char-range (car range) (integer->char (- (char->integer ch) 1)))) - (and (not (eqv? ch (cdr range))) - (char-range (integer->char (+ (char->integer ch) 1)) (cdr range))))) - -;; returns 5 (possibly #f) char ranges: -;; a-only-1 a-only-2 a-and-b b-only-1 b-only-2 -;; -;; TODO: Is this odd procedure really necessary? -(define (intersect-char-ranges a b) - (if (char>? (car a) (car b)) - (reverse (intersect-char-ranges b a)) - (let ((a-lo (car a)) - (a-hi (cdr a)) - (b-lo (car b)) - (b-hi (cdr b))) - (list - (and (char<? a-lo b-lo) - (char-range a-lo (prev-char b-lo))) - (and (char>? a-hi b-hi) - (char-range (next-char b-hi) a-hi)) - (char-range b-lo (if (char<? b-hi a-hi) b-hi a-hi)) - #f - (and (char>? b-hi a-hi) - (char-range (next-char a-hi) b-hi)))))) + (define (csets-intersect? a b) + (let ((i (cset-intersection a b))) + (and (not (cset-empty? i)) i))) + (let lp ((ls existing) (res '())) + (cond + ((null? ls) + (cond ; First try to find a group that includes this state + ((find (lambda (x) (nfa-multi-state-contains? (cdr x) state)) existing) => + (lambda (existing-state) ; If found, merge charsets with it + (set-car! existing-state (cset-union (car existing-state) elt)) + existing)) + ;; State not seen yet? Add a new state transition + (else (cons (cons elt (nfa-state->multi-state nfa state)) existing)))) + ((cset=? elt (caar ls)) ; Add state to existing set for this charset + (set-cdr! (car ls) (nfa-multi-state-add! (cdar ls) state)) + existing) + ((csets-intersect? elt (caar ls)) => ; overlapping charset, but diff state + (lambda (intersection) + (let* ((only-in-old (cset-difference (caar ls) elt)) + (states-for-old (and (not (cset-empty? only-in-old)) + (nfa-multi-state-copy (cdar ls)))) + (only-in-new (cset-difference elt (caar ls))) + (states-for-new (and (not (cset-empty? only-in-new)) + (nfa-multi-state-copy (cdar ls))))) + ;; Add this state to the states already here and restrict to + ;; the overlapping charset + (set-car! (car ls) intersection) + (set-cdr! (car ls) (nfa-multi-state-add! (cdar ls) state)) + ;; Continue with the remaining subset of the new cset that we got + (nfa-join-transitions! nfa (if states-for-old + (cons (cons only-in-old states-for-old) + existing) + existing) + only-in-new state)))) + (else + (lp (cdr ls) (cons (car ls) res)))))) (define (nfa-cache-state-closure! nfa state) (let ((cached (nfa-get-state-closure nfa state))) @@ -3619,6 +3551,15 @@ (define (make-cset) (vector)) (define (range->cset from to) (vector (cons from to))) +(define (cset-empty? cs) (zero? (vector-length cs))) +(define (maybe-cset->char cs) + (if (and (= (vector-length cs) 1) + (char=? (car (vector-ref cs 0)) (cdr (vector-ref cs 0)))) + (car (vector-ref cs 0)) + cs)) + +;; Since csets are sorted, there's only one possible representation of any cset +(define cset=? equal?) (define (cset-size cs) (let ((len (vector-length cs))) @@ -3696,16 +3637,19 @@ (define (cset-contains? cset ch) (let ((len (vector-length cset))) - (and (> len 0) - (let lp ((lower 0) (upper len)) - (let* ((middle (+ lower (quotient (- upper lower) 2))) - (range (vector-ref cset middle))) - (cond - ((char<? ch (car range)) - (and (< lower middle) (lp lower middle))) - ((char>? ch (cdr range)) - (and (< (+ middle 1) upper) (lp (+ middle 1) upper))) - (else #t))))))) + (case len + ((0) #f) + ((1) (let ((range (vector-ref cset 0))) + (and (char<=? ch (cdr range)) (char<=? (car range) ch)))) + (else (let lp ((lower 0) (upper len)) + (let* ((middle (quotient (+ upper lower) 2)) + (range (vector-ref cset middle))) + (cond ((char<? (cdr range) ch) + (let ((next (+ middle 1))) + (and (< next upper) (lp next upper)))) + ((char<? ch (car range)) + (and (< lower middle) (lp lower middle))) + (else #t)))))))) (define (char-ranges-union a b) (cons (if (char<=? (car a) (car b)) (car a) (car b))Trap