~ chicken-core (chicken-5) 7d4eb862e99e3c6914a10e1ec843f35a92a06553
commit 7d4eb862e99e3c6914a10e1ec843f35a92a06553 Author: Peter Bex <Peter.Bex@xs4all.nl> AuthorDate: Fri Sep 24 23:13:42 2010 +0200 Commit: Peter Bex <Peter.Bex@xs4all.nl> CommitDate: Fri Sep 24 23:13:42 2010 +0200 Apply upstream changeset 40233db2c0cb (Convert all character-set matching stuff to use the cset API, simplifying the code considerably) diff --git a/irregex-core.scm b/irregex-core.scm index ca4d327d..f688421b 100644 --- a/irregex-core.scm +++ b/irregex-core.scm @@ -1195,65 +1195,45 @@ (let* ((end (string-length str)) (invert? (and (< start end) (eqv? #\^ (string-ref str start)))) (utf8? (flag-set? flags ~utf8?))) - (define (go i chars ranges) + (define (go i prev-char cset) (if (>= i end) (%irregex-error "incomplete char set" str i end) (let ((c (string-ref str i))) (case c ((#\]) - (if (and (null? chars) (null? ranges)) - (go (+ i 1) (cons #\] chars) ranges) - (let ((ci? (flag-set? flags ~case-insensitive?)) - (hi-chars (if utf8? (filter high-char? chars) '())) - (chars (if utf8? (remove high-char? chars) chars))) + (if (cset-empty? cset) + (go (+ i 1) #\] (cset-adjoin cset #\])) + (let ((ci? (flag-set? flags ~case-insensitive?))) (list - ((lambda (res) - (if invert? (cons '~ res) (sre-alternate res))) - (append - hi-chars - (if (pair? chars) - (list - (list (list->string - ((if ci? - cset-case-insensitive - (lambda (x) x)) - (reverse chars))))) - '()) - (if (pair? ranges) - (let ((res (if ci? - (cset-case-insensitive - (plist->cset ranges)) - (plist->cset ranges)))) - (list (cons '/ (cset->plist res)))) - '()))) + (let ((res (if ci? (cset-case-insensitive cset) cset))) + (cset->sre (if invert? (cset-complement res) res))) i)))) ((#\-) (cond ((or (= i start) (and (= i (+ start 1)) (eqv? #\^ (string-ref str start))) (eqv? #\] (string-ref str (+ i 1)))) - (go (+ i 1) (cons c chars) ranges)) - ((null? chars) + (go (+ i 1) c (cset-adjoin cset c))) + ((not prev-char) (%irregex-error "bad char-set")) (else - (let* ((c1 (car chars)) - (c2 (string-ref str (+ i 1)))) + (let ((char (string-ref str (+ i 1)))) (apply - (lambda (c2 j) - (if (char<? c2 c1) - (%irregex-error "inverted range in char-set" c1 c2) - (go j (cdr chars) (cons c1 (cons c2 ranges))))) + (lambda (c j) + (if (char<? c prev-char) + (error "inverted range in char-set" prev-char c) + (go j #f (cset-union cset (range->cset prev-char c))))) (cond - ((and (eqv? #\\ c2) (assv c2 posix-escape-sequences)) + ((and (eqv? #\\ char) (assv char posix-escape-sequences)) => (lambda (x) (list (cdr x) (+ i 3)))) - ((and (eqv? #\\ c2) + ((and (eqv? #\\ char) (eqv? (string-ref str (+ i 2)) #\x)) (string-parse-hex-escape str (+ i 3) end)) - ((and utf8? (<= #x80 (char->integer c2) #xFF)) - (let ((len (utf8-start-char->length c2))) + ((and utf8? (<= #x80 (char->integer char) #xFF)) + (let ((len (utf8-start-char->length char))) (list (utf8-string-ref str (+ i 1) len) (+ i 1 len)))) (else - (list c2 (+ i 2))))))))) + (list char (+ i 2))))))))) ((#\[) (let* ((inv? (eqv? #\^ (string-ref str (+ i 1)))) (i2 (if inv? (+ i 2) (+ i 1)))) @@ -1262,76 +1242,45 @@ (let ((j (string-scan-char str #\: (+ i2 1)))) (if (or (not j) (not (eqv? #\] (string-ref str (+ j 1))))) (%irregex-error "incomplete character class" str) - (let* ((cset (sre->cset - (string->symbol - (substring str (+ i2 1) j)))) - (cset (if inv? (cset-complement cset) cset))) - ;; REFACTORME - (let split ((cset-as-plist (cset->plist cset)) - (cset-single-chars '()) - (cset-ranges '())) - (cond - ((null? cset-as-plist) - (go (+ j 2) - (append cset-single-chars chars) - (append cset-ranges ranges))) - ((char=? (car cset-as-plist) (cadr cset-as-plist)) - (split (cddr cset-as-plist) - (cons (car cset-as-plist) cset-single-chars) - cset-ranges)) - (else (split (cddr cset-as-plist) - cset-single-chars - (cons (car cset-as-plist) - (cons (cadr cset-as-plist) - cset-ranges)))))))))) + (let* ((class (sre->cset + (string->symbol + (substring str (+ i2 1) j)))) + (class (if inv? (cset-complement class) class))) + (go (+ j 2) #f (cset-union cset class)))))) ((#\= #\.) (%irregex-error "collating sequences not supported" str)) (else - (go (+ i 1) (cons #\[ chars) ranges))))) + (go (+ i 1) #\[ (cset-adjoin cset #\[)))))) ((#\\) (let ((c (string-ref str (+ i 1)))) (case c ((#\d #\D #\s #\S #\w #\W) - (let ((cset (sre->cset (string->sre (string #\\ c))))) - ;; REFACTORME - (let split ((cset-as-plist (cset->plist cset)) - (cset-single-chars '()) - (cset-ranges '())) - (cond - ((null? cset-as-plist) - (go (+ i 2) - (append cset-single-chars chars) - (append cset-ranges ranges))) - ((char=? (car cset-as-plist) (cadr cset-as-plist)) - (split (cddr cset-as-plist) - (cons (car cset-as-plist) cset-single-chars) - cset-ranges)) - (else (split (cddr cset-as-plist) - cset-single-chars - (cons (car cset-as-plist) - (cons (cadr cset-as-plist) - cset-ranges)))))))) + (go (+ i 2) #f + (cset-union cset + (sre->cset (string->sre (string #\\ c)))))) ((#\x) (apply (lambda (ch j) - (go j (cons ch chars) ranges)) + (go j ch (cset-adjoin cset ch))) (string-parse-hex-escape str (+ i 2) end))) (else (let ((c (cond ((assv c posix-escape-sequences) => cdr) (else c)))) - (go (+ i 2) (cons c chars) ranges)))))) + (go (+ i 2) c (cset-adjoin cset c))))))) (else (if (and utf8? (<= #x80 (char->integer c) #xFF)) (let ((len (utf8-start-char->length c))) (go (+ i len) - (cons (utf8-string-ref str i len) chars) - ranges)) - (go (+ i 1) (cons c chars) ranges))))))) + (utf8-string-ref str i len) + (cset-adjoin cset (utf8-string-ref str i len)))) + (go (+ i 1) c (cset-adjoin cset c)))))))) (if invert? (go (+ start 1) - (if (flag-set? flags ~multi-line?) '(#\newline) '()) - '()) - (go start '() '())))) + #f + (if (flag-set? flags ~multi-line?) + (char->cset #\newline) + (make-cset))) + (go start #f (make-cset))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; UTF-8 Utilities @@ -2419,9 +2368,8 @@ (define (add-char-state! next ch) (let ((alt (char-altcase ch))) (if (flag-set? flags ~case-insensitive?) - (extend-state! next (cset-union (range->cset ch ch) - (range->cset alt alt))) - (extend-state! next (range->cset ch ch))))) + (extend-state! next (cset-union (char->cset ch) (char->cset alt))) + (extend-state! next (char->cset ch))))) (if (null? ls) next (cond @@ -3543,6 +3491,7 @@ (define (make-cset) (vector)) (define (range->cset from to) (vector (cons from to))) +(define (char->cset ch) (vector (cons ch ch))) (define (cset-empty? cs) (zero? (vector-length cs))) (define (maybe-cset->char cs) (if (and (= (vector-length cs) 1) @@ -3578,7 +3527,7 @@ (define (string->cset s) (fold (lambda (ch cs) - (cset-union (range->cset ch ch) cs)) + (cset-adjoin cs ch)) (make-cset) (string->list s))) @@ -3672,6 +3621,8 @@ (cons (char-ranges-union a-range b-range) (cdr b)) res)))))))) +(define (cset-adjoin cs ch) (cset-union cs (char->cset ch))) + (define (next-char c) (integer->char (+ (char->integer c) 1)))Trap