~ chicken-core (chicken-5) 7b654fb64acf430af6cbbff25b74fc7c37bc5075
commit 7b654fb64acf430af6cbbff25b74fc7c37bc5075 Author: Peter Bex <peter@more-magic.net> AuthorDate: Thu Jan 2 20:46:44 2020 +0100 Commit: felix <felix@call-with-current-continuation.org> CommitDate: Sun Jan 5 20:28:27 2020 +0100 Update irregex to the 0.9.7 release (upstream commit 353b8db8) This makes the behaviour of irregex-fold and irregex-split more in line of expectations (also as compared to other regex engines). Signed-off-by: felix <felix@call-with-current-continuation.org> diff --git a/NEWS b/NEWS index 004baf15..6c30dece 100644 --- a/NEWS +++ b/NEWS @@ -17,6 +17,8 @@ letter is present in the supplied path string. - irregex-replace[/all] have been fixed for empty matches, so they will no longer drop characters and ignore the replacement (#1661). + - Irregex has been updated to upstream 0.9.7, which also improves + how empty matches are treated in irregex-fold and irregex-split. - Runtime system - Quoted empty keywords like ||: and :|| are now read like prescribed diff --git a/irregex-core.scm b/irregex-core.scm index badc11c0..9bcf7e0b 100644 --- a/irregex-core.scm +++ b/irregex-core.scm @@ -30,6 +30,8 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; History +;; 0.9.7: 2019/12/31 - more intuitive handling of empty matches in -fold, +;; -replace and -split ;; 0.9.6: 2016/12/05 - fixed exponential memory use of + in compilation ;; of backtracking matcher (CVE-2016-9954). ;; 0.9.5: 2016/09/10 - fixed a bug in irregex-fold handling of bow @@ -4024,9 +4026,9 @@ irx (lambda (i m a) (cond - ;; ((= i (%irregex-match-end-index m 0)) - ;; ;; empty match, just include the char - ;; (cons (substring str i (+ i 1)) a)) + ((= i (%irregex-match-end-index m 0)) + ;; empty match, include the skipped char to rejoin in finish + (cons (string-ref str i) a)) ((= i (%irregex-match-start-index m 0)) a) (else @@ -4034,6 +4036,18 @@ '() str (lambda (i a) - (reverse (if (= i end) a (cons (substring str i end) a)))) + (let lp ((ls (if (= i end) a (cons (substring str i end) a))) + (res '()) + (was-char? #f)) + (cond + ((null? ls) res) + ((char? (car ls)) + (lp (cdr ls) + (if (or was-char? (null? res)) + (cons (string (car ls)) res) + (cons (string-append (string (car ls)) (car res)) + (cdr res))) + #t)) + (else (lp (cdr ls) (cons (car ls) res) #f))))) start end))) diff --git a/manual/Module (chicken irregex) b/manual/Module (chicken irregex) index e09e3c37..2be9fd92 100644 --- a/manual/Module (chicken irregex) +++ b/manual/Module (chicken irregex) @@ -222,6 +222,8 @@ Examples: (irregex-replace/all "[aeiou]" "hello world" "*") => "h*ll* w*rld" +(irregex-replace/all '(* "poo ") "poo poo platter" "*") => "**p*l*a*t*t*e*r" + (irregex-replace "(.)(.)" "ab" 2 1 "*") => "ba*" (irregex-replace "...bar" "xxfoobar" (lambda (m) @@ -242,6 +244,16 @@ by the pattern in {{<irx>}}. {{irregex-extract}} does the opposite, returning a list of each instance of the pattern matched disregarding the substrings in between. +Empty matches will result in subsequent single character string in +{{irregex-split}}, or empty strings in {{irregex-extract}}. + +<enscript highlight="scheme"> +(irregex-split "[aeiou]*" "foobarbaz") => '("f" "b" "r" "b" "z") + +(irregex-extract "[aeiou]*" "foobarbaz") => '("" "oo" "" "a" "" "" "a" "") +</enscript> + + ==== irregex-fold <procedure>(irregex-fold <irx> <kons> <knil> <str> [<finish> <start> <end>])</procedure> @@ -289,6 +301,12 @@ To extract all instances of a match out of a string, you can use (lambda (i s) (reverse s)))) </enscript> +Note if an empty match is found {{<kons>}} will be called on that +empty string, and to avoid an infinite loop matching will resume at +the next char. It is up to the programmer to do something sensible +with the skipped char in this case. + + === Extended SRE Syntax Irregex provides the first native implementation of SREs (Scheme diff --git a/tests/test-irregex.scm b/tests/test-irregex.scm index 1bb63a58..59268364 100644 --- a/tests/test-irregex.scm +++ b/tests/test-irregex.scm @@ -397,8 +397,14 @@ (irregex-replace/all '(* "poo ") "poo poo platter" "*")) (test-equal '("foo" " " "foo" " " "b" "a" "r" " " "foo") (irregex-extract '(or (: bow "foo" eow) any) "foo foo bar foo")) - ;; (test-equal '("f" "o" "o" "b" "a" "r" "b" "a" "z") - ;; (irregex-split (irregex "") "foobarbaz")) + (test-equal '("f" "o" "o" "b" "a" "r" "b" "a" "z") + (irregex-split (irregex "") "foobarbaz")) + (test-equal '("f" "b" "r" "b" "z") + (irregex-split (irregex "[aeiou]*") "foobarbaz")) + (test-equal '("" "oo" "" "a" "" "" "a" "") + (irregex-extract (irregex "[aeiou]*") "foobarbaz")) + (test-equal '("Line 1\n" "Line 2\n" "Line 3") + (irregex-split 'bol "Line 1\nLine 2\nLine 3")) )Trap