~ chicken-core (chicken-5) 7b654fb64acf430af6cbbff25b74fc7c37bc5075
commit 7b654fb64acf430af6cbbff25b74fc7c37bc5075
Author: Peter Bex <peter@more-magic.net>
AuthorDate: Thu Jan 2 20:46:44 2020 +0100
Commit: felix <felix@call-with-current-continuation.org>
CommitDate: Sun Jan 5 20:28:27 2020 +0100
Update irregex to the 0.9.7 release (upstream commit 353b8db8)
This makes the behaviour of irregex-fold and irregex-split more in
line of expectations (also as compared to other regex engines).
Signed-off-by: felix <felix@call-with-current-continuation.org>
diff --git a/NEWS b/NEWS
index 004baf15..6c30dece 100644
--- a/NEWS
+++ b/NEWS
@@ -17,6 +17,8 @@
letter is present in the supplied path string.
- irregex-replace[/all] have been fixed for empty matches, so they
will no longer drop characters and ignore the replacement (#1661).
+ - Irregex has been updated to upstream 0.9.7, which also improves
+ how empty matches are treated in irregex-fold and irregex-split.
- Runtime system
- Quoted empty keywords like ||: and :|| are now read like prescribed
diff --git a/irregex-core.scm b/irregex-core.scm
index badc11c0..9bcf7e0b 100644
--- a/irregex-core.scm
+++ b/irregex-core.scm
@@ -30,6 +30,8 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;; History
+;; 0.9.7: 2019/12/31 - more intuitive handling of empty matches in -fold,
+;; -replace and -split
;; 0.9.6: 2016/12/05 - fixed exponential memory use of + in compilation
;; of backtracking matcher (CVE-2016-9954).
;; 0.9.5: 2016/09/10 - fixed a bug in irregex-fold handling of bow
@@ -4024,9 +4026,9 @@
irx
(lambda (i m a)
(cond
- ;; ((= i (%irregex-match-end-index m 0))
- ;; ;; empty match, just include the char
- ;; (cons (substring str i (+ i 1)) a))
+ ((= i (%irregex-match-end-index m 0))
+ ;; empty match, include the skipped char to rejoin in finish
+ (cons (string-ref str i) a))
((= i (%irregex-match-start-index m 0))
a)
(else
@@ -4034,6 +4036,18 @@
'()
str
(lambda (i a)
- (reverse (if (= i end) a (cons (substring str i end) a))))
+ (let lp ((ls (if (= i end) a (cons (substring str i end) a)))
+ (res '())
+ (was-char? #f))
+ (cond
+ ((null? ls) res)
+ ((char? (car ls))
+ (lp (cdr ls)
+ (if (or was-char? (null? res))
+ (cons (string (car ls)) res)
+ (cons (string-append (string (car ls)) (car res))
+ (cdr res)))
+ #t))
+ (else (lp (cdr ls) (cons (car ls) res) #f)))))
start
end)))
diff --git a/manual/Module (chicken irregex) b/manual/Module (chicken irregex)
index e09e3c37..2be9fd92 100644
--- a/manual/Module (chicken irregex)
+++ b/manual/Module (chicken irregex)
@@ -222,6 +222,8 @@ Examples:
(irregex-replace/all "[aeiou]" "hello world" "*") => "h*ll* w*rld"
+(irregex-replace/all '(* "poo ") "poo poo platter" "*") => "**p*l*a*t*t*e*r"
+
(irregex-replace "(.)(.)" "ab" 2 1 "*") => "ba*"
(irregex-replace "...bar" "xxfoobar" (lambda (m)
@@ -242,6 +244,16 @@ by the pattern in {{<irx>}}. {{irregex-extract}} does the opposite,
returning a list of each instance of the pattern matched disregarding
the substrings in between.
+Empty matches will result in subsequent single character string in
+{{irregex-split}}, or empty strings in {{irregex-extract}}.
+
+<enscript highlight="scheme">
+(irregex-split "[aeiou]*" "foobarbaz") => '("f" "b" "r" "b" "z")
+
+(irregex-extract "[aeiou]*" "foobarbaz") => '("" "oo" "" "a" "" "" "a" "")
+</enscript>
+
+
==== irregex-fold
<procedure>(irregex-fold <irx> <kons> <knil> <str> [<finish> <start> <end>])</procedure>
@@ -289,6 +301,12 @@ To extract all instances of a match out of a string, you can use
(lambda (i s) (reverse s))))
</enscript>
+Note if an empty match is found {{<kons>}} will be called on that
+empty string, and to avoid an infinite loop matching will resume at
+the next char. It is up to the programmer to do something sensible
+with the skipped char in this case.
+
+
=== Extended SRE Syntax
Irregex provides the first native implementation of SREs (Scheme
diff --git a/tests/test-irregex.scm b/tests/test-irregex.scm
index 1bb63a58..59268364 100644
--- a/tests/test-irregex.scm
+++ b/tests/test-irregex.scm
@@ -397,8 +397,14 @@
(irregex-replace/all '(* "poo ") "poo poo platter" "*"))
(test-equal '("foo" " " "foo" " " "b" "a" "r" " " "foo")
(irregex-extract '(or (: bow "foo" eow) any) "foo foo bar foo"))
- ;; (test-equal '("f" "o" "o" "b" "a" "r" "b" "a" "z")
- ;; (irregex-split (irregex "") "foobarbaz"))
+ (test-equal '("f" "o" "o" "b" "a" "r" "b" "a" "z")
+ (irregex-split (irregex "") "foobarbaz"))
+ (test-equal '("f" "b" "r" "b" "z")
+ (irregex-split (irregex "[aeiou]*") "foobarbaz"))
+ (test-equal '("" "oo" "" "a" "" "" "a" "")
+ (irregex-extract (irregex "[aeiou]*") "foobarbaz"))
+ (test-equal '("Line 1\n" "Line 2\n" "Line 3")
+ (irregex-split 'bol "Line 1\nLine 2\nLine 3"))
)
Trap