~ chicken-core (chicken-5) 84968ca35ae057cf9243ee66201829843ea5d6fa
commit 84968ca35ae057cf9243ee66201829843ea5d6fa
Author: Peter Bex <peter@more-magic.net>
AuthorDate: Mon Dec 30 18:31:35 2019 +0100
Commit: felix <felix@call-with-current-continuation.org>
CommitDate: Mon Dec 30 19:34:13 2019 +0100
Update irregex to latest upstream (b194cab) to fix #1661
irregex-replace and irregex-replace/all now work properly for empty
matches: they don't drop characters and will actually insert the
replacement in the output string.
Signed-off-by: felix <felix@call-with-current-continuation.org>
diff --git a/NEWS b/NEWS
index 05b4dfe9..004baf15 100644
--- a/NEWS
+++ b/NEWS
@@ -15,6 +15,8 @@
`define-record-printer` which isn't a "real" definition (see #1294).
- On Windows, `decompose-directory` no longer crashes when a drive
letter is present in the supplied path string.
+ - irregex-replace[/all] have been fixed for empty matches, so they
+ will no longer drop characters and ignore the replacement (#1661).
- Runtime system
- Quoted empty keywords like ||: and :|| are now read like prescribed
diff --git a/irregex-core.scm b/irregex-core.scm
index f26e8de6..badc11c0 100644
--- a/irregex-core.scm
+++ b/irregex-core.scm
@@ -1653,25 +1653,36 @@
(null? (cddr sre))
(sre-repeater? (cadr sre))))))
-(define (sre-searcher? sre)
+(define (sre-bos? sre)
(if (pair? sre)
(case (car sre)
- ((* +) (sre-any? (sre-sequence (cdr sre))))
((seq : $ submatch => submatch-named)
- (and (pair? (cdr sre)) (sre-searcher? (cadr sre))))
- ((or) (every sre-searcher? (cdr sre)))
+ (and (pair? (cdr sre)) (sre-bos? (cadr sre))))
+ ((or) (every sre-bos? (cdr sre)))
(else #f))
(eq? 'bos sre)))
+;; a searcher doesn't need explicit iteration to find the first match
+(define (sre-searcher? sre)
+ (or (sre-bos? sre)
+ (and (pair? sre)
+ (case (car sre)
+ ((* +) (sre-any? (sre-sequence (cdr sre))))
+ ((seq : $ submatch => submatch-named)
+ (and (pair? (cdr sre)) (sre-searcher? (cadr sre))))
+ ((or) (every sre-searcher? (cdr sre)))
+ (else #f)))))
+
+;; a consumer doesn't need to match more than once
(define (sre-consumer? sre)
- (if (pair? sre)
- (case (car sre)
- ((* +) (sre-any? (sre-sequence (cdr sre))))
- ((seq : $ submatch => submatch-named)
- (and (pair? (cdr sre)) (sre-consumer? (last sre))))
- ((or) (every sre-consumer? (cdr sre)))
- (else #f))
- (eq? 'eos sre)))
+ (or (sre-bos? sre)
+ (and (pair? sre)
+ (case (car sre)
+ ((* +) (sre-any? (sre-sequence (cdr sre))))
+ ((seq : $ submatch => submatch-named)
+ (and (pair? (cdr sre)) (sre-consumer? (last sre))))
+ ((or) (every sre-consumer? (cdr sre)))
+ (else #f)))))
(define (sre-has-submatches? sre)
(and (pair? sre)
@@ -3877,18 +3888,17 @@
matches)))
(if (not m)
(finish i acc)
- (let ((j (%irregex-match-end-index m 0)))
- (if (= j i)
- ;; skip one char forward if we match the empty string
- (lp (list str (+ j 1) end) (+ j 1) acc)
- (let ((acc (kons i m acc)))
- (irregex-reset-matches! matches)
- ;; no need to continue looping if this is a
- ;; searcher - it's already consumed the only
- ;; available match
- (if (flag-set? (irregex-flags irx) ~searcher?)
- (finish j acc)
- (lp (list str j end) j acc)))))))))))
+ (let ((j (%irregex-match-end-index m 0))
+ (acc (kons i m acc)))
+ (irregex-reset-matches! matches)
+ (cond
+ ((flag-set? (irregex-flags irx) ~consumer?)
+ (finish j acc))
+ ((= j i)
+ ;; skip one char forward if we match the empty string
+ (lp (list str (+ j 1) end) (+ j 1) acc))
+ (else
+ (lp (list str j end) j acc))))))))))
(define (irregex-fold irx kons . args)
(if (not (procedure? kons)) (error 'irregex-fold "not a procedure" kons))
@@ -3920,10 +3930,7 @@
(lp end-src (+ end-index 1) acc))
(let ((acc (kons start i m acc)))
(irregex-reset-matches! matches)
- ;; no need to continue looping if this is a
- ;; searcher - it's already consumed the only
- ;; available match
- (if (flag-set? (irregex-flags irx) ~searcher?)
+ (if (flag-set? (irregex-flags irx) ~consumer?)
(finish end-src end-index acc)
(lp end-src end-index acc)))))))))))
@@ -3948,11 +3955,15 @@
(irregex-fold/fast
irx
(lambda (i m acc)
- (let ((m-start (%irregex-match-start-index m 0)))
- (append (irregex-apply-match m o)
- (if (>= i m-start)
- acc
- (cons (substring str i m-start) acc)))))
+ (let* ((m-start (%irregex-match-start-index m 0))
+ (res (if (>= i m-start)
+ (append (irregex-apply-match m o) acc)
+ (append (irregex-apply-match m o)
+ (cons (substring str i m-start) acc)))))
+ ;; include the skipped char on empty matches
+ (if (= i (%irregex-match-end-index m 0))
+ (cons (substring str i (+ i 1)) res)
+ res)))
'()
str
(lambda (i acc)
@@ -4012,9 +4023,14 @@
(irregex-fold/fast
irx
(lambda (i m a)
- (if (= i (%irregex-match-start-index m 0))
- a
- (cons (substring str i (%irregex-match-start-index m 0)) a)))
+ (cond
+ ;; ((= i (%irregex-match-end-index m 0))
+ ;; ;; empty match, just include the char
+ ;; (cons (substring str i (+ i 1)) a))
+ ((= i (%irregex-match-start-index m 0))
+ a)
+ (else
+ (cons (substring str i (%irregex-match-start-index m 0)) a))))
'()
str
(lambda (i a)
diff --git a/irregex-utils.scm b/irregex-utils.scm
index a2195a91..291b03ea 100644
--- a/irregex-utils.scm
+++ b/irregex-utils.scm
@@ -121,10 +121,10 @@
(display "]" out))
((- & / ~)
(cond
- ((or (eq? #\~ (car x))
+ ((or (eqv? #\~ (car x))
(and (eq? '- (car x)) (pair? (cdr x)) (eq? 'any (cadr x))))
(display "[^" out)
- (display (cset->string (if (eq? #\~ (car x)) (cdr x) (cddr x))) out)
+ (display (cset->string (if (eqv? #\~ (car x)) (cdr x) (cddr x))) out)
(display "]" out))
(else
(lp `(cset ,@(sre->cset x))))))
diff --git a/tests/test-irregex.scm b/tests/test-irregex.scm
index 8626b82c..1bb63a58 100644
--- a/tests/test-irregex.scm
+++ b/tests/test-irregex.scm
@@ -361,7 +361,10 @@
(lambda (src i s) (reverse s))))
(test-equal '("poo poo ")
(irregex-fold '(* "poo ")
- (lambda (i m s) (cons (irregex-match-substring m) s))
+ (lambda (i m s)
+ (if (< i (irregex-match-end-index m 0))
+ (cons (irregex-match-substring m) s)
+ s))
'()
"poo poo platter"))
(test-equal "* x "
@@ -388,8 +391,14 @@
(test-equal "xaac"
(irregex-replace/all (irregex '(or (seq bos "a") "b") 'dfa)
"aaac" "x"))
+ (test-equal "*Line 1\n*Line 2"
+ (irregex-replace/all 'bol "Line 1\nLine 2" "*"))
+ (test-equal "**p*l*a*t*t*e*r"
+ (irregex-replace/all '(* "poo ") "poo poo platter" "*"))
(test-equal '("foo" " " "foo" " " "b" "a" "r" " " "foo")
(irregex-extract '(or (: bow "foo" eow) any) "foo foo bar foo"))
+ ;; (test-equal '("f" "o" "o" "b" "a" "r" "b" "a" "z")
+ ;; (irregex-split (irregex "") "foobarbaz"))
)
Trap