~ chicken-core (chicken-5) 84968ca35ae057cf9243ee66201829843ea5d6fa


commit 84968ca35ae057cf9243ee66201829843ea5d6fa
Author:     Peter Bex <peter@more-magic.net>
AuthorDate: Mon Dec 30 18:31:35 2019 +0100
Commit:     felix <felix@call-with-current-continuation.org>
CommitDate: Mon Dec 30 19:34:13 2019 +0100

    Update irregex to latest upstream (b194cab) to fix #1661
    
    irregex-replace and irregex-replace/all now work properly for empty
    matches: they don't drop characters and will actually insert the
    replacement in the output string.
    
    Signed-off-by: felix <felix@call-with-current-continuation.org>

diff --git a/NEWS b/NEWS
index 05b4dfe9..004baf15 100644
--- a/NEWS
+++ b/NEWS
@@ -15,6 +15,8 @@
     `define-record-printer` which isn't a "real" definition (see #1294).
   - On Windows, `decompose-directory` no longer crashes when a drive
     letter is present in the supplied path string.
+  - irregex-replace[/all] have been fixed for empty matches, so they
+    will no longer drop characters and ignore the replacement (#1661).
 
 - Runtime system
   - Quoted empty keywords like ||: and :|| are now read like prescribed
diff --git a/irregex-core.scm b/irregex-core.scm
index f26e8de6..badc11c0 100644
--- a/irregex-core.scm
+++ b/irregex-core.scm
@@ -1653,25 +1653,36 @@
                 (null? (cddr sre))
                 (sre-repeater? (cadr sre))))))
 
-(define (sre-searcher? sre)
+(define (sre-bos? sre)
   (if (pair? sre)
       (case (car sre)
-        ((* +) (sre-any? (sre-sequence (cdr sre))))
         ((seq : $ submatch => submatch-named)
-         (and (pair? (cdr sre)) (sre-searcher? (cadr sre))))
-        ((or) (every sre-searcher? (cdr sre)))
+         (and (pair? (cdr sre)) (sre-bos? (cadr sre))))
+        ((or) (every sre-bos? (cdr sre)))
         (else #f))
       (eq? 'bos sre)))
 
+;; a searcher doesn't need explicit iteration to find the first match
+(define (sre-searcher? sre)
+  (or (sre-bos? sre)
+      (and (pair? sre)
+           (case (car sre)
+             ((* +) (sre-any? (sre-sequence (cdr sre))))
+             ((seq : $ submatch => submatch-named)
+              (and (pair? (cdr sre)) (sre-searcher? (cadr sre))))
+             ((or) (every sre-searcher? (cdr sre)))
+             (else #f)))))
+
+;; a consumer doesn't need to match more than once
 (define (sre-consumer? sre)
-  (if (pair? sre)
-      (case (car sre)
-        ((* +) (sre-any? (sre-sequence (cdr sre))))
-        ((seq : $ submatch => submatch-named)
-         (and (pair? (cdr sre)) (sre-consumer? (last sre))))
-        ((or) (every sre-consumer? (cdr sre)))
-        (else #f))
-      (eq? 'eos sre)))
+  (or (sre-bos? sre)
+      (and (pair? sre)
+           (case (car sre)
+             ((* +) (sre-any? (sre-sequence (cdr sre))))
+             ((seq : $ submatch => submatch-named)
+              (and (pair? (cdr sre)) (sre-consumer? (last sre))))
+             ((or) (every sre-consumer? (cdr sre)))
+             (else #f)))))
 
 (define (sre-has-submatches? sre)
   (and (pair? sre)
@@ -3877,18 +3888,17 @@
                     matches)))
             (if (not m)
                 (finish i acc)
-                (let ((j (%irregex-match-end-index m 0)))
-                  (if (= j i)
-                      ;; skip one char forward if we match the empty string
-                      (lp (list str (+ j 1) end) (+ j 1) acc)
-                      (let ((acc (kons i m acc)))
-                        (irregex-reset-matches! matches)
-                        ;; no need to continue looping if this is a
-                        ;; searcher - it's already consumed the only
-                        ;; available match
-                        (if (flag-set? (irregex-flags irx) ~searcher?)
-                            (finish j acc)
-                            (lp (list str j end) j acc)))))))))))
+                (let ((j (%irregex-match-end-index m 0))
+                      (acc (kons i m acc)))
+                  (irregex-reset-matches! matches)
+                  (cond
+                   ((flag-set? (irregex-flags irx) ~consumer?)
+                    (finish j acc))
+                   ((= j i)
+                    ;; skip one char forward if we match the empty string
+                    (lp (list str (+ j 1) end) (+ j 1) acc))
+                   (else
+                    (lp (list str j end) j acc))))))))))
 
 (define (irregex-fold irx kons . args)
   (if (not (procedure? kons)) (error 'irregex-fold "not a procedure" kons))
@@ -3920,10 +3930,7 @@
                           (lp end-src (+ end-index 1) acc))
                       (let ((acc (kons start i m acc)))
                         (irregex-reset-matches! matches)
-                        ;; no need to continue looping if this is a
-                        ;; searcher - it's already consumed the only
-                        ;; available match
-                        (if (flag-set? (irregex-flags irx) ~searcher?)
+                        (if (flag-set? (irregex-flags irx) ~consumer?)
                             (finish end-src end-index acc)
                             (lp end-src end-index acc)))))))))))
 
@@ -3948,11 +3955,15 @@
   (irregex-fold/fast
    irx
    (lambda (i m acc)
-     (let ((m-start (%irregex-match-start-index m 0)))
-       (append (irregex-apply-match m o)
-               (if (>= i m-start)
-                   acc
-                   (cons (substring str i m-start) acc)))))
+     (let* ((m-start (%irregex-match-start-index m 0))
+            (res (if (>= i m-start)
+                     (append (irregex-apply-match m o) acc)
+                     (append (irregex-apply-match m o)
+                             (cons (substring str i m-start) acc)))))
+       ;; include the skipped char on empty matches
+       (if (= i (%irregex-match-end-index m 0))
+           (cons (substring str i (+ i 1)) res)
+           res)))
    '()
    str
    (lambda (i acc)
@@ -4012,9 +4023,14 @@
     (irregex-fold/fast
      irx
      (lambda (i m a)
-       (if (= i (%irregex-match-start-index m 0))
-           a
-           (cons (substring str i (%irregex-match-start-index m 0)) a)))
+       (cond
+        ;; ((= i (%irregex-match-end-index m 0))
+        ;;  ;; empty match, just include the char
+        ;;  (cons (substring str i (+ i 1)) a))
+        ((= i (%irregex-match-start-index m 0))
+         a)
+        (else
+         (cons (substring str i (%irregex-match-start-index m 0)) a))))
      '()
      str
      (lambda (i a)
diff --git a/irregex-utils.scm b/irregex-utils.scm
index a2195a91..291b03ea 100644
--- a/irregex-utils.scm
+++ b/irregex-utils.scm
@@ -121,10 +121,10 @@
            (display "]" out))
           ((- & / ~)
            (cond
-            ((or (eq? #\~ (car x))
+            ((or (eqv? #\~ (car x))
                  (and (eq? '- (car x)) (pair? (cdr x)) (eq? 'any (cadr x))))
              (display "[^" out)
-             (display (cset->string (if (eq? #\~ (car x)) (cdr x) (cddr x))) out)
+             (display (cset->string (if (eqv? #\~ (car x)) (cdr x) (cddr x))) out)
              (display "]" out))
             (else
              (lp `(cset ,@(sre->cset x))))))
diff --git a/tests/test-irregex.scm b/tests/test-irregex.scm
index 8626b82c..1bb63a58 100644
--- a/tests/test-irregex.scm
+++ b/tests/test-irregex.scm
@@ -361,7 +361,10 @@
        (lambda (src i s) (reverse s))))
   (test-equal '("poo poo ")
       (irregex-fold '(* "poo ")
-                    (lambda (i m s) (cons (irregex-match-substring m) s))
+                    (lambda (i m s)
+                      (if (< i (irregex-match-end-index m 0))
+                          (cons (irregex-match-substring m) s)
+                          s))
                     '()
                     "poo poo platter"))
   (test-equal "*  x   "
@@ -388,8 +391,14 @@
   (test-equal "xaac"
       (irregex-replace/all (irregex '(or (seq bos "a") "b") 'dfa)
                            "aaac" "x"))
+  (test-equal "*Line 1\n*Line 2"
+      (irregex-replace/all 'bol "Line 1\nLine 2" "*"))
+  (test-equal "**p*l*a*t*t*e*r"
+      (irregex-replace/all '(* "poo ") "poo poo platter" "*"))
   (test-equal '("foo" " " "foo" " " "b" "a" "r" " " "foo")
       (irregex-extract '(or (: bow "foo" eow) any) "foo foo bar foo"))
+  ;; (test-equal '("f" "o" "o" "b" "a" "r" "b" "a" "z")
+  ;;     (irregex-split (irregex "") "foobarbaz"))
   )
 
 
Trap