~ chicken-core (chicken-5) 01b254c2c8746a9cd88b0fd273329f5b94514b6c


commit 01b254c2c8746a9cd88b0fd273329f5b94514b6c
Author:     Peter Bex <peter@more-magic.net>
AuthorDate: Fri May 14 08:32:48 2021 +0200
Commit:     Mario Domenech Goulart <mario@parenteses.org>
CommitDate: Sun May 16 15:35:41 2021 +0200

    Update irregex to upstream 0.9.9 (rev 3c367082)
    
    This fixes an edge case from upstream bug #24 where replacing empty
    strings with replace/all at the start of a string would incorrectly
    duplicate the starting character, found by Sandra Snan.
    
    Signed-off-by: Mario Domenech Goulart <mario@parenteses.org>

diff --git a/NEWS b/NEWS
index 1e852d90..c5eb0f03 100644
--- a/NEWS
+++ b/NEWS
@@ -6,9 +6,13 @@
   - Fixed a bug where optimisations for `irregex-match?` would cause
     runtime errors due to the inlined specialisations not being
     fully-expanded (see #1690).
-  - Irregex has been updated to upstream 0.9.8, which fixes behaviour
+  - Irregex has been updated to upstream 0.9.9, which fixes behaviour
     of irregex-replace/all with positive lookbehind so all matches are
-    replaced instead of only the first (reported by Kay Rhodes).
+    replaced instead of only the first (reported by Kay Rhodes), and
+    a regression regarding replacing empty matches which was introduced
+    by the fixes in 0.9.7 (reported by Sandra Snan).  Finally, the
+    http-url shorthand now allows any top-level domain and the old
+    "top-level-domain" now also supports "edu" (fixed by Sandra Snan).
   - current-milliseconds has been deprecated in favor of the name
     current-process-milliseconds, to avoid confusion due to naming
     of current-milliseconds versus current-seconds, which do something
diff --git a/irregex-core.scm b/irregex-core.scm
index 42f2a806..8f672333 100644
--- a/irregex-core.scm
+++ b/irregex-core.scm
@@ -1,6 +1,6 @@
 ;;;; irregex.scm -- IrRegular Expressions
 ;;
-;; Copyright (c) 2005-2020 Alex Shinn.  All rights reserved.
+;; Copyright (c) 2005-2021 Alex Shinn.  All rights reserved.
 ;; BSD-style license: http://synthcode.com/license.txt
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -30,6 +30,7 @@
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;; History
+;; 0.9.9: 2021/05/14 - more comprehensive fix for repeated empty matches
 ;; 0.9.8: 2020/07/13 - fix irregex-replace/all with look-behind patterns
 ;; 0.9.7: 2019/12/31 - more intuitive handling of empty matches in -fold,
 ;;                     -replace and -split
@@ -2348,8 +2349,8 @@
     (domain . (seq domain-atom (+ #\. domain-atom)))
     ;; XXXX now anything can be a top-level domain, but this is still handy
     (top-level-domain . (w/nocase (or "arpa" "com" "gov" "mil" "net" "org"
-                                      "aero" "biz" "coop" "info" "museum"
-                                      "name" "pro" (= 2 alpha))))
+                                      "edu" "aero" "biz" "coop" "info"
+				      "museum" "name" "pro" (= 2 alpha))))
     (domain/common . (seq (+ domain-atom #\.) top-level-domain))
     ;;(email-local-part . (seq (+ (or (~ #\") string))))
     (email-local-part . (+ (or alphanumeric #\_ #\- #\. #\+)))
@@ -2360,7 +2361,7 @@
                           (seq "%" hex-digit hex-digit)))
     (http-url . (w/nocase
                  "http" (? "s") "://"
-                 (or domain/common ipv4-address) ;; (seq "[" ipv6-address "]")
+                 (or domain ipv4-address) ;; (seq "[" ipv6-address "]")
                  (? ":" (+ numeric)) ;; port
                  ;; path
                  (? "/" (* (or url-char "/"))
@@ -3889,9 +3890,9 @@
     (if (not (and (integer? end) (exact? end)))
         (error 'irregex-fold "not an exact integer" end))
     (irregex-match-chunker-set! matches irregex-basic-string-chunker)
-    (let lp ((src init-src) (i start) (acc knil))
+    (let lp ((src init-src) (from start) (i start) (acc knil))
       (if (>= i end)
-          (finish i acc)
+          (finish from acc)
           (let ((m (irregex-search/matches
                     irx
                     irregex-basic-string-chunker
@@ -3900,18 +3901,18 @@
                     i
                     matches)))
             (if (not m)
-                (finish i acc)
+                (finish from acc)
                 (let ((j (%irregex-match-end-index m 0))
-                      (acc (kons i m acc)))
+                      (acc (kons from m acc)))
                   (irregex-reset-matches! matches)
                   (cond
                    ((flag-set? (irregex-flags irx) ~consumer?)
                     (finish j acc))
                    ((= j i)
                     ;; skip one char forward if we match the empty string
-                    (lp (list str (+ j 1) end) (+ j 1) acc))
+                    (lp (list str j end) j (+ j 1) acc))
                    (else
-                    (lp (list str j end) j acc))))))))))
+                    (lp (list str j end) j j acc))))))))))
 
 (define (irregex-fold irx kons . args)
   (if (not (procedure? kons)) (error 'irregex-fold "not a procedure" kons))
@@ -3968,15 +3969,11 @@
   (irregex-fold/fast
    irx
    (lambda (i m acc)
-     (let* ((m-start (%irregex-match-start-index m 0))
-            (res (if (>= i m-start)
-                     (append (irregex-apply-match m o) acc)
-                     (append (irregex-apply-match m o)
-                             (cons (substring str i m-start) acc)))))
-       ;; include the skipped char on empty matches
-       (if (= i (%irregex-match-end-index m 0))
-           (cons (substring str i (+ i 1)) res)
-           res)))
+     (let ((m-start (%irregex-match-start-index m 0)))
+       (if (>= i m-start)
+           (append (irregex-apply-match m o) acc)
+           (append (irregex-apply-match m o)
+                   (cons (substring str i m-start) acc)))))
    '()
    str
    (lambda (i acc)
@@ -4037,9 +4034,6 @@
      irx
      (lambda (i m a)
        (cond
-        ((= i (%irregex-match-end-index m 0))
-         ;; empty match, include the skipped char to rejoin in finish
-         (cons (string-ref str i) a))
         ((= i (%irregex-match-start-index m 0))
          a)
         (else
diff --git a/tests/test-irregex.scm b/tests/test-irregex.scm
index 18582809..f1aefc21 100644
--- a/tests/test-irregex.scm
+++ b/tests/test-irregex.scm
@@ -400,7 +400,7 @@
        (rope "bob@test.com and fred@example.com")
        (lambda (src i s) (reverse s))))
   (test-equal '("poo poo ")
-      (irregex-fold '(* "poo ")
+      (irregex-fold '(+ "poo ")
                     (lambda (i m s)
                       (if (< i (irregex-match-end-index m 0))
                           (cons (irregex-match-substring m) s)
@@ -438,6 +438,9 @@
   (test-equal "x- y- z-"
       (irregex-replace/all '(: (look-behind (or "x" "y" "z")) "a")
                            "xa ya za"  "-"))
+  (test-equal "any gosh darned string"
+	(irregex-replace/all '(: bos (* whitespace))
+                             "any gosh darned string"  ""))
   (test-equal '("foo" " " "foo" " " "b" "a" "r" " " "foo")
       (irregex-extract '(or (: bow "foo" eow) any) "foo foo bar foo"))
   (test-equal '("f" "o" "o" "b" "a" "r" "b" "a" "z")
Trap