~ chicken-core (chicken-5) a2dad11f9fbeaa2505bc123cc4e7a707d9190174
commit a2dad11f9fbeaa2505bc123cc4e7a707d9190174
Author: Peter Bex <peter@more-magic.net>
AuthorDate: Tue Apr 23 08:26:07 2024 +0200
Commit: felix <felix@call-with-current-continuation.org>
CommitDate: Tue Apr 23 17:51:01 2024 +0200
Bump irregex to upstream commit 923cfc39, which is 0.9.11 plus a bugfix
Signed-off-by: felix <felix@call-with-current-continuation.org>
diff --git a/NEWS b/NEWS
index 6b09db47..b1bf9e1c 100644
--- a/NEWS
+++ b/NEWS
@@ -37,6 +37,11 @@
an `errno' property.
- Deprecated "chicken-home" and added "include-path" in the
chicken.platform module.
+ - Irregex has been updated to upstream 0.9.11 plus an additional fix
+ for sre->string. The 0.9.11 release fixes a few problems related to
+ utf-8 handling (which should not affect CHICKEN) and expands the
+ definition for the 'whitespace character set to include vertical tab,
+ carriage return and form feed.
- Tools
- The -R option for csi and csc now accepts list-notation like
diff --git a/irregex-core.scm b/irregex-core.scm
index 55e9a6c0..5550ace8 100644
--- a/irregex-core.scm
+++ b/irregex-core.scm
@@ -1,6 +1,6 @@
;;;; irregex.scm -- IrRegular Expressions
;;
-;; Copyright (c) 2005-2021 Alex Shinn. All rights reserved.
+;; Copyright (c) 2005-2024 Alex Shinn. All rights reserved.
;; BSD-style license: http://synthcode.com/license.txt
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -30,6 +30,7 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;; History
+;; 0.9.11: 2024/02/23 - Guile test and packaging support from Tomas Volf.
;; 0.9.10: 2021/07/06 - fixes for submatches under kleene star, empty seqs
;; in alternations, and bol in folds for backtracking
;; matcher (thanks John Clements and snan for reporting
@@ -425,7 +426,12 @@
;; (define *all-chars* `(/ ,(integer->char (- (char->integer #\space) 32)) ,(integer->char (+ (char->integer #\space) 223))))
;; set to #f to ignore even an explicit request for utf8 handling
-(define *allow-utf8-mode?* #t)
+;; The utf8-mode is undesired on any implementation with native unicode support.
+;; It is a workaround for those that treat strings as a raw byte sequences, and
+;; does not work well otherwise. So disable it on implementations known to
+;; handle unicode natively.
+(define *allow-utf8-mode?* (cond-expand ((and chicken (not full-unicode)) #t)
+ (else #f)))
;; (define *named-char-properties* '())
@@ -1568,8 +1574,8 @@
(cons (car sre) (map rec (cdr sre))))))
(else
(case sre
- ((any) 'utf8-any)
- ((nonl) 'utf8-nonl)
+ ((any) (if utf8? 'utf8-any 'any))
+ ((nonl) (if utf8? 'utf8-nonl 'nonl))
(else
(if (and utf8? (char? sre) (high-char? sre))
(sre-sequence (map integer->char (char->utf8-list sre)))
@@ -2292,10 +2298,11 @@
. (or alphanumeric punctuation #\$ #\+ #\< #\= #\> #\^ #\` #\| #\~))
(graph . graphic)
(blank . (or #\space ,(integer->char (- (char->integer #\space) 23))))
- (whitespace . (or blank #\newline))
+ ;; 0B - vertical tab, 0C - form feed
+ (whitespace . (or blank #\newline #\x0C #\return #\x0B))
(space . whitespace)
(white . whitespace)
- (printing or graphic whitespace)
+ (printing . (or graphic whitespace))
(print . printing)
;; XXXX we assume a (possibly shifted) ASCII-based ordering
diff --git a/irregex-utils.scm b/irregex-utils.scm
index 291b03ea..37313666 100644
--- a/irregex-utils.scm
+++ b/irregex-utils.scm
@@ -104,7 +104,7 @@
(display ")" out))
((* + ? *? ??)
(cond
- ((pair? (cddr x))
+ ((or (pair? (cddr x)) (and (string? (cadr x)) (not (= 1 (string-length (cadr x))))))
(display "(?:" out) (for-each lp (cdr x)) (display ")" out))
(else (lp (cadr x))))
(display (car x) out))
diff --git a/tests/test-irregex.scm b/tests/test-irregex.scm
index 0888f09b..8c0464ad 100644
--- a/tests/test-irregex.scm
+++ b/tests/test-irregex.scm
@@ -419,6 +419,12 @@
(test-equal "***x***"
(irregex-replace/all
(irregex '(: #\space) 'dfa) " x " "*"))
+ (test-equal "A:42"
+ (irregex-replace/all "^" "42" "A:"))
+ (test-equal "A:42"
+ (irregex-replace/all 'bos "42" "A:"))
+ (test-equal "A:42"
+ (irregex-replace/all 'bol "42" "A:"))
(test-equal "xaac"
(irregex-replace/all
(irregex '(or (seq bos "a") (seq bos "b")) 'backtrack) "aaac" "x"))
@@ -458,6 +464,15 @@
)
+(test-group "parsing"
+ (test-equal "c+" (sre->string '(+ "c")))
+ (test-equal "(?:abc)+" (sre->string '(+ "abc")))
+ (test-equal "(?:abc|def)+" (sre->string '(+ (or "abc" "def"))))
+ (test-equal '(+ #\c) (string->sre "c+"))
+ (test-equal '(+ "abc") (string->sre "(?:abc)+"))
+ (test-equal '(+ (or "abc" "def")) (string->sre "(?:abc|def)+"))
+ )
+
(define (extract name irx str)
(irregex-match-substring (irregex-match irx str) name))
(define (valid? name irx str)
Trap