~ chicken-core (chicken-5) a2dad11f9fbeaa2505bc123cc4e7a707d9190174
commit a2dad11f9fbeaa2505bc123cc4e7a707d9190174 Author: Peter Bex <peter@more-magic.net> AuthorDate: Tue Apr 23 08:26:07 2024 +0200 Commit: felix <felix@call-with-current-continuation.org> CommitDate: Tue Apr 23 17:51:01 2024 +0200 Bump irregex to upstream commit 923cfc39, which is 0.9.11 plus a bugfix Signed-off-by: felix <felix@call-with-current-continuation.org> diff --git a/NEWS b/NEWS index 6b09db47..b1bf9e1c 100644 --- a/NEWS +++ b/NEWS @@ -37,6 +37,11 @@ an `errno' property. - Deprecated "chicken-home" and added "include-path" in the chicken.platform module. + - Irregex has been updated to upstream 0.9.11 plus an additional fix + for sre->string. The 0.9.11 release fixes a few problems related to + utf-8 handling (which should not affect CHICKEN) and expands the + definition for the 'whitespace character set to include vertical tab, + carriage return and form feed. - Tools - The -R option for csi and csc now accepts list-notation like diff --git a/irregex-core.scm b/irregex-core.scm index 55e9a6c0..5550ace8 100644 --- a/irregex-core.scm +++ b/irregex-core.scm @@ -1,6 +1,6 @@ ;;;; irregex.scm -- IrRegular Expressions ;; -;; Copyright (c) 2005-2021 Alex Shinn. All rights reserved. +;; Copyright (c) 2005-2024 Alex Shinn. All rights reserved. ;; BSD-style license: http://synthcode.com/license.txt ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -30,6 +30,7 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; History +;; 0.9.11: 2024/02/23 - Guile test and packaging support from Tomas Volf. ;; 0.9.10: 2021/07/06 - fixes for submatches under kleene star, empty seqs ;; in alternations, and bol in folds for backtracking ;; matcher (thanks John Clements and snan for reporting @@ -425,7 +426,12 @@ ;; (define *all-chars* `(/ ,(integer->char (- (char->integer #\space) 32)) ,(integer->char (+ (char->integer #\space) 223)))) ;; set to #f to ignore even an explicit request for utf8 handling -(define *allow-utf8-mode?* #t) +;; The utf8-mode is undesired on any implementation with native unicode support. +;; It is a workaround for those that treat strings as a raw byte sequences, and +;; does not work well otherwise. So disable it on implementations known to +;; handle unicode natively. +(define *allow-utf8-mode?* (cond-expand ((and chicken (not full-unicode)) #t) + (else #f))) ;; (define *named-char-properties* '()) @@ -1568,8 +1574,8 @@ (cons (car sre) (map rec (cdr sre)))))) (else (case sre - ((any) 'utf8-any) - ((nonl) 'utf8-nonl) + ((any) (if utf8? 'utf8-any 'any)) + ((nonl) (if utf8? 'utf8-nonl 'nonl)) (else (if (and utf8? (char? sre) (high-char? sre)) (sre-sequence (map integer->char (char->utf8-list sre))) @@ -2292,10 +2298,11 @@ . (or alphanumeric punctuation #\$ #\+ #\< #\= #\> #\^ #\` #\| #\~)) (graph . graphic) (blank . (or #\space ,(integer->char (- (char->integer #\space) 23)))) - (whitespace . (or blank #\newline)) + ;; 0B - vertical tab, 0C - form feed + (whitespace . (or blank #\newline #\x0C #\return #\x0B)) (space . whitespace) (white . whitespace) - (printing or graphic whitespace) + (printing . (or graphic whitespace)) (print . printing) ;; XXXX we assume a (possibly shifted) ASCII-based ordering diff --git a/irregex-utils.scm b/irregex-utils.scm index 291b03ea..37313666 100644 --- a/irregex-utils.scm +++ b/irregex-utils.scm @@ -104,7 +104,7 @@ (display ")" out)) ((* + ? *? ??) (cond - ((pair? (cddr x)) + ((or (pair? (cddr x)) (and (string? (cadr x)) (not (= 1 (string-length (cadr x)))))) (display "(?:" out) (for-each lp (cdr x)) (display ")" out)) (else (lp (cadr x)))) (display (car x) out)) diff --git a/tests/test-irregex.scm b/tests/test-irregex.scm index 0888f09b..8c0464ad 100644 --- a/tests/test-irregex.scm +++ b/tests/test-irregex.scm @@ -419,6 +419,12 @@ (test-equal "***x***" (irregex-replace/all (irregex '(: #\space) 'dfa) " x " "*")) + (test-equal "A:42" + (irregex-replace/all "^" "42" "A:")) + (test-equal "A:42" + (irregex-replace/all 'bos "42" "A:")) + (test-equal "A:42" + (irregex-replace/all 'bol "42" "A:")) (test-equal "xaac" (irregex-replace/all (irregex '(or (seq bos "a") (seq bos "b")) 'backtrack) "aaac" "x")) @@ -458,6 +464,15 @@ ) +(test-group "parsing" + (test-equal "c+" (sre->string '(+ "c"))) + (test-equal "(?:abc)+" (sre->string '(+ "abc"))) + (test-equal "(?:abc|def)+" (sre->string '(+ (or "abc" "def")))) + (test-equal '(+ #\c) (string->sre "c+")) + (test-equal '(+ "abc") (string->sre "(?:abc)+")) + (test-equal '(+ (or "abc" "def")) (string->sre "(?:abc|def)+")) + ) + (define (extract name irx str) (irregex-match-substring (irregex-match irx str) name)) (define (valid? name irx str)Trap