~ chicken-core (master) d9284d5795e5d360348ea77bb2362e544ede2435
commit d9284d5795e5d360348ea77bb2362e544ede2435
Author: felix <felix@call-with-current-continuation.org>
AuthorDate: Sun May 10 21:57:22 2026 +0200
Commit: felix <felix@call-with-current-continuation.org>
CommitDate: Sun May 10 21:57:22 2026 +0200
make utf8->string R7RS conforming, add bytes->string as non-validating variant.
Also ensure validation in the latter primitive will validate only the selected region.
(thanks to "nmeum" for reporting this)
diff --git a/chicken.h b/chicken.h
index 6945fcd1..73651e93 100644
--- a/chicken.h
+++ b/chicken.h
@@ -1943,7 +1943,7 @@ C_fctexport int C_utf_isdigit(int c) C_regparm;
C_fctexport int C_utf_isalpha(int c) C_regparm;
C_fctexport int C_utf_isupper(int c) C_regparm;
C_fctexport int C_utf_islower(int c) C_regparm;
-C_fctexport C_word C_utf_validate(C_word bv, C_word blen) C_regparm;
+C_fctexport C_word C_utf_validate(C_word bv, C_word blen, C_word start, C_word end) C_regparm;
C_fctexport C_word C_latin_to_utf(C_word from, C_word to, C_word start, C_word len) C_regparm;
C_fctexport C_word C_utf_to_latin(C_word from, C_word to, C_word start, C_word len) C_regparm;
C_fctexport C_word C_utf_char_foldcase(C_word c) C_regparm;
diff --git a/library.scm b/library.scm
index f2b87fed..2436f7ef 100644
--- a/library.scm
+++ b/library.scm
@@ -3453,7 +3453,7 @@ EOF
make-bytevector bytevector bytevector-u8-ref
bytevector-u8-set! bytevector-copy bytevector-copy!
bytevector-append utf8->string string->utf8
- latin1->string string->latin1)
+ latin1->string string->latin1 bytes->string)
(import scheme (chicken foreign))
@@ -3484,12 +3484,24 @@ EOF
(##core#inline "C_copy_memory" bv sbv n)
bv) )
-(define (utf8->string bv #!optional (validate #t))
+(define (utf8->string bv #!optional (start 0) end)
(##sys#check-bytevector bv 'utf8->string)
- (if (and validate (not (##core#inline "C_utf_validate" bv (##sys#size bv))))
- (##sys#error-hook (foreign-value "C_DECODING_ERROR" int)
- 'utf8->string bv))
- (##sys#buffer->string bv 0 (##sys#size bv)))
+ (let* ((n (##sys#size bv))
+ (to (or end n)))
+ (if end
+ (##sys#check-range/including end 0 n 'utf8->string))
+ (if (not (##core#inline "C_utf_validate" bv (##sys#size bv) start to))
+ (##sys#error-hook (foreign-value "C_DECODING_ERROR" int)
+ 'utf8->string bv))
+ (##sys#buffer->string bv start (##core#inline "C_fixnum_difference" end start))))
+
+(define (bytes->string bv #!optional (start 0) end)
+ (##sys#check-bytevector bv 'bytes->string)
+ (let* ((n (##sys#size bv))
+ (to (or end n)))
+ (if end
+ (##sys#check-range/including end 0 n 'bytes->string))
+ (##sys#buffer->string bv start (##core#inline "C_fixnum_difference" end start))))
(define (string->latin1 s)
(##sys#check-string s 'string->latin1)
diff --git a/manual/Acknowledgements b/manual/Acknowledgements
index 063093d6..03e05cfc 100644
--- a/manual/Acknowledgements
+++ b/manual/Acknowledgements
@@ -49,7 +49,7 @@ Spencer Schumann, Ivan Shcheklein, Alexander Shendi, Alex Shinn, Ivan
Shmakov, "Shmul", "siiky", Tony Sidaway, Jeffrey B. Siegal, Andrey Sidorenko,
Michele Simionato, Iruata Souza, Volker Stolz, Jon Strait, Dorai Sitaram,
Robert Skeels, Sandra Snan, Jason Songhurst, Clifford Stein, David Steiner,
-"Sunnan", Zbigniew Szadkowski, Rick Taube, Nathan Thern, Mike Thomas, Minh
+"Sunnan", Zbigniew Szadkowski, Rick Taube, Sören Tempel, Nathan Thern, Mike Thomas, Minh
Thu, Christian Tismer, Andre van Tonder, John Tobey, Henrik Tramberend,
Vladimir Tsichevsky, James Ursetto, Neil van Dyke, Sam Varner,
Taylor Venable, Sander Vesik, Jaques Vidrine, Panagiotis Vossos,
diff --git a/manual/Module (chicken bytevector) b/manual/Module (chicken bytevector)
index e314d090..886e9f2b 100644
--- a/manual/Module (chicken bytevector)
+++ b/manual/Module (chicken bytevector)
@@ -47,11 +47,15 @@ should be a fixnum.
=== utf8->string
-<procedure>(utf8->string BYTEVECTOR [VALIDATE])</procedure>
+<procedure>(utf8->string BYTEVECTOR [START END])</procedure>
-Returns a string with the contents of {{BYTEVECTOR}}. if {{VALIDATE}}
-is given and false, then invalidly
-encoded characters do not signal an error - byte-sequences that are not representing
+Returns a string with the contents of {{BYTEVECTOR}}. Invalid encodings
+for characters signal an error.
+
+<procedure>(bytes->string BYTEVECTOR [START END])</procedure>
+
+Similar to {{utf8->string}}, but accepts any byte sequence without
+validating the contents. Byte-sequences that are not representing
valid UTF-8 characters are retained and, if extracted with {{string-ref}}
are converted to a trailing surrogate pair half in the range U+DC80 to U+DCFF.
diff --git a/modules.scm b/modules.scm
index 5edda53d..801e8a04 100644
--- a/modules.scm
+++ b/modules.scm
@@ -1285,6 +1285,7 @@
(error-object-irritants . scheme#error-object-irritants)
(string->utf8 . chicken.bytevector#string->utf8)
(utf8->string . chicken.bytevector#utf8->string)
+ (bytes->string . chicken.bytevector#bytes->string)
(write-bytevector . chicken.io#write-bytevector)
(bytevector . chicken.bytevector#bytevector)
(bytevector-length . chicken.bytevector#bytevector-length)
diff --git a/tests/unicode-tests.scm b/tests/unicode-tests.scm
index 2f56d6cd..3872a4bc 100644
--- a/tests/unicode-tests.scm
+++ b/tests/unicode-tests.scm
@@ -103,8 +103,9 @@
'("列" "字" "文" "→" "ト" "ス" "リ" "逆"))))
(test-error (utf8->string #u8(255 1 2)))
-(test-assert (utf8->string #u8(255 1 2) #f))
-(test-equal (string-length (utf8->string #u8(255 1 2) #f)) 3)
+(test-equal "BC" (utf8->string #u8(65 66 67) 1 3))
+(test-assert (bytes->string #u8(255 1 2)))
+(test-equal (string-length (bytes->string #u8(255 1 2))) 3)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; extras
diff --git a/types.db b/types.db
index b9a68401..b657e6a5 100644
--- a/types.db
+++ b/types.db
@@ -1174,7 +1174,8 @@
(chicken.bytevector#bytevector=? (#(procedure #:clean #:enforce #:foldable) chicken.bytevector#bytevector=? (bytevector bytevector) boolean))
(chicken.bytevector#bytevector-length (#(procedure #:clean #:enforce #:foldable) chicken.bytevector#bytevector-size (bytevector) fixnum)
((bytevector) (##sys#size #(1))))
-(chicken.bytevector#utf8->string (#(procedure #:clean #:enforce) chicken.bytevector#utf8->string (bytevector #!optional boolean) string))
+(chicken.bytevector#utf8->string (#(procedure #:clean #:enforce) chicken.bytevector#utf8->string (bytevector #!optional fixnum fixnum) string))
+(chicken.bytevector#bytes->string (#(procedure #:clean #:enforce) chicken.bytevector#bytes->string (bytevector #!optional fixnum fixnum) string))
(chicken.bytevector#latin1->string (#(procedure #:clean #:enforce) chicken.bytevector#latin1->string (bytevector) string))
(chicken.bytevector#make-bytevector (#(procedure #:clean #:enforce) chicken.bytevector#make-bytevector (fixnum #!optional fixnum) bytevector)
((fixnum) (##sys#make-bytevector #(1))))
diff --git a/utf.c b/utf.c
index 19b317f4..73fe8d1d 100644
--- a/utf.c
+++ b/utf.c
@@ -3509,13 +3509,13 @@ C_regparm int C_utf_count(C_char *s, int len)
}
/* Count characters - slow variant, detects invalid sequences */
-C_regparm C_word C_utf_validate(C_word bv, C_word blen)
+C_regparm C_word C_utf_validate(C_word bv, C_word blen, C_word start, C_word end)
{
int i = 0;
C_u32 c;
int e;
- C_char *s = C_c_string(bv), *s2;
- int len = C_unfix(blen);
+ C_char *s = C_c_string(bv) + C_unfix(start), *s2;
+ int len = C_unfix(end) - C_unfix(start);
while (len > 0) {
s2 = utf8_decode(s, &c, &e);
if(e) return C_SCHEME_FALSE;
Trap