~ chicken-core (chicken-5) d720a5d15cc0fb0619c70ccc26eb888171744a7f
commit d720a5d15cc0fb0619c70ccc26eb888171744a7f
Author: felix <felix@call-with-current-continuation.org>
AuthorDate: Tue Aug 3 01:16:11 2010 +0200
Commit: felix <felix@call-with-current-continuation.org>
CommitDate: Tue Aug 3 01:16:11 2010 +0200
Revert "Merge branch 'total-irregex' of ssh://chicken@galinha.ucpel.tche.br/~/chicken-core into experimental"
This reverts commit c53dcbfd42c6baf325538aa312c3364bd5d3b88f.
Well, this can happen to the best of us ... (that was a joke)
diff --git a/chicken-install.scm b/chicken-install.scm
index eb67238c..6150ef27 100644
--- a/chicken-install.scm
+++ b/chicken-install.scm
@@ -25,13 +25,13 @@
(require-library setup-download setup-api)
-(require-library srfi-1 posix data-structures utils irregex ports extras srfi-13 files)
+(require-library srfi-1 posix data-structures utils regex ports extras srfi-13 files)
(require-library chicken-syntax) ; in case an import library reexports chicken syntax
(require-library chicken-ffi-syntax) ; same reason, also for filling modules.db
(module main ()
- (import scheme chicken srfi-1 posix data-structures utils irregex ports extras
+ (import scheme chicken srfi-1 posix data-structures utils regex ports extras
srfi-13 files)
(import setup-download setup-api)
@@ -51,6 +51,7 @@
"srfi-13.import.so"
"srfi-69.import.so"
"extras.import.so"
+ "regex.import.so"
"srfi-14.import.so"
"tcp.import.so"
"foreign.import.so"
@@ -490,17 +491,17 @@
(let* ((files (glob (make-pathname (repository-path) "*.import.*")))
(tmpdir (create-temporary-directory))
(dbfile (make-pathname tmpdir +module-db+))
- (rx (irregex ".*/([^/]+)\\.import\\.(scm|so)")))
+ (rx (regexp ".*/([^/]+)\\.import\\.(scm|so)")))
(print "loading import libraries ...")
(fluid-let ((##sys#warnings-enabled #f))
(for-each
(lambda (f)
- (let ((m (irregex-match rx f)))
+ (let ((m (string-match rx f)))
(handle-exceptions ex
(print-error-message
ex (current-error-port)
(sprintf "Failed to import from `~a'" f))
- (eval `(import ,(string->symbol (irregex-match-substring m 1)))))))
+ (eval `(import ,(string->symbol (cadr m)))))))
files))
(print "generating database")
(let ((db
@@ -586,10 +587,10 @@ EOF
(define (setup-proxy uri)
(if (string? uri)
- (cond ((irregex-match "(.+)\\:([0-9]+)" uri) =>
+ (cond ((string-match "(.+)\\:([0-9]+)" uri) =>
(lambda (m)
- (set! *proxy-host* (irregex-match-substring m 1))
- (set! *proxy-port* (string->number (irregex-match-substring m 2))))
+ (set! *proxy-host* (cadr m))
+ (set! *proxy-port* (string->number (caddr m))))
(else
(set! *proxy-host* uri)
(set! *proxy-port* 80))))))
@@ -598,7 +599,7 @@ EOF
(define (main args)
(let ((update #f)
- (rx (irregex "([^:]+):(.+)")))
+ (rx (regexp "([^:]+):(.+)")))
(setup-proxy (get-environment-variable "http_proxy"))
(let loop ((args args) (eggs '()))
(cond ((null? args)
@@ -740,14 +741,9 @@ EOF
"")
*eggs+dirs+vers*))
(loop (cdr args) (cons egg eggs))))
- ((irregex-match rx arg) =>
+ ((string-match rx arg) =>
(lambda (m)
- (loop
- (cdr args)
- (alist-cons
- (irregex-match-substring m 1)
- (irregex-match-substring m 2)
- eggs))))
+ (loop (cdr args) (alist-cons (cadr m) (caddr m) eggs))))
(else (loop (cdr args) (cons arg eggs))))))))))
(register-feature! 'chicken-install)
diff --git a/chicken-status.scm b/chicken-status.scm
index cbd246c3..5222ebb0 100644
--- a/chicken-status.scm
+++ b/chicken-status.scm
@@ -24,13 +24,13 @@
; POSSIBILITY OF SUCH DAMAGE.
-(require-library setup-api srfi-1 posix data-structures utils ports irregex files)
+(require-library setup-api srfi-1 posix data-structures utils ports regex files)
(module main ()
(import scheme chicken foreign)
- (import srfi-1 posix data-structures utils ports irregex
+ (import srfi-1 posix data-structures utils ports regex
files setup-api)
(define-foreign-variable C_TARGET_LIB_HOME c-string)
@@ -45,9 +45,6 @@
(make-pathname C_TARGET_LIB_HOME (sprintf "chicken/~a" C_BINARY_VERSION))
(repository-path)))
- (define (grep rx lst)
- (filter (cut irregex-search rx <>) lst))
-
(define (gather-eggs patterns)
(let ((eggs (map pathname-file
(glob (make-pathname (repo-path) "*" "setup-info")))))
@@ -125,10 +122,11 @@ EOF
(lambda ()
(let* ((patterns
(map
- irregex
+ regexp
(cond ((null? pats) '(".*"))
+ ;;XXX change for total-irregex branch:
(exact (map (lambda (p)
- (string-append "^" (irregex-quote p) "$"))
+ (string-append "^" (regexp-escape p) "$"))
pats))
(else pats))))
(eggs (gather-eggs patterns)))
diff --git a/chicken-uninstall.scm b/chicken-uninstall.scm
index bd6b35ee..8a074b4d 100644
--- a/chicken-uninstall.scm
+++ b/chicken-uninstall.scm
@@ -26,14 +26,14 @@
(require-library
setup-api
- srfi-1 posix data-structures utils ports irregex srfi-13 files)
+ srfi-1 posix data-structures utils ports regex srfi-13 files)
(module main ()
(import scheme chicken foreign)
(import setup-api)
- (import srfi-1 posix data-structures utils ports irregex srfi-13 files)
+ (import srfi-1 posix data-structures utils ports regex srfi-13 files)
(define-foreign-variable C_TARGET_LIB_HOME c-string)
(define-foreign-variable C_BINARY_VERSION int)
@@ -49,9 +49,6 @@
(define *force* #f)
- (define (grep rx lst)
- (filter (cut irregex-search rx <>) lst))
-
(define (gather-eggs patterns)
(let ((eggs (map pathname-file
(glob (make-pathname (repo-path) "*" "setup-info")))))
@@ -120,8 +117,8 @@ EOF
(map
(lambda (p)
(if exact
- (irregex (string-append "^" (irregex-quote p) "$"))
- (##sys#glob->regexp p)))
+ (regexp (string-append "^" (regexp-escape p) "$"))
+ (glob->regexp p)))
pats))))
(else
(let ((arg (car args)))
diff --git a/compiler.scm b/compiler.scm
index 5d064d2f..435909b0 100644
--- a/compiler.scm
+++ b/compiler.scm
@@ -1251,12 +1251,10 @@
(##sys#strip-syntax x se))
(define stripu ##sys#strip-syntax)
(define (globalize sym)
- (if (symbol? sym)
- (let loop ((se se)) ; ignores syntax bindings
- (cond ((null? se) (##sys#alias-global-hook sym #f))
- ((and (eq? sym (caar se)) (symbol? (cdar se))) (cdar se))
- (else (loop (cdr se)))))
- sym))
+ (let loop ((se se)) ; ignores syntax bindings
+ (cond ((null? se) (##sys#alias-global-hook sym #f))
+ ((and (eq? sym (caar se)) (symbol? (cdar se))) (cdar se))
+ (else (loop (cdr se))))))
(define (globalize-all syms) (map globalize syms))
(call-with-current-continuation
(lambda (return)
diff --git a/defaults.make b/defaults.make
index 4564b952..e4e8537b 100644
--- a/defaults.make
+++ b/defaults.make
@@ -293,7 +293,7 @@ CSI ?= csi$(EXE)
# Scheme compiler flags
-CHICKEN_OPTIONS = -optimize-level 2 -include-path . -include-path $(SRCDIR) -inline -ignore-repository -feature building-chicken
+CHICKEN_OPTIONS = -optimize-level 2 -include-path . -include-path $(SRCDIR) -inline -ignore-repository
ifdef DEBUGBUILD
CHICKEN_OPTIONS += -feature debugbuild -scrutinize -types $(SRCDIR)types.db
else
@@ -321,9 +321,12 @@ CHICKEN_INSTALL_PROGRAM = $(PROGRAM_PREFIX)chicken-install$(PROGRAM_SUFFIX)
CHICKEN_UNINSTALL_PROGRAM = $(PROGRAM_PREFIX)chicken-uninstall$(PROGRAM_SUFFIX)
CHICKEN_STATUS_PROGRAM = $(PROGRAM_PREFIX)chicken-status$(PROGRAM_SUFFIX)
CHICKEN_BUG_PROGRAM = $(PROGRAM_PREFIX)chicken-bug$(PROGRAM_SUFFIX)
-IMPORT_LIBRARIES = chicken lolevel srfi-1 srfi-4 data-structures ports files posix srfi-13 srfi-69 extras srfi-14 tcp foreign scheme srfi-18 utils csi irregex
+IMPORT_LIBRARIES = chicken lolevel srfi-1 srfi-4 data-structures ports files posix srfi-13 srfi-69 extras \
+ regex srfi-14 tcp foreign scheme srfi-18 utils csi irregex
IMPORT_LIBRARIES += setup-api setup-download
-SCRUTINIZED_LIBRARIES = library eval data-structures ports files extras lolevel utils tcp srfi-1 srfi-4 srfi-13 srfi-14 srfi-18 srfi-69 $(POSIXFILE) irregex scheduler profiler stub expand chicken-syntax chicken-ffi-syntax
+SCRUTINIZED_LIBRARIES = library eval data-structures ports files extras lolevel utils tcp srfi-1 srfi-4 srfi-13 \
+ srfi-14 srfi-18 srfi-69 $(POSIXFILE) regex scheduler \
+ profiler stub expand chicken-syntax chicken-ffi-syntax
ifdef STATICBUILD
CHICKEN_STATIC_EXECUTABLE = $(CHICKEN_PROGRAM)$(EXE)
diff --git a/distribution/manifest b/distribution/manifest
index 01ce9249..affd61e9 100644
--- a/distribution/manifest
+++ b/distribution/manifest
@@ -28,7 +28,7 @@ optimizer.c
compiler-syntax.c
scrutinizer.c
unboxing.c
-irregex.c
+regex.c
posixunix.c
posixwin.c
profiler.c
@@ -77,9 +77,8 @@ optimizer.scm
compiler-syntax.scm
scrutinizer.scm
unboxing.scm
+regex.scm
irregex.scm
-irregex-core.scm
-irregex-utils.scm
posixunix.scm
posixwin.scm
posix-common.scm
@@ -209,6 +208,7 @@ posix.import.scm
srfi-13.import.scm
srfi-69.import.scm
extras.import.scm
+regex.import.scm
irregex.import.scm
srfi-14.import.scm
tcp.import.scm
@@ -222,6 +222,7 @@ posix.import.c
srfi-13.import.c
srfi-69.import.c
extras.import.c
+regex.import.c
irregex.import.c
srfi-14.import.c
tcp.import.c
@@ -287,7 +288,7 @@ manual/Unit library
manual/Unit lolevel
manual/Unit ports
manual/Unit posix
-manual/Unit irregex
+manual/Unit regex
manual/Unit srfi-1
manual/Unit srfi-13
manual/Unit srfi-14
diff --git a/eval.scm b/eval.scm
index 69e2bd03..ff35cfaf 100644
--- a/eval.scm
+++ b/eval.scm
@@ -55,7 +55,7 @@
(define-foreign-variable binary-version int "C_BINARY_VERSION")
(define ##sys#core-library-modules
- '(extras lolevel utils files tcp irregex posix srfi-1 srfi-4 srfi-13
+ '(extras lolevel utils files tcp regex posix srfi-1 srfi-4 srfi-13
srfi-14 srfi-18 srfi-69 data-structures ports chicken-syntax
chicken-ffi-syntax))
diff --git a/files.scm b/files.scm
index 1ef2eda2..2c1c167f 100644
--- a/files.scm
+++ b/files.scm
@@ -36,7 +36,7 @@
(declare
(unit files)
- (uses irregex data-structures)
+ (uses regex data-structures)
(fixnum)
(hide chop-pds absolute-pathname-root root-origin root-directory split-directory)
(disable-interrupts)
@@ -172,19 +172,20 @@ EOF
(define root-origin)
(define root-directory)
-(if ##sys#windows-platform
- (let ((rx (irregex "([A-Za-z]:)?([\\/\\\\]).*")))
- (set! absolute-pathname-root (lambda (pn) (irregex-match rx pn)))
- (set! root-origin (lambda (rt) (and rt (irregex-match-substring rt 1))))
- (set! root-directory (lambda (rt) (and rt (irregex-match-substring rt 2)))) )
- (let ((rx (irregex "([\\/\\\\]).*")))
- (set! absolute-pathname-root (lambda (pn) (irregex-match rx pn)))
- (set! root-origin (lambda (rt) #f))
- (set! root-directory (lambda (rt) (and rt (irregex-match-substring rt 1)))) ) )
+(let ((string-match string-match))
+ (if ##sys#windows-platform
+ (let ((rx (regexp "([A-Za-z]:)?([\\/\\\\]).*")))
+ (set! absolute-pathname-root (lambda (pn) (string-match rx pn)))
+ (set! root-origin (lambda (rt) (and rt (cadr rt))))
+ (set! root-directory (lambda (rt) (and rt (caddr rt)))) )
+ (let ((rx (regexp "([\\/\\\\]).*")))
+ (set! absolute-pathname-root (lambda (pn) (string-match rx pn)))
+ (set! root-origin (lambda (rt) #f))
+ (set! root-directory (lambda (rt) (and rt (cadr rt)))) ) ) )
(define (absolute-pathname? pn)
(##sys#check-string pn 'absolute-pathname?)
- (irregex-match-data? (absolute-pathname-root pn)) )
+ (pair? (absolute-pathname-root pn)) )
(define-inline (*char-pds? ch) (memq ch '(#\\ #\/)))
@@ -261,33 +262,28 @@ EOF
file ext def-pds) ) ) )
(define decompose-pathname
- (let* ([patt1 "^(.*[\\/\\\\])?([^\\/\\\\]+)(\\.([^\\/\\\\.]+))$"]
- [patt2 "^(.*[\\/\\\\])?((\\.)?[^\\/\\\\]+)$"]
- [rx1 (irregex patt1)]
- [rx2 (irregex patt2)]
- [strip-pds
- (lambda (dir)
- (and dir
- (if (member dir '("/" "\\"))
- dir
- (chop-pds dir #f) ) ) )] )
- (lambda (pn)
- (##sys#check-string pn 'decompose-pathname)
- (if (fx= 0 (##sys#size pn))
- (values #f #f #f)
- (let ([ms (irregex-search rx1 pn)])
- (if ms
- (values
- (strip-pds (irregex-match-substring ms 1))
- (irregex-match-substring ms 2)
- (irregex-match-substring ms 4))
- (let ([ms (irregex-search rx2 pn)])
- (if ms
- (values
- (strip-pds (irregex-match-substring ms 1))
- (irregex-match-substring ms 2)
- #f)
- (values (strip-pds pn) #f #f) ) ) ) ) ) ) ) )
+ (let ((string-match string-match))
+ (let* ([patt1 "^(.*[\\/\\\\])?([^\\/\\\\]+)(\\.([^\\/\\\\.]+))$"]
+ [patt2 "^(.*[\\/\\\\])?((\\.)?[^\\/\\\\]+)$"]
+ [rx1 (regexp patt1)]
+ [rx2 (regexp patt2)]
+ [strip-pds
+ (lambda (dir)
+ (and dir
+ (if (member dir '("/" "\\"))
+ dir
+ (chop-pds dir #f) ) ) )] )
+ (lambda (pn)
+ (##sys#check-string pn 'decompose-pathname)
+ (if (fx= 0 (##sys#size pn))
+ (values #f #f #f)
+ (let ([ms (string-match rx1 pn)])
+ (if ms
+ (values (strip-pds (cadr ms)) (caddr ms) (car (cddddr ms)))
+ (let ([ms (string-match rx2 pn)])
+ (if ms
+ (values (strip-pds (cadr ms)) (caddr ms) #f)
+ (values (strip-pds pn) #f #f) ) ) ) ) ) ) ) ) )
(define pathname-directory)
(define pathname-file)
diff --git a/irregex-core.scm b/irregex-core.scm
deleted file mode 100644
index 7cd57d81..00000000
--- a/irregex-core.scm
+++ /dev/null
@@ -1,3874 +0,0 @@
-;;;; irregex.scm -- IrRegular Expressions
-;;
-;; Copyright (c) 2005-2010 Alex Shinn. All rights reserved.
-;; BSD-style license: http://synthcode.com/license.txt
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; At this moment there was a loud ring at the bell, and I could
-;; hear Mrs. Hudson, our landlady, raising her voice in a wail of
-;; expostulation and dismay.
-;;
-;; "By heaven, Holmes," I said, half rising, "I believe that
-;; they are really after us."
-;;
-;; "No, it's not quite so bad as that. It is the unofficial
-;; force, -- the Baker Street irregulars."
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;; Notes
-;;
-;; This code should not require any porting - it should work out of
-;; the box in any R[45]RS Scheme implementation. Slight modifications
-;; are needed for R6RS (a separate R6RS-compatible version is included
-;; in the distribution as irregex-r6rs.scm).
-;;
-;; The goal of portability makes this code a little clumsy and
-;; inefficient. Future versions will include both cleanup and
-;; performance tuning, but you can only go so far while staying
-;; portable. AND-LET*, SRFI-9 records and custom macros would've been
-;; nice.
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;; History
-;;
-;; 0.8.2: 2010/07/30 - (...)? submatch extraction fix and alternate
-;; named submatches from Peter Bex
-;; 0.8.1: 2010/03/09 - backtracking irregex-match fix and other small fixes
-;; 0.8.0: 2010/01/20 - optimizing DFA compilation, adding SRE escapes
-;; inside PCREs, adding utility SREs
-;; 0.7.5: 2009/08/31 - adding irregex-extract and irregex-split
-;; *-fold copies match data (use *-fold/fast for speed)
-;; irregex-opt now returns an SRE
-;; 0.7.4: 2009/05/14 - empty alternates (or) and empty csets always fail,
-;; bugfix in default finalizer for irregex-fold/chunked
-;; 0.7.3: 2009/04/14 - adding irregex-fold/chunked, minor doc fixes
-;; 0.7.2: 2009/02/11 - some bugfixes, much improved documentation
-;; 0.7.1: 2008/10/30 - several bugfixes (thanks to Derick Eddington)
-;; 0.7.0: 2008/10/20 - support abstract chunked strings
-;; 0.6.2: 2008/07/26 - minor bugfixes, allow global disabling of utf8 mode,
-;; friendlier error messages in parsing, \Q..\E support
-;; 0.6.1: 2008/07/21 - added utf8 mode, more utils, bugfixes
-;; 0.6: 2008/05/01 - most of PCRE supported
-;; 0.5: 2008/04/24 - fully portable R4RS, many PCRE features implemented
-;; 0.4: 2008/04/17 - rewriting NFA to use efficient closure compilation,
-;; normal strings only, but all of the spencer tests pass
-;; 0.3: 2008/03/10 - adding DFA converter (normal strings only)
-;; 0.2: 2005/09/27 - adding irregex-opt (like elisp's regexp-opt) utility
-;; 0.1: 2005/08/18 - simple NFA interpreter over abstract chunked strings
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;; Data Structures
-
-(cond-expand
- (building-chicken
- (begin
- (define-syntax (internal x r c)
- `(,(with-input-from-string (cadr x) read) ,@(cddr x)))
- ;; make-irregex defined elsewhere
- (define (irregex? x)
- (internal "##sys#structure?" x 'regexp))
- (define (irregex-dfa x)
- (internal "##sys#check-structure" x 'regexp 'irregex-dfa)
- (internal "##sys#slot" x 1))
- (define (irregex-dfa/search x)
- (internal "##sys#check-structure" x 'regexp 'irregex-dfa/search)
- (internal "##sys#slot" x 2))
- (define (irregex-dfa/extract x)
- (internal "##sys#check-structure" x 'regexp 'irregex-dfa/extract)
- (internal "##sys#slot" x 3))
- (define (irregex-nfa x)
- (internal "##sys#check-structure" x 'regexp 'irregex-nfa)
- (internal "##sys#slot" x 4))
- (define (irregex-flags x)
- (internal "##sys#check-structure" x 'regexp 'irregex-flags)
- (internal "##sys#slot" x 5))
- (define (irregex-num-submatches x)
- (internal "##sys#check-structure" x 'regexp 'irregex-num-submatches)
- (internal "##sys#slot" x 6))
- (define (irregex-lengths x)
- (internal "##sys#check-structure" x 'regexp 'irregex-lengths)
- (internal "##sys#slot" x 7))
- (define (irregex-names x)
- (internal "##sys#check-structure" x 'regexp 'irregex-names)
- (internal "##sys#slot" x 8))))
- (else
- (begin
- (define irregex-tag '*irregex-tag*)
- (define (make-irregex dfa dfa/search dfa/extract nfa flags
- submatches lengths names)
- (vector irregex-tag dfa dfa/search dfa/extract nfa flags
- submatches lengths names))
- (define (irregex? obj)
- (and (vector? obj)
- (= 9 (vector-length obj))
- (eq? irregex-tag (vector-ref obj 0))))
- (define (irregex-dfa x) (vector-ref x 1))
- (define (irregex-dfa/search x) (vector-ref x 2))
- (define (irregex-dfa/extract x) (vector-ref x 3))
- (define (irregex-nfa x) (vector-ref x 4))
- (define (irregex-flags x) (vector-ref x 5))
- (define (irregex-num-submatches x) (vector-ref x 6))
- (define (irregex-lengths x) (vector-ref x 7))
- (define (irregex-names x) (vector-ref x 8)))))
-
-(cond-expand
- (building-chicken
- (begin
- ;; make-irregex-match defined elsewhere
- (define (irregex-new-matches irx)
- (make-irregex-match (irregex-num-submatches irx) (irregex-names irx)))
- (define (irregex-reset-matches! m)
- (let ((v (internal "##sys#slot" m 1)))
- (vector-fill! v #f)
- m))
- (define (irregex-copy-matches m)
- (and (internal "##sys#structure?" m 'regexp-match)
- (internal
- "##sys#make-structure"
- 'regexp-match
- (let* ((v (internal "##sys#slot" m 1))
- (v2 (make-vector (internal "##sys#size" v))))
- (vector-copy! v v2)
- v2)
- (internal "##sys#slot" m 2)
- (internal "##sys#slot" m 3)
- (internal "##sys#slot" m 4))))
- (define (irregex-match-data? obj)
- (internal "##sys#structure?" obj 'regexp-match))
- (define (irregex-match-num-submatches m)
- (internal "##sys#check-structure" m 'regexp-match 'irregex-match-num-submatches)
- (- (fx/ (internal "##sys#size" (internal "##sys#slot" m 1)) 4) 2))
- (define (irregex-match-chunker m)
- (internal "##sys#slot" m 3))
- (define (irregex-match-names m)
- (internal "##sys#check-structure" m 'regexp-match 'irregex-match-names)
- (internal "##sys#slot" m 2))
- (define (irregex-match-chunker-set! m str)
- (internal "##sys#setslot" m 3 str))
- (define-inline (%irregex-match-start-chunk m n)
- (internal "##sys#slot" (internal "##sys#slot" m 1) (* n 4)))
- (define-inline (%irregex-match-start-index m n)
- (internal "##sys#slot" (internal "##sys#slot" m 1) (+ 1 (* n 4))))
- (define-inline (%irregex-match-end-chunk m n)
- (internal "##sys#slot" (internal "##sys#slot" m 1) (+ 2 (* n 4))))
- (define (%irregex-match-end-index m n)
- (internal "##sys#slot" (internal "##sys#slot" m 1) (+ 3 (* n 4))))
- (define (%irregex-match-fail m) (internal "##sys#slot" m 4))
- (define (%irregex-match-fail-set! m x) (internal "##sys#setslot" m 4 x))
- (define-record-printer (regexp-match m out)
- (let ((n (irregex-match-num-submatches m)))
- (display "#<regexp-match (" out)
- (display n out)
- (display " submatches)>" out)))))
- (else
- (begin
- (define (irregex-new-matches irx)
- (make-irregex-match (irregex-num-submatches irx) (irregex-names irx)))
- (define (irregex-reset-matches! m)
- (do ((i (- (vector-length m) 1) (- i 1)))
- ((<= i 3) m)
- (vector-set! m i #f)))
- (define (irregex-copy-matches m)
- (and (vector? m)
- (let ((r (make-vector (vector-length m))))
- (do ((i (- (vector-length m) 1) (- i 1)))
- ((< i 0) r)
- (vector-set! r i (vector-ref m i))))))
- (define irregex-match-tag '*irregex-match-tag*)
- (define (irregex-match-data? obj)
- (and (vector? obj)
- (>= (vector-length obj) 11)
- (eq? irregex-match-tag (vector-ref obj 0))))
- (define (make-irregex-match count names)
- (let ((res (make-vector (+ (* 4 (+ 2 count)) 4) #f)))
- (vector-set! res 0 irregex-match-tag)
- (vector-set! res 2 names)
- res))
- (define (irregex-match-num-submatches m)
- (- (quotient (- (vector-length m) 3) 4) 2))
- (define (irregex-match-chunker m)
- (vector-ref m 1))
- (define (irregex-match-names m)
- (vector-ref m 2))
- (define (irregex-match-chunker-set! m str)
- (vector-set! m 1 str))
- (define (%irregex-match-start-chunk m n) (vector-ref m (+ 3 (* n 4))))
- (define (%irregex-match-start-index m n) (vector-ref m (+ 4 (* n 4))))
- (define (%irregex-match-end-chunk m n) (vector-ref m (+ 5 (* n 4))))
- (define (%irregex-match-end-index m n) (vector-ref m (+ 6 (* n 4))))
- (define (%irregex-match-fail m) (vector-ref m (- (vector-length m) 1)))
- (define (%irregex-match-fail-set! m x) (vector-set! m (- (vector-length m) 1) x)))))
-
-;; public interface with error checking
-(define (irregex-match-start-chunk m n)
- (if (not (irregex-match-valid-index? m n))
- (error "irregex-match-start-chunk: not a valid index" m n))
- (%irregex-match-start-chunk m n))
-(define (irregex-match-start-index m n)
- (if (not (irregex-match-valid-index? m n))
- (error "irregex-match-start-index: not a valid index" m n))
- (%irregex-match-start-index m n))
-(define (irregex-match-end-chunk m n)
- (if (not (irregex-match-valid-index? m n))
- (error "irregex-match-end-chunk: not a valid index" m n))
- (%irregex-match-end-chunk m n))
-(define (irregex-match-end-index m n)
- (if (not (irregex-match-valid-index? m n))
- (error "irregex-match-end-index: not a valid index" m n))
- (%irregex-match-end-index m n))
-
-(define (irregex-match-start-chunk-set! m n start)
- (vector-set! m (+ 3 (* n 4)) start))
-(define (irregex-match-start-index-set! m n start)
- (vector-set! m (+ 4 (* n 4)) start))
-(define (irregex-match-end-chunk-set! m n end)
- (vector-set! m (+ 5 (* n 4)) end))
-(define (irregex-match-end-index-set! m n end)
- (vector-set! m (+ 6 (* n 4)) end))
-
-(define (irregex-match-index m opt)
- (if (pair? opt)
- (if (number? (car opt))
- (car opt)
- (let lp ((ls (irregex-match-names m))
- (exists #f))
- (cond ((null? ls)
- (if exists #f (error "unknown match name" (car opt))))
- ((eq? (car opt) (caar ls))
- (if (%irregex-match-start-chunk m (cdar ls))
- (cdar ls)
- (lp (cdr ls) #t)))
- (else (lp (cdr ls) exists)))))
- 0))
-
-(cond-expand
- (building-chicken
- (define-inline (%irregex-match-valid-index? m n)
- (let ((v (internal "##sys#slot" m 1)))
- (and (< (* n 4) (internal "##sys#size" v))
- (internal "##sys#slot" v (+ 1 (* n 4)))))))
- (else
- (define (%irregex-match-valid-index? m n)
- (and (< (+ 3 (* n 4)) (vector-length m))
- (vector-ref m (+ 4 (* n 4)))))))
-
-(define (irregex-match-valid-index? m n)
- (if (not (irregex-match-data? m))
- (error "irregex-match-valid-index?: not match data" m))
- (if (not (integer? n))
- (error "irregex-match-valid-index?: not an integer" n))
- (%irregex-match-valid-index? m n))
-
-(define (irregex-match-substring m . opt)
- (if (not (irregex-match-data? m))
- (error "irregex-match-substring: not match data" m))
- (let* ((cnk (irregex-match-chunker m))
- (n (irregex-match-index m opt)))
- (and (%irregex-match-valid-index? m n)
- ((chunker-get-substring cnk)
- (%irregex-match-start-chunk m n)
- (%irregex-match-start-index m n)
- (%irregex-match-end-chunk m n)
- (%irregex-match-end-index m n)))))
-
-(define (irregex-match-subchunk m . opt)
- (if (not (irregex-match-data? m))
- (error "irregex-match-subchunk: not match data" m))
- (let* ((cnk (irregex-match-chunker m))
- (n (irregex-match-index m opt))
- (get-subchunk (chunker-get-subchunk cnk)))
- (if (not get-subchunk)
- (error "this chunk type does not support match subchunks")
- (and n
- (%irregex-match-valid-index? m n)
- (get-subchunk
- (%irregex-match-start-chunk m n)
- (%irregex-match-start-index m n)
- (%irregex-match-end-chunk m n)
- (%irregex-match-end-index m n))))))
-
-;; chunkers tell us how to navigate through chained chunks of strings
-
-(define (make-irregex-chunker get-next get-str . o)
- (let* ((get-start (or (and (pair? o) (car o)) (lambda (cnk) 0)))
- (o (if (pair? o) (cdr o) o))
- (get-end (or (and (pair? o) (car o))
- (lambda (cnk) (string-length (get-str cnk)))))
- (o (if (pair? o) (cdr o) o))
- (get-substr
- (or (and (pair? o) (car o))
- (lambda (cnk1 start cnk2 end)
- (if (eq? cnk1 cnk2)
- (substring (get-str cnk1) start end)
- (let loop ((cnk (get-next cnk1))
- (res (list (substring (get-str cnk1)
- start
- (get-end cnk1)))))
- (if (eq? cnk cnk2)
- (string-cat-reverse
- (cons (substring (get-str cnk)
- (get-start cnk)
- end)
- res))
- (loop (get-next cnk)
- (cons (substring (get-str cnk)
- (get-start cnk)
- (get-end cnk))
- res))))))))
- (o (if (pair? o) (cdr o) o))
- (get-subchunk (and (pair? o) (car o))))
- (if (not (and (procedure? get-next) (procedure? get-str)
- (procedure? get-start) (procedure? get-substr)))
- (error "make-irregex-chunker: expected a procdure"))
- (vector get-next get-str get-start get-end get-substr get-subchunk)))
-
-(define (chunker-get-next cnk) (vector-ref cnk 0))
-(define (chunker-get-str cnk) (vector-ref cnk 1))
-(define (chunker-get-start cnk) (vector-ref cnk 2))
-(define (chunker-get-end cnk) (vector-ref cnk 3))
-(define (chunker-get-substring cnk) (vector-ref cnk 4))
-(define (chunker-get-subchunk cnk) (vector-ref cnk 5))
-
-(define (chunker-prev-chunk cnk start end)
- (if (eq? start end)
- #f
- (let ((get-next (chunker-get-next cnk)))
- (let lp ((start start))
- (let ((next (get-next start)))
- (if (eq? next end)
- start
- (and next (lp next))))))))
-
-(define (chunker-prev-char cnk start end)
- (let ((prev (chunker-prev-chunk cnk start end)))
- (and prev
- (string-ref ((chunker-get-str cnk) prev)
- (- ((chunker-get-end cnk) prev) 1)))))
-
-(define (chunker-next-char cnk src)
- (let ((next ((chunker-get-next cnk) src)))
- (and next
- (string-ref ((chunker-get-str cnk) next)
- ((chunker-get-start cnk) next)))))
-
-(define (chunk-before? cnk a b)
- (and (not (eq? a b))
- (let ((next ((chunker-get-next cnk) a)))
- (and next
- (if (eq? next b)
- #t
- (chunk-before? cnk next b))))))
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;; String Utilities
-
-;; Unicode version (skip surrogates)
-(define *all-chars*
- `(/ ,(integer->char 0) ,(integer->char #xD7FF)
- ,(integer->char #xE000) ,(integer->char #x10FFFF)))
-
-;; ASCII version, offset to not assume 0-255
-;; (define *all-chars* `(/ ,(integer->char (- (char->integer #\space) 32)) ,(integer->char (+ (char->integer #\space) 223))))
-
-;; set to #f to ignore even an explicit request for utf8 handling
-(define *allow-utf8-mode?* #t)
-
-;; (define *named-char-properties* '())
-
-(define (string-scan-char str c . o)
- (let ((end (string-length str)))
- (let scan ((i (if (pair? o) (car o) 0)))
- (cond ((= i end) #f)
- ((eqv? c (string-ref str i)) i)
- (else (scan (+ i 1)))))))
-
-(define (string-scan-char-escape str c . o)
- (let ((end (string-length str)))
- (let scan ((i (if (pair? o) (car o) 0)))
- (cond ((= i end) #f)
- ((eqv? c (string-ref str i)) i)
- ((eqv? c #\\) (scan (+ i 2)))
- (else (scan (+ i 1)))))))
-
-(define (string-scan-pred str pred . o)
- (let ((end (string-length str)))
- (let scan ((i (if (pair? o) (car o) 0)))
- (cond ((= i end) #f)
- ((pred (string-ref str i)) i)
- (else (scan (+ i 1)))))))
-
-(define (string-split-char str c)
- (let ((end (string-length str)))
- (let lp ((i 0) (from 0) (res '()))
- (define (collect) (cons (substring str from i) res))
- (cond ((>= i end) (reverse (collect)))
- ((eqv? c (string-ref str i)) (lp (+ i 1) (+ i 1) (collect)))
- (else (lp (+ i 1) from res))))))
-
-(define (char-alphanumeric? c)
- (or (char-alphabetic? c) (char-numeric? c)))
-
-(define (%substring=? a b start1 start2 len)
- (let lp ((i 0))
- (cond ((>= i len)
- #t)
- ((char=? (string-ref a (+ start1 i)) (string-ref b (+ start2 i)))
- (lp (+ i 1)))
- (else
- #f))))
-
-;; SRFI-13 extracts
-
-(define (%%string-copy! to tstart from fstart fend)
- (do ((i fstart (+ i 1))
- (j tstart (+ j 1)))
- ((>= i fend))
- (string-set! to j (string-ref from i))))
-
-(define (string-cat-reverse string-list)
- (string-cat-reverse/aux
- (fold (lambda (s a) (+ (string-length s) a)) 0 string-list)
- string-list))
-
-(define (string-cat-reverse/aux len string-list)
- (let ((res (make-string len)))
- (let lp ((i len) (ls string-list))
- (if (pair? ls)
- (let* ((s (car ls))
- (slen (string-length s))
- (i (- i slen)))
- (%%string-copy! res i s 0 slen)
- (lp i (cdr ls)))))
- res))
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;; List Utilities
-
-;; like the one-arg IOTA case
-(define (zero-to n)
- (if (<= n 0)
- '()
- (let lp ((i (- n 1)) (res '()))
- (if (zero? i) (cons 0 res) (lp (- i 1) (cons i res))))))
-
-;; take the head of list FROM up to but not including TO, which must
-;; be a tail of the list
-(define (take-up-to from to)
- (let lp ((ls from) (res '()))
- (if (and (pair? ls) (not (eq? ls to)))
- (lp (cdr ls) (cons (car ls) res))
- (reverse res))))
-
-;; SRFI-1 extracts (simplified 1-ary versions)
-
-(define (find pred ls)
- (let lp ((ls ls))
- (cond ((null? ls) #f)
- ((pred (car ls)) (car ls))
- (else (lp (cdr ls))))))
-
-(define (find-tail pred ls)
- (let lp ((ls ls))
- (cond ((null? ls) #f)
- ((pred (car ls)) ls)
- (else (lp (cdr ls))))))
-
-(define (last ls)
- (if (not (pair? ls))
- (error "can't take last of empty list" ls)
- (let lp ((ls ls))
- (if (pair? (cdr ls))
- (lp (cdr ls))
- (car ls)))))
-
-(define (any pred ls)
- (and (pair? ls)
- (let lp ((head (car ls)) (tail (cdr ls)))
- (if (null? tail)
- (pred head)
- (or (pred head) (lp (car tail) (cdr tail)))))))
-
-(define (every pred ls)
- (or (null? ls)
- (let lp ((head (car ls)) (tail (cdr ls)))
- (if (null? tail)
- (pred head)
- (and (pred head) (lp (car tail) (cdr tail)))))))
-
-(define (fold kons knil ls)
- (let lp ((ls ls) (res knil))
- (if (null? ls)
- res
- (lp (cdr ls) (kons (car ls) res)))))
-
-(define (filter pred ls)
- (let lp ((ls ls) (res '()))
- (if (null? ls)
- (reverse res)
- (lp (cdr ls) (if (pred (car ls)) (cons (car ls) res) res)))))
-
-(define (remove pred ls)
- (let lp ((ls ls) (res '()))
- (if (null? ls)
- (reverse res)
- (lp (cdr ls) (if (pred (car ls)) res (cons (car ls) res))))))
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;; Flags
-
-(define (bit-shr n i)
- (quotient n (expt 2 i)))
-
-(define (bit-shl n i)
- (* n (expt 2 i)))
-
-(define (bit-not n) (- #xFFFF n))
-
-(define (bit-ior a b)
- (cond
- ((zero? a) b)
- ((zero? b) a)
- (else
- (+ (if (or (odd? a) (odd? b)) 1 0)
- (* 2 (bit-ior (quotient a 2) (quotient b 2)))))))
-
-(define (bit-and a b)
- (cond
- ((zero? a) 0)
- ((zero? b) 0)
- (else
- (+ (if (and (odd? a) (odd? b)) 1 0)
- (* 2 (bit-and (quotient a 2) (quotient b 2)))))))
-
-(define (integer-log n)
- (define (b8 n r)
- (if (>= n (bit-shl 1 8)) (b4 (bit-shr n 8) (+ r 8)) (b4 n r)))
- (define (b4 n r)
- (if (>= n (bit-shl 1 4)) (b2 (bit-shr n 4) (+ r 4)) (b2 n r)))
- (define (b2 n r)
- (if (>= n (bit-shl 1 2)) (b1 (bit-shr n 2) (+ r 2)) (b1 n r)))
- (define (b1 n r) (if (>= n (bit-shl 1 1)) (+ r 1) r))
- (if (>= n (bit-shl 1 16)) (b8 (bit-shr n 16) 16) (b8 n 0)))
-
-(define (flag-set? flags i)
- (= i (bit-and flags i)))
-(define (flag-join a b)
- (if b (bit-ior a b) a))
-(define (flag-clear a b)
- (bit-and a (bit-not b)))
-
-(define ~none 0)
-(define ~searcher? 1)
-(define ~consumer? 2)
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;; Parsing Embedded SREs in PCRE Strings
-
-;; (define (with-read-from-string str i proc)
-;; (define (port-size in)
-;; (let lp ((i 0)) (if (eof-object? (read-char in)) i (lp (+ i 1)))))
-;; (let* ((len (string-length str))
-;; (tail-len (- len i))
-;; (in (open-input-string (substring str i len)))
-;; (sre (read in))
-;; (unused-len (port-size in)))
-;; (close-input-port in)
-;; (proc sre (- tail-len unused-len))))
-
-(define close-token (list 'close))
-(define dot-token (string->symbol "."))
-
-(define (with-read-from-string str i proc)
- (define end (string-length str))
- (define (read i k)
- (cond
- ((>= i end) (error "unterminated embedded SRE" str))
- (else
- (case (string-ref str i)
- ((#\()
- (let lp ((i (+ i 1)) (ls '()))
- (read
- i
- (lambda (x j)
- (cond
- ((eq? x close-token)
- (k (reverse ls) j))
- ((eq? x dot-token)
- (if (null? ls)
- (error "bad dotted form" str)
- (read j (lambda (y j2)
- (read j2 (lambda (z j3)
- (if (not (eq? z close-token))
- (error "bad dotted form" str)
- (k (append (reverse (cdr ls))
- (cons (car ls) y))
- j3))))))))
- (else
- (lp j (cons x ls))))))))
- ((#\))
- (k close-token (+ i 1)))
- ((#\;)
- (let skip ((i (+ i 1)))
- (if (or (>= i end) (eqv? #\newline (string-ref str i)))
- (read (+ i 1) k)
- (skip (+ i 1)))))
- ((#\' #\`)
- (read (+ i 1)
- (lambda (sexp j)
- (let ((q (if (eqv? #\' (string-ref str i)) 'quote 'quasiquote)))
- (k (list q sexp) j)))))
- ((#\,)
- (let* ((at? (and (< (+ i 1) end) (eqv? #\@ (string-ref str (+ i 1)))))
- (u (if at? 'uquote-splicing 'unquote))
- (j (if at? (+ i 2) (+ i 1))))
- (read j (lambda (sexp j) (k (list u sexp) j)))))
- ((#\")
- (let scan ((from (+ i 1)) (i (+ i 1)) (res '()))
- (define (collect)
- (if (= from i) res (cons (substring str from i) res)))
- (if (>= i end)
- (error "unterminated string in embedded SRE" str)
- (case (string-ref str i)
- ((#\") (k (string-cat-reverse (collect)) (+ i 1)))
- ((#\\) (scan (+ i 1) (+ i 2) (collect)))
- (else (scan from (+ i 1) res))))))
- ((#\#)
- (case (string-ref str (+ i 1))
- ((#\;)
- (read (+ i 2) (lambda (sexp j) (read j k))))
- ((#\\)
- (read (+ i 2)
- (lambda (sexp j)
- (k (case sexp
- ((space) #\space)
- ((newline) #\newline)
- (else (let ((s (if (number? sexp)
- (number->string sexp)
- (symbol->string sexp))))
- (string-ref s 0))))
- j))))
- ((#\t #\f)
- (k (eqv? #\t (string-ref str (+ i 1))) (+ i 2)))
- (else
- (error "bad # syntax in simplified SRE" i))))
- (else
- (cond
- ((char-whitespace? (string-ref str i))
- (read (+ i 1) k))
- (else ;; symbol/number
- (let scan ((j (+ i 1)))
- (cond
- ((or (>= j end)
- (let ((c (string-ref str j)))
- (or (char-whitespace? c)
- (memv c '(#\; #\( #\) #\" #\# #\\)))))
- (let ((str2 (substring str i j)))
- (k (or (string->number str2) (string->symbol str2)) j)))
- (else (scan (+ j 1))))))))))))
- (read i (lambda (res j)
- (if (eq? res 'close-token)
- (error "unexpected ')' in SRE" str j)
- (proc res j)))))
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;; Parsing PCRE Strings
-
-(define ~save? 1)
-(define ~case-insensitive? 2)
-(define ~multi-line? 4)
-(define ~single-line? 8)
-(define ~ignore-space? 16)
-(define ~utf8? 32)
-
-(define (symbol-list->flags ls)
- (let lp ((ls ls) (res ~none))
- (if (not (pair? ls))
- res
- (lp (cdr ls)
- (flag-join
- res
- (case (car ls)
- ((i ci case-insensitive) ~case-insensitive?)
- ((m multi-line) ~multi-line?)
- ((s single-line) ~single-line?)
- ((x ignore-space) ~ignore-space?)
- ((u utf8) (if *allow-utf8-mode?* ~utf8? ~none))
- (else #f)))))))
-
-(define (maybe-string->sre obj)
- (if (string? obj) (string->sre obj) obj))
-
-(define (string->sre str . o)
- (if (not (string? str)) (error "string->sre: expected a string" str))
- (let ((end (string-length str))
- (flags (symbol-list->flags o)))
-
- (let lp ((i 0) (from 0) (flags flags) (res '()) (st '()))
-
- ;; handle case sensitivity at the literal char/string level
- (define (cased-char ch)
- (if (and (flag-set? flags ~case-insensitive?)
- (char-alphabetic? ch))
- `(or ,ch ,(char-altcase ch))
- ch))
- (define (cased-string str)
- (if (flag-set? flags ~case-insensitive?)
- (sre-sequence (map cased-char (string->list str)))
- str))
- ;; accumulate the substring from..i as literal text
- (define (collect)
- (if (= i from) res (cons (cased-string (substring str from i)) res)))
- ;; like collect but breaks off the last single character when
- ;; collecting literal data, as the argument to ?/*/+ etc.
- (define (collect/single)
- (let* ((utf8? (flag-set? flags ~utf8?))
- (j (if (and utf8? (> i 1))
- (utf8-backup-to-initial-char str (- i 1))
- (- i 1))))
- (cond
- ((< j from)
- res)
- (else
- (let ((c (cased-char (if utf8?
- (utf8-string-ref str j (- i j))
- (string-ref str j)))))
- (cond
- ((= j from)
- (cons c res))
- (else
- (cons c
- (cons (cased-string (substring str from j))
- res)))))))))
- ;; collects for use as a result, reversing and grouping OR
- ;; terms, and some ugly tweaking of `function-like' groups and
- ;; conditionals
- (define (collect/terms)
- (let* ((ls (collect))
- (func
- (and (pair? ls)
- (memq (last ls)
- '(atomic if look-ahead neg-look-ahead
- look-behind neg-look-behind
- => submatch-named
- w/utf8 w/noutf8))))
- (prefix (if (and func (memq (car func) '(=> submatch-named)))
- (list 'submatch-named (cadr (reverse ls)))
- (and func (list (car func)))))
- (ls (if func
- (if (memq (car func) '(=> submatch-named))
- (reverse (cddr (reverse ls)))
- (reverse (cdr (reverse ls))))
- ls)))
- (let lp ((ls ls) (term '()) (res '()))
- (define (shift)
- (cons (sre-sequence term) res))
- (cond
- ((null? ls)
- (let* ((res (sre-alternate (shift)))
- (res (if (flag-set? flags ~save?)
- (list 'submatch res)
- res)))
- (if prefix
- (if (eq? 'if (car prefix))
- (cond
- ((not (pair? res))
- 'epsilon)
- ((memq (car res)
- '(look-ahead neg-look-ahead
- look-behind neg-look-behind))
- res)
- ((eq? 'seq (car res))
- `(if ,(cadr res)
- ,(if (pair? (cdr res))
- (sre-sequence (cddr res))
- 'epsilon)))
- (else
- `(if ,(cadadr res)
- ,(if (pair? (cdr res))
- (sre-sequence (cddadr res))
- 'epsilon)
- ,(sre-alternate
- (if (pair? (cdr res)) (cddr res) '())))))
- `(,@prefix ,res))
- res)))
- ((eq? 'or (car ls)) (lp (cdr ls) '() (shift)))
- (else (lp (cdr ls) (cons (car ls) term) res))))))
- (define (save)
- (cons (cons flags (collect)) st))
-
- ;; main parsing
- (if (>= i end)
- (if (pair? st)
- (error "unterminated parenthesis in regexp" str)
- (collect/terms))
- (let ((c (string-ref str i)))
- (case c
- ((#\.)
- (lp (+ i 1) (+ i 1) flags
- (cons (if (flag-set? flags ~single-line?) 'any 'nonl)
- (collect))
- st))
- ((#\?)
- (let ((res (collect/single)))
- (if (null? res)
- (error "? can't follow empty pattern" str res)
- (let ((x (car res)))
- (lp (+ i 1)
- (+ i 1)
- flags
- (cons
- (if (pair? x)
- (case (car x)
- ((*) `(*? ,@(cdr x)))
- ((+) `(**? 1 #f ,@(cdr x)))
- ((?) `(?? ,@(cdr x)))
- ((**) `(**? ,@(cdr x)))
- ((=) `(**? ,(cadr x) ,@(cdr x)))
- ((>=) `(**? ,(cadr x) #f ,@(cddr x)))
- (else `(? ,x)))
- `(? ,x))
- (cdr res))
- st)))))
- ((#\+ #\*)
- (let* ((res (collect/single))
- (x (if (pair? res) (car res) 'epsilon))
- (op (string->symbol (string c))))
- (cond
- ((sre-repeater? x)
- (error "duplicate repetition (e.g. **) in pattern" str res))
- ((sre-empty? x)
- (error "can't repeat empty pattern (e.g. ()*)" str res))
- (else
- (lp (+ i 1) (+ i 1) flags
- (cons (list op x) (cdr res))
- st)))))
- ((#\()
- (cond
- ((>= (+ i 1) end)
- (error "unterminated parenthesis in regexp" str))
- ((not (memv (string-ref str (+ i 1)) '(#\? #\*))) ; normal case
- (lp (+ i 1) (+ i 1) (flag-join flags ~save?) '() (save)))
- ((>= (+ i 2) end)
- (error "unterminated parenthesis in regexp" str))
- ((eqv? (string-ref str (+ i 1)) #\*)
- (if (eqv? #\' (string-ref str (+ i 2)))
- (with-read-from-string str (+ i 3)
- (lambda (sre j)
- (if (or (>= j end) (not (eqv? #\) (string-ref str j))))
- (error "unterminated (*'...) SRE escape" str)
- (lp (+ j 1) (+ j 1) flags (cons sre (collect)) st))))
- (error "bad regexp syntax: (*FOO) not supported" str)))
- (else ;; (?...) case
- (case (string-ref str (+ i 2))
- ((#\#)
- (let ((j (string-scan-char str #\) (+ i 3))))
- (lp (+ j i) (+ j 1) flags (collect) st)))
- ((#\:)
- (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) '() (save)))
- ((#\=)
- (lp (+ i 3) (+ i 3) (flag-clear flags ~save?)
- '(look-ahead) (save)))
- ((#\!)
- (lp (+ i 3) (+ i 3) (flag-clear flags ~save?)
- '(neg-look-ahead) (save)))
- ((#\<)
- (cond
- ((>= (+ i 3) end)
- (error "unterminated parenthesis in regexp" str))
- (else
- (case (string-ref str (+ i 3))
- ((#\=)
- (lp (+ i 4) (+ i 4) (flag-clear flags ~save?)
- '(look-behind) (save)))
- ((#\!)
- (lp (+ i 4) (+ i 4) (flag-clear flags ~save?)
- '(neg-look-behind) (save)))
- (else
- (let ((j (and (char-alphabetic?
- (string-ref str (+ i 3)))
- (string-scan-char str #\> (+ i 4)))))
- (if j
- (lp (+ j 1) (+ j 1) (flag-clear flags ~save?)
- `(,(string->symbol (substring str (+ i 3) j))
- submatch-named)
- (save))
- (error "invalid (?< sequence" str))))))))
- ((#\>)
- (lp (+ i 3) (+ i 3) (flag-clear flags ~save?)
- '(atomic) (save)))
- ;;((#\' #\P) ; named subpatterns
- ;; )
- ;;((#\R) ; recursion
- ;; )
- ((#\()
- (cond
- ((>= (+ i 3) end)
- (error "unterminated parenthesis in regexp" str))
- ((char-numeric? (string-ref str (+ i 3)))
- (let* ((j (string-scan-char str #\) (+ i 3)))
- (n (string->number (substring str (+ i 3) j))))
- (if (not n)
- (error "invalid conditional reference" str)
- (lp (+ j 1) (+ j 1) (flag-clear flags ~save?)
- `(,n if) (save)))))
- ((char-alphabetic? (string-ref str (+ i 3)))
- (let* ((j (string-scan-char str #\) (+ i 3)))
- (s (string->symbol (substring str (+ i 3) j))))
- (lp (+ j 1) (+ j 1) (flag-clear flags ~save?)
- `(,s if) (save))))
- (else
- (lp (+ i 2) (+ i 2) (flag-clear flags ~save?)
- '(if) (save)))))
- ((#\{)
- (error "unsupported Perl-style cluster" str))
- (else
- (let ((old-flags flags))
- (let lp2 ((j (+ i 2)) (flags flags) (invert? #f))
- (define (join x)
- ((if invert? flag-clear flag-join) flags x))
- (define (new-res res)
- (let ((before (flag-set? old-flags ~utf8?))
- (after (flag-set? flags ~utf8?)))
- (if (eq? before after)
- res
- (cons (if after 'w/utf8 'w/noutf8) res))))
- (cond
- ((>= j end)
- (error "incomplete cluster" str i))
- (else
- (case (string-ref str j)
- ((#\i)
- (lp2 (+ j 1) (join ~case-insensitive?) invert?))
- ((#\m)
- (lp2 (+ j 1) (join ~multi-line?) invert?))
- ((#\x)
- (lp2 (+ j 1) (join ~ignore-space?) invert?))
- ((#\u)
- (if *allow-utf8-mode?*
- (lp2 (+ j 1) (join ~utf8?) invert?)
- (lp2 (+ j 1) flags invert?)))
- ((#\-)
- (lp2 (+ j 1) flags (not invert?)))
- ((#\))
- (lp (+ j 1) (+ j 1) flags (new-res (collect))
- st))
- ((#\:)
- (lp (+ j 1) (+ j 1) flags (new-res '())
- (cons (cons old-flags (collect)) st)))
- (else
- (error "unknown regex cluster modifier" str)
- )))))))))))
- ((#\))
- (if (null? st)
- (error "too many )'s in regexp" str)
- (lp (+ i 1)
- (+ i 1)
- (caar st)
- (cons (collect/terms) (cdar st))
- (cdr st))))
- ((#\[)
- (apply
- (lambda (sre j)
- (lp (+ j 1) (+ j 1) flags (cons sre (collect)) st))
- (string-parse-cset str (+ i 1) flags)))
- ((#\{)
- (cond
- ((or (>= (+ i 1) end)
- (not (or (char-numeric? (string-ref str (+ i 1)))
- (eqv? #\, (string-ref str (+ i 1))))))
- (lp (+ i 1) from flags res st))
- (else
- (let ((res (collect/single)))
- (cond
- ((null? res)
- (error "{ can't follow empty pattern"))
- (else
- (let* ((x (car res))
- (tail (cdr res))
- (j (string-scan-char str #\} (+ i 1)))
- (s2 (string-split-char (substring str (+ i 1) j)
- #\,))
- (n (string->number (car s2)))
- (m (and (pair? (cdr s2))
- (string->number (cadr s2)))))
- (cond
- ((or (not n)
- (and (pair? (cdr s2))
- (not (equal? "" (cadr s2)))
- (not m)))
- (error "invalid {n} repetition syntax" s2))
- ((null? (cdr s2))
- (lp (+ j 1) (+ j 1) flags `((= ,n ,x) ,@tail) st))
- (m
- (lp (+ j 1) (+ j 1) flags `((** ,n ,m ,x) ,@tail) st))
- (else
- (lp (+ j 1) (+ j 1) flags `((>= ,n ,x) ,@tail) st)
- )))))))))
- ((#\\)
- (cond
- ((>= (+ i 1) end)
- (error "incomplete escape sequence" str))
- (else
- (let ((c (string-ref str (+ i 1))))
- (case c
- ((#\d)
- (lp (+ i 2) (+ i 2) flags `(numeric ,@(collect)) st))
- ((#\D)
- (lp (+ i 2) (+ i 2) flags `((~ numeric) ,@(collect)) st))
- ((#\s)
- (lp (+ i 2) (+ i 2) flags `(space ,@(collect)) st))
- ((#\S)
- (lp (+ i 2) (+ i 2) flags `((~ space) ,@(collect)) st))
- ((#\w)
- (lp (+ i 2) (+ i 2) flags
- `((or alphanumeric ("_")) ,@(collect)) st))
- ((#\W)
- (lp (+ i 2) (+ i 2) flags
- `((~ (or alphanumeric ("_"))) ,@(collect)) st))
- ((#\b)
- (lp (+ i 2) (+ i 2) flags
- `((or bow eow) ,@(collect)) st))
- ((#\B)
- (lp (+ i 2) (+ i 2) flags `(nwb ,@(collect)) st))
- ((#\A)
- (lp (+ i 2) (+ i 2) flags `(bos ,@(collect)) st))
- ((#\Z)
- (lp (+ i 2) (+ i 2) flags
- `((? #\newline) eos ,@(collect)) st))
- ((#\z)
- (lp (+ i 2) (+ i 2) flags `(eos ,@(collect)) st))
- ((#\R)
- (lp (+ i 2) (+ i 2) flags `(newline ,@(collect)) st))
- ((#\K)
- (lp (+ i 2) (+ i 2) flags `(reset ,@(collect)) st))
- ;; these two are from Emacs and TRE, but not in PCRE
- ((#\<)
- (lp (+ i 2) (+ i 2) flags `(bow ,@(collect)) st))
- ((#\>)
- (lp (+ i 2) (+ i 2) flags `(eow ,@(collect)) st))
- ((#\x)
- (apply
- (lambda (ch j)
- (lp (+ j 1) (+ j 1) flags `(,ch ,@(collect)) st))
- (string-parse-hex-escape str (+ i 2) end)))
- ((#\k)
- (let ((c (string-ref str (+ i 2))))
- (if (not (memv c '(#\< #\{ #\')))
- (error "bad \\k usage, expected \\k<...>" str)
- (let* ((terminal (char-mirror c))
- (j (string-scan-char str terminal (+ i 2)))
- (s (and j (substring str (+ i 3) j)))
- (backref
- (if (flag-set? flags ~case-insensitive?)
- 'backref-ci
- 'backref)))
- (if (not j)
- (error "unterminated named backref" str)
- (lp (+ j 1) (+ j 1) flags
- `((,backref ,(string->symbol s))
- ,@(collect))
- st))))))
- ((#\Q) ;; \Q..\E escapes
- (let ((res (collect)))
- (let lp2 ((j (+ i 2)))
- (cond
- ((>= j end)
- (lp j (+ i 2) flags res st))
- ((eqv? #\\ (string-ref str j))
- (cond
- ((>= (+ j 1) end)
- (lp (+ j 1) (+ i 2) flags res st))
- ((eqv? #\E (string-ref str (+ j 1)))
- (lp (+ j 2) (+ j 2) flags
- (cons (substring str (+ i 2) j) res) st))
- (else
- (lp2 (+ j 2)))))
- (else
- (lp2 (+ j 1)))))))
- ((#\')
- (with-read-from-string str (+ i 2)
- (lambda (sre j)
- (lp j j flags (cons sre (collect)) st))))
- ;;((#\p) ; XXXX unicode properties
- ;; )
- ;;((#\P)
- ;; )
- (else
- (cond
- ((char-numeric? c)
- (let* ((j (or (string-scan-pred
- str
- (lambda (c) (not (char-numeric? c)))
- (+ i 2))
- end))
- (backref
- (if (flag-set? flags ~case-insensitive?)
- 'backref-ci
- 'backref))
- (res `((,backref ,(string->number
- (substring str (+ i 1) j)))
- ,@(collect))))
- (lp j j flags res st)))
- ((char-alphabetic? c)
- (let ((cell (assv c posix-escape-sequences)))
- (if cell
- (lp (+ i 2) (+ i 2) flags
- (cons (cdr cell) (collect)) st)
- (error "unknown escape sequence" str c))))
- (else
- (lp (+ i 2) (+ i 1) flags (collect) st)))))))))
- ((#\|)
- (lp (+ i 1) (+ i 1) flags (cons 'or (collect)) st))
- ((#\^)
- (let ((sym (if (flag-set? flags ~multi-line?) 'bol 'bos)))
- (lp (+ i 1) (+ i 1) flags (cons sym (collect)) st)))
- ((#\$)
- (let ((sym (if (flag-set? flags ~multi-line?) 'eol 'eos)))
- (lp (+ i 1) (+ i 1) flags (cons sym (collect)) st)))
- ((#\space)
- (if (flag-set? flags ~ignore-space?)
- (lp (+ i 1) (+ i 1) flags (collect) st)
- (lp (+ i 1) from flags res st)))
- ((#\#)
- (if (flag-set? flags ~ignore-space?)
- (let ((j (or (string-scan-char str #\newline (+ i 1))
- (- end 1))))
- (lp (+ j 1) (+ j 1) flags (collect) st))
- (lp (+ i 1) from flags res st)))
- (else
- (lp (+ i 1) from flags res st))))))))
-
-(define posix-escape-sequences
- `((#\n . #\newline)
- (#\r . ,(integer->char (+ (char->integer #\newline) 3)))
- (#\t . ,(integer->char (- (char->integer #\newline) 1)))
- (#\a . ,(integer->char (- (char->integer #\newline) 3)))
- (#\e . ,(integer->char (+ (char->integer #\newline) #x11)))
- (#\f . ,(integer->char (+ (char->integer #\newline) 2)))
- ))
-
-(define (char-altcase c)
- (if (char-upper-case? c) (char-downcase c) (char-upcase c)))
-
-(define (char-mirror c)
- (case c ((#\<) #\>) ((#\{) #\}) ((#\() #\)) ((#\[) #\]) (else c)))
-
-(define (string-parse-hex-escape str i end)
- (cond
- ((>= i end)
- (error "incomplete hex escape" str i))
- ((eqv? #\{ (string-ref str i))
- (let ((j (string-scan-char-escape str #\} (+ i 1))))
- (if (not j)
- (error "incomplete hex brace escape" str i)
- (let* ((s (substring str (+ i 1) j))
- (n (string->number s 16)))
- (if n
- (list (integer->char n) j)
- (error "bad hex brace escape" s))))))
- ((>= (+ i 1) end)
- (error "incomplete hex escape" str i))
- (else
- (let* ((s (substring str i (+ i 2)))
- (n (string->number s 16)))
- (if n
- (list (integer->char n) (+ i 2))
- (error "bad hex escape" s))))))
-
-(define (string-parse-cset str start flags)
- (let* ((end (string-length str))
- (invert? (and (< start end) (eqv? #\^ (string-ref str start))))
- (utf8? (flag-set? flags ~utf8?)))
- (define (go i chars ranges)
- (if (>= i end)
- (error "incomplete char set" str i end)
- (let ((c (string-ref str i)))
- (case c
- ((#\])
- (if (and (null? chars) (null? ranges))
- (go (+ i 1) (cons #\] chars) ranges)
- (let ((ci? (flag-set? flags ~case-insensitive?))
- (hi-chars (if utf8? (filter high-char? chars) '()))
- (chars (if utf8? (remove high-char? chars) chars)))
- (list
- ((lambda (res)
- (if invert? (cons '~ res) (sre-alternate res)))
- (append
- hi-chars
- (if (pair? chars)
- (list
- (list (list->string
- ((if ci?
- cset-case-insensitive
- (lambda (x) x))
- (reverse chars)))))
- '())
- (if (pair? ranges)
- (let ((res (if ci?
- (cset-case-insensitive
- (reverse ranges))
- (reverse ranges))))
- (list (cons '/ (alist->plist res))))
- '())))
- i))))
- ((#\-)
- (cond
- ((or (= i start)
- (and (= i (+ start 1)) (eqv? #\^ (string-ref str start)))
- (eqv? #\] (string-ref str (+ i 1))))
- (go (+ i 1) (cons c chars) ranges))
- ((null? chars)
- (error "bad char-set"))
- (else
- (let* ((c1 (car chars))
- (c2 (string-ref str (+ i 1))))
- (apply
- (lambda (c2 j)
- (if (char<? c2 c1)
- (error "inverted range in char-set" c1 c2)
- (go j (cdr chars) (cons (cons c1 c2) ranges))))
- (cond
- ((and (eqv? #\\ c2) (assv c2 posix-escape-sequences))
- => (lambda (x) (list (cdr x) (+ i 3))))
- ((and (eqv? #\\ c2)
- (eqv? (string-ref str (+ i 2)) #\x))
- (string-parse-hex-escape str (+ i 3) end))
- ((and utf8? (<= #x80 (char->integer c2) #xFF))
- (let ((len (utf8-start-char->length c2)))
- (list (utf8-string-ref str (+ i 1) len) (+ i 1 len))))
- (else
- (list c2 (+ i 2)))))))))
- ((#\[)
- (let* ((inv? (eqv? #\^ (string-ref str (+ i 1))))
- (i2 (if inv? (+ i 2) (+ i 1))))
- (case (string-ref str i2)
- ((#\:)
- (let ((j (string-scan-char str #\: (+ i2 1))))
- (if (or (not j) (not (eqv? #\] (string-ref str (+ j 1)))))
- (error "incomplete character class" str)
- (let* ((cset (sre->cset
- (string->symbol
- (substring str (+ i2 1) j))))
- (cset (if inv? (cset-complement cset) cset)))
- (go (+ j 2)
- (append (filter char? cset) chars)
- (append (filter pair? cset) ranges))))))
- ((#\= #\.)
- (error "collating sequences not supported" str))
- (else
- (go (+ i 1) (cons #\[ chars) ranges)))))
- ((#\\)
- (let ((c (string-ref str (+ i 1))))
- (case c
- ((#\d #\D #\s #\S #\w #\W)
- (let ((cset (sre->cset (string->sre (string #\\ c)))))
- (go (+ i 2)
- (append (filter char? cset) chars)
- (append (filter pair? cset) ranges))))
- ((#\x)
- (apply
- (lambda (ch j)
- (go j (cons ch chars) ranges))
- (string-parse-hex-escape str (+ i 2) end)))
- (else
- (let ((c (cond ((assv c posix-escape-sequences) => cdr)
- (else c))))
- (go (+ i 2)
- (cons (string-ref str (+ i 1)) (cons c chars))
- ranges))))))
- (else
- (if (and utf8? (<= #x80 (char->integer c) #xFF))
- (let ((len (utf8-start-char->length c)))
- (go (+ i len)
- (cons (utf8-string-ref str i len) chars)
- ranges))
- (go (+ i 1) (cons c chars) ranges)))))))
- (if invert?
- (go (+ start 1)
- (if (flag-set? flags ~multi-line?) '(#\newline) '())
- '())
- (go start '() '()))))
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;; UTF-8 Utilities
-
-;; Here are some hairy optimizations that need to be documented
-;; better. Thanks to these, we never do any utf8 processing once the
-;; regexp is compiled.
-
-;; two chars: ab..ef
-;; a[b..xFF]|[b-d][x80..xFF]|e[x80..xFF]
-
-;; three chars: abc..ghi
-;; ab[c..xFF]|a[d..xFF][x80..xFF]|
-;; [b..f][x80..xFF][x80..xFF]|
-;; g[x80..g][x80..xFF]|gh[x80..i]
-
-;; four chars: abcd..ghij
-;; abc[d..xFF]|ab[d..xFF][x80..xFF]|a[c..xFF][x80..xFF][x80..xFF]|
-;; [b..f][x80..xFF][x80..xFF][x80..xFF]|
-;; g[x80..g][x80..xFF][x80..xFF]|gh[x80..h][x80..xFF]|ghi[x80..j]
-
-(define (high-char? c) (<= #x80 (char->integer c)))
-
-;; number of total bytes in a utf8 char given the 1st byte
-
-(define utf8-start-char->length
- (let ((table '#(
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 0x
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 1x
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 2x
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 3x
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 4x
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 5x
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 6x
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 7x
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 8x
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 9x
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; ax
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; bx
-2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ; cx
-2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ; dx
-3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 ; ex
-4 4 4 4 4 4 4 4 5 5 5 5 6 6 0 0 ; fx
-)))
- (lambda (c) (vector-ref table (char->integer c)))))
-
-(define (utf8-string-ref str i len)
- (define (byte n) (char->integer (string-ref str n)))
- (case len
- ((1) ; shouldn't happen in this module
- (string-ref str i))
- ((2)
- (integer->char
- (+ (bit-shl (bit-and (byte i) #b00011111) 6)
- (bit-and (byte (+ i 1)) #b00111111))))
- ((3)
- (integer->char
- (+ (bit-shl (bit-and (byte i) #b00001111) 12)
- (bit-shl (bit-and (byte (+ i 1)) #b00111111) 6)
- (bit-and (byte (+ i 2)) #b00111111))))
- ((4)
- (integer->char
- (+ (bit-shl (bit-and (byte i) #b00000111) 18)
- (bit-shl (bit-and (byte (+ i 1)) #b00111111) 12)
- (bit-shl (bit-and (byte (+ i 2)) #b00111111) 6)
- (bit-and (byte (+ i 3)) #b00111111))))
- (else
- (error "invalid utf8 length" str len i))))
-
-(define (utf8-backup-to-initial-char str i)
- (let lp ((i i))
- (if (= i 0)
- 0
- (let ((c (char->integer (string-ref str i))))
- (if (or (< c #x80) (>= c #xC0))
- i
- (lp (- i 1)))))))
-
-(define (utf8-lowest-digit-of-length len)
- (case len
- ((1) 0) ((2) #xC0) ((3) #xE0) ((4) #xF0)
- (else (error "invalid utf8 length" len))))
-
-(define (utf8-highest-digit-of-length len)
- (case len
- ((1) #x7F) ((2) #xDF) ((3) #xEF) ((4) #xF7)
- (else (error "invalid utf8 length" len))))
-
-(define (char->utf8-list c)
- (let ((i (char->integer c)))
- (cond
- ((<= i #x7F) (list i))
- ((<= i #x7FF)
- (list (bit-ior #b11000000 (bit-shr i 6))
- (bit-ior #b10000000 (bit-and i #b111111))))
- ((<= i #xFFFF)
- (list (bit-ior #b11100000 (bit-shr i 12))
- (bit-ior #b10000000 (bit-and (bit-shr i 6) #b111111))
- (bit-ior #b10000000 (bit-and i #b111111))))
- ((<= i #x1FFFFF)
- (list (bit-ior #b11110000 (bit-shr i 18))
- (bit-ior #b10000000 (bit-and (bit-shr i 12) #b111111))
- (bit-ior #b10000000 (bit-and (bit-shr i 6) #b111111))
- (bit-ior #b10000000 (bit-and i #b111111))))
- (else (error "unicode codepoint out of range:" i)))))
-
-(define (unicode-range->utf8-pattern lo hi)
- (let ((lo-ls (char->utf8-list lo))
- (hi-ls (char->utf8-list hi)))
- (if (not (= (length lo-ls) (length hi-ls)))
- (sre-alternate (list (unicode-range-climb-digits lo-ls hi-ls)
- (unicode-range-up-to hi-ls)))
- (let lp ((lo-ls lo-ls) (hi-ls hi-ls))
- (cond
- ((null? lo-ls)
- '())
- ((= (car lo-ls) (car hi-ls))
- (sre-sequence
- (list (integer->char (car lo-ls))
- (lp (cdr lo-ls) (cdr hi-ls)))))
- ((= (+ (car lo-ls) 1) (car hi-ls))
- (sre-alternate (list (unicode-range-up-from lo-ls)
- (unicode-range-up-to hi-ls))))
- (else
- (sre-alternate (list (unicode-range-up-from lo-ls)
- (unicode-range-middle lo-ls hi-ls)
- (unicode-range-up-to hi-ls)))))))))
-
-(define (unicode-range-helper one ls prefix res)
- (if (null? ls)
- res
- (unicode-range-helper
- one
- (cdr ls)
- (cons (car ls) prefix)
- (cons (sre-sequence
- `(,@(map integer->char prefix)
- ,(one (car ls))
- ,@(map (lambda (_)
- `(/ ,(integer->char #x80)
- ,(integer->char #xFF)))
- (cdr ls))))
- res))))
-
-(define (unicode-range-up-from lo-ls)
- (sre-sequence
- (list (integer->char (car lo-ls))
- (sre-alternate
- (unicode-range-helper
- (lambda (c)
- `(/ ,(integer->char (+ (car lo-ls) 1)) ,(integer->char #xFF)))
- (cdr (reverse (cdr lo-ls)))
- '()
- (list
- (sre-sequence
- (append
- (map integer->char (reverse (cdr (reverse (cdr lo-ls)))))
- `((/ ,(integer->char (last lo-ls))
- ,(integer->char #xFF)))))))))))
-
-(define (unicode-range-up-to hi-ls)
- (sre-sequence
- (list (integer->char (car hi-ls))
- (sre-alternate
- (unicode-range-helper
- (lambda (c)
- `(/ ,(integer->char #x80) ,(integer->char (- (car hi-ls) 1))))
- (cdr (reverse (cdr hi-ls)))
- '()
- (list
- (sre-sequence
- (append
- (map integer->char (reverse (cdr (reverse (cdr hi-ls)))))
- `((/ ,(integer->char #x80)
- ,(integer->char (last hi-ls))))))))))))
-
-(define (unicode-range-climb-digits lo-ls hi-ls)
- (let ((lo-len (length lo-ls)))
- (sre-alternate
- (append
- (list
- (sre-sequence
- (cons `(/ ,(integer->char (car lo-ls))
- ,(integer->char (if (<= (car lo-ls) #x7F) #x7F #xFF)))
- (map (lambda (_)
- `(/ ,(integer->char #x80) ,(integer->char #xFF)))
- (cdr lo-ls)))))
- (map
- (lambda (i)
- (sre-sequence
- (cons
- `(/ ,(integer->char (utf8-lowest-digit-of-length (+ i lo-len 1)))
- ,(integer->char (utf8-highest-digit-of-length (+ i lo-len 1))))
- (map (lambda (_)
- `(/ ,(integer->char #x80) ,(integer->char #xFF)))
- (zero-to (+ i lo-len))))))
- (zero-to (- (length hi-ls) lo-len 1)))
- (list
- (sre-sequence
- (cons `(/ ,(integer->char
- (utf8-lowest-digit-of-length
- (utf8-start-char->length
- (integer->char (- (car hi-ls) 1)))))
- ,(integer->char (- (car hi-ls) 1)))
- (map (lambda (_)
- `(/ ,(integer->char #x80) ,(integer->char #xFF)))
- (cdr hi-ls)))))))))
-
-(define (unicode-range-middle lo-ls hi-ls)
- (let ((lo (integer->char (+ (car lo-ls) 1)))
- (hi (integer->char (- (car hi-ls) 1))))
- (sre-sequence
- (cons (if (char=? lo hi) lo `(/ ,lo ,hi))
- (map (lambda (_) `(/ ,(integer->char #x80) ,(integer->char #xFF)))
- (cdr lo-ls))))))
-
-(define (cset->utf8-pattern cset)
- (let lp ((ls cset) (alts '()) (lo-cset '()))
- (cond
- ((null? ls)
- (sre-alternate (append (reverse alts)
- (if (null? lo-cset)
- '()
- (list (cons '/ (reverse lo-cset)))))))
- ((char? (car ls))
- (if (high-char? (car ls))
- (lp (cdr ls) (cons (car ls) alts) lo-cset)
- (lp (cdr ls) alts (cons (car ls) lo-cset))))
- (else
- (if (or (high-char? (caar ls)) (high-char? (cdar ls)))
- (lp (cdr ls)
- (cons (unicode-range->utf8-pattern (caar ls) (cdar ls)) alts)
- lo-cset)
- (lp (cdr ls) alts (cons (cdar ls) (cons (caar ls) lo-cset))))))))
-
-(define (sre-adjust-utf8 sre flags)
- (let adjust ((sre sre)
- (utf8? (flag-set? flags ~utf8?))
- (ci? (flag-set? flags ~case-insensitive?)))
- (define (rec sre) (adjust sre utf8? ci?))
- (cond
- ((pair? sre)
- (case (car sre)
- ((w/utf8) (adjust (sre-sequence (cdr sre)) #t ci?))
- ((w/noutf8) (adjust (sre-sequence (cdr sre)) #f ci?))
- ((w/case)
- (cons (car sre) (map (lambda (s) (adjust s utf8? #f)) (cdr sre))))
- ((w/nocase)
- (cons (car sre) (map (lambda (s) (adjust s utf8? #t)) (cdr sre))))
- ((/ ~ & -)
- (if (not utf8?)
- sre
- (let ((cset (sre->cset sre ci?)))
- (if (any (lambda (x)
- (if (pair? x)
- (or (high-char? (car x)) (high-char? (cdr x)))
- (high-char? x)))
- cset)
- (if ci?
- (list 'w/case (cset->utf8-pattern cset))
- (cset->utf8-pattern cset))
- sre))))
- ((*)
- (case (sre-sequence (cdr sre))
- ;; special case optimization: .* w/utf8 == .* w/noutf8
- ((any) '(* any))
- ((nonl) '(* nonl))
- (else (cons '* (map rec (cdr sre))))))
- (else
- (cons (car sre) (map rec (cdr sre))))))
- (else
- (case sre
- ((any) 'utf8-any)
- ((nonl) 'utf8-nonl)
- (else
- (if (and utf8? (char? sre) (high-char? sre))
- (sre-sequence (map integer->char (char->utf8-list sre)))
- sre)))))))
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;; Compilation
-
-(cond-expand
- (building-chicken
- (define-syntax cached
- (syntax-rules ()
- ((_ arg fail) (build-cache 5 arg fail)))))
- (else
- (define-syntax cached
- (syntax-rules ()
- ((_ arg fail) fail)))))
-
-(define (irregex x . o)
- (cond ((irregex? x) x)
- ((null? o)
- (cached
- x
- (if (string? x)
- (string->irregex x)
- (sre->irregex x))))
- (else
- (if (string? x)
- (apply string->irregex x o)
- (apply sre->irregex x o)))))
-
-(define (string->irregex str . o)
- (apply sre->irregex (apply string->sre str o) o))
-
-(define (sre->irregex sre . o)
- (let* ((pat-flags (symbol-list->flags o))
- (sre (if *allow-utf8-mode?*
- (sre-adjust-utf8 sre pat-flags)
- sre))
- (searcher? (sre-searcher? sre))
- (sre-dfa (if searcher? (sre-remove-initial-bos sre) sre))
- (dfa-limit (cond ((memq 'small o) 1) ((memq 'fast o) 50) (else 10)))
- (dfa/search
- (cond ((memq 'backtrack o) #f)
- (searcher? #t)
- ((sre->nfa `(seq (* any) ,sre-dfa) pat-flags)
- => (lambda (nfa)
- (nfa->dfa nfa (* dfa-limit (nfa-num-states nfa)))))
- (else #f)))
- (dfa (cond ((and dfa/search (sre->nfa sre-dfa pat-flags))
- => (lambda (nfa)
- (nfa->dfa nfa (* dfa-limit (nfa-num-states nfa)))))
- (else #f)))
- (submatches (sre-count-submatches sre-dfa))
- (extractor
- (and dfa dfa/search (sre-match-extractor sre-dfa submatches)))
- (names (sre-names sre-dfa 1 '()))
- (lens (sre-length-ranges sre-dfa names))
- (flags (flag-join
- (flag-join ~none (and searcher? ~searcher?))
- (and (sre-consumer? sre) ~consumer?))))
- (cond
- (dfa
- (make-irregex dfa dfa/search extractor #f flags submatches lens names))
- (else
- (let ((f (sre->procedure sre pat-flags names)))
- (make-irregex #f #f #f f flags submatches lens names))))))
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;; SRE Analysis
-
-;; returns #t if the sre can ever be empty
-(define (sre-empty? sre)
- (if (pair? sre)
- (case (car sre)
- ((* ? look-ahead look-behind neg-look-ahead neg-look-behind) #t)
- ((**) (or (not (number? (cadr sre))) (zero? (cadr sre))))
- ((or) (any sre-empty? (cdr sre)))
- ((: seq $ submatch => submatch-named + atomic)
- (every sre-empty? (cdr sre)))
- (else #f))
- (memq sre '(epsilon bos eos bol eol bow eow commit))))
-
-(define (sre-any? sre)
- (or (eq? sre 'any)
- (and (pair? sre)
- (case (car sre)
- ((seq : $ submatch => submatch-named)
- (and (pair? (cdr sre)) (null? (cddr sre)) (sre-any? (cadr sre))))
- ((or) (every sre-any? (cdr sre)))
- (else #f)))))
-
-(define (sre-repeater? sre)
- (and (pair? sre)
- (or (memq (car sre) '(* +))
- (and (memq (car sre) '($ submatch => submatch-named seq :))
- (pair? (cdr sre))
- (null? (cddr sre))
- (sre-repeater? (cadr sre))))))
-
-(define (sre-searcher? sre)
- (if (pair? sre)
- (case (car sre)
- ((* +) (sre-any? (sre-sequence (cdr sre))))
- ((seq : $ submatch => submatch-named)
- (and (pair? (cdr sre)) (sre-searcher? (cadr sre))))
- ((or) (every sre-searcher? (cdr sre)))
- (else #f))
- (eq? 'bos sre)))
-
-(define (sre-consumer? sre)
- (if (pair? sre)
- (case (car sre)
- ((* +) (sre-any? (sre-sequence (cdr sre))))
- ((seq : $ submatch => submatch-named)
- (and (pair? (cdr sre)) (sre-consumer? (last sre))))
- ((or) (every sre-consumer? (cdr sre)))
- (else #f))
- (eq? 'eos sre)))
-
-(define (sre-has-submatches? sre)
- (and (pair? sre)
- (or (memq (car sre) '($ submatch => submatch-named))
- (if (eq? 'posix-string (car sre))
- (sre-has-submatches? (string->sre (cadr sre)))
- (any sre-has-submatches? (cdr sre))))))
-
-(define (sre-count-submatches sre)
- (let count ((sre sre) (sum 0))
- (if (pair? sre)
- (fold count
- (+ sum (case (car sre)
- (($ submatch => submatch-named) 1)
- ((dsm) (+ (cadr sre) (caddr sre)))
- ((posix-string)
- (sre-count-submatches (string->sre (cadr sre))))
- (else 0)))
- (cdr sre))
- sum)))
-
-(define (sre-length-ranges sre . o)
- (let ((names (if (pair? o) (car o) (sre-names sre 1 '())))
- (sublens (make-vector (+ 1 (sre-count-submatches sre)) #f)))
- (vector-set!
- sublens
- 0
- (let lp ((sre sre) (n 1) (lo 0) (hi 0) (return cons))
- (define (grow i) (return (+ lo i) (and hi (+ hi i))))
- (cond
- ((pair? sre)
- (if (string? (car sre))
- (grow 1)
- (case (car sre)
- ((/ ~ & -)
- (grow 1))
- ((posix-string)
- (lp (string->sre (cadr sre)) n lo hi return))
- ((seq : w/case w/nocase atomic)
- (let lp2 ((ls (cdr sre)) (n n) (lo2 0) (hi2 0))
- (if (null? ls)
- (return (+ lo lo2) (and hi hi2 (+ hi hi2)))
- (lp (car ls) n 0 0
- (lambda (lo3 hi3)
- (lp2 (cdr ls)
- (+ n (sre-count-submatches (car ls)))
- (+ lo2 lo3)
- (and hi2 hi3 (+ hi2 hi3))))))))
- ((or)
- (let lp2 ((ls (cdr sre)) (n n) (lo2 #f) (hi2 0))
- (if (null? ls)
- (return (+ lo (or lo2 1)) (and hi hi2 (+ hi hi2)))
- (lp (car ls) n 0 0
- (lambda (lo3 hi3)
- (lp2 (cdr ls)
- (+ n (sre-count-submatches (car ls)))
- (if lo2 (min lo2 lo3) lo3)
- (and hi2 hi3 (max hi2 hi3))))))))
- ((if)
- (cond
- ((or (null? (cdr sre)) (null? (cddr sre)))
- (return lo hi))
- (else
- (let ((n1 (sre-count-submatches (car sre)))
- (n2 (sre-count-submatches (cadr sre))))
- (lp (if (or (number? (cadr sre)) (symbol? (cadr sre)))
- 'epsilon
- (cadr sre))
- n lo hi
- (lambda (lo2 hi2)
- (lp (caddr sre) (+ n n1) 0 0
- (lambda (lo3 hi3)
- (lp (if (pair? (cdddr sre))
- (cadddr sre)
- 'epsilon)
- (+ n n1 n2) 0 0
- (lambda (lo4 hi4)
- (return (+ lo2 (min lo3 lo4))
- (and hi2 hi3 hi4
- (+ hi2 (max hi3 hi4))
- ))))))))))))
- ((dsm)
- (lp (sre-sequence (cdddr sre)) (+ n (cadr sre)) lo hi return))
- (($ submatch => submatch-named)
- (lp (sre-sequence
- (if (eq? 'submatch (car sre)) (cdr sre) (cddr sre)))
- (+ n 1) lo hi
- (lambda (lo2 hi2)
- (vector-set! sublens n (cons lo2 hi2))
- (return lo2 hi2))))
- ((backref backref-ci)
- (let ((n (cond
- ((number? (cadr sre)) (cadr sre))
- ((assq (cadr sre) names) => cdr)
- (else (error "unknown backreference" (cadr sre))))))
- (cond
- ((or (not (integer? n))
- (not (< 0 n (vector-length sublens))))
- (error "sre-length: invalid backreference" sre))
- ((not (vector-ref sublens n))
- (error "sre-length: invalid forward backreference" sre))
- (else
- (let ((lo2 (car (vector-ref sublens n)))
- (hi2 (cdr (vector-ref sublens n))))
- (return (+ lo lo2) (and hi hi2 (+ hi hi2))))))))
- ((* *?)
- (lp (sre-sequence (cdr sre)) n lo hi (lambda (lo hi) #f))
- (return lo #f))
- ((** **?)
- (cond
- ((or (and (number? (cadr sre))
- (number? (caddr sre))
- (> (cadr sre) (caddr sre)))
- (and (not (cadr sre)) (caddr sre)))
- (return lo hi))
- (else
- (if (caddr sre)
- (lp (sre-sequence (cdddr sre)) n 0 0
- (lambda (lo2 hi2)
- (return (+ lo (* (cadr sre) lo2))
- (and hi hi2 (+ hi (* (caddr sre) hi2))))))
- (lp (sre-sequence (cdddr sre)) n 0 0
- (lambda (lo2 hi2)
- (return (+ lo (* (cadr sre) lo2)) #f)))))))
- ((+)
- (lp (sre-sequence (cdr sre)) n lo hi
- (lambda (lo2 hi2)
- (return (+ lo lo2) #f))))
- ((? ??)
- (lp (sre-sequence (cdr sre)) n lo hi
- (lambda (lo2 hi2)
- (return lo (and hi hi2 (+ hi hi2))))))
- ((= =? >= >=?)
- (lp `(** ,(cadr sre)
- ,(if (memq (car sre) '(>= >=?)) #f (cadr sre))
- ,@(cddr sre))
- n lo hi return))
- ((look-ahead neg-look-ahead look-behind neg-look-behind)
- (return lo hi))
- (else
- (cond
- ((assq (car sre) sre-named-definitions)
- => (lambda (cell)
- (lp (apply (cdr cell) (cdr sre)) n lo hi return)))
- (else
- (error "sre-length-ranges: unknown sre operator" sre)))))))
- ((char? sre)
- (grow 1))
- ((string? sre)
- (grow (string-length sre)))
- ((memq sre '(any nonl))
- (grow 1))
- ((memq sre '(epsilon bos eos bol eol bow eow nwb commit))
- (return lo hi))
- (else
- (let ((cell (assq sre sre-named-definitions)))
- (if cell
- (lp (if (procedure? (cdr cell)) ((cdr cell)) (cdr cell))
- n lo hi return)
- (error "sre-length-ranges: unknown sre" sre)))))))
- sublens))
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;; SRE Manipulation
-
-;; build a (seq ls ...) sre from a list
-(define (sre-sequence ls)
- (cond
- ((null? ls) 'epsilon)
- ((null? (cdr ls)) (car ls))
- (else (cons 'seq ls))))
-
-;; build a (or ls ...) sre from a list
-(define (sre-alternate ls)
- (cond
- ((null? ls) '(or))
- ((null? (cdr ls)) (car ls))
- (else (cons 'or ls))))
-
-;; returns an equivalent SRE without any match information
-(define (sre-strip-submatches sre)
- (if (not (pair? sre))
- sre
- (case (car sre)
- (($ submatch) (sre-strip-submatches (sre-sequence (cdr sre))))
- ((=> submatch-named) (sre-strip-submatches (sre-sequence (cddr sre))))
- ((dsm) (sre-strip-submatches (sre-sequence (cdddr sre))))
- (else (map sre-strip-submatches sre)))))
-
-;; given a char-set list of chars and strings, flattens them into
-;; chars only
-(define (sre-flatten-ranges ls)
- (let lp ((ls ls) (res '()))
- (cond
- ((null? ls)
- (reverse res))
- ((string? (car ls))
- (lp (append (string->list (car ls)) (cdr ls)) res))
- (else
- (lp (cdr ls) (cons (car ls) res))))))
-
-(define (sre-names sre n names)
- (if (not (pair? sre))
- names
- (case (car sre)
- (($ submatch)
- (sre-names (sre-sequence (cdr sre)) (+ n 1) names))
- ((=> submatch-named)
- (sre-names (sre-sequence (cddr sre))
- (+ n 1)
- (cons (cons (cadr sre) n) names)))
- ((dsm)
- (sre-names (sre-sequence (cdddr sre)) (+ n (cadr sre)) names))
- ((seq : or * + ? *? ?? w/case w/nocase atomic
- look-ahead look-behind neg-look-ahead neg-look-behind)
- (sre-sequence-names (cdr sre) n names))
- ((= >=)
- (sre-sequence-names (cddr sre) n names))
- ((** **?)
- (sre-sequence-names (cdddr sre) n names))
- (else
- names))))
-
-(define (sre-sequence-names ls n names)
- (if (null? ls)
- names
- (sre-sequence-names (cdr ls)
- (+ n (sre-count-submatches (car ls)))
- (sre-names (car ls) n names))))
-
-(define (sre-remove-initial-bos sre)
- (cond
- ((pair? sre)
- (case (car sre)
- ((seq : $ submatch => submatch-named * +)
- (cond
- ((not (pair? (cdr sre)))
- sre)
- ((eq? 'bos (cadr sre))
- (cons (car sre) (cddr sre)))
- (else
- (cons (car sre)
- (cons (sre-remove-initial-bos (cadr sre)) (cddr sre))))))
- ((or)
- (sre-alternate (map sre-remove-initial-bos (cdr sre))))
- (else
- sre)))
- (else
- sre)))
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;; Basic Matching
-
-(define irregex-basic-string-chunker
- (make-irregex-chunker (lambda (x) #f)
- car
- cadr
- caddr
- (lambda (src1 i src2 j)
- (substring (car src1) i j))))
-
-(define (irregex-search x str . o)
- (if (not (string? str)) (error "irregex-search: not a string" str))
- (let ((start (or (and (pair? o) (car o)) 0))
- (end (or (and (pair? o) (pair? (cdr o)) (cadr o)) (string-length str))))
- (irregex-search/chunked x
- irregex-basic-string-chunker
- (list str start end)
- start)))
-
-(define (irregex-search/chunked x cnk src . o)
- (let* ((irx (irregex x))
- (matches (irregex-new-matches irx))
- (i (if (pair? o) (car o) ((chunker-get-start cnk) src))))
- (irregex-match-chunker-set! matches cnk)
- (irregex-search/matches irx cnk src i matches)))
-
-;; internal routine, can be used in loops to avoid reallocating the
-;; match vector
-(define (irregex-search/matches irx cnk src i matches)
- (cond
- ((irregex-dfa irx)
- (cond
- ((flag-set? (irregex-flags irx) ~searcher?)
- (cond
- ((dfa-match/longest (irregex-dfa irx) cnk src i #f #f matches 0)
- (irregex-match-start-chunk-set! matches 0 src)
- (irregex-match-start-index-set! matches 0 i)
- ((irregex-dfa/extract irx)
- cnk src i
- (%irregex-match-end-chunk matches 0)
- (%irregex-match-end-index matches 0)
- matches)
- matches)
- (else
- #f)))
- ((dfa-match/shortest
- (irregex-dfa/search irx) cnk src i matches 0)
- (let ((dfa (irregex-dfa irx))
- (get-start (chunker-get-start cnk))
- (get-end (chunker-get-end cnk))
- (get-next (chunker-get-next cnk)))
- (let lp1 ((src src) (i i))
- (let ((end (get-end src)))
- (let lp2 ((i i))
- (cond
- ((dfa-match/longest dfa cnk src i #f #f matches 0)
- (irregex-match-start-chunk-set! matches 0 src)
- (irregex-match-start-index-set! matches 0 i)
- ((irregex-dfa/extract irx)
- cnk src i
- (%irregex-match-end-chunk matches 0)
- (%irregex-match-end-index matches 0)
- matches)
- matches)
- ((>= i end)
- (let ((next (get-next src)))
- (and next (lp1 next (get-start next)))))
- (else
- (lp2 (+ i 1)))))))))
- (else
- #f)))
- (else
- (let ((res (irregex-search/backtrack irx cnk src i matches)))
- (if res (%irregex-match-fail-set! res #f))
- res))))
-
-(define (irregex-search/backtrack irx cnk src i matches)
- (let ((matcher (irregex-nfa irx))
- (str ((chunker-get-str cnk) src))
- (end ((chunker-get-end cnk) src))
- (get-next (chunker-get-next cnk))
- (init (cons src i)))
- (if (flag-set? (irregex-flags irx) ~searcher?)
- (matcher cnk init src str i end matches (lambda () #f))
- (let lp ((src2 src)
- (str str)
- (i i)
- (end end))
- (cond
- ((matcher cnk init src2 str i end matches (lambda () #f))
- (irregex-match-start-chunk-set! matches 0 src2)
- (irregex-match-start-index-set! matches 0 i)
- matches)
- ((< i end)
- (lp src2 str (+ i 1) end))
- (else
- (let ((src2 (get-next src2)))
- (if src2
- (lp src2
- ((chunker-get-str cnk) src2)
- ((chunker-get-start cnk) src2)
- ((chunker-get-end cnk) src2))
- #f))))))))
-
-(define (irregex-match irx str . o)
- (if (not (string? str)) (error "irregex-match: not a string" str))
- (let ((start (or (and (pair? o) (car o)) 0))
- (end (or (and (pair? o) (pair? (cdr o)) (cadr o)) (string-length str))))
- (irregex-match/chunked irx
- irregex-basic-string-chunker
- (list str start end))))
-
-(define (irregex-match/chunked irx cnk src)
- (let* ((irx (irregex irx))
- (matches (irregex-new-matches irx)))
- (irregex-match-chunker-set! matches cnk)
- (cond
- ((irregex-dfa irx)
- (and
- (dfa-match/longest
- (irregex-dfa irx) cnk src ((chunker-get-start cnk) src) #f #f matches 0)
- (= ((chunker-get-end cnk) (%irregex-match-end-chunk matches 0))
- (%irregex-match-end-index matches 0))
- (begin
- (irregex-match-start-chunk-set! matches 0 src)
- (irregex-match-start-index-set! matches
- 0
- ((chunker-get-start cnk) src))
- ((irregex-dfa/extract irx)
- cnk src ((chunker-get-start cnk) src)
- (%irregex-match-end-chunk matches 0)
- (%irregex-match-end-index matches 0)
- matches)
- matches)))
- (else
- (let* ((matcher (irregex-nfa irx))
- (str ((chunker-get-str cnk) src))
- (i ((chunker-get-start cnk) src))
- (end ((chunker-get-end cnk) src))
- (init (cons src i)))
- (let lp ((m (matcher cnk init src str i end matches (lambda () #f))))
- (and m
- (cond
- ((and (not ((chunker-get-next cnk)
- (%irregex-match-end-chunk m 0)))
- (= ((chunker-get-end cnk)
- (%irregex-match-end-chunk m 0))
- (%irregex-match-end-index m 0)))
- (%irregex-match-fail-set! m #f)
- m)
- ((%irregex-match-fail m)
- (lp ((%irregex-match-fail m))))
- (else
- #f)))))))))
-
-(define (irregex-match? . args)
- (and (apply irregex-match args) #t))
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;; DFA Matching
-
-;; inline these
-(define (dfa-init-state dfa)
- (vector-ref dfa 0))
-(define (dfa-next-state dfa node)
- (vector-ref dfa (cdr node)))
-(define (dfa-final-state? dfa state)
- (car state))
-
-;; this searches for the first end index for which a match is possible
-(define (dfa-match/shortest dfa cnk src start matches index)
- (let ((get-str (chunker-get-str cnk))
- (get-start (chunker-get-start cnk))
- (get-end (chunker-get-end cnk))
- (get-next (chunker-get-next cnk)))
- (let lp1 ((src src) (start start) (state (dfa-init-state dfa)))
- (and
- src
- (let ((str (get-str src))
- (end (get-end src)))
- (let lp2 ((i start) (state state))
- (cond
- ((dfa-final-state? dfa state)
- (cond
- (index
- (irregex-match-end-chunk-set! matches index src)
- (irregex-match-end-index-set! matches index i)))
- #t)
- ((< i end)
- (let* ((ch (string-ref str i))
- (next (find (lambda (x)
- (if (eqv? ch (car x))
- #t
- (and (pair? (car x))
- (char<=? (caar x) ch)
- (char<=? ch (cdar x)))))
- (cdr state))))
- (and next (lp2 (+ i 1) (dfa-next-state dfa next)))))
- (else
- (let ((next (get-next src)))
- (and next (lp1 next (get-start next) state)))))))))))
-
-;; this finds the longest match starting at a given index
-(define (dfa-match/longest dfa cnk src start end-src end matches index)
- (let ((get-str (chunker-get-str cnk))
- (get-start (chunker-get-start cnk))
- (get-end (chunker-get-end cnk))
- (get-next (chunker-get-next cnk))
- (start-is-final? (dfa-final-state? dfa (dfa-init-state dfa))))
- (cond
- (index
- (irregex-match-end-chunk-set! matches index #f)
- (irregex-match-end-index-set! matches index #f)))
- (let lp1 ((src src)
- (start start)
- (state (dfa-init-state dfa))
- (res-src (and start-is-final? src))
- (res-index (and start-is-final? start)))
- (let ((str (get-str src))
- (end (if (eq? src end-src) end (get-end src))))
- (let lp2 ((i start)
- (state state)
- (res-src res-src)
- (res-index res-index))
- (cond
- ((>= i end)
- (cond
- ((and index res-src)
- (irregex-match-end-chunk-set! matches index res-src)
- (irregex-match-end-index-set! matches index res-index)))
- (let ((next (and (not (eq? src end-src)) (get-next src))))
- (if next
- (lp1 next (get-start next) state res-src res-index)
- (and index
- (%irregex-match-end-chunk matches index)
- #t))))
- (else
- (let* ((ch (string-ref str i))
- (cell (find (lambda (x)
- (if (eqv? ch (car x))
- #t
- (and (pair? (car x))
- (char<=? (caar x) ch)
- (char<=? ch (cdar x)))))
- (cdr state))))
- (cond
- (cell
- (let ((next (dfa-next-state dfa cell)))
- (if (dfa-final-state? dfa next)
- (lp2 (+ i 1) next src (+ i 1))
- (lp2 (+ i 1) next res-src res-index))))
- (res-src
- (cond
- (index
- (irregex-match-end-chunk-set! matches index res-src)
- (irregex-match-end-index-set! matches index res-index)))
- #t)
- ((and index (%irregex-match-end-chunk matches index))
- #t)
- (else
- #f))))))))))
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;; Named Definitions
-
-(define sre-named-definitions
- `((any . ,*all-chars*)
- (nonl . (- ,*all-chars* (,(string #\newline))))
- (alphabetic . (/ #\a #\z #\A #\Z))
- (alpha . alphabetic)
- (alphanumeric . (/ #\a #\z #\A #\Z #\0 #\9))
- (alphanum . alphanumeric)
- (alnum . alphanumeric)
- (lower-case . (/ #\a #\z))
- (lower . lower-case)
- (upper-case . (/ #\A #\Z))
- (upper . upper-case)
- (numeric . (/ #\0 #\9))
- (num . numeric)
- (digit . numeric)
- (punctuation . (or #\! #\" #\# #\% #\& #\' #\( #\) #\* #\, #\- #\.
- #\/ #\: #\; #\? #\@ #\[ #\\ #\] #\_ #\{ #\}))
- (punct . punctuation)
- (graphic
- . (or alphanumeric punctuation #\$ #\+ #\< #\= #\> #\^ #\` #\| #\~))
- (graph . graphic)
- (blank . (or #\space ,(integer->char (- (char->integer #\space) 23))))
- (whitespace . (or blank #\newline))
- (space . whitespace)
- (white . whitespace)
- (printing or graphic whitespace)
- (print . printing)
-
- ;; XXXX we assume a (possibly shifted) ASCII-based ordering
- (control . (/ ,(integer->char (- (char->integer #\space) 32))
- ,(integer->char (- (char->integer #\space) 1))))
- (cntrl . control)
- (hex-digit . (or numeric (/ #\a #\f #\A #\F)))
- (xdigit . hex-digit)
- (ascii . (/ ,(integer->char (- (char->integer #\space) 32))
- ,(integer->char (+ (char->integer #\space) 95))))
- (ascii-nonl . (/ ,(integer->char (- (char->integer #\space) 32))
- ,(integer->char (- (char->integer #\newline) 1))
- ,(integer->char (+ (char->integer #\newline) 1))
- ,(integer->char (+ (char->integer #\space) 95))))
- (newline . (or (seq ,(integer->char (+ (char->integer #\newline) 3))
- #\newline)
- (/ #\newline
- ,(integer->char (+ (char->integer #\newline) 3)))))
-
- ;; ... it's really annoying to support old Scheme48
- (word . (seq bow (+ (or alphanumeric #\_)) eow))
- (utf8-tail-char . (/ ,(integer->char (+ (char->integer #\space) #x60))
- ,(integer->char (+ (char->integer #\space) #xA1))))
- (utf8-2-char . (seq (/ ,(integer->char (+ (char->integer #\space) #xA2))
- ,(integer->char (+ (char->integer #\space) #xBF)))
- utf8-tail-char))
- (utf8-3-char . (seq (/ ,(integer->char (+ (char->integer #\space) #xC0))
- ,(integer->char (+ (char->integer #\space) #xCF)))
- utf8-tail-char
- utf8-tail-char))
- (utf8-4-char . (seq (/ ,(integer->char (+ (char->integer #\space) #xD0))
- ,(integer->char (+ (char->integer #\space) #xD7)))
- utf8-tail-char
- utf8-tail-char
- utf8-tail-char))
- (utf8-any . (or ascii utf8-2-char utf8-3-char utf8-4-char))
- (utf8-nonl . (or ascii-nonl utf8-2-char utf8-3-char utf8-4-char))
-
- ;; extended library patterns
- (integer . (seq (? (or #\+ #\-)) (+ numeric)))
- (real . (seq (+ numeric) (? #\. (+ numeric)) (? (or #\e #\E) integer)))
- ;; slightly more lax than R5RS, allow ->foo, etc.
- (symbol-initial . (or alpha ("!$%&*/:<=>?^_~")))
- (symbol-subsequent . (or symbol-initial digit ("+-.@")))
- (symbol . (or (seq symbol-initial (* symbol-subsequent))
- (seq ("+-") (? symbol-initial (* symbol-subsequent)))
- (seq ".." (* "."))))
- (sexp-space . (seq (* (* space) ";" (* nonl) newline) (+ space)))
- (string . (seq #\" (escape #\\ #\") #\"))
- (escape . ,(lambda (esc . o) `(* (or (~ ,esc ,@o) (seq ,esc any)))))
-
- (ipv4-digit . (seq (? (/ "12")) (? numeric) numeric))
- (ipv4-address . (seq ipv4-digit (= 3 #\. ipv4-digit)))
- ;; XXXX lax, allows multiple double-colons or < 8 terms w/o a ::
- (ipv6-address . (seq (** 0 4 hex-digit)
- (** 1 7 #\: (? #\:) (** 0 4 hex-digit))))
- (ip-address . (or ipv4-address ipv6-address))
- (domain-atom . (+ (or alphanumeric #\_ #\-)))
- (domain . (seq domain-atom (+ #\. domain-atom)))
- ;; XXXX now anything can be a top-level domain, but this is still handy
- (top-level-domain . (w/nocase (or "arpa" "com" "gov" "mil" "net" "org"
- "aero" "biz" "coop" "info" "museum"
- "name" "pro" (= 2 alpha))))
- (domain/common . (seq (+ domain-atom #\.) top-level-domain))
- ;;(email-local-part . (seq (+ (or (~ #\") string))))
- (email-local-part . (+ (or alphanumeric #\_ #\- #\. #\+)))
- (email . (seq email-local-part #\@ domain))
- (url-char . (or alnum #\_ #\- #\+ #\\ #\= #\~ #\. #\, #\& #\;
- (seq "%" hex-digit hex-digit)))
- (url-final-char . (or alnum #\_ #\- #\+ #\\ #\= #\~ #\&
- (seq "%" hex-digit hex-digit)))
- (http-url . (w/nocase
- "http" (? "s") "://"
- (or domain/common ipv4-address) ;; (seq "[" ipv6-address "]")
- (? ":" (+ numeric)) ;; port
- ;; path
- (? "/" (* url-char)
- (? "?" (* url-char)) ;; query
- (? "#" (? (* url-char) url-final-char)) ;; fragment
- )))
-
- ))
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;; SRE->NFA compilation
-;;
-;; An NFA state is a numbered node with a list of patter->number
-;; transitions, where pattern is either a character, (lo . hi)
-;; character range, or epsilon (indicating an empty transition).
-;; There may be duplicate characters and overlapping ranges - since
-;; it's an NFA we process it by considering all possible transitions.
-
-(define *nfa-presize* 128) ;; constant
-(define *nfa-num-fields* 4) ;; constant
-
-(define (nfa-num-states nfa) (quotient (vector-length nfa) *nfa-num-fields*))
-(define (nfa-start-state nfa) (- (nfa-num-states nfa) 1))
-
-(define (nfa-get-state-trans nfa i)
- (vector-ref nfa (* i *nfa-num-fields*)))
-(define (nfa-set-state-trans! nfa i x)
- (vector-set! nfa (* i *nfa-num-fields*) x))
-(define (nfa-push-state-trans! nfa i x)
- (nfa-set-state-trans! nfa i (cons x (nfa-get-state-trans nfa i))))
-
-(define (nfa-get-epsilons nfa i)
- (vector-ref nfa (+ (* i *nfa-num-fields*) 1)))
-(define (nfa-set-epsilons! nfa i x)
- (vector-set! nfa (+ (* i *nfa-num-fields*) 1) x))
-(define (nfa-add-epsilon! nfa i x)
- (let ((eps (nfa-get-epsilons nfa i)))
- (if (not (memq x eps))
- (nfa-set-epsilons! nfa i (cons x eps)))))
-
-(define (nfa-get-state-closure nfa i)
- (vector-ref nfa (+ (* i *nfa-num-fields*) 2)))
-(define (nfa-set-state-closure! nfa i x)
- (vector-set! nfa (+ (* i *nfa-num-fields*) 2) x))
-
-(define (nfa-get-closure nfa mst)
- (cond ((assoc mst
- (vector-ref nfa (+ (* (nfa-multi-state-hash nfa mst)
- *nfa-num-fields*)
- (- *nfa-num-fields* 1))))
- => cdr)
- (else #f)))
-(define (nfa-add-closure! nfa mst x)
- (let ((i (+ (* (nfa-multi-state-hash nfa mst) *nfa-num-fields*)
- (- *nfa-num-fields* 1))))
- (vector-set! nfa i (cons (cons mst x) (vector-ref nfa i)))))
-
-;; Compile and return the vector of NFA states (in groups of
-;; *nfa-num-fields* packed elements). The start state will be the
-;; last element(s) of the vector, and all remaining states will be in
-;; descending numeric order, with state 0 being the unique accepting
-;; state.
-(define (sre->nfa sre init-flags)
- (let ((buf (make-vector (* *nfa-presize* *nfa-num-fields*) '())))
- ;; we loop over an implicit sequence list
- (define (lp ls n flags next)
- (define (new-state-number state)
- (max n (+ 1 state)))
- (define (add-state! n2 trans-ls)
- (if (>= (* n2 *nfa-num-fields*) (vector-length buf))
- (let ((tmp (make-vector (* 2 (vector-length buf)) '())))
- (do ((i (- (vector-length buf) 1) (- i 1)))
- ((< i 0))
- (vector-set! tmp i (vector-ref buf i)))
- (set! buf tmp)))
- (nfa-set-state-trans! buf n2 trans-ls)
- n2)
- (define (extend-state! next . trans)
- (and next
- (add-state! (new-state-number next)
- (map (lambda (x) (cons x next)) trans))))
- (define (add-char-state! next ch)
- (let ((alt (char-altcase ch)))
- (if (and (flag-set? flags ~case-insensitive?) (not (eqv? ch alt)))
- (extend-state! next ch alt)
- (extend-state! next ch))))
- (if (null? ls)
- next
- (cond
- ((or (eq? 'epsilon (car ls)) (equal? "" (car ls)))
- ;; chars and epsilons go directly into the transition table
- (let ((next (lp (cdr ls) n flags next)))
- (and next
- (let ((new (add-state! (new-state-number next) '())))
- (nfa-add-epsilon! buf new next)
- new))))
- ((string? (car ls))
- ;; process literal strings a char at a time
- (let ((next (lp (cdr ls) n flags next)))
- (and next
- (let lp2 ((i (- (string-length (car ls)) 1))
- (next next))
- (if (< i 0)
- next
- (lp2 (- i 1)
- (add-char-state! next (string-ref (car ls) i))))
- ))))
- ((char? (car ls))
- (add-char-state! (lp (cdr ls) n flags next) (car ls)))
- ((symbol? (car ls))
- (let ((cell (assq (car ls) sre-named-definitions)))
- (and cell
- (lp (cons (if (procedure? (cdr cell))
- ((cdr cell))
- (cdr cell))
- (cdr ls))
- n
- flags
- next))))
- ((pair? (car ls))
- (cond
- ((string? (caar ls))
- ;; enumerated character set
- (lp (cons (sre-alternate (string->list (caar ls))) (cdr ls))
- n
- flags
- next))
- (else
- (case (caar ls)
- ((seq :)
- ;; for an explicit sequence, just append to the list
- (lp (append (cdar ls) (cdr ls)) n flags next))
- ((w/case w/nocase w/utf8 w/noutf8)
- (let* ((next (lp (cdr ls) n flags next))
- (flags ((if (memq (caar ls) '(w/case w/utf8))
- flag-clear
- flag-join)
- flags
- (if (memq (caar ls) '(w/case w/nocase))
- ~case-insensitive?
- ~utf8?))))
- (and next
- (lp (cdar ls) (new-state-number next) flags next))))
- ((/ - & ~)
- (let ((ranges
- (sre->cset (car ls)
- (flag-set? flags ~case-insensitive?))))
- (case (length ranges)
- ((1)
- (extend-state! (lp (cdr ls) n flags next) (car ranges)))
- (else
- (let ((next (lp (cdr ls) n flags next)))
- (and
- next
- (lp (list (sre-alternate
- (map (lambda (x) (if (pair? x)
- (list '/ (car x) (cdr x))
- x))
- ranges)))
- (new-state-number next)
- (flag-clear flags ~case-insensitive?)
- next)))))))
- ((or)
- (let ((next (lp (cdr ls) n flags next)))
- (and
- next
- (if (null? (cdar ls))
- ;; empty (or) always fails
- (add-state! (new-state-number next) '())
- ;; compile both branches and insert epsilon
- ;; transitions to either
- (let* ((b (lp (list (sre-alternate (cddar ls)))
- (new-state-number next)
- flags
- next))
- (a (and b
- (lp (list (cadar ls))
- (new-state-number (max b next))
- flags
- next))))
- (and a
- (let ((c (add-state! (new-state-number a) '())))
- (nfa-add-epsilon! buf c a)
- (nfa-add-epsilon! buf c b)
- c)))))))
- ((?)
- (let ((next (lp (cdr ls) n flags next)))
- ;; insert an epsilon transition directly to next
- (and
- next
- (let ((a (lp (cdar ls) (new-state-number next) flags next)))
- (if a
- (nfa-add-epsilon! buf a next))
- a))))
- ((+ *)
- (let ((next (lp (cdr ls) n flags next)))
- (and
- next
- (let* ((new (lp '(epsilon)
- (new-state-number next)
- flags
- next))
- (a (lp (cdar ls) (new-state-number new) flags new)))
- (cond
- (a
- ;; for *, insert an epsilon transition as in ? above
- (if (eq? '* (caar ls))
- (nfa-add-epsilon! buf a new))
- ;; for both, insert a loop back to self
- (nfa-add-epsilon! buf new a)))
- a))))
- ;; need to add these to the match extractor first,
- ;; but they tend to generate large DFAs
- ;;((=)
- ;; (lp (append (vector->list
- ;; (make-vector (cadar ls)
- ;; (sre-sequence (cddar ls))))
- ;; (cdr ls))
- ;; n flags next))
- ;;((>=)
- ;; (lp (append (vector->list
- ;; (make-vector (- (cadar ls) 1)
- ;; (sre-sequence (cddar ls))))
- ;; (cons `(+ ,@(cddar ls)) (cdr ls)))
- ;; n flags next))
- ;;((**)
- ;; (lp (append (vector->list
- ;; (make-vector (cadar ls)
- ;; (sre-sequence (cdddar ls))))
- ;; (map
- ;; (lambda (x) `(? ,x))
- ;; (vector->list
- ;; (make-vector (- (caddar ls) (cadar ls))
- ;; (sre-sequence (cdddar ls)))))
- ;; (cdr ls))
- ;; n flags next))
- ;; ignore submatches altogether
- (($ submatch)
- (lp (cons (sre-sequence (cdar ls)) (cdr ls)) n flags next))
- ((=> submatch-named)
- (lp (cons (sre-sequence (cddar ls)) (cdr ls)) n flags next))
- (else
- (cond
- ((assq (caar ls) sre-named-definitions)
- => (lambda (cell)
- (if (procedure? (cdr cell))
- (lp (cons (apply (cdr cell) (cdar ls)) (cdr ls))
- n flags next)
- (error "non-procedure in op position" (caar ls)))))
- (else #f)))))))
- (else
- #f))))
- (let ((len (lp (list sre) 1 init-flags 0)))
- (and len
- (let ((nfa (make-vector (* *nfa-num-fields* (+ len 1)))))
- (do ((i (- (vector-length nfa) 1) (- i 1)))
- ((< i 0))
- (vector-set! nfa i (vector-ref buf i)))
- nfa)))))
-
-;; We don't really want to use this, we use the closure compilation
-;; below instead, but this is included for reference and testing the
-;; sre->nfa conversion.
-
-;; (define (nfa-match nfa str)
-;; (let lp ((ls (string->list str)) (state (car nfa)) (epsilons '()))
-;; (if (null? ls)
-;; (zero? (car state))
-;; (any (lambda (m)
-;; (if (eq? 'epsilon (car m))
-;; (and (not (memv (cdr m) epsilons))
-;; (lp ls (assv (cdr m) nfa) (cons (cdr m) epsilons)))
-;; (and (or (eqv? (car m) (car ls))
-;; (and (pair? (car m))
-;; (char<=? (caar m) (car ls))
-;; (char<=? (car ls) (cdar m))))
-;; (lp (cdr ls) (assv (cdr m) nfa) '()))))
-;; (cdr state)))))
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;; NFA multi-state representation
-
-;; Cache closures in a simple hash-table keyed on the smallest state
-;; (define (nfa-multi-state-hash nfa mst)
-;; (car mst))
-
-;; Original sorted list-based representation
-
-;; (define (make-nfa-multi-state nfa)
-;; '())
-
-;; (define (nfa-state->multi-state nfa state)
-;; (list state))
-
-;; (define (nfa-multi-state-copy mst)
-;; (map (lambda (x) x) mst))
-
-;; (define (list->nfa-multi-state nfa ls)
-;; (nfa-multi-state-copy ls))
-
-;; (define (nfa-multi-state-contains? mst i)
-;; (memq i mst))
-
-;; (define (nfa-multi-state-fold mst kons knil)
-;; (fold kons knil mst))
-
-;; (define (nfa-multi-state-add! mst i)
-;; (insert-sorted i mst))
-
-;; (define (nfa-multi-state-add mst i)
-;; (insert-sorted i mst))
-
-;; (define (nfa-multi-state-union a b)
-;; (merge-sorted a b))
-
-;; Sorted List Utilities
-
-;; (define (insert-sorted n ls)
-;; (cond
-;; ((null? ls)
-;; (cons n '()))
-;; ((<= n (car ls))
-;; (if (= n (car ls))
-;; ls
-;; (cons n ls)))
-;; (else
-;; (cons (car ls) (insert-sorted n (cdr ls))))))
-
-;; (define (insert-sorted! n ls)
-;; (cond
-;; ((null? ls)
-;; (cons n '()))
-;; ((<= n (car ls))
-;; (if (= n (car ls))
-;; ls
-;; (cons n ls)))
-;; (else
-;; (let lp ((head ls) (tail (cdr ls)))
-;; (cond ((or (null? tail) (< n (car tail)))
-;; (set-cdr! head (cons n tail)))
-;; ((> n (car tail))
-;; (lp tail (cdr tail)))))
-;; ls)))
-
-;; (define (merge-sorted a b)
-;; (cond ((null? a) b)
-;; ((null? b) a)
-;; ((< (car a) (car b))
-;; (cons (car a) (merge-sorted (cdr a) b)))
-;; ((> (car a) (car b))
-;; (cons (car b) (merge-sorted a (cdr b))))
-;; (else (merge-sorted (cdr a) b))))
-
-;; ========================================================= ;;
-
-;; Presized bit-vector based
-
-(define (nfa-multi-state-hash nfa mst)
- (modulo (vector-ref mst 0) (nfa-num-states nfa)))
-
-(define (make-nfa-multi-state nfa)
- (make-vector (quotient (+ (nfa-num-states nfa) 24 -1) 24) 0))
-
-(define (nfa-state->multi-state nfa state)
- (nfa-multi-state-add! (make-nfa-multi-state nfa) state))
-
-(define (nfa-multi-state-copy mst)
- (let ((res (make-vector (vector-length mst))))
- (do ((i (- (vector-length mst) 1) (- i 1)))
- ((< i 0) res)
- (vector-set! res i (vector-ref mst i)))))
-
-(define (nfa-multi-state-contains? mst i)
- (let ((cell (quotient i 24))
- (bit (remainder i 24)))
- (not (zero? (bit-and (vector-ref mst cell) (bit-shl 1 bit))))))
-
-(define (nfa-multi-state-add! mst i)
- (let ((cell (quotient i 24))
- (bit (remainder i 24)))
- (vector-set! mst cell (bit-ior (vector-ref mst cell) (bit-shl 1 bit)))
- mst))
-
-(define (nfa-multi-state-add mst i)
- (nfa-multi-state-add! (nfa-multi-state-copy mst) i))
-
-(define (nfa-multi-state-union! a b)
- (do ((i (- (vector-length a) 1) (- i 1)))
- ((< i 0) a)
- (vector-set! a i (bit-ior (vector-ref a i) (vector-ref b i)))))
-
-(define (nfa-multi-state-union a b)
- (nfa-multi-state-union! (nfa-multi-state-copy a) b))
-
-(define (nfa-multi-state-fold mst kons knil)
- (let ((limit (vector-length mst)))
- (let lp1 ((i 0)
- (acc knil))
- (if (>= i limit)
- acc
- (let lp2 ((n (vector-ref mst i))
- (acc acc))
- (if (zero? n)
- (lp1 (+ i 1) acc)
- (let* ((n2 (bit-and n (- n 1)))
- (n-tail (- n n2))
- (bit (+ (* i 24) (integer-log n-tail))))
- (lp2 n2 (kons bit acc)))))))))
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;; NFA->DFA compilation
-;;
-;; During processing, the DFA is a list of the form:
-;;
-;; ((NFA-states ...) accepting-state? transitions ...)
-;;
-;; where the transitions are as in the NFA, except there are no
-;; epsilons, duplicate characters or overlapping char-set ranges, and
-;; the states moved to are closures (sets of NFA states). Multiple
-;; DFA states may be accepting states.
-
-(define (nfa->dfa nfa . o)
- (let ((max-states (and (pair? o) (car o))))
- (let lp ((ls (list (nfa-cache-state-closure! nfa (nfa-start-state nfa))))
- (i 0)
- (res '()))
- (cond
- ((null? ls)
- (dfa-renumber nfa (reverse res)))
- ((assoc (car ls) res) ;; already seen this combination of states
- (lp (cdr ls) i res))
- ((and max-states (> i max-states)) ;; too many DFA states
- #f)
- (else
- (let* ((states (car ls))
- (trans (nfa-state-transitions nfa states))
- (accept? (and (nfa-multi-state-contains? states 0) #t)))
- (lp (append (map cdr trans) (cdr ls))
- (+ i 1)
- `((,states ,accept? ,@trans) ,@res))))))))
-
-;; When the conversion is complete we renumber the DFA sets-of-states
-;; in order and convert the result to a vector for fast lookup.
-(define (dfa-renumber nfa dfa)
- (let* ((len (length dfa))
- (states (make-vector (nfa-num-states nfa) '()))
- (res (make-vector len)))
- (define (renumber mst)
- (cdr (assoc mst (vector-ref states (nfa-multi-state-hash nfa mst)))))
- (let lp ((ls dfa) (i 0))
- (cond ((pair? ls)
- (let ((j (nfa-multi-state-hash nfa (caar ls))))
- (vector-set! states j (cons (cons (caar ls) i)
- (vector-ref states j))))
- (lp (cdr ls) (+ i 1)))))
- (let lp ((ls dfa) (i 0))
- (cond ((pair? ls)
- (for-each
- (lambda (x) (set-cdr! x (renumber (cdr x))))
- (cddar ls))
- (vector-set! res i (cdar ls))
- (lp (cdr ls) (+ i 1)))))
- res))
-
-;; Extract all distinct characters or ranges and the potential states
-;; they can transition to from a given set of states. Any ranges that
-;; would overlap with distinct characters are split accordingly.
-(define (nfa-state-transitions nfa states)
- (let ((res (nfa-multi-state-fold
- states
- (lambda (st res)
- (fold (lambda (trans res)
- (nfa-join-transitions! nfa res (car trans) (cdr trans)))
- res
- (nfa-get-state-trans nfa st)))
- '())))
- (for-each (lambda (x) (set-cdr! x (nfa-closure nfa (cdr x)))) res)
- res))
-
-(define (nfa-join-transitions! nfa existing elt state)
- (define (join! ls elt state)
- (if (not elt)
- ls
- (nfa-join-transitions! nfa ls elt state)))
- (cond
- ((char? elt)
- (let lp ((ls existing) (res '()))
- (cond
- ((null? ls)
- ;; done, just cons this on to the original list
- (cons (cons elt (nfa-state->multi-state nfa state)) existing))
- ((eq? elt (caar ls))
- ;; add a new state to an existing char
- (set-cdr! (car ls) (nfa-multi-state-add! (cdar ls) state))
- existing)
- ((and (pair? (caar ls))
- (char<=? (caaar ls) elt)
- (char<=? elt (cdaar ls)))
- ;; split a range
- (apply
- (lambda (left right)
- (let ((left-copy (nfa-multi-state-copy (cdar ls)))
- (right-copy (nfa-multi-state-copy (cdar ls))))
- (cons (cons elt (nfa-multi-state-add! (cdar ls) state))
- (append (if left (list (cons left left-copy)) '())
- (if right (list (cons right right-copy)) '())
- res
- (cdr ls)))))
- (split-char-range (caar ls) elt)))
- (else
- ;; keep looking
- (lp (cdr ls) (cons (car ls) res))))))
- (else
- (let ((lo (car elt))
- (hi (cdr elt)))
- (let lp ((ls existing) (res '()))
- (cond
- ((null? ls)
- ;; done, just cons this on to the original list
- (cons (cons elt (nfa-state->multi-state nfa state)) existing))
- ((and (char? (caar ls)) (char<=? lo (caar ls)) (char<=? (caar ls) hi))
- ;; range enclosing a character
- (apply
- (lambda (left right)
- (set-cdr! (car ls) (nfa-multi-state-add! (cdar ls) state))
- (join! (join! existing left state) right state))
- (split-char-range elt (caar ls))))
- ((and (pair? (caar ls))
- (or (and (char<=? (caaar ls) hi) (char<=? lo (cdaar ls)))
- (and (char<=? hi (caaar ls)) (char<=? (cdaar ls) lo))))
- ;; overlapping ranges
- (apply
- (lambda (left1 left2 same right1 right2) ;; 5 regions
- (let ((right1-copy (nfa-multi-state-copy (cdar ls)))
- (right2-copy (nfa-multi-state-copy (cdar ls))))
- (set-car! (car ls) same)
- (set-cdr! (car ls) (nfa-multi-state-add! (cdar ls) state))
- (let* ((res (if right1
- (cons (cons right1 right1-copy) existing)
- existing))
- (res (if right2
- (cons (cons right2 right2-copy) res)
- res)))
- (join! (join! res left1 state) left2 state))))
- (intersect-char-ranges elt (caar ls))))
- (else
- (lp (cdr ls) (cons (car ls) res)))))))))
-
-(define (char-range c1 c2)
- (if (eqv? c1 c2) c1 (cons c1 c2)))
-
-;; assumes ch is included in the range
-(define (split-char-range range ch)
- (list
- (and (not (eqv? ch (car range)))
- (char-range (car range) (integer->char (- (char->integer ch) 1))))
- (and (not (eqv? ch (cdr range)))
- (char-range (integer->char (+ (char->integer ch) 1)) (cdr range)))))
-
-;; returns 5 (possibly #f) char ranges:
-;; a-only-1 a-only-2 a-and-b b-only-1 b-only-2
-(define (intersect-char-ranges a b)
- (if (char>? (car a) (car b))
- (reverse (intersect-char-ranges b a))
- (let ((a-lo (car a))
- (a-hi (cdr a))
- (b-lo (car b))
- (b-hi (cdr b)))
- (list
- (and (char<? a-lo b-lo)
- (char-range a-lo (integer->char (- (char->integer b-lo) 1))))
- (and (char>? a-hi b-hi)
- (char-range (integer->char (+ (char->integer b-hi) 1)) a-hi))
- (char-range b-lo (if (char<? b-hi a-hi) b-hi a-hi))
- #f
- (and (char>? b-hi a-hi)
- (char-range (integer->char (+ (char->integer a-hi) 1)) b-hi))))))
-
-(define (nfa-cache-state-closure! nfa state)
- (let ((cached (nfa-get-state-closure nfa state)))
- (cond
- ((not (null? cached))
- cached)
- (else
- (let ((res (nfa-state-closure-internal nfa state)))
- (nfa-set-state-closure! nfa state res)
- res)))))
-
-;; The `closure' of a list of NFA states - all states that can be
-;; reached from any of them using any number of epsilon transitions.
-(define (nfa-state-closure-internal nfa state)
- (let lp ((ls (list state))
- (res (make-nfa-multi-state nfa)))
- (cond
- ((null? ls)
- res)
- ((nfa-multi-state-contains? res (car ls))
- (lp (cdr ls) res))
- (else
- (lp (append (nfa-get-epsilons nfa (car ls)) (cdr ls))
- (nfa-multi-state-add! res (car ls)))))))
-
-(define (nfa-closure-internal nfa states)
- (nfa-multi-state-fold
- states
- (lambda (st res)
- (nfa-multi-state-union! res (nfa-cache-state-closure! nfa st)))
- (make-nfa-multi-state nfa)))
-
-(define (nfa-closure nfa states)
- (or (nfa-get-closure nfa states)
- (let ((res (nfa-closure-internal nfa states)))
- (nfa-add-closure! nfa states res)
- res)))
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;; Match Extraction
-;;
-;; DFAs don't give us match information, so once we match and
-;; determine the start and end, we need to recursively break the
-;; problem into smaller DFAs to get each submatch.
-;;
-;; See http://compilers.iecc.com/comparch/article/07-10-026
-
-(define (match-vector-ref v i) (vector-ref v (+ 3 i)))
-
-(define (match-vector-set! v i x) (vector-set! v (+ 3 i) x))
-
-(define (sre-match-extractor sre num-submatches)
- (let* ((tmp (+ num-submatches 1))
- (tmp-end-src-offset (+ 2 (* tmp 4)))
- (tmp-end-index-offset (+ 3 (* tmp 4))))
- (let lp ((sre sre) (n 1) (submatch-deps? #f))
- (cond
- ((not (sre-has-submatches? sre))
- (if (not submatch-deps?)
- (lambda (cnk start i end j matches) #t)
- (let ((dfa (nfa->dfa (sre->nfa sre ~none))))
- (lambda (cnk start i end j matches)
- (dfa-match/longest dfa cnk start i end j matches tmp)))))
- ((pair? sre)
- (case (car sre)
- ((: seq)
- (let* ((right (sre-sequence (cddr sre)))
- (match-left (lp (cadr sre) n #t))
- (match-right
- (lp right (+ n (sre-count-submatches (cadr sre))) #t)))
- (lambda (cnk start i end j matches)
- (let lp1 ((end2 end) (j2 j) (best-src #f) (best-index #f))
- (let ((limit (if (eq? start end2)
- i
- ((chunker-get-start cnk) end2))))
- (let lp2 ((k j2) (best-src best-src) (best-index best-index))
- (if (< k limit)
- (cond
- ((not (eq? start end2))
- (let ((prev (chunker-prev-chunk cnk start end2)))
- (lp1 prev
- ((chunker-get-end cnk) prev)
- best-src
- best-index)))
- (best-src
- (match-vector-set! matches tmp-end-src-offset best-src)
- (match-vector-set! matches tmp-end-index-offset best-index)
- #t)
- (else
- #f))
- (if (and (match-left cnk start i end2 k matches)
- (eq? end2 (match-vector-ref matches
- tmp-end-src-offset))
- (eqv? k (match-vector-ref matches
- tmp-end-index-offset))
- (match-right cnk end2 k end j matches))
- (let ((right-src
- (match-vector-ref matches tmp-end-src-offset))
- (right
- (match-vector-ref matches tmp-end-index-offset)))
- (cond
- ((and (eq? end right-src) (eqv? j right))
- (match-vector-set! matches tmp-end-src-offset end)
- (match-vector-set! matches tmp-end-index-offset j)
- #t)
- ((or (not best-src)
- (if (eq? best-src right-src)
- (> right best-index)
- (chunk-before? cnk
- best-src
- right-src)))
- (lp2 (- k 1) right-src right))
- (else
- (lp2 (- k 1) best-src best-index))))
- (lp2 (- k 1) best-src best-index)))))))))
- ((or)
- (if (null? (cdr sre))
- (lambda (cnk start i end j matches) #f)
- (let* ((rest (sre-alternate (cddr sre)))
- (match-first
- (lp (cadr sre) n #t))
- (match-rest
- (lp rest
- (+ n (sre-count-submatches (cadr sre)))
- submatch-deps?)))
- (lambda (cnk start i end j matches)
- (or (and (match-first cnk start i end j matches)
- (eq? end (match-vector-ref matches tmp-end-src-offset))
- (eqv? j (match-vector-ref matches tmp-end-index-offset)))
- (match-rest cnk start i end j matches))))))
- ((* +)
- (letrec ((match-once
- (lp (sre-sequence (cdr sre)) n #t))
- (match-all
- (lambda (cnk start i end j matches)
- (if (match-once cnk start i end j matches)
- (let ((src (match-vector-ref matches tmp-end-src-offset))
- (k (match-vector-ref matches tmp-end-index-offset)))
- (if (and src (or (not (eq? start src)) (< i k)))
- (match-all cnk src k end j matches)
- #t))
- (begin
- (match-vector-set! matches tmp-end-src-offset start)
- (match-vector-set! matches tmp-end-index-offset i)
- #t)))))
- (if (eq? '* (car sre))
- match-all
- (lambda (cnk start i end j matches)
- (and (match-once cnk start i end j matches)
- (let ((src (match-vector-ref matches tmp-end-src-offset))
- (k (match-vector-ref matches tmp-end-index-offset)))
- (match-all cnk src k end j matches)))))))
- ((?)
- (let ((match-once (lp (sre-sequence (cdr sre)) n #t)))
- (lambda (cnk start i end j matches)
- (cond
- ((match-once cnk start i end j matches)
- #t)
- (else
- (match-vector-set! matches tmp-end-src-offset start)
- (match-vector-set! matches tmp-end-index-offset i)
- #t)))))
- (($ submatch => submatch-named)
- (let ((match-one
- (lp (sre-sequence (if (memq (car sre) '($ submatch))
- (cdr sre)
- (cddr sre)))
- (+ n 1)
- #t))
- (start-src-offset (* n 4))
- (start-index-offset (+ 1 (* n 4)))
- (end-src-offset (+ 2 (* n 4)))
- (end-index-offset (+ 3 (* n 4))))
- (lambda (cnk start i end j matches)
- (cond
- ((match-one cnk start i end j matches)
- (match-vector-set! matches start-src-offset start)
- (match-vector-set! matches start-index-offset i)
- (match-vector-set! matches end-src-offset
- (match-vector-ref matches tmp-end-src-offset))
- (match-vector-set! matches end-index-offset
- (match-vector-ref matches tmp-end-index-offset))
- #t)
- (else
- #f)))))
- (else
- (error "unknown regexp operator" (car sre)))))
- (else
- (error "unknown regexp" sre))))))
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;; Closure Compilation
-;;
-;; We use this for non-regular expressions instead of an interpreted
-;; NFA matcher. We use backtracking anyway, but this gives us more
-;; freedom of implementation, allowing us to support patterns that
-;; can't be represented in the above NFA representation.
-
-(define (sre->procedure sre . o)
- (define names
- (if (and (pair? o) (pair? (cdr o))) (cadr o) (sre-names sre 1 '())))
- (let lp ((sre sre)
- (n 1)
- (flags (if (pair? o) (car o) ~none))
- (next (lambda (cnk init src str i end matches fail)
- (irregex-match-start-chunk-set! matches 0 (car init))
- (irregex-match-start-index-set! matches 0 (cdr init))
- (irregex-match-end-chunk-set! matches 0 src)
- (irregex-match-end-index-set! matches 0 i)
- (%irregex-match-fail-set! matches fail)
- matches)))
- ;; XXXX this should be inlined
- (define (rec sre) (lp sre n flags next))
- (cond
- ((pair? sre)
- (if (string? (car sre))
- (sre-cset->procedure
- (sre->cset (car sre) (flag-set? flags ~case-insensitive?))
- next)
- (case (car sre)
- ((~ - & /)
- (sre-cset->procedure
- (sre->cset sre (flag-set? flags ~case-insensitive?))
- next))
- ((or)
- (case (length (cdr sre))
- ((0) (lambda (cnk init src str i end matches fail) (fail)))
- ((1) (rec (cadr sre)))
- (else
- (let* ((first (rec (cadr sre)))
- (rest (lp (sre-alternate (cddr sre))
- (+ n (sre-count-submatches (cadr sre)))
- flags
- next)))
- (lambda (cnk init src str i end matches fail)
- (first cnk init src str i end matches
- (lambda ()
- (rest cnk init src str i end matches fail))))))))
- ((w/case)
- (lp (sre-sequence (cdr sre))
- n
- (flag-clear flags ~case-insensitive?)
- next))
- ((w/nocase)
- (lp (sre-sequence (cdr sre))
- n
- (flag-join flags ~case-insensitive?)
- next))
- ((w/utf8)
- (lp (sre-sequence (cdr sre)) n (flag-join flags ~utf8?) next))
- ((w/noutf8)
- (lp (sre-sequence (cdr sre)) n (flag-clear flags ~utf8?) next))
- ((seq :)
- (case (length (cdr sre))
- ((0) next)
- ((1) (rec (cadr sre)))
- (else
- (let ((rest (lp (sre-sequence (cddr sre))
- (+ n (sre-count-submatches (cadr sre)))
- flags
- next)))
- (lp (cadr sre) n flags rest)))))
- ((?)
- (let ((body (rec (sre-sequence (cdr sre)))))
- (lambda (cnk init src str i end matches fail)
- (body cnk init src str i end matches
- (lambda () (next cnk init src str i end matches fail))))))
- ((??)
- (let ((body (rec (sre-sequence (cdr sre)))))
- (lambda (cnk init src str i end matches fail)
- (next cnk init src str i end matches
- (lambda () (body cnk init src str i end matches fail))))))
- ((*)
- (cond
- ((sre-empty? (sre-sequence (cdr sre)))
- (error "invalid sre: empty *" sre))
- (else
- (letrec
- ((body
- (lp (sre-sequence (cdr sre))
- n
- flags
- (lambda (cnk init src str i end matches fail)
- (body cnk init src str i end matches
- (lambda ()
- (next cnk init src str i end matches fail)
- ))))))
- (lambda (cnk init src str i end matches fail)
- (body cnk init src str i end matches
- (lambda ()
- (next cnk init src str i end matches fail))))))))
- ((*?)
- (cond
- ((sre-empty? (sre-sequence (cdr sre)))
- (error "invalid sre: empty *?" sre))
- (else
- (letrec
- ((body
- (lp (sre-sequence (cdr sre))
- n
- flags
- (lambda (cnk init src str i end matches fail)
- (next cnk init src str i end matches
- (lambda ()
- (body cnk init src str i end matches fail)
- ))))))
- (lambda (cnk init src str i end matches fail)
- (next cnk init src str i end matches
- (lambda ()
- (body cnk init src str i end matches fail))))))))
- ((+)
- (lp (sre-sequence (cdr sre))
- n
- flags
- (rec (list '* (sre-sequence (cdr sre))))))
- ((=)
- (rec `(** ,(cadr sre) ,(cadr sre) ,@(cddr sre))))
- ((>=)
- (rec `(** ,(cadr sre) #f ,@(cddr sre))))
- ((** **?)
- (cond
- ((or (and (number? (cadr sre))
- (number? (caddr sre))
- (> (cadr sre) (caddr sre)))
- (and (not (cadr sre)) (caddr sre)))
- (lambda (cnk init src str i end matches fail) (fail)))
- (else
- (let* ((from (cadr sre))
- (to (caddr sre))
- (? (if (eq? '** (car sre)) '? '??))
- (* (if (eq? '** (car sre)) '* '*?))
- (sre (sre-sequence (cdddr sre)))
- (x-sre (sre-strip-submatches sre))
- (next (if to
- (if (= from to)
- next
- (fold (lambda (x next)
- (lp `(,? ,sre) n flags next))
- next
- (zero-to (- to from))))
- (rec `(,* ,sre)))))
- (if (zero? from)
- next
- (lp `(seq ,@(map (lambda (x) x-sre) (zero-to (- from 1)))
- ,sre)
- n
- flags
- next))))))
- ((word)
- (rec `(seq bow ,@(cdr sre) eow)))
- ((word+)
- (rec `(seq bow (+ (& (or alphanumeric "_")
- (or ,@(cdr sre)))) eow)))
- ((posix-string)
- (rec (string->sre (cadr sre))))
- ((look-ahead)
- (let ((check
- (lp (sre-sequence (cdr sre))
- n
- flags
- (lambda (cnk init src str i end matches fail) i))))
- (lambda (cnk init src str i end matches fail)
- (if (check cnk init src str i end matches (lambda () #f))
- (next cnk init src str i end matches fail)
- (fail)))))
- ((neg-look-ahead)
- (let ((check
- (lp (sre-sequence (cdr sre))
- n
- flags
- (lambda (cnk init src str i end matches fail) i))))
- (lambda (cnk init src str i end matches fail)
- (if (check cnk init src str i end matches (lambda () #f))
- (fail)
- (next cnk init src str i end matches fail)))))
- ((look-behind neg-look-behind)
- (let ((check
- (lp (sre-sequence
- (cons '(* any) (append (cdr sre) '(eos))))
- n
- flags
- (lambda (cnk init src str i end matches fail) i))))
- (lambda (cnk init src str i end matches fail)
- (let* ((prev ((chunker-get-substring cnk)
- (car init)
- (cdr init)
- src
- i))
- (len (string-length prev))
- (src2 (list prev 0 len)))
- (if ((if (eq? (car sre) 'look-behind) (lambda (x) x) not)
- (check irregex-basic-string-chunker
- (cons src2 0) src2 prev 0 len matches (lambda () #f)))
- (next cnk init src str i end matches fail)
- (fail))))))
- ((atomic)
- (let ((once
- (lp (sre-sequence (cdr sre))
- n
- flags
- (lambda (cnk init src str i end matches fail) i))))
- (lambda (cnk init src str i end matches fail)
- (let ((j (once cnk init src str i end matches (lambda () #f))))
- (if j
- (next cnk init src str j end matches fail)
- (fail))))))
- ((if)
- (let* ((test-submatches (sre-count-submatches (cadr sre)))
- (pass (lp (caddr sre) flags (+ n test-submatches) next))
- (fail (if (pair? (cdddr sre))
- (lp (cadddr sre)
- (+ n test-submatches
- (sre-count-submatches (caddr sre)))
- flags
- next)
- (lambda (cnk init src str i end matches fail)
- (fail)))))
- (cond
- ((or (number? (cadr sre)) (symbol? (cadr sre)))
- (let ((index
- (if (symbol? (cadr sre))
- (cond
- ((assq (cadr sre) names) => cdr)
- (else
- (error "unknown named backref in SRE IF" sre)))
- (cadr sre))))
- (lambda (cnk init src str i end matches fail2)
- (if (%irregex-match-end-chunk matches index)
- (pass cnk init src str i end matches fail2)
- (fail cnk init src str i end matches fail2)))))
- (else
- (let ((test (lp (cadr sre) n flags pass)))
- (lambda (cnk init src str i end matches fail2)
- (test cnk init src str i end matches
- (lambda () (fail cnk init src str i end matches fail2)))
- ))))))
- ((backref backref-ci)
- (let ((n (cond ((number? (cadr sre)) (cadr sre))
- ((assq (cadr sre) names) => cdr)
- (else (error "unknown backreference" (cadr sre)))))
- (compare (if (or (eq? (car sre) 'backref-ci)
- (flag-set? flags ~case-insensitive?))
- string-ci=?
- string=?)))
- (lambda (cnk init src str i end matches fail)
- (let ((s (irregex-match-substring matches n)))
- (if (not s)
- (fail)
- ;; XXXX create an abstract subchunk-compare
- (let lp ((src src)
- (str str)
- (i i)
- (end end)
- (j 0)
- (len (string-length s)))
- (cond
- ((<= len (- end i))
- (cond
- ((compare (substring s j (string-length s))
- (substring str i (+ i len)))
- (next cnk init src str (+ i len) end matches fail))
- (else
- (fail))))
- (else
- (cond
- ((compare (substring s j (+ j (- end i)))
- (substring str i end))
- (let ((src2 ((chunker-get-next cnk) src)))
- (if src2
- (lp src2
- ((chunker-get-str cnk) src2)
- ((chunker-get-start cnk) src2)
- ((chunker-get-end cnk) src2)
- (+ j (- end i))
- (- len (- end i)))
- (fail))))
- (else
- (fail)))))))))))
- ((dsm)
- (lp (sre-sequence (cdddr sre)) (+ n (cadr sre)) flags next))
- (($ submatch)
- (let ((body
- (lp (sre-sequence (cdr sre))
- (+ n 1)
- flags
- (lambda (cnk init src str i end matches fail)
- (let ((old-source
- (%irregex-match-end-chunk matches n))
- (old-index
- (%irregex-match-end-index matches n)))
- (irregex-match-end-chunk-set! matches n src)
- (irregex-match-end-index-set! matches n i)
- (next cnk init src str i end matches
- (lambda ()
- (irregex-match-end-chunk-set!
- matches n old-source)
- (irregex-match-end-index-set!
- matches n old-index)
- (fail))))))))
- (lambda (cnk init src str i end matches fail)
- (let ((old-source (%irregex-match-start-chunk matches n))
- (old-index (%irregex-match-start-index matches n)))
- (irregex-match-start-chunk-set! matches n src)
- (irregex-match-start-index-set! matches n i)
- (body cnk init src str i end matches
- (lambda ()
- (irregex-match-start-chunk-set!
- matches n old-source)
- (irregex-match-start-index-set!
- matches n old-index)
- (fail)))))))
- ((=> submatch-named)
- (rec `(submatch ,@(cddr sre))))
- (else
- (error "unknown regexp operator" sre)))))
- ((symbol? sre)
- (case sre
- ((any)
- (lambda (cnk init src str i end matches fail)
- (if (< i end)
- (next cnk init src str (+ i 1) end matches fail)
- (let ((src2 ((chunker-get-next cnk) src)))
- (if src2
- (let ((str2 ((chunker-get-str cnk) src2))
- (i2 ((chunker-get-start cnk) src2))
- (end2 ((chunker-get-end cnk) src2)))
- (next cnk init src2 str2 (+ i2 1) end2 matches fail))
- (fail))))))
- ((nonl)
- (lambda (cnk init src str i end matches fail)
- (if (< i end)
- (if (not (eqv? #\newline (string-ref str i)))
- (next cnk init src str (+ i 1) end matches fail)
- (fail))
- (let ((src2 ((chunker-get-next cnk) src)))
- (if src2
- (let ((str2 ((chunker-get-str cnk) src2))
- (i2 ((chunker-get-start cnk) src2))
- (end2 ((chunker-get-end cnk) src2)))
- (if (not (eqv? #\newline (string-ref str2 i2)))
- (next cnk init src2 str2 (+ i2 1) end2 matches fail)
- (fail)))
- (fail))))))
- ((bos)
- (lambda (cnk init src str i end matches fail)
- (if (and (eq? src (car init)) (eqv? i (cdr init)))
- (next cnk init src str i end matches fail)
- (fail))))
- ((bol)
- (lambda (cnk init src str i end matches fail)
- (if (or (and (eq? src (car init)) (eqv? i (cdr init)))
- (and (> i ((chunker-get-start cnk) src))
- (eqv? #\newline (string-ref str (- i 1)))))
- (next cnk init src str i end matches fail)
- (fail))))
- ((bow)
- (lambda (cnk init src str i end matches fail)
- (if (and (or (if (> i ((chunker-get-start cnk) src))
- (not (char-alphanumeric? (string-ref str (- i 1))))
- (let ((ch (chunker-prev-char cnk src end)))
- (and ch (not (char-alphanumeric? ch)))))
- (and (eq? src (car init)) (eqv? i (cdr init))))
- (if (< i end)
- (char-alphanumeric? (string-ref str i))
- (let ((next ((chunker-get-next cnk) src)))
- (and next
- (char-alphanumeric?
- (string-ref ((chunker-get-str cnk) next)
- ((chunker-get-start cnk) next)))))))
- (next cnk init src str i end matches fail)
- (fail))))
- ((eos)
- (lambda (cnk init src str i end matches fail)
- (if (and (>= i end) (not ((chunker-get-next cnk) src)))
- (next cnk init src str i end matches fail)
- (fail))))
- ((eol)
- (lambda (cnk init src str i end matches fail)
- (if (if (< i end)
- (eqv? #\newline (string-ref str i))
- (let ((src2 ((chunker-get-next cnk) src)))
- (if (not src2)
- #t
- (eqv? #\newline
- (string-ref ((chunker-get-str cnk) src2)
- ((chunker-get-start cnk) src2))))))
- (next cnk init src str i end matches fail)
- (fail))))
- ((eow)
- (lambda (cnk init src str i end matches fail)
- (if (and (if (< i end)
- (not (char-alphanumeric? (string-ref str i)))
- (let ((ch (chunker-next-char cnk src)))
- (or (not ch) (not (char-alphanumeric? ch)))))
- (if (> i ((chunker-get-start cnk) src))
- (char-alphanumeric? (string-ref str (- i 1)))
- (let ((prev (chunker-prev-char cnk init src)))
- (or (not prev) (char-alphanumeric? prev)))))
- (next cnk init src str i end matches fail)
- (fail))))
- ((nwb) ;; non-word-boundary
- (lambda (cnk init src str i end matches fail)
- (let ((c1 (if (< i end)
- (string-ref str i)
- (chunker-next-char cnk src)))
- (c2 (if (> i ((chunker-get-start cnk) src))
- (string-ref str (- i 1))
- (chunker-prev-char cnk init src))))
- (if (and c1 c2
- (if (char-alphanumeric? c1)
- (char-alphanumeric? c2)
- (not (char-alphanumeric? c2))))
- (next cnk init src str i end matches fail)
- (fail)))))
- ((epsilon)
- next)
- (else
- (let ((cell (assq sre sre-named-definitions)))
- (if cell
- (rec (cdr cell))
- (error "unknown regexp" sre))))))
- ((char? sre)
- (if (flag-set? flags ~case-insensitive?)
- ;; case-insensitive
- (lambda (cnk init src str i end matches fail)
- (if (>= i end)
- (let lp ((src2 ((chunker-get-next cnk) src)))
- (if src2
- (let ((str2 ((chunker-get-str cnk) src2))
- (i2 ((chunker-get-start cnk) src2))
- (end2 ((chunker-get-end cnk) src2)))
- (if (>= i2 end2)
- (lp ((chunker-get-next cnk) src2))
- (if (char-ci=? sre (string-ref str2 i2))
- (next cnk init src2 str2 (+ i2 1) end2
- matches fail)
- (fail))))
- (fail)))
- (if (char-ci=? sre (string-ref str i))
- (next cnk init src str (+ i 1) end matches fail)
- (fail))))
- ;; case-sensitive
- (lambda (cnk init src str i end matches fail)
- (if (>= i end)
- (let lp ((src2 ((chunker-get-next cnk) src)))
- (if src2
- (let ((str2 ((chunker-get-str cnk) src2))
- (i2 ((chunker-get-start cnk) src2))
- (end2 ((chunker-get-end cnk) src2)))
- (if (>= i2 end2)
- (lp ((chunker-get-next cnk) src2))
- (if (char=? sre (string-ref str2 i2))
- (next cnk init src2 str2 (+ i2 1) end2
- matches fail)
- (fail))))
- (fail)))
- (if (char=? sre (string-ref str i))
- (next cnk init src str (+ i 1) end matches fail)
- (fail))))
- ))
- ((string? sre)
- (rec (sre-sequence (string->list sre)))
-;; XXXX reintroduce faster string matching on chunks
-;; (if (flag-set? flags ~case-insensitive?)
-;; (rec (sre-sequence (string->list sre)))
-;; (let ((len (string-length sre)))
-;; (lambda (cnk init src str i end matches fail)
-;; (if (and (<= (+ i len) end)
-;; (%substring=? sre str 0 i len))
-;; (next str (+ i len) matches fail)
-;; (fail)))))
- )
- (else
- (error "unknown regexp" sre)))))
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;; Character Sets
-;;
-;; Simple character sets as lists of ranges, as used in the NFA/DFA
-;; compilation. This is not especially efficient, but is portable and
-;; scalable for any range of character sets.
-
-(define (sre-cset->procedure cset next)
- (lambda (cnk init src str i end matches fail)
- (if (< i end)
- (if (cset-contains? cset (string-ref str i))
- (next cnk init src str (+ i 1) end matches fail)
- (fail))
- (let ((src2 ((chunker-get-next cnk) src)))
- (if src2
- (let ((str2 ((chunker-get-str cnk) src2))
- (i2 ((chunker-get-start cnk) src2))
- (end2 ((chunker-get-end cnk) src2)))
- (if (cset-contains? cset (string-ref str2 i2))
- (next cnk init src2 str2 (+ i2 1) end2 matches fail)
- (fail)))
- (fail))))))
-
-(define (plist->alist ls)
- (let lp ((ls ls) (res '()))
- (if (null? ls)
- (reverse res)
- (lp (cddr ls) (cons (cons (car ls) (cadr ls)) res)))))
-
-(define (alist->plist ls)
- (let lp ((ls ls) (res '()))
- (if (null? ls)
- (reverse res)
- (lp (cdr ls) (cons (cdar ls) (cons (caar ls) res))))))
-
-(define (sre->cset sre . o)
- (let lp ((sre sre) (ci? (and (pair? o) (car o))))
- (define (rec sre) (lp sre ci?))
- (cond
- ((pair? sre)
- (if (string? (car sre))
- (if ci?
- (cset-case-insensitive (string->list (car sre)))
- (string->list (car sre)))
- (case (car sre)
- ((~)
- (cset-complement
- (fold cset-union (rec (cadr sre)) (map rec (cddr sre)))))
- ((&)
- (fold cset-intersection (rec (cadr sre)) (map rec (cddr sre))))
- ((-)
- (fold (lambda (x res) (cset-difference res x))
- (rec (cadr sre))
- (map rec (cddr sre))))
- ((/)
- (let ((res (plist->alist (sre-flatten-ranges (cdr sre)))))
- (if ci?
- (cset-case-insensitive res)
- res)))
- ((or)
- (fold cset-union (rec (cadr sre)) (map rec (cddr sre))))
- ((w/case)
- (lp (sre-alternate (cdr sre)) #f))
- ((w/nocase)
- (lp (sre-alternate (cdr sre)) #t))
- (else
- (error "not a valid sre char-set operator" sre)))))
- ((char? sre) (rec (list (string sre))))
- ((string? sre) (rec (list sre)))
- (else
- (let ((cell (assq sre sre-named-definitions)))
- (if cell
- (rec (cdr cell))
- (error "not a valid sre char-set" sre)))))))
-
-;; another debugging utility
-;; (define (cset->sre cset)
-;; (let lp ((ls cset) (chars '()) (ranges '()))
-;; (cond
-;; ((null? ls)
-;; (sre-alternate
-;; (append
-;; (if (pair? chars) (list (list (list->string chars))) '())
-;; (if (pair? ranges) (list (cons '/ (alist->plist ranges))) '()))))
-;; ((char? (car ls)) (lp (cdr ls) (cons (car ls) chars) ranges))
-;; (else (lp (cdr ls) chars (cons (car ls) ranges))))))
-
-(define (cset-contains? cset ch)
- (find (lambda (x)
- (or (eqv? x ch)
- (and (pair? x) (char<=? (car x) ch) (char<=? ch (cdr x)))))
- cset))
-
-(define (cset-range x)
- (if (char? x) (cons x x) x))
-
-(define (char-ranges-overlap? a b)
- (if (pair? a)
- (if (pair? b)
- (or (and (char<=? (car a) (cdr b)) (char<=? (car b) (cdr a)))
- (and (char<=? (cdr b) (car a)) (char<=? (cdr a) (car b))))
- (and (char<=? (car a) b) (char<=? b (cdr a))))
- (if (pair? b)
- (char-ranges-overlap? b a)
- (eqv? a b))))
-
-(define (char-ranges-union a b)
- (cons (if (char<=? (car a) (car b)) (car a) (car b))
- (if (char>=? (cdr a) (cdr b)) (cdr a) (cdr b))))
-
-(define (cset-union a b)
- (cond ((null? b) a)
- ((find-tail (lambda (x) (char-ranges-overlap? x (car b))) a)
- => (lambda (ls)
- (cset-union
- (cset-union (append (take-up-to a ls) (cdr ls))
- (list (char-ranges-union (cset-range (car ls))
- (cset-range (car b)))))
- (cdr b))))
- (else (cset-union (cons (car b) a) (cdr b)))))
-
-(define (cset-difference a b)
- (cond ((null? b) a)
- ((not (car b)) (cset-difference a (cdr b)))
- ((find-tail (lambda (x) (char-ranges-overlap? x (car b))) a)
- => (lambda (ls)
- (apply
- (lambda (left1 left2 same right1 right2)
- (let* ((a (append (take-up-to a ls) (cdr ls)))
- (a (if left1 (cons left1 a) a))
- (a (if left2 (cons left2 a) a))
- (b (if right1 (cset-union b (list right1)) b))
- (b (if right2 (cset-union b (list right2)) b)))
- (cset-difference a b)))
- (intersect-char-ranges (cset-range (car ls))
- (cset-range (car b))))))
- (else (cset-difference a (cdr b)))))
-
-(define (cset-intersection a b)
- (let intersect ((a a) (b b) (res '()))
- (cond ((null? b) res)
- ((find-tail (lambda (x) (char-ranges-overlap? x (car b))) a)
- => (lambda (ls)
- (apply
- (lambda (left1 left2 same right1 right2)
- (let* ((a (append (take-up-to a ls) (cdr ls)))
- (a (if left1 (cons left1 a) a))
- (a (if left2 (cons left2 a) a))
- (b (if right1 (cset-union b (list right1)) b))
- (b (if right2 (cset-union b (list right2)) b)))
- (intersect a b (cset-union res (list same)))))
- (intersect-char-ranges (cset-range (car ls))
- (cset-range (car b))))))
- (else (intersect a (cdr b) res)))))
-
-(define (cset-complement a)
- (cset-difference (sre->cset *all-chars*) a))
-
-(define (cset-case-insensitive a)
- (let lp ((ls a) (res '()))
- (cond ((null? ls) (reverse res))
- ((and (char? (car ls)) (char-alphabetic? (car ls)))
- (let ((c2 (char-altcase (car ls)))
- (res (cons (car ls) res)))
- (lp (cdr ls) (if (cset-contains? res c2) res (cons c2 res)))))
- ((and (pair? (car ls))
- (char-alphabetic? (caar ls))
- (char-alphabetic? (cdar ls)))
- (lp (cdr ls)
- (cset-union (cset-union res (list (car ls)))
- (list (cons (char-altcase (caar ls))
- (char-altcase (cdar ls)))))))
- (else (lp (cdr ls) (cset-union res (list (car ls))))))))
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;; Match and Replace Utilities
-
-(define (irregex-fold/fast irx kons knil str . o)
- (if (not (string? str)) (error "irregex-fold: not a string" str))
- (if (not (procedure? kons)) (error "irregex-fold: not a procedure" kons))
- (let* ((irx (irregex irx))
- (matches (irregex-new-matches irx))
- (finish (or (and (pair? o) (car o)) (lambda (i acc) acc)))
- (start (if (and (pair? o) (pair? (cdr o))) (cadr o) 0))
- (end (if (and (pair? o) (pair? (cdr o)) (pair? (cddr o)))
- (caddr o)
- (string-length str))))
- (irregex-match-chunker-set! matches irregex-basic-string-chunker)
- (let lp ((i start) (acc knil))
- (if (>= i end)
- (finish i acc)
- (let ((m (irregex-search/matches
- irx
- irregex-basic-string-chunker
- (list str i end)
- i
- matches)))
- (if (not m)
- (finish i acc)
- (let* ((end (%irregex-match-end-index m 0))
- (acc (kons i m acc)))
- (irregex-reset-matches! matches)
- (lp end acc))))))))
-
-(define (irregex-fold irx kons . args)
- (let ((kons2 (lambda (i m acc) (kons i (irregex-copy-matches m) acc))))
- (apply irregex-fold/fast irx kons2 args)))
-
-(define (irregex-fold/chunked/fast irx kons knil cnk start . o)
- (let* ((irx (irregex irx))
- (matches (irregex-new-matches irx))
- (finish (or (and (pair? o) (car o)) (lambda (src i acc) acc)))
- (i (if (and (pair? o) (pair? (cdr o)))
- (cadr o)
- ((chunker-get-start cnk) start))))
- (irregex-match-chunker-set! matches cnk)
- (let lp ((start start) (i i) (acc knil))
- (if (not start)
- (finish start i acc)
- (let ((m (irregex-search/matches irx cnk start i matches)))
- (if (not m)
- (finish start i acc)
- (let* ((acc (kons start i m acc))
- (end-src (%irregex-match-end-chunk m 0))
- (end-index (%irregex-match-end-index m 0)))
- (irregex-reset-matches! matches)
- (lp end-src end-index acc))))))))
-
-(define (irregex-fold/chunked irx kons . args)
- (if (not (procedure? kons)) (error "irregex-fold/chunked: not a procedure" kons))
- (let ((kons2 (lambda (s i m acc) (kons s i (irregex-copy-matches m) acc))))
- (apply irregex-fold/chunked/fast irx kons2 args)))
-
-(define (irregex-replace irx str . o)
- (if (not (string? str)) (error "irregex-replace: not a string" str))
- (let ((m (irregex-search irx str)))
- (and
- m
- (string-cat-reverse
- (cons (substring str (%irregex-match-end-index m 0) (string-length str))
- (append (irregex-apply-match m o)
- (list (substring str 0 (%irregex-match-start-index m 0)))
- ))))))
-
-(define (irregex-replace/all irx str . o)
- (if (not (string? str)) (error "irregex-replace/all: not a string" str))
- (irregex-fold/fast
- irx
- (lambda (i m acc)
- (let ((m-start (%irregex-match-start-index m 0)))
- (append (irregex-apply-match m o)
- (if (>= i m-start)
- acc
- (cons (substring str i m-start) acc)))))
- '()
- str
- (lambda (i acc)
- (let ((end (string-length str)))
- (string-cat-reverse (if (>= i end)
- acc
- (cons (substring str i end) acc)))))))
-
-(define (irregex-apply-match m ls)
- (let lp ((ls ls) (res '()))
- (if (null? ls)
- res
- (cond
- ((integer? (car ls))
- (lp (cdr ls)
- (cons (or (irregex-match-substring m (car ls)) "") res)))
- ((procedure? (car ls))
- (lp (cdr ls) (cons ((car ls) m) res)))
- ((symbol? (car ls))
- (case (car ls)
- ((pre)
- (lp (cdr ls)
- (cons (substring (car (%irregex-match-start-chunk m 0))
- 0
- (%irregex-match-start-index m 0))
- res)))
- ((post)
- (let ((str (car (%irregex-match-start-chunk m 0))))
- (lp (cdr ls)
- (cons (substring str
- (%irregex-match-end-index m 0)
- (string-length str))
- res))))
- (else
- (cond
- ((assq (car ls) (irregex-match-names m))
- => (lambda (x) (lp (cons (cdr x) (cdr ls)) res)))
- (else
- (error "unknown match replacement" (car ls)))))))
- (else
- (lp (cdr ls) (cons (car ls) res)))))))
-
-(define (irregex-extract irx str . o)
- (if (not (string? str)) (error "irregex-extract: not a string" str))
- (apply irregex-fold/fast
- irx
- (lambda (i m a) (cons (irregex-match-substring m) a))
- '()
- str
- (lambda (i a) (reverse a))
- o))
-
-(define (irregex-split irx str . o)
- (if (not (string? str)) (error "irregex-split: not a string" str))
- (let ((start (if (pair? o) (car o) 0))
- (end (if (and (pair? o) (pair? (cdr o))) (cadr o) (string-length str))))
- (irregex-fold/fast
- irx
- (lambda (i m a)
- (if (= i (%irregex-match-start-index m 0))
- a
- (cons (substring str i (%irregex-match-start-index m 0)) a)))
- '()
- str
- (lambda (i a)
- (reverse (if (= i end) a (cons (substring str i end) a))))
- start
- end)))
diff --git a/irregex-utils.scm b/irregex-utils.scm
deleted file mode 100644
index 8332791d..00000000
--- a/irregex-utils.scm
+++ /dev/null
@@ -1,154 +0,0 @@
-;;;; irregex-utils.scm
-;;
-;; Copyright (c) 2010 Alex Shinn. All rights reserved.
-;; BSD-style license: http://synthcode.com/license.txt
-
-(define rx-special-chars
- "\\|[](){}.*+?^$#")
-
-(define (string-scan-char str c . o)
- (let ((end (string-length str)))
- (let scan ((i (if (pair? o) (car o) 0)))
- (cond ((= i end) #f)
- ((eqv? c (string-ref str i)) i)
- (else (scan (+ i 1)))))))
-
-(define (irregex-quote str)
- (list->string
- (let loop ((ls (string->list str)) (res '()))
- (if (null? ls)
- (reverse res)
- (let ((c (car ls)))
- (if (string-scan-char rx-special-chars c)
- (loop (cdr ls) (cons c (cons #\\ res)))
- (loop (cdr ls) (cons c res))))))))
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-(define (irregex-opt ls)
- (define (make-alt ls)
- (cond ((null? (cdr ls)) (car ls))
- ((every char? ls) (list (list->string ls)))
- (else (cons 'or ls))))
- (define (make-seq ls)
- (cond ((null? (cdr ls)) (car ls))
- ((every (lambda (x) (or (string? x) (char? x))) ls)
- (apply string-append (map (lambda (x) (if (char? x) (string x) x)) ls)))
- (else (cons 'seq ls))))
- (cond
- ((null? ls) "")
- ((null? (cdr ls)) (car ls))
- (else
- (let ((chars (make-vector 256 '())))
- (let lp1 ((ls ls) (empty? #f))
- (if (null? ls)
- (let lp2 ((i 0) (res '()))
- (if (= i 256)
- (let ((res (make-alt (reverse res))))
- (if empty? `(? ,res) res))
- (let ((c (integer->char i))
- (opts (vector-ref chars i)))
- (lp2 (+ i 1)
- (cond
- ((null? opts) res)
- ((equal? opts '("")) `(,c ,@res))
- (else `(,(make-seq (list c (irregex-opt opts)))
- ,@res)))))))
- (let* ((str (car ls))
- (len (string-length str)))
- (if (zero? len)
- (lp1 (cdr ls) #t)
- (let ((i (char->integer (string-ref str 0))))
- (vector-set!
- chars
- i
- (cons (substring str 1 len) (vector-ref chars i)))
- (lp1 (cdr ls) empty?))))))))))
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-(define (cset->string ls)
- (let ((out (open-output-string)))
- (let lp ((ls ls))
- (cond
- ((pair? ls)
- (cond
- ((pair? (car ls))
- (display (irregex-quote (string (caar ls))) out)
- (write-char #\- out)
- (display (irregex-quote (string (cdar ls))) out))
- (else (display (irregex-quote (string (car ls))) out)))
- (lp (cdr ls)))))
- (get-output-string out)))
-
-(define (sre->string obj)
- (let ((out (open-output-string)))
- (let lp ((x obj))
- (cond
- ((pair? x)
- (case (car x)
- ((: seq)
- (cond
- ((and (pair? (cddr x)) (pair? (cddr x)) (not (eq? x obj)))
- (display "(?:" out) (for-each lp (cdr x)) (display ")" out))
- (else (for-each lp (cdr x)))))
- ((submatch)
- (display "(" out) (for-each lp (cdr x)) (display ")" out))
- ((submatch-named)
- (display "(?<" out) (display (cadr x) out) (display ">" out)
- (for-each lp (cddr x)) (display ")" out))
- ((or)
- (display "(?:" out)
- (lp (cadr x))
- (for-each (lambda (x) (display "|" out) (lp x)) (cddr x))
- (display ")" out))
- ((* + ? *? ??)
- (cond
- ((pair? (cddr x))
- (display "(?:" out) (for-each lp (cdr x)) (display ")" out))
- (else (lp (cadr x))))
- (display (car x) out))
- ((not)
- (cond
- ((and (pair? (cadr x)) (eq? 'cset (caadr x)))
- (display "[^" out)
- (display (cset->string (cdadr x)) out)
- (display "]" out))
- (else (error "can't represent general 'not' in strings" x))))
- ((cset)
- (display "[" out)
- (display (cset->string (cdr x)) out)
- (display "]" out))
- ((- & / ~)
- (cond
- ((or (eq? #\~ (car x))
- (and (eq? '- (car x)) (pair? (cdr x)) (eq? 'any (cadr x))))
- (display "[^" out)
- (display (cset->string (if (eq? #\~ (car x)) (cdr x) (cddr x))) out)
- (display "]" out))
- (else
- (lp `(cset ,@(sre->cset x))))))
- ((w/case w/nocase)
- (display "(?" out)
- (if (eq? (car x) 'w/case) (display "-" out))
- (display ":" out)
- (for-each lp (cdr x))
- (display ")" out))
- (else
- (if (string? (car x))
- (lp `(cset ,@(string->list (car x))))
- (error "unknown sre operator" x)))))
- ((symbol? x)
- (case x
- ((bos bol) (display "^" out))
- ((eos eol) (display "$" out))
- ((any nonl) (display "." out))
- (else (error "unknown sre symbol" x))))
- ((string? x)
- (display (irregex-quote x) out))
- ((char? x)
- (display (irregex-quote (string x)) out))
- (else
- (error "unknown sre pattern" x))))
- (get-output-string out)))
-
diff --git a/irregex.import.scm b/irregex.import.scm
index 63bd1320..7fc3bde7 100644
--- a/irregex.import.scm
+++ b/irregex.import.scm
@@ -26,49 +26,11 @@
(##sys#register-primitive-module
'irregex
- '(irregex
- irregex-apply-match
- irregex-dfa
- irregex-dfa/extract
- irregex-dfa/search
- irregex-extract
- irregex-flags
- irregex-fold
- irregex-fold/chunked
- irregex-lengths
- irregex-match
- irregex-match?
- irregex-match-data?
- irregex-match-end
- irregex-match-end-chunk
- irregex-match-end-index
- irregex-match-names
- irregex-match-num-submatches
- irregex-match-start
- irregex-match-start-chunk
- irregex-match-start-index
- irregex-match-string
- irregex-match-subchunk
- irregex-match-substring
- irregex-match/chunked
- irregex-names
- irregex-new-matches
- irregex-nfa
- irregex-num-submatches
- irregex-opt
- irregex-quote
- irregex-replace
- irregex-replace/all
- irregex-reset-matches!
- irregex-search
- irregex-search/chunked
- irregex-search/matches
- irregex-split
- irregex?
- make-irregex-chunker
- maybe-string->sre
- sre->irregex
- sre->string
- string->irregex
- string->sre
- ))
+ '(irregex string->irregex sre->irregex string->sre
+ irregex? irregex-match-data?
+ irregex-new-matches irregex-reset-matches!
+ irregex-match-start irregex-match-end irregex-match-substring
+ irregex-search irregex-search/matches irregex-match irregex-match-string
+ irregex-fold irregex-replace irregex-replace/all irregex-apply-match
+ irregex-dfa irregex-dfa/search irregex-dfa/extract
+ irregex-nfa irregex-flags irregex-submatches irregex-lengths irregex-names))
diff --git a/irregex.scm b/irregex.scm
index 0447d3de..5d0f77e7 100644
--- a/irregex.scm
+++ b/irregex.scm
@@ -1,248 +1,2718 @@
-;;;; irregex.scm - container for irregex-core.scm
-;
-; Copyright (c) 2010, The Chicken Team
-; All rights reserved.
-;
-; Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following
-; conditions are met:
-;
-; Redistributions of source code must retain the above copyright notice, this list of conditions and the following
-; disclaimer.
-; Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
-; disclaimer in the documentation and/or other materials provided with the distribution.
-; Neither the name of the author nor the names of its contributors may be used to endorse or promote
-; products derived from this software without specific prior written permission.
-;
-; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS
-; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
-; AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
-; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
-; OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-; POSSIBILITY OF SUCH DAMAGE.
-
-
-
-(declare (unit irregex))
-
-(declare
- (no-procedure-checks)
- (fixnum)
- (export
- ##sys#glob->regexp
- irregex
- irregex-apply-match
- irregex-dfa
- irregex-dfa/extract
- irregex-dfa/search
- irregex-extract
- irregex-flags
- irregex-fold
- irregex-fold/chunked
- irregex-lengths
- irregex-match
- irregex-match?
- irregex-match-data?
- irregex-match-end
- irregex-match-end-chunk
- irregex-match-end-index
- irregex-match-names
- irregex-match-num-submatches
- irregex-match-start
- irregex-match-start-chunk
- irregex-match-start-index
- irregex-match-string
- irregex-match-subchunk
- irregex-match-substring
- irregex-match/chunked
- irregex-names
- irregex-new-matches
- irregex-nfa
- irregex-num-submatches
- irregex-opt
- irregex-quote
- irregex-replace
- irregex-replace/all
- irregex-reset-matches!
- irregex-search
- irregex-search/chunked
- irregex-search/matches
- irregex-split
- irregex-submatches
- irregex?
- make-irregex-chunker
- maybe-string->sre
- irregex-search/chunked
- sre->irregex
- sre->string
- string->irregex
- string->sre
- ))
-
-(include "common-declarations.scm")
-
-(register-feature! 'irregex)
-
-(define-syntax (build-cache x r c)
- ;; (build-cache N ARG FAIL)
- (let* ((n (cadr x))
- (n2 (* n 2))
- (arg (caddr x))
- (fail (cadddr x))
- (%cache (r 'cache))
- (%index (r 'index))
- (%arg (r 'arg))
- (%let (r 'let))
- (%let* (r 'let*))
- (%if (r 'if))
- (%fx+ (r 'fx+))
- (%fxmod (r 'fxmod))
- (%equal? (r 'equal?))
- (%quote (r 'quote))
- (%tmp (r 'tmp))
- (%begin (r 'begin))
- (cache (make-vector (add1 n2) #f)))
- (##sys#setslot cache n2 0) ; last slot: current index
- `(,%let* ((,%cache (,%quote ,cache)) ; we mutate a literal vector
- (,%arg ,arg))
- ,(let fold ((i 0))
- (if (fx>= i n)
- ;; this should be thread-safe: a context-switch can only
- ;; happen before this code and in the call to FAIL.
- `(,%let ((,%tmp ,fail)
- (,%index (##sys#slot ,%cache ,n2)))
- (##sys#setslot ,%cache ,%index ,%arg)
- (##sys#setslot ,%cache (,%fx+ ,%index 1) ,%tmp)
- (##sys#setislot
- ,%cache ,n2
- (##core#inline "C_u_fixnum_modulo" (,%fx+ ,%index 2) ,n2))
- ,%tmp)
- `(,%if (,%equal? (##sys#slot ,%cache ,(* i 2)) ,%arg)
- (##sys#slot ,%cache ,(add1 (* i 2)))
- ,(fold (add1 i))))))))
-
-(define-compiler-syntax %%string-copy!
- (syntax-rules ()
- ((_ to tstart from fstart fend)
- (let ((x to)
- (y tstart)
- (z from)
- (u fstart)
- (v fend))
- (##core#inline "C_substring_copy" z x u v y)))))
-
-(define-compiler-syntax %substring=?
- (syntax-rules ()
- ((_ a b start1 start2 len)
- (##core#inline "C_substring_compare" a b start1 start2 len))))
-
-(define-compiler-syntax make-irregex
- (syntax-rules ()
- ((_ dfa dfa/search dfa/extract nfa flags submatches lengths names)
- (##sys#make-structure
- 'regexp dfa dfa/search dfa/extract nfa flags submatches lengths names))))
-
-(define-compiler-syntax make-irregex-match
- (syntax-rules ()
- ((_ count names)
- (##sys#make-structure
- 'regexp-match
- (make-vector (+ (* 4 (+ 2 count)) 3) #f) ; #1: submatches
- names ; #2: (guess)
- #f ; #3: chunka
- #f)))) ; #4: fail
-
-(define-compiler-syntax bit-shl
- (syntax-rules ()
- ((_ n i) (fxshl n i))))
-
-(define-compiler-syntax bit-shr
- (syntax-rules ()
- ((_ n i) (fxshr n i))))
-
-(define-compiler-syntax bit-not
- (syntax-rules ()
- ((_ n) (fxnot n))))
-
-(define-compiler-syntax bit-ior
- (syntax-rules ()
- ((_ a b) (fxior a b))))
-
-(define-compiler-syntax bit-and
- (syntax-rules ()
- ((_ a b) (fxand a b))))
-
-(define-compiler-syntax match-vector-ref
- (syntax-rules ()
- ((_ m i) (##sys#slot (##sys#slot m 1) i))))
-
-(define-compiler-syntax match-vector-set!
- (syntax-rules ()
- ((_ m i x) (##sys#setslot (##sys#slot m 1) i x))))
-
-(define-compiler-syntax irregex-match-start-chunk-set!
- (syntax-rules ()
- ((_ m n start)
- (vector-set! (##sys#slot m 1) (* n 4) start))))
-
-(define-compiler-syntax irregex-match-start-index-set!
- (syntax-rules ()
- ((_ m n start)
- (vector-set! (##sys#slot m 1) (+ 1 (* n 4)) start))))
-
-(define-compiler-syntax irregex-match-end-chunk-set!
- (syntax-rules ()
- ((_ m n end)
- (vector-set! (##sys#slot m 1) (+ 2 (* n 4)) end))))
-
-(define-compiler-syntax irregex-match-end-index-set!
- (syntax-rules ()
- ((_ m n end)
- (vector-set! (##sys#slot m 1) (+ 3 (* n 4)) end))))
-
-(include "irregex-core.scm")
-(include "irregex-utils.scm")
-
-(define ##sys#glob->regexp
- (let ((list->string list->string)
- (string->list string->list))
- (lambda (s #!optional sre?)
- (##sys#check-string s 'glob->regexp)
- (let ((sre
- (cons
- ':
- (let loop ((cs (string->list s)) (dir #t))
- (if (null? cs)
- '()
- (let ((c (car cs))
- (rest (cdr cs)) )
- (cond ((char=? c #\*)
- (if dir
- `((or (: (~ ("./\\"))
- (* (~ ("/\\"))))
- (* (~ ("./\\"))))
- ,@(loop rest #f))
- `((* (~ ("/\\"))) ,@(loop rest #f))))
- ((char=? c #\?) (cons 'any (loop rest #f)))
- ((char=? c #\[)
- (let loop2 ((rest rest) (s '()))
- (cond ((not (pair? rest))
- (error 'glob->regexp
- "unexpected end of character class" s))
- ((char=? #\] (car rest))
- `((or ,@s) ,@(loop (cdr rest) #f)))
- ((and (pair? (cdr rest))
- (pair? (cddr rest))
- (char=? #\- (cadr rest)) )
- (loop2 (cdddr rest)
- (cons `(/ ,(car rest) ,(caddr rest)) s)))
- ((and (pair? (cdr rest))
- (char=? #\- (car rest)))
- (loop2 (cddr rest)
- (cons `(~ ,(cadr rest)) s)))
- (else
- (loop2 (cdr rest) (cons (car rest) s))))))
- (else (cons c (loop rest (memq c '(#\\ #\/))))))))))))
- (if sre? sre (irregex sre))))))
+;;;; irregex.scm -- IrRegular Expressions
+;;
+;; Copyright (c) 2005-2008 Alex Shinn. All rights reserved.
+;; BSD-style license: http://synthcode.com/license.txt
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; At this moment there was a loud ring at the bell, and I could
+;; hear Mrs. Hudson, our landlady, raising her voice in a wail of
+;; expostulation and dismay.
+;;
+;; "By heaven, Holmes," I said, half rising, "I believe that
+;; they are really after us."
+;;
+;; "No, it's not quite so bad as that. It is the unofficial
+;; force, -- the Baker Street irregulars."
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; History
+;;
+;; 0.6.2: 2008/07/26 - minor bugfixes, allow global disabling of utf8 mode,
+;; friendlier error messages in parsing, \Q..\E support
+;; 0.6.1: 2008/07/21 - added utf8 mode, more utils, bugfixes
+;; 0.6: 2008/05/01 - most of PCRE supported
+;; 0.5: 2008/04/24 - fully portable R4RS, many PCRE features implemented
+;; 0.4: 2008/04/17 - rewriting NFA to use efficient closure compilation,
+;; normal strings only, but all of the spencer tests pass
+;; 0.3: 2008/03/10 - adding DFA converter (normal strings only)
+;; 0.2: 2005/09/27 - adding irregex-opt (like elisp's regexp-opt) utility
+;; 0.1: 2005/08/18 - simple NFA interpreter over abstract chunked strings
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define irregex-tag '*irregex-tag*)
+
+(define (make-irregex dfa dfa/search dfa/extract nfa flags
+ submatches lengths names)
+ (vector irregex-tag dfa dfa/search dfa/extract nfa flags
+ submatches lengths names))
+
+(define (irregex? obj)
+ (and (vector? obj)
+ (= 9 (vector-length obj))
+ (eq? irregex-tag (vector-ref obj 0))))
+
+(define (irregex-dfa x) (vector-ref x 1))
+(define (irregex-dfa/search x) (vector-ref x 2))
+(define (irregex-dfa/extract x) (vector-ref x 3))
+(define (irregex-nfa x) (vector-ref x 4))
+(define (irregex-flags x) (vector-ref x 5))
+(define (irregex-submatches x) (vector-ref x 6))
+(define (irregex-lengths x) (vector-ref x 7))
+(define (irregex-names x) (vector-ref x 8))
+
+(define (irregex-new-matches irx)
+ (make-irregex-match #f (irregex-submatches irx) (irregex-names irx)))
+(define (irregex-reset-matches! m)
+ (do ((i (- (vector-length m) 1) (- i 1)))
+ ((<= i 3) m)
+ (vector-set! m i #f)))
+
+(define irregex-match-tag '*irregex-match-tag*)
+
+(define (irregex-match-data? obj)
+ (and (vector? obj)
+ (>= (vector-length obj) 5)
+ (eq? irregex-match-tag (vector-ref obj 0))))
+
+(define (make-irregex-match str count names)
+ (let ((res (make-vector (+ (* 2 (+ 1 count)) 3) #f)))
+ (vector-set! res 0 irregex-match-tag)
+ (vector-set! res 1 str)
+ (vector-set! res 2 names)
+ res))
+
+(define (irregex-match-num-submatches m)
+ (- (quotient (- (vector-length m) 3) 2) 1))
+
+(define (irregex-match-string m)
+ (vector-ref m 1))
+(define (irregex-match-names m)
+ (vector-ref m 2))
+(define (irregex-match-string-set! m str)
+ (vector-set! m 1 str))
+
+(define (irregex-match-start-index m n)
+ (vector-ref m (+ 3 (* n 2))))
+(define (irregex-match-end-index m n)
+ (vector-ref m (+ 4 (* n 2))))
+
+(define (irregex-match-start-index-set! m n start)
+ (vector-set! m (+ 3 (* n 2)) start))
+(define (irregex-match-end-index-set! m n end)
+ (vector-set! m (+ 4 (* n 2)) end))
+
+(define (irregex-match-index m opt)
+ (if (pair? opt)
+ (cond ((number? (car opt)) (car opt))
+ ((assq (car opt) (irregex-match-names m)) => cdr)
+ (else (error "unknown match name" (car opt))))
+ 0))
+
+(define (irregex-match-valid-index? m n)
+ (and (< (+ 3 (* n 2)) (vector-length m))
+ (vector-ref m (+ 4 (* n 2)))))
+
+(define (irregex-match-substring m . opt)
+ (let ((n (irregex-match-index m opt)))
+ (and (irregex-match-valid-index? m n)
+ (substring (irregex-match-string m)
+ (vector-ref m (+ 3 (* n 2)))
+ (vector-ref m (+ 4 (* n 2)))))))
+
+(define (irregex-match-start m . opt)
+ (let ((n (irregex-match-index m opt)))
+ (and (irregex-match-valid-index? m n)
+ (vector-ref m (+ 3 (* n 2))))))
+
+(define (irregex-match-end m . opt)
+ (irregex-match-valid-index? m (irregex-match-index m opt)))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; string utilities
+
+;;;; Unicode version (skip surrogates)
+(define *all-chars*
+ `(/ ,(integer->char 0) ,(integer->char #xD7FF)
+ ,(integer->char #xE000) ,(integer->char #x10FFFF)))
+
+;;;; ASCII version, offset to not assume 0-255
+;; (define *all-chars* `(/ ,(integer->char (- (char->integer #\space) 32)) ,(integer->char (+ (char->integer #\space) 223))))
+
+;; set to #f to ignore even an explicit request for utf8 handling
+(define *allow-utf8-mode?* #t)
+
+;; (define *named-char-properties* '())
+
+(define (string-scan-char str c . o)
+ (let ((end (string-length str)))
+ (let scan ((i (if (pair? o) (car o) 0)))
+ (cond ((= i end) #f)
+ ((eqv? c (string-ref str i)) i)
+ (else (scan (+ i 1)))))))
+
+(define (string-scan-char-escape str c . o)
+ (let ((end (string-length str)))
+ (let scan ((i (if (pair? o) (car o) 0)))
+ (cond ((= i end) #f)
+ ((eqv? c (string-ref str i)) i)
+ ((eqv? c #\\) (scan (+ i 2)))
+ (else (scan (+ i 1)))))))
+
+(define (string-scan-pred str pred . o)
+ (let ((end (string-length str)))
+ (let scan ((i (if (pair? o) (car o) 0)))
+ (cond ((= i end) #f)
+ ((pred (string-ref str i)) i)
+ (else (scan (+ i 1)))))))
+
+(define (string-split-char str c)
+ (let ((end (string-length str)))
+ (let lp ((i 0) (from 0) (res '()))
+ (define (collect) (cons (substring str from i) res))
+ (cond ((>= i end) (reverse (collect)))
+ ((eqv? c (string-ref str i)) (lp (+ i 1) (+ i 1) (collect)))
+ (else (lp (+ i 1) from res))))))
+
+(define (char-alphanumeric? c)
+ (or (char-alphabetic? c) (char-numeric? c)))
+
+;; SRFI-13 extracts
+
+(define (%%string-copy! to tstart from fstart fend)
+ (do ((i fstart (+ i 1))
+ (j tstart (+ j 1)))
+ ((>= i fend))
+ (string-set! to j (string-ref from i))))
+
+(define (string-cat-reverse string-list)
+ (string-cat-reverse/aux
+ (fold (lambda (s a) (+ (string-length s) a)) 0 string-list)
+ string-list))
+
+(define (string-cat-reverse/aux len string-list)
+ (let ((res (make-string len)))
+ (let lp ((i len) (ls string-list))
+ (if (pair? ls)
+ (let* ((s (car ls))
+ (slen (string-length s))
+ (i (- i slen)))
+ (%%string-copy! res i s 0 slen)
+ (lp i (cdr ls)))))
+ res))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; list utilities
+
+;; like the one-arg IOTA case
+(define (zero-to n)
+ (if (<= n 0)
+ '()
+ (let lp ((i (- n 1)) (res '()))
+ (if (zero? i) (cons 0 res) (lp (- i 1) (cons i res))))))
+
+;; take the head of list FROM up to but not including TO, which must
+;; be a tail of the list
+(define (take-up-to from to)
+ (let lp ((ls from) (res '()))
+ (if (and (pair? ls) (not (eq? ls to)))
+ (lp (cdr ls) (cons (car ls) res))
+ (reverse res))))
+
+;; SRFI-1 extracts (simplified 1-ary versions)
+
+(define (find pred ls)
+ (cond ((find-tail pred ls) => car)
+ (else #f)))
+
+(define (find-tail pred ls)
+ (let lp ((ls ls))
+ (cond ((null? ls) #f)
+ ((pred (car ls)) ls)
+ (else (lp (cdr ls))))))
+
+(define (last ls)
+ (if (not (pair? ls))
+ (error "can't take last of empty list" ls)
+ (let lp ((ls ls))
+ (if (pair? (cdr ls))
+ (lp (cdr ls))
+ (car ls)))))
+
+(define (any pred ls)
+ (and (pair? ls)
+ (let lp ((head (car ls)) (tail (cdr ls)))
+ (if (null? tail)
+ (pred head)
+ (or (pred head) (lp (car tail) (cdr tail)))))))
+
+(define (every pred ls)
+ (or (null? ls)
+ (let lp ((head (car ls)) (tail (cdr ls)))
+ (if (null? tail)
+ (pred head)
+ (and (pred head) (lp (car tail) (cdr tail)))))))
+
+(define (fold kons knil ls)
+ (let lp ((ls ls) (res knil))
+ (if (null? ls)
+ res
+ (lp (cdr ls) (kons (car ls) res)))))
+
+(define (filter pred ls)
+ (let lp ((ls ls) (res '()))
+ (if (null? ls)
+ (reverse res)
+ (lp (cdr ls) (if (pred (car ls)) (cons (car ls) res) res)))))
+
+(define (remove pred ls)
+ (let lp ((ls ls) (res '()))
+ (if (null? ls)
+ (reverse res)
+ (lp (cdr ls) (if (pred (car ls)) res (cons (car ls) res))))))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; flags
+
+(define (bit-shr n i)
+ (quotient n (expt 2 i)))
+
+(define (bit-shl n i)
+ (* n (expt 2 i)))
+
+(define (bit-not n) (- #xFFFF n))
+
+(define (bit-ior a b)
+ (cond
+ ((zero? a) b)
+ ((zero? b) a)
+ (else
+ (+ (if (or (odd? a) (odd? b)) 1 0)
+ (* 2 (bit-ior (quotient a 2) (quotient b 2)))))))
+
+(define (bit-and a b)
+ (cond
+ ((zero? a) 0)
+ ((zero? b) 0)
+ (else
+ (+ (if (and (odd? a) (odd? b)) 1 0)
+ (* 2 (bit-and (quotient a 2) (quotient b 2)))))))
+
+(define (flag-set? flags i)
+ (= i (bit-and flags i)))
+(define (flag-join a b)
+ (if b (bit-ior a b) a))
+(define (flag-clear a b)
+ (bit-and a (bit-not b)))
+
+(define ~none 0)
+(define ~searcher? 1)
+(define ~consumer? 2)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; parsing
+
+(define ~save? 1)
+(define ~case-insensitive? 2)
+(define ~multi-line? 4)
+(define ~single-line? 8)
+(define ~ignore-space? 16)
+(define ~utf8? 32)
+
+(define (symbol-list->flags ls)
+ (let lp ((ls ls) (res ~none))
+ (if (not (pair? ls))
+ res
+ (lp (cdr ls)
+ (flag-join
+ res
+ (case (car ls)
+ ((i ci case-insensitive) ~case-insensitive?)
+ ((m multi-line) ~multi-line?)
+ ((s single-line) ~single-line?)
+ ((x ignore-space) ~ignore-space?)
+ ((u utf8) ~utf8?)
+ (else #f)))))))
+
+(define (string->sre str . o)
+ (let ((end (string-length str))
+ (flags (symbol-list->flags o)))
+
+ (let lp ((i 0) (from 0) (flags flags) (res '()) (st '()))
+
+ ;; handle case sensitivity at the literal char/string level
+ (define (cased-char ch)
+ (if (and (flag-set? flags ~case-insensitive?)
+ (char-alphabetic? ch))
+ `(or ,ch ,(char-altcase ch))
+ ch))
+ (define (cased-string str)
+ (if (flag-set? flags ~case-insensitive?)
+ (sre-sequence (map cased-char (string->list str)))
+ str))
+ ;; accumulate the substring from..i as literal text
+ (define (collect)
+ (if (= i from) res (cons (cased-string (substring str from i)) res)))
+ ;; like collect but breaks off the last single character when
+ ;; collecting literal data, as the argument to ?/*/+ etc.
+ (define (collect/single)
+ (let* ((utf8? (flag-set? flags ~utf8?))
+ (j (if (and utf8? (> i 1))
+ (utf8-backup-to-initial-char str (- i 1))
+ (- i 1))))
+ (cond
+ ((< j from)
+ res)
+ (else
+ (let ((c (cased-char (if utf8?
+ (utf8-string-ref str j (- i j) )
+ (string-ref str j)))))
+ (cond
+ ((= j from)
+ (cons c res))
+ (else
+ (cons c
+ (cons (cased-string (substring str from j))
+ res)))))))))
+ ;; collects for use as a result, reversing and grouping OR
+ ;; terms, and some ugly tweaking of `function-like' groups and
+ ;; conditionals
+ (define (collect/terms)
+ (let* ((ls (collect))
+ (func
+ (and (pair? ls)
+ (memq (last ls)
+ '(atomic if look-ahead neg-look-ahead
+ look-behind neg-look-behind submatch-named
+ w/utf8 w/noutf8))))
+ (prefix (if (and func (eq? 'submatch-named (car func)))
+ (list 'submatch-named (cadr (reverse ls)))
+ (and func (list (car func)))))
+ (ls (if func
+ (if (eq? 'submatch-named (car func))
+ (reverse (cddr (reverse ls)))
+ (reverse (cdr (reverse ls))))
+ ls)))
+ (let lp ((ls ls) (term '()) (res '()))
+ (define (shift)
+ (cons (sre-sequence term) res))
+ (cond
+ ((null? ls)
+ (let* ((res (sre-alternate (shift)))
+ (res (if (flag-set? flags ~save?)
+ (list 'submatch res)
+ res)))
+ (if prefix
+ (if (eq? 'if (car prefix))
+ (cond
+ ((not (pair? res))
+ 'epsilon)
+ ((memq (car res)
+ '(look-ahead neg-look-ahead
+ look-behind neg-look-behind))
+ res)
+ ((eq? 'seq (car res))
+ `(if ,(cadr res)
+ ,(if (pair? (cdr res))
+ (sre-sequence (cddr res))
+ 'epsilon)))
+ (else
+ `(if ,(cadadr res)
+ ,(if (pair? (cdr res))
+ (sre-sequence (cddadr res))
+ 'epsilon)
+ ,(sre-alternate
+ (if (pair? (cdr res)) (cddr res) '())))))
+ `(,@prefix ,res))
+ res)))
+ ((eq? 'or (car ls)) (lp (cdr ls) '() (shift)))
+ (else (lp (cdr ls) (cons (car ls) term) res))))))
+ (define (save)
+ (cons (cons flags (collect)) st))
+
+ ;; main parsing
+ (if (>= i end)
+ (if (pair? st)
+ (error "unterminated parenthesis in regexp" str)
+ (collect/terms))
+ (let ((c (string-ref str i)))
+ (case c
+ ((#\.)
+ (lp (+ i 1) (+ i 1) flags
+ (cons (if (flag-set? flags ~single-line?) 'any 'nonl)
+ (collect))
+ st))
+ ((#\?)
+ (let ((res (collect/single)))
+ (if (null? res)
+ (error "? can't follow empty sre" str res)
+ (let ((x (car res)))
+ (lp (+ i 1)
+ (+ i 1)
+ flags
+ (cons
+ (if (pair? x)
+ (case (car x)
+ ((*) `(*? ,@(cdr x)))
+ ((+) `(**? 1 #f ,@(cdr x)))
+ ((?) `(?? ,@(cdr x)))
+ ((**) `(**? ,@(cdr x)))
+ ((=) `(**? ,(cadr x) ,@(cdr x)))
+ ((>=) `(**? ,(cadr x) #f ,@(cddr x)))
+ (else `(? ,x)))
+ `(? ,x))
+ (cdr res))
+ st)))))
+ ((#\+ #\*)
+ (let* ((res (collect/single))
+ (x (car res))
+ (op (string->symbol (string c))))
+ (cond
+ ((sre-repeater? x)
+ (error "duplicate repetition (e.g. **) in sre" str res))
+ ((sre-empty? x)
+ (error "can't repeat empty sre (e.g. ()*)" str res))
+ (else
+ (lp (+ i 1) (+ i 1) flags
+ (cons (list op x) (cdr res))
+ st)))))
+ ((#\()
+ (cond
+ ((>= (+ i 1) end)
+ (error "unterminated parenthesis in regexp" str))
+ ((not (eqv? #\? (string-ref str (+ i 1))))
+ (lp (+ i 1) (+ i 1) (flag-join flags ~save?) '() (save)))
+ ((>= (+ i 2) end)
+ (error "unterminated parenthesis in regexp" str))
+ (else
+ (case (string-ref str (+ i 2))
+ ((#\#)
+ (let ((j (string-scan-char str #\) (+ i 3))))
+ (lp (+ j i) (+ j 1) flags (collect) st)))
+ ((#\:)
+ (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) '() (save)))
+ ((#\=)
+ (lp (+ i 3) (+ i 3) (flag-clear flags ~save?)
+ '(look-ahead) (save)))
+ ((#\!)
+ (lp (+ i 3) (+ i 3) (flag-clear flags ~save?)
+ '(neg-look-ahead) (save)))
+ ((#\<)
+ (cond
+ ((>= (+ i 3) end)
+ (error "unterminated parenthesis in regexp" str))
+ (else
+ (case (string-ref str (+ i 3))
+ ((#\=)
+ (lp (+ i 4) (+ i 4) (flag-clear flags ~save?)
+ '(look-behind) (save)))
+ ((#\!)
+ (lp (+ i 4) (+ i 4) (flag-clear flags ~save?)
+ '(neg-look-behind) (save)))
+ (else
+ (let ((j (and (char-alphabetic?
+ (string-ref str (+ i 3)))
+ (string-scan-char str #\> (+ i 4)))))
+ (if j
+ (lp (+ j 1) (+ j 1) (flag-clear flags ~save?)
+ `(,(string->symbol (substring str (+ i 3) j))
+ submatch-named)
+ (save))
+ (error "invalid (?< sequence" str))))))))
+ ((#\>)
+ (lp (+ i 3) (+ i 3) (flag-clear flags ~save?)
+ '(atomic) (save)))
+ ;;((#\' #\P) ; named subpatterns
+ ;; )
+ ;;((#\R) ; recursion
+ ;; )
+ ((#\()
+ (cond
+ ((>= (+ i 3) end)
+ (error "unterminated parenthesis in regexp" str))
+ ((char-numeric? (string-ref str (+ i 3)))
+ (let* ((j (string-scan-char str #\) (+ i 3)))
+ (n (string->number (substring str (+ i 3) j))))
+ (if (not n)
+ (error "invalid conditional reference" str)
+ (lp (+ j 1) (+ j 1) (flag-clear flags ~save?)
+ `(,n if) (save)))))
+ ((char-alphabetic? (string-ref str (+ i 3)))
+ (let* ((j (string-scan-char str #\) (+ i 3)))
+ (s (string->symbol (substring str (+ i 3) j))))
+ (lp (+ j 1) (+ j 1) (flag-clear flags ~save?)
+ `(,s if) (save))))
+ (else
+ (lp (+ i 2) (+ i 2) (flag-clear flags ~save?)
+ '(if) (save)))))
+ ((#\{)
+ (error "unsupported Perl-style cluster" str))
+ (else
+ (let ((old-flags flags))
+ (let lp2 ((j (+ i 2)) (flags flags) (invert? #f))
+ (define (join x)
+ ((if invert? flag-clear flag-join) flags x))
+ (define (new-res res)
+ (let ((before (flag-set? old-flags ~utf8?))
+ (after (flag-set? flags ~utf8?)))
+ (if (eq? before after)
+ res
+ (cons (if after 'w/utf8 'w/noutf8) res))))
+ (cond
+ ((>= j end)
+ (error "incomplete cluster" str i))
+ (else
+ (case (string-ref str j)
+ ((#\i)
+ (lp2 (+ j 1) (join ~case-insensitive?) invert?))
+ ((#\m)
+ (lp2 (+ j 1) (join ~multi-line?) invert?))
+ ((#\x)
+ (lp2 (+ j 1) (join ~ignore-space?) invert?))
+ ((#\u)
+ (if *allow-utf8-mode?*
+ (lp2 (+ j 1) (join ~utf8?) invert?)
+ (lp2 (+ j 1) flags invert?)))
+ ((#\-)
+ (lp2 (+ j 1) flags (not invert?)))
+ ((#\))
+ (lp (+ j 1) (+ j 1) flags (new-res (collect))
+ st))
+ ((#\:)
+ (lp (+ j 1) (+ j 1) flags (new-res '())
+ (cons (cons old-flags (collect)) st)))
+ (else
+ (error "unknown regex cluster modifier" str)
+ )))))))))))
+ ((#\))
+ (if (null? st)
+ (error "too many )'s in regexp" str)
+ (lp (+ i 1)
+ (+ i 1)
+ (caar st)
+ (cons (collect/terms) (cdar st))
+ (cdr st))))
+ ((#\[)
+ (apply
+ (lambda (sre j)
+ (lp (+ j 1) (+ j 1) flags (cons sre (collect)) st))
+ (string-parse-cset str (+ i 1) flags)))
+ ((#\{)
+ (if (or (>= (+ i 1) end)
+ (not (or (char-numeric? (string-ref str (+ i 1)))
+ (eqv? #\, (string-ref str (+ i 1))))))
+ (lp (+ i 1) from flags res st)
+ (let* ((res (collect/single))
+ (x (car res))
+ (tail (cdr res))
+ (j (string-scan-char str #\} (+ i 1)))
+ (s2 (string-split-char (substring str (+ i 1) j) #\,))
+ (n (or (string->number (car s2)) 0))
+ (m (and (pair? (cdr s2)) (string->number (cadr s2)))))
+ (cond
+ ((null? (cdr s2))
+ (lp (+ j 1) (+ j 1) flags `((= ,n ,x) ,@tail) st))
+ (m
+ (lp (+ j 1) (+ j 1) flags `((** ,n ,m ,x) ,@tail) st))
+ (else
+ (lp (+ j 1) (+ j 1) flags `((>= ,n ,x) ,@tail) st)
+ )))))
+ ((#\\)
+ (cond
+ ((>= (+ i 1) end)
+ (error "incomplete escape sequence" str))
+ (else
+ (let ((c (string-ref str (+ i 1))))
+ (case c
+ ((#\d)
+ (lp (+ i 2) (+ i 2) flags `(numeric ,@(collect)) st))
+ ((#\D)
+ (lp (+ i 2) (+ i 2) flags `((~ numeric) ,@(collect)) st))
+ ((#\s)
+ (lp (+ i 2) (+ i 2) flags `(space ,@(collect)) st))
+ ((#\S)
+ (lp (+ i 2) (+ i 2) flags `((~ space) ,@(collect)) st))
+ ((#\w)
+ (lp (+ i 2) (+ i 2) flags
+ `((or alphanumeric ("_")) ,@(collect)) st))
+ ((#\W)
+ (lp (+ i 2) (+ i 2) flags
+ `((~ (or alphanumeric ("_"))) ,@(collect)) st))
+ ((#\b)
+ (lp (+ i 2) (+ i 2) flags
+ `((or bow eow) ,@(collect)) st))
+ ((#\B)
+ (lp (+ i 2) (+ i 2) flags `(nwb ,@(collect)) st))
+ ((#\A)
+ (lp (+ i 2) (+ i 2) flags `(bos ,@(collect)) st))
+ ((#\Z)
+ (lp (+ i 2) (+ i 2) flags
+ `((? #\newline) eos ,@(collect)) st))
+ ((#\z)
+ (lp (+ i 2) (+ i 2) flags `(eos ,@(collect)) st))
+ ((#\R)
+ (lp (+ i 2) (+ i 2) flags `(newline ,@(collect)) st))
+ ((#\K)
+ (lp (+ i 2) (+ i 2) flags `(reset ,@(collect)) st))
+ ;; these two are from Emacs and TRE, but not PCRE
+ ((#\<)
+ (lp (+ i 2) (+ i 2) flags `(bow ,@(collect)) st))
+ ((#\>)
+ (lp (+ i 2) (+ i 2) flags `(eow ,@(collect)) st))
+ ((#\x)
+ (apply
+ (lambda (ch j)
+ (lp (+ j 1) (+ j 1) flags `(,ch ,@(collect)) st))
+ (string-parse-hex-escape str (+ i 2) end)))
+ ((#\k)
+ (let ((c (string-ref str (+ i 2))))
+ (if (not (memv c '(#\< #\{ #\')))
+ (error "bad \\k usage, expected \\k<...>" str)
+ (let* ((terminal (char-mirror c))
+ (j (string-scan-char str terminal (+ i 2)))
+ (s (and j (substring str (+ i 3) j)))
+ (backref
+ (if (flag-set? flags ~case-insensitive?)
+ 'backref-ci
+ 'backref)))
+ (if (not j)
+ (error "interminated named backref" str)
+ (lp (+ j 1) (+ j 1) flags
+ `((,backref ,(string->symbol s))
+ ,@(collect))
+ st))))))
+ ((#\Q) ;; \Q..\E escapes
+ (let ((res (collect)))
+ (let lp2 ((j (+ i 2)))
+ (cond
+ ((>= j end)
+ (lp j (+ i 2) flags res st))
+ ((eqv? #\\ (string-ref str j))
+ (cond
+ ((>= (+ j 1) end)
+ (lp (+ j 1) (+ i 2) flags res st))
+ ((eqv? #\E (string-ref str (+ j 1)))
+ (lp (+ j 2) (+ j 2) flags
+ (cons (substring str (+ i 2) j) res) st))
+ (else
+ (lp2 (+ j 2)))))
+ (else
+ (lp2 (+ j 1)))))))
+ ;;((#\p) ; XXXX unicode properties
+ ;; )
+ ;;((#\P)
+ ;; )
+ (else
+ (cond
+ ((char-numeric? c)
+ (let* ((j (or (string-scan-pred
+ str
+ (lambda (c) (not (char-numeric? c)))
+ (+ i 2))
+ end))
+ (backref
+ (if (flag-set? flags ~case-insensitive?)
+ 'backref-ci
+ 'backref))
+ (res `((,backref ,(string->number
+ (substring str (+ i 1) j)))
+ ,@(collect))))
+ (lp j j flags res st)))
+ ((char-alphabetic? c)
+ (let ((cell (assv c posix-escape-sequences)))
+ (if cell
+ (lp (+ i 2) (+ i 2) flags
+ (cons (cdr cell) (collect)) st)
+ (error "unknown escape sequence" str c))))
+ (else
+ (lp (+ i 2) (+ i 1) flags (collect) st)))))))))
+ ((#\|)
+ (lp (+ i 1) (+ i 1) flags (cons 'or (collect)) st))
+ ((#\^)
+ (let ((sym (if (flag-set? flags ~multi-line?) 'bol 'bos)))
+ (lp (+ i 1) (+ i 1) flags (cons sym (collect)) st)))
+ ((#\$)
+ (let ((sym (if (flag-set? flags ~multi-line?) 'eol 'eos)))
+ (lp (+ i 1) (+ i 1) flags (cons sym (collect)) st)))
+ ((#\space)
+ (if (flag-set? flags ~ignore-space?)
+ (lp (+ i 1) (+ i 1) flags (collect) st)
+ (lp (+ i 1) from flags res st)))
+ ((#\#)
+ (if (flag-set? flags ~ignore-space?)
+ (let ((j (or (string-scan-char str #\newline (+ i 1))
+ (- end 1))))
+ (lp (+ j 1) (+ j 1) flags (collect) st))
+ (lp (+ i 1) from flags res st)))
+ (else
+ (lp (+ i 1) from flags res st))))))))
+
+(define posix-escape-sequences
+ `((#\n . #\newline)
+ (#\r . ,(integer->char (+ (char->integer #\newline) 3)))
+ (#\t . ,(integer->char (- (char->integer #\newline) 1)))
+ (#\a . ,(integer->char (- (char->integer #\newline) 3)))
+ (#\e . ,(integer->char (+ (char->integer #\newline) #x11)))
+ (#\f . ,(integer->char (+ (char->integer #\newline) 2)))
+ ))
+
+(define (char-altcase c)
+ (if (char-upper-case? c) (char-downcase c) (char-upcase c)))
+
+(define (char-mirror c)
+ (case c ((#\<) #\>) ((#\{) #\}) ((#\() #\)) ((#\[) #\]) (else c)))
+
+(define (string-parse-hex-escape str i end)
+ (cond
+ ((>= i end)
+ (error "incomplete hex escape" str i))
+ ((eqv? #\{ (string-ref str i))
+ (let ((j (string-scan-char-escape str #\} (+ i 1))))
+ (if (not j)
+ (error "incomplete hex brace escape" str i)
+ (let* ((s (substring str (+ i 1) j))
+ (n (string->number s 16)))
+ (if n
+ (list (integer->char n) j)
+ (error "bad hex brace escape" s))))))
+ ((>= (+ i 1) end)
+ (error "incomplete hex escape" str i))
+ (else
+ (let* ((s (substring str i (+ i 2)))
+ (n (string->number s 16)))
+ (if n
+ (list (integer->char n) (+ i 2))
+ (error "bad hex escape" s))))))
+
+(define (string-parse-cset str start flags)
+ (let ((end (string-length str))
+ (invert? (eqv? #\^ (string-ref str start)))
+ (utf8? (flag-set? flags ~utf8?)))
+ (define (go i chars ranges)
+ (if (>= i end)
+ (error "incomplete char set")
+ (let ((c (string-ref str i)))
+ (case c
+ ((#\])
+ (if (and (null? chars) (null? ranges))
+ (go (+ i 1) (cons #\] chars) ranges)
+ (let ((ci? (flag-set? flags ~case-insensitive?))
+ (hi-chars (if utf8? (filter high-char? chars) '()))
+ (chars (if utf8? (remove high-char? chars) chars)))
+ (list
+ ((lambda (res)
+ (if invert? (cons '~ res) (sre-alternate res)))
+ (append
+ hi-chars
+ (if (pair? chars)
+ (list
+ (list (list->string
+ ((if ci?
+ cset-case-insensitive
+ (lambda (x) x))
+ (reverse chars)))))
+ '())
+ (if (pair? ranges)
+ (let ((res (if ci?
+ (cset-case-insensitive
+ (reverse ranges))
+ (reverse ranges))))
+ (list (cons '/ (alist->plist res))))
+ '())))
+ i))))
+ ((#\-)
+ (cond
+ ((or (= i start)
+ (and (= i (+ start 1)) (eqv? #\^ (string-ref str start)))
+ (eqv? #\] (string-ref str (+ i 1))))
+ (go (+ i 1) (cons c chars) ranges))
+ ((null? chars)
+ (error "bad char-set"))
+ (else
+ (let* ((c1 (car chars))
+ (c2 (string-ref str (+ i 1))))
+ (apply
+ (lambda (c2 j)
+ (if (char<? c2 c1)
+ (error "inverted range in char-set" c1 c2)
+ (go j (cdr chars) (cons (cons c1 c2) ranges))))
+ (cond
+ ((and (eqv? #\\ c2) (assv c2 posix-escape-sequences))
+ => (lambda (x) (list (cdr x) (+ i 3))))
+ ((and (eqv? #\\ c2)
+ (eqv? (string-ref str (+ i 2)) #\x))
+ (string-parse-hex-escape str (+ i 3) end))
+ ((and utf8? (<= #x80 (char->integer c2) #xFF))
+ (let ((len (utf8-start-char->length c2)))
+ (list (utf8-string-ref str (+ i 1) len) (+ i 1 len))))
+ (else
+ (list c2 (+ i 2)))))))))
+ ((#\[)
+ (let* ((inv? (eqv? #\^ (string-ref str (+ i 1))))
+ (i2 (if inv? (+ i 2) (+ i 1))))
+ (case (string-ref str i2)
+ ((#\:)
+ (let ((j (string-scan-char str #\: (+ i2 1))))
+ (if (or (not j) (not (eqv? #\] (string-ref str (+ j 1)))))
+ (error "incomplete character class" str)
+ (let* ((cset (sre->cset
+ (string->symbol
+ (substring str (+ i2 1) j))))
+ (cset (if inv? (cset-complement cset) cset)))
+ (go (+ j 2)
+ (append (filter char? cset) chars)
+ (append (filter pair? cset) ranges))))))
+ ((#\= #\.)
+ (error "collating sequences not supported" str))
+ (else
+ (go (+ i 1) (cons #\[ chars) ranges)))))
+ ((#\\)
+ (let ((c (string-ref str (+ i 1))))
+ (case c
+ ((#\d #\D #\s #\S #\w #\W)
+ (let ((cset (sre->cset (string->sre (string #\\ c)))))
+ (go (+ i 2)
+ (append (filter char? cset) chars)
+ (append (filter pair? cset) ranges))))
+ ((#\x)
+ (apply
+ (lambda (ch j)
+ (go j (cons ch chars) ranges))
+ (string-parse-hex-escape str (+ i 2) end)))
+ (else
+ (let ((c (cond ((assv c posix-escape-sequences) => cdr)
+ (else c))))
+ (go (+ i 2)
+ (cons (string-ref str (+ i 1)) (cons c chars))
+ ranges))))))
+ (else
+ (if (and utf8? (<= #x80 (char->integer c) #xFF))
+ (let ((len (utf8-start-char->length c)))
+ (go (+ i len)
+ (cons (utf8-string-ref str i len) chars)
+ ranges))
+ (go (+ i 1) (cons c chars) ranges)))))))
+ (if invert?
+ (go (+ start 1)
+ (if (flag-set? flags ~multi-line?) '(#\newline) '())
+ '())
+ (go start '() '()))))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; utf8 utilities
+
+;; Here are some hairy optimizations that need to be documented
+;; better. Thanks to these, we never do any utf8 processing once the
+;; regexp is compiled.
+
+;; two chars: ab..ef
+;; a[b..xFF]|[b-d][x80..xFF]|e[x80..xFF]
+
+;; three chars: abc..ghi
+;; ab[c..xFF]|a[d..xFF][x80..xFF]|
+;; [b..f][x80..xFF][x80..xFF]|
+;; g[x80..g][x80..xFF]|gh[x80..i]
+
+;; four chars: abcd..ghij
+;; abc[d..xFF]|ab[d..xFF][x80..xFF]|a[c..xFF][x80..xFF][x80..xFF]|
+;; [b..f][x80..xFF][x80..xFF][x80..xFF]|
+;; g[x80..g][x80..xFF][x80..xFF]|gh[x80..h][x80..xFF]|ghi[x80..j]
+
+(define (high-char? c) (<= #x80 (char->integer c)))
+
+;; number of total bytes in a utf8 char given the 1st byte
+
+(define utf8-start-char->length
+ (let ((table '#(
+1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 0x
+1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 1x
+1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 2x
+1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 3x
+1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 4x
+1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 5x
+1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 6x
+1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 7x
+1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 8x
+1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 9x
+1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; ax
+1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; bx
+2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ; cx
+2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ; dx
+3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 ; ex
+4 4 4 4 4 4 4 4 5 5 5 5 6 6 0 0 ; fx
+)))
+ (lambda (c) (vector-ref table (char->integer c)))))
+
+(define (utf8-string-ref str i len)
+ (define (byte n) (char->integer (string-ref str n)))
+ (case len
+ ((1) ; shouldn't happen in this module
+ (string-ref str i))
+ ((2)
+ (integer->char
+ (+ (bit-shl (bit-and (byte i) #b00011111) 6)
+ (bit-and (byte (+ i 1)) #b00111111))))
+ ((3)
+ (integer->char
+ (+ (bit-shl (bit-and (byte i) #b00001111) 12)
+ (bit-shl (bit-and (byte (+ i 1)) #b00111111) 6)
+ (bit-and (byte (+ i 2)) #b00111111))))
+ ((4)
+ (integer->char
+ (+ (bit-shl (bit-and (byte i) #b00000111) 18)
+ (bit-shl (bit-and (byte (+ i 1)) #b00111111) 12)
+ (bit-shl (bit-and (byte (+ i 2)) #b00111111) 6)
+ (bit-and (byte (+ i 3)) #b00111111))))
+ (else
+ (error "invalid utf8 length" str len i))))
+
+(define (utf8-backup-to-initial-char str i)
+ (let lp ((i i))
+ (if (= i 0)
+ 0
+ (let ((c (char->integer (string-ref str i))))
+ (if (or (< c #x80) (>= c #xC0))
+ i
+ (lp (- i 1)))))))
+
+(define (utf8-lowest-digit-of-length len)
+ (case len
+ ((1) 0) ((2) #xC0) ((3) #xE0) ((4) #xF0)
+ (else (error "invalid utf8 length" len))))
+
+(define (utf8-highest-digit-of-length len)
+ (case len
+ ((1) #x7F) ((2) #xDF) ((3) #xEF) ((4) #xF7)
+ (else (error "invalid utf8 length" len))))
+
+(define (char->utf8-list c)
+ (let ((i (char->integer c)))
+ (cond
+ ((<= i #x7F) (list i))
+ ((<= i #x7FF)
+ (list (bit-ior #b11000000 (bit-shr i 6))
+ (bit-ior #b10000000 (bit-and i #b111111))))
+ ((<= i #xFFFF)
+ (list (bit-ior #b11100000 (bit-shr i 12))
+ (bit-ior #b10000000 (bit-and (bit-shr i 6) #b111111))
+ (bit-ior #b10000000 (bit-and i #b111111))))
+ ((<= i #x1FFFFF)
+ (list (bit-ior #b11110000 (bit-shr i 18))
+ (bit-ior #b10000000 (bit-and (bit-shr i 12) #b111111))
+ (bit-ior #b10000000 (bit-and (bit-shr i 6) #b111111))
+ (bit-ior #b10000000 (bit-and i #b111111))))
+ (else (error "unicode codepoint out of range:" i)))))
+
+(define (unicode-range->utf8-pattern lo hi)
+ (let ((lo-ls (char->utf8-list lo))
+ (hi-ls (char->utf8-list hi)))
+ (if (not (= (length lo-ls) (length hi-ls)))
+ (sre-alternate (list (unicode-range-climb-digits lo-ls hi-ls)
+ (unicode-range-up-to hi-ls)))
+ (let lp ((lo-ls lo-ls) (hi-ls hi-ls))
+ (cond
+ ((null? lo-ls)
+ '())
+ ((= (car lo-ls) (car hi-ls))
+ (sre-sequence
+ (list (integer->char (car lo-ls))
+ (lp (cdr lo-ls) (cdr hi-ls)))))
+ ((= (+ (car lo-ls) 1) (car hi-ls))
+ (sre-alternate (list (unicode-range-up-from lo-ls)
+ (unicode-range-up-to hi-ls))))
+ (else
+ (sre-alternate (list (unicode-range-up-from lo-ls)
+ (unicode-range-middle lo-ls hi-ls)
+ (unicode-range-up-to hi-ls)))))))))
+
+(define (unicode-range-helper one ls prefix res)
+ (if (null? ls)
+ res
+ (unicode-range-helper
+ one
+ (cdr ls)
+ (cons (car ls) prefix)
+ (cons (sre-sequence
+ `(,@(map integer->char prefix)
+ ,(one (car ls))
+ ,@(map (lambda (_)
+ `(/ ,(integer->char #x80)
+ ,(integer->char #xFF)))
+ (cdr ls))))
+ res))))
+
+(define (unicode-range-up-from lo-ls)
+ (sre-sequence
+ (list (integer->char (car lo-ls))
+ (sre-alternate
+ (unicode-range-helper
+ (lambda (c)
+ `(/ ,(integer->char (+ (car lo-ls) 1)) ,(integer->char #xFF)))
+ (cdr (reverse (cdr lo-ls)))
+ '()
+ (list
+ (sre-sequence
+ (append
+ (map integer->char (reverse (cdr (reverse (cdr lo-ls)))))
+ `((/ ,(integer->char (last lo-ls))
+ ,(integer->char #xFF)))))))))))
+
+(define (unicode-range-up-to hi-ls)
+ (sre-sequence
+ (list (integer->char (car hi-ls))
+ (sre-alternate
+ (unicode-range-helper
+ (lambda (c)
+ `(/ ,(integer->char #x80) ,(integer->char (- (car hi-ls) 1))))
+ (cdr (reverse (cdr hi-ls)))
+ '()
+ (list
+ (sre-sequence
+ (append
+ (map integer->char (reverse (cdr (reverse (cdr hi-ls)))))
+ `((/ ,(integer->char #x80)
+ ,(integer->char (last hi-ls))))))))))))
+
+(define (unicode-range-climb-digits lo-ls hi-ls)
+ (let ((lo-len (length lo-ls)))
+ (sre-alternate
+ (append
+ (list
+ (sre-sequence
+ (cons `(/ ,(integer->char (car lo-ls))
+ ,(integer->char (if (<= (car lo-ls) #x7F) #x7F #xFF)))
+ (map (lambda (_)
+ `(/ ,(integer->char #x80) ,(integer->char #xFF)))
+ (cdr lo-ls)))))
+ (map
+ (lambda (i)
+ (sre-sequence
+ (cons
+ `(/ ,(integer->char (utf8-lowest-digit-of-length (+ i lo-len 1)))
+ ,(integer->char (utf8-highest-digit-of-length (+ i lo-len 1))))
+ (map (lambda (_)
+ `(/ ,(integer->char #x80) ,(integer->char #xFF)))
+ (zero-to (+ i lo-len))))))
+ (zero-to (- (length hi-ls) lo-len 1)))
+ (list
+ (sre-sequence
+ (cons `(/ ,(integer->char
+ (utf8-lowest-digit-of-length
+ (utf8-start-char->length
+ (integer->char (- (car hi-ls) 1)))))
+ ,(integer->char (- (car hi-ls) 1)))
+ (map (lambda (_)
+ `(/ ,(integer->char #x80) ,(integer->char #xFF)))
+ (cdr hi-ls)))))))))
+
+(define (unicode-range-middle lo-ls hi-ls)
+ (let ((lo (integer->char (+ (car lo-ls) 1)))
+ (hi (integer->char (- (car hi-ls) 1))))
+ (sre-sequence
+ (cons (if (char=? lo hi) lo `(/ ,lo ,hi))
+ (map (lambda (_) `(/ ,(integer->char #x80) ,(integer->char #xFF)))
+ (cdr lo-ls))))))
+
+(define (cset->utf8-pattern cset)
+ (let lp ((ls cset) (alts '()) (lo-cset '()))
+ (cond
+ ((null? ls)
+ (sre-alternate (append (reverse alts)
+ (if (null? lo-cset)
+ '()
+ (list (cons '/ (reverse lo-cset)))))))
+ ((char? (car ls))
+ (if (high-char? (car ls))
+ (lp (cdr ls) (cons (car ls) alts) lo-cset)
+ (lp (cdr ls) alts (cons (car ls) lo-cset))))
+ (else
+ (if (or (high-char? (caar ls)) (high-char? (cdar ls)))
+ (lp (cdr ls)
+ (cons (unicode-range->utf8-pattern (caar ls) (cdar ls)) alts)
+ lo-cset)
+ (lp (cdr ls) alts (cons (cdar ls) (cons (caar ls) lo-cset))))))))
+
+(define (sre-adjust-utf8 sre flags)
+ (let adjust ((sre sre)
+ (utf8? (flag-set? flags ~utf8?))
+ (ci? (flag-set? flags ~case-insensitive?)))
+ (define (rec sre) (adjust sre utf8? ci?))
+ (cond
+ ((pair? sre)
+ (case (car sre)
+ ((w/utf8) (adjust (sre-sequence (cdr sre)) #t ci?))
+ ((w/noutf8) (adjust (sre-sequence (cdr sre)) #f ci?))
+ ((w/case)
+ (cons (car sre) (map (lambda (s) (adjust s utf8? #f)) (cdr sre))))
+ ((w/nocase)
+ (cons (car sre) (map (lambda (s) (adjust s utf8? #t)) (cdr sre))))
+ ((/ ~ & -)
+ (if (not utf8?)
+ sre
+ (let ((cset (sre->cset sre ci?)))
+ (if (any (lambda (x)
+ (if (pair? x)
+ (or (high-char? (car x)) (high-char? (cdr x)))
+ (high-char? x)))
+ cset)
+ (if ci?
+ (list 'w/case (cset->utf8-pattern cset))
+ (cset->utf8-pattern cset))
+ sre))))
+ ((*)
+ (case (sre-sequence (cdr sre))
+ ;; special case optimization: .* w/utf8 == .* w/noutf8
+ ((any) '(* any))
+ ((nonl) '(* nonl))
+ (else (cons '* (map rec (cdr sre))))))
+ (else
+ (cons (car sre) (map rec (cdr sre))))))
+ (else
+ (case sre
+ ((any) 'utf8-any)
+ ((nonl) 'utf8-nonl)
+ (else
+ (if (and utf8? (char? sre) (high-char? sre))
+ (sre-sequence (map integer->char (char->utf8-list sre)))
+ sre)))))))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; compilation
+
+(define (irregex x . o)
+ (cond
+ ((irregex? x) x)
+ ((string? x) (apply string->irregex x o))
+ (else (apply sre->irregex x o))))
+
+(define (string->irregex str . o)
+ (apply sre->irregex (apply string->sre str o) o))
+
+(define (sre->irregex sre . o)
+ (let* ((pat-flags (symbol-list->flags o))
+ (sre (if *allow-utf8-mode?*
+ (sre-adjust-utf8 sre pat-flags)
+ sre))
+ (searcher? (sre-searcher? sre))
+ (sre-dfa (if searcher? (sre-remove-initial-bos sre) sre))
+ (dfa-limit (cond ((memq 'small o) 1) ((memq 'fast o) 50) (else 10)))
+ (dfa/search
+ (if searcher?
+ #t
+ (cond ((sre->nfa `(seq (* any) ,sre-dfa) pat-flags)
+ => (lambda (nfa) (nfa->dfa nfa (* dfa-limit (length nfa)))))
+ (else #f))))
+ (dfa (cond ((and dfa/search (sre->nfa sre-dfa pat-flags))
+ => (lambda (nfa) (nfa->dfa nfa (* dfa-limit (length nfa)))))
+ (else #f)))
+ (extractor (and dfa dfa/search (sre-match-extractor sre-dfa)))
+ (submatches (sre-count-submatches sre-dfa))
+ (names (sre-names sre-dfa 1 '()))
+ (lens (sre-length-ranges sre-dfa names))
+ (flags (flag-join
+ (flag-join ~none (and searcher? ~searcher?))
+ (and (sre-consumer? sre) ~consumer?))))
+ (cond
+ (dfa
+ (make-irregex dfa dfa/search extractor #f flags submatches lens names))
+ (else
+ (let ((f (sre->procedure sre pat-flags names)))
+ (make-irregex #f #f #f f flags submatches lens names))))))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sre analysis
+
+;; returns #t if the sre can ever be empty
+(define (sre-empty? sre)
+ (if (pair? sre)
+ (case (car sre)
+ ((* ? look-ahead look-behind neg-look-ahead neg-look-behind) #t)
+ ((**) (or (not (number? (cadr sre))) (zero? (cadr sre))))
+ ((or) (any sre-empty? (cdr sre)))
+ ((: seq submatch + atomic) (every sre-empty? (cdr sre)))
+ (else #f))
+ (memq sre '(epsilon bos eos bol eol bow eow commit))))
+
+(define (sre-any? sre)
+ (or (eq? sre 'any)
+ (and (pair? sre)
+ (case (car sre)
+ ((seq : submatch)
+ (and (pair? (cdr sre)) (null? (cddr sre)) (sre-any? (cadr sre))))
+ ((or) (every sre-any? (cdr sre)))
+ (else #f)))))
+
+(define (sre-repeater? sre)
+ (and (pair? sre)
+ (or (memq (car sre) '(* +))
+ (and (memq (car sre) '(submatch seq :))
+ (pair? (cdr sre))
+ (null? (cddr sre))
+ (sre-repeater? (cadr sre))))))
+
+(define (sre-searcher? sre)
+ (if (pair? sre)
+ (case (car sre)
+ ((* +) (sre-any? (sre-sequence (cdr sre))))
+ ((seq : submatch) (and (pair? (cdr sre)) (sre-searcher? (cadr sre))))
+ ((or) (every sre-searcher? (cdr sre)))
+ (else #f))
+ (eq? 'bos sre)))
+
+(define (sre-consumer? sre)
+ (if (pair? sre)
+ (case (car sre)
+ ((* +) (sre-any? (sre-sequence (cdr sre))))
+ ((seq : submatch) (and (pair? (cdr sre)) (sre-consumer? (last sre))))
+ ((or) (every sre-consumer? (cdr sre)))
+ (else #f))
+ (eq? 'eos sre)))
+
+(define (sre-has-submatchs? sre)
+ (and (pair? sre)
+ (or (eq? 'submatch (car sre))
+ (any sre-has-submatchs? (cdr sre)))))
+
+(define (sre-count-submatches sre)
+ (let count ((sre sre) (sum 0))
+ (if (pair? sre)
+ (fold count
+ (+ sum (case (car sre)
+ ((submatch submatch-named) 1)
+ ((dsm) (+ (cadr sre) (caddr sre)))
+ (else 0)))
+ (cdr sre))
+ sum)))
+
+(define (sre-length-ranges sre . o)
+ (let ((names (if (pair? o) (car o) (sre-names sre 1 '())))
+ (sublens (make-vector (+ 1 (sre-count-submatches sre)) #f)))
+ (vector-set!
+ sublens
+ 0
+ (let lp ((sre sre) (n 1) (lo 0) (hi 0) (return cons))
+ (define (grow i) (return (+ lo i) (and hi (+ hi i))))
+ (cond
+ ((pair? sre)
+ (if (string? (car sre))
+ (grow 1)
+ (case (car sre)
+ ((/ ~ & -)
+ (grow 1))
+ ((posix-string)
+ (lp (string->sre (cadr sre)) n lo hi return))
+ ((seq : w/case w/nocase atomic)
+ (let lp2 ((ls (cdr sre)) (n n) (lo2 0) (hi2 0))
+ (if (null? ls)
+ (return (+ lo lo2) (and hi hi2 (+ hi hi2)))
+ (lp (car ls) n 0 0
+ (lambda (lo3 hi3)
+ (lp2 (cdr ls)
+ (+ n (sre-count-submatches (car ls)))
+ (+ lo2 lo3)
+ (and hi2 hi3 (+ hi2 hi3))))))))
+ ((or)
+ (let lp2 ((ls (cdr sre)) (n n) (lo2 #f) (hi2 0))
+ (if (null? ls)
+ (return (+ lo lo2) (and hi hi2 (+ hi hi2)))
+ (lp (car ls) n 0 0
+ (lambda (lo3 hi3)
+ (lp2 (cdr ls)
+ (+ n (sre-count-submatches (car ls)))
+ (if lo2 (min lo2 lo3) lo3)
+ (and hi2 hi3 (max hi2 hi3))))))))
+ ((if)
+ (cond
+ ((or (null? (cdr sre)) (null? (cddr sre)))
+ (return lo hi))
+ (else
+ (let ((n1 (sre-count-submatches (car sre)))
+ (n2 (sre-count-submatches (cadr sre))))
+ (lp (if (or (number? (cadr sre)) (symbol? (cadr sre)))
+ 'epsilon
+ (cadr sre))
+ n lo hi
+ (lambda (lo2 hi2)
+ (lp (caddr sre) (+ n n1) 0 0
+ (lambda (lo3 hi3)
+ (lp (if (pair? (cdddr sre))
+ (cadddr sre)
+ 'epsilon)
+ (+ n n1 n2) 0 0
+ (lambda (lo4 hi4)
+ (return (+ lo2 (min lo3 lo4))
+ (and hi2 hi3 hi4
+ (+ hi2 (max hi3 hi4))
+ ))))))))))))
+ ((dsm)
+ (lp (sre-sequence (cdddr sre)) (+ n (cadr sre)) lo hi return))
+ ((submatch submatch-named)
+ (lp (sre-sequence
+ (if (eq? 'submatch (car sre)) (cdr sre) (cddr sre)))
+ (+ n 1) lo hi
+ (lambda (lo2 hi2)
+ (vector-set! sublens n (cons lo2 hi2))
+ (return lo2 hi2))))
+ ((backref backref-ci)
+ (let ((n (cond
+ ((number? (cadr sre)) (cadr sre))
+ ((assq (cadr sre) names) => cdr)
+ (else (error "unknown backreference" (cadr sre))))))
+ (cond
+ ((or (not (integer? n))
+ (not (< 0 n (vector-length sublens))))
+ (error "sre-length: invalid backreference" sre))
+ ((not (vector-ref sublens n))
+ (error "sre-length: invalid forward backreference" sre))
+ (else
+ (let ((lo2 (car (vector-ref sublens n)))
+ (hi2 (cdr (vector-ref sublens n))))
+ (return (+ lo lo2) (and hi hi2 (+ hi hi2))))))))
+ ((* *?)
+ (lp (sre-sequence (cdr sre)) n lo hi (lambda (lo hi) #f))
+ (return lo #f))
+ ((** **?)
+ (cond
+ ((or (and (number? (cadr sre))
+ (number? (caddr sre))
+ (> (cadr sre) (caddr sre)))
+ (and (not (cadr sre)) (caddr sre)))
+ (return lo hi))
+ (else
+ (if (caddr sre)
+ (lp (sre-sequence (cdddr sre)) n 0 0
+ (lambda (lo2 hi2)
+ (return (+ lo (* (cadr sre) lo2))
+ (and hi hi2 (+ hi (* (caddr sre) hi2))))))
+ (lp (sre-sequence (cdddr sre)) n 0 0
+ (lambda (lo2 hi2)
+ (return (+ lo (* (cadr sre) lo2)) #f)))))))
+ ((+)
+ (lp (sre-sequence (cdr sre)) n lo hi
+ (lambda (lo2 hi2)
+ (return (+ lo lo2) #f))))
+ ((? ??)
+ (lp (sre-sequence (cdr sre)) n lo hi
+ (lambda (lo2 hi2)
+ (return lo (and hi hi2 (+ hi hi2))))))
+ ((= =? >= >=?)
+ (lp `(** ,(cadr sre)
+ ,(if (memq (car sre) '(>= >=?)) #f (cadr sre))
+ ,@(cddr sre))
+ n lo hi return))
+ ((look-ahead neg-look-ahead look-behind neg-look-behind)
+ (return lo hi))
+ (else
+ (error "sre-length-ranges: unknown sre operator" sre)))))
+ ((char? sre)
+ (grow 1))
+ ((string? sre)
+ (grow (string-length sre)))
+ ((memq sre '(any nonl))
+ (grow 1))
+ ((memq sre '(epsilon bos eos bol eol bow eow nwb commit))
+ (return lo hi))
+ (else
+ (let ((cell (assq sre sre-named-definitions)))
+ (if cell
+ (lp (cdr cell) n lo hi return)
+ (error "sre-length-ranges: unknown sre" sre)))))))
+ sublens))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sre manipulation
+
+;; build a (seq ls ...) sre from a list
+(define (sre-sequence ls)
+ (cond
+ ((null? ls) 'epsilon)
+ ((null? (cdr ls)) (car ls))
+ (else (cons 'seq ls))))
+
+;; build a (or ls ...) sre from a list
+(define (sre-alternate ls)
+ (cond
+ ((null? ls) 'epsilon)
+ ((null? (cdr ls)) (car ls))
+ (else (cons 'or ls))))
+
+;; returns an equivalent SRE without any match information
+(define (sre-strip-submatches sre)
+ (if (not (pair? sre))
+ sre
+ (case (car sre)
+ ((submatch) (sre-strip-submatches (sre-sequence (cdr sre))))
+ ((dsm) (sre-strip-submatches (sre-sequence (cdddr sre))))
+ (else (map sre-strip-submatches sre)))))
+
+;; given a char-set list of chars and strings, flattens them into
+;; chars only
+(define (sre-flatten-ranges ls)
+ (let lp ((ls ls) (res '()))
+ (cond
+ ((null? ls)
+ (reverse res))
+ ((string? (car ls))
+ (lp (append (string->list (car ls)) (cdr ls)) res))
+ (else
+ (lp (cdr ls) (cons (car ls) res))))))
+
+(define (sre-names sre n names)
+ (if (not (pair? sre))
+ names
+ (case (car sre)
+ ((submatch)
+ (sre-names (sre-sequence (cdr sre)) (+ n 1) names))
+ ((submatch-named)
+ (sre-names (sre-sequence (cddr sre))
+ (+ n 1)
+ (cons (cons (cadr sre) n) names)))
+ ((dsm)
+ (sre-names (sre-sequence (cdddr sre)) (+ n (cadr sre)) names))
+ ((seq : or * + ? *? ?? w/case w/nocase atomic
+ look-ahead look-behind neg-look-ahead neg-look-behind)
+ (sre-sequence-names (cdr sre) n names))
+ ((= >=)
+ (sre-sequence-names (cddr sre) n names))
+ ((** **?)
+ (sre-sequence-names (cdddr sre) n names))
+ (else
+ names))))
+
+(define (sre-sequence-names ls n names)
+ (if (null? ls)
+ names
+ (sre-sequence-names (cdr ls)
+ (+ n (sre-count-submatches (car ls)))
+ (sre-names (car ls) n names))))
+
+(define (sre-remove-initial-bos sre)
+ (cond
+ ((pair? sre)
+ (case (car sre)
+ ((seq : submatch * +)
+ (cond
+ ((not (pair? (cdr sre)))
+ sre)
+ ((eq? 'bos (cadr sre))
+ (cons (car sre) (cddr sre)))
+ (else
+ (cons (car sre)
+ (cons (sre-remove-initial-bos (cadr sre)) (cddr sre))))))
+ ((or)
+ (sre-alternate (map sre-remove-initial-bos (cdr sre))))
+ (else
+ sre)))
+ (else
+ sre)))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; matching
+
+(define (irregex-search x str . o)
+ (let ((irx (irregex x)))
+ (let ((start (if (pair? o) (car o) 0))
+ (end (if (and (pair? o) (pair? (cdr o)))
+ (cadr o) (string-length str)))
+ (matches (irregex-new-matches irx)))
+ (irregex-match-string-set! matches str)
+ (irregex-search/matches irx str start end matches))))
+
+;; internal routine, can be used in loops to avoid reallocating the
+;; match vector
+(define (irregex-search/matches irx str start end matches)
+ (cond
+ ((irregex-dfa irx)
+ (cond
+ ((flag-set? (irregex-flags irx) ~searcher?)
+ (let ((m-end (dfa-match/longest (irregex-dfa irx) str start end)))
+ (cond
+ (m-end
+ (irregex-match-start-index-set! matches 0 start)
+ (irregex-match-end-index-set! matches 0 m-end)
+ ((irregex-dfa/extract irx) str start m-end matches)
+ matches)
+ (else
+ #f))))
+ (else
+ (let ((first-match
+ (dfa-match/shortest (irregex-dfa/search irx) str start end)))
+ (and
+ first-match
+ (let* ((lo+hi (vector-ref (irregex-lengths irx) 0))
+ (m-start (if (cdr lo+hi)
+ (max start (- first-match (cdr lo+hi)))
+ start))
+ (m-limit (- first-match (car lo+hi)))
+ (dfa (irregex-dfa irx)))
+ (let lp ((m-start m-start))
+ (and (<= m-start m-limit)
+ (let ((m-end (dfa-match/longest dfa str m-start end)))
+ (cond
+ (m-end
+ (irregex-match-start-index-set! matches 0 m-start)
+ (irregex-match-end-index-set! matches 0 m-end)
+ ((irregex-dfa/extract irx) str m-start m-end matches)
+ matches)
+ (else
+ (lp (+ m-start 1)))))))))))))
+ (else
+ (let ((matcher (irregex-nfa irx)))
+ (let lp ((start start))
+ (and (<= start end)
+ (let ((i (matcher str start matches (lambda () #f))))
+ (cond
+ (i
+ (irregex-match-start-index-set! matches 0 start)
+ (irregex-match-end-index-set! matches 0 i)
+ matches)
+ (else
+ (lp (+ start 1)))))))))))
+
+(define (irregex-match irx str)
+ (let* ((irx (irregex irx))
+ (matches (irregex-new-matches irx))
+ (start 0)
+ (end (string-length str)))
+ (irregex-match-string-set! matches str)
+ (cond
+ ((irregex-dfa irx)
+ (let ((m-end (dfa-match/longest (irregex-dfa irx) str start end)))
+ (cond
+ ((equal? m-end end)
+ (irregex-match-start-index-set! matches 0 start)
+ (irregex-match-end-index-set! matches 0 m-end)
+ ((irregex-dfa/extract irx) str start m-end matches)
+ matches)
+ (else
+ #f))))
+ (else
+ (let* ((matcher (irregex-nfa irx))
+ (i (matcher str start matches (lambda () #f))))
+ (cond
+ ((equal? i end)
+ (irregex-match-start-index-set! matches 0 start)
+ (irregex-match-end-index-set! matches 0 i)
+ matches)
+ (else
+ #f)))))))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; DFA matching
+
+;; inline these
+(define (dfa-init-state dfa)
+ (vector-ref dfa 0))
+(define (dfa-next-state dfa node)
+ (vector-ref dfa (cdr node)))
+(define (dfa-final-state? dfa state)
+ (car state))
+
+;; this searches for the first end index for which a match is possible
+(define (dfa-match/shortest dfa str start end)
+ (let lp ((i start) (state (dfa-init-state dfa)))
+ (if (dfa-final-state? dfa state)
+ i
+ (and (< i end)
+ (let* ((ch (string-ref str i))
+ (next (find (lambda (x)
+ (or (eqv? ch (car x))
+ (and (pair? (car x))
+ (char<=? (caar x) ch)
+ (char<=? ch (cdar x)))))
+ (cdr state))))
+ (and next (lp (+ i 1) (dfa-next-state dfa next))))))))
+
+;; this finds the longest match starting at a given index
+(define (dfa-match/longest dfa str start end)
+ (let lp ((i start)
+ (state (dfa-init-state dfa))
+ (res (and (dfa-final-state? dfa (dfa-init-state dfa)) start)))
+ (if (>= i end)
+ res
+ (let* ((ch (string-ref str i))
+ (cell (find (lambda (x)
+ (or (eqv? ch (car x))
+ (and (pair? (car x))
+ (char<=? (caar x) ch)
+ (char<=? ch (cdar x)))))
+ (cdr state))))
+ (if cell
+ (let ((next (dfa-next-state dfa cell)))
+ (lp (+ i 1)
+ next
+ (if (dfa-final-state? dfa next) (+ i 1) res)))
+ res)))))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; SRE->NFA compilation
+;;
+;; An NFA state is a numbered node with a list of patter->number
+;; transitions, where pattern is either a character, (lo . hi)
+;; character range, or epsilon (indicating an empty transition).
+;; There may be duplicate characters and overlapping ranges - since
+;; it's an NFA we process it by considering all possible transitions.
+
+(define sre-named-definitions
+ `((any . ,*all-chars*)
+ (nonl . (- ,*all-chars* (,(string #\newline))))
+ (alphabetic . (/ #\a #\z #\A #\Z))
+ (alpha . alphabetic)
+ (alphanumeric . (/ #\a #\z #\A #\Z #\0 #\9))
+ (alphanum . alphanumeric)
+ (alnum . alphanumeric)
+ (lower-case . (/ #\a #\z))
+ (lower . lower-case)
+ (upper-case . (/ #\A #\Z))
+ (upper . upper-case)
+ (numeric . (/ #\0 #\9))
+ (num . numeric)
+ (digit . numeric)
+ (punctuation . (or #\! #\" #\# #\% #\& #\' #\( #\) #\* #\, #\- #\.
+ #\/ #\: #\; #\? #\@ #\[ #\\ #\] #\_ #\{ #\}))
+ (punct . punctuation)
+ (graphic
+ . (or alphanumeric punctuation #\$ #\+ #\< #\= #\> #\^ #\` #\| #\~))
+ (graph . graphic)
+ (blank . (or #\space ,(integer->char (- (char->integer #\space) 23))))
+ (whitespace . (or blank #\newline))
+ (space . whitespace)
+ (white . whitespace)
+ (printing or graphic whitespace)
+ (print . printing)
+ ;; XXXX we assume a (possibly shifted) ASCII-based ordering
+ (control . (/ ,(integer->char (- (char->integer #\space) 32))
+ ,(integer->char (- (char->integer #\space) 1))))
+ (cntrl . control)
+ (hex-digit . (or numeric (/ #\a #\f #\A #\F)))
+ (xdigit . hex-digit)
+ (ascii . (/ ,(integer->char (- (char->integer #\space) 32))
+ ,(integer->char (+ (char->integer #\space) 95))))
+ (ascii-nonl . (/ ,(integer->char (- (char->integer #\space) 32))
+ ,(integer->char (- (char->integer #\newline) 1))
+ ,(integer->char (+ (char->integer #\newline) 1))
+ ,(integer->char (+ (char->integer #\space) 95))))
+ (newline . (or (seq ,(integer->char (+ (char->integer #\newline) 3))
+ #\newline)
+ (/ #\newline
+ ,(integer->char (+ (char->integer #\newline) 3)))))
+
+ ;; ... it's really annoying to support scheme48
+ (word . (seq bow (+ (or alphanumeric #\_)) eow))
+ (utf8-tail-char . (/ ,(integer->char (+ (char->integer #\space) #x60))
+ ,(integer->char (+ (char->integer #\space) #xA1))))
+ (utf8-2-char . (seq (/ ,(integer->char (+ (char->integer #\space) #xA2))
+ ,(integer->char (+ (char->integer #\space) #xBF)))
+ utf8-tail-char))
+ (utf8-3-char . (seq (/ ,(integer->char (+ (char->integer #\space) #xC0))
+ ,(integer->char (+ (char->integer #\space) #xCF)))
+ utf8-tail-char
+ utf8-tail-char))
+ (utf8-4-char . (seq (/ ,(integer->char (+ (char->integer #\space) #xD0))
+ ,(integer->char (+ (char->integer #\space) #xD7)))
+ utf8-tail-char
+ utf8-tail-char
+ utf8-tail-char))
+ (utf8-any . (or ascii utf8-2-char utf8-3-char utf8-4-char))
+ (utf8-nonl . (or ascii-nonl utf8-2-char utf8-3-char utf8-4-char))
+ ))
+
+;; Compile and return the list of NFA states. The start state will be
+;; at the head of the list, and all remaining states will be in
+;; descending numeric order, with state 0 being the unique accepting
+;; state.
+(define (sre->nfa sre . o)
+ ;; we loop over an implicit sequence list
+ (let lp ((ls (list sre))
+ (n 1)
+ (flags (if (pair? o) (car o) ~none))
+ (next (list (list 0))))
+ (define (new-state-number state)
+ (max n (+ 1 (caar state))))
+ (define (extend-state next . trans)
+ (and next
+ (cons (cons (new-state-number next)
+ (map (lambda (x) (cons x (caar next))) trans))
+ next)))
+ (if (null? ls)
+ next
+ (cond
+ ((string? (car ls))
+ ;; process literal strings a char at a time
+ (lp (append (string->list (car ls)) (cdr ls)) n flags next))
+ ((eq? 'epsilon (car ls))
+ ;; chars and epsilons go directly into the transition table
+ (extend-state (lp (cdr ls) n flags next) (car ls)))
+ ((char? (car ls))
+ (let ((alt (char-altcase (car ls))))
+ (if (and (flag-set? flags ~case-insensitive?)
+ (not (eqv? (car ls) alt)))
+ (extend-state (lp (cdr ls) n flags next) (car ls) alt)
+ (extend-state (lp (cdr ls) n flags next) (car ls)))))
+ ((symbol? (car ls))
+ (let ((cell (assq (car ls) sre-named-definitions)))
+ (and cell (lp (cons (cdr cell) (cdr ls)) n flags next))))
+ ((pair? (car ls))
+ (cond
+ ((string? (caar ls))
+ ;; enumerated character set
+ (lp (cons (sre-alternate (string->list (caar ls))) (cdr ls))
+ n
+ flags
+ next))
+ (else
+ (case (caar ls)
+ ((seq :)
+ ;; for an explicit sequence, just append to the list
+ (lp (append (cdar ls) (cdr ls)) n flags next))
+ ((w/case w/nocase w/utf8 w/noutf8)
+ (let* ((next (lp (cdr ls) n flags next))
+ (flags ((if (memq (caar ls) '(w/case w/utf8))
+ flag-clear
+ flag-join)
+ flags
+ (if (memq (caar ls) '(w/case w/nocase))
+ ~case-insensitive?
+ ~utf8?))))
+ (and next (lp (cdar ls) (new-state-number next) flags next))))
+ ((/ - & ~)
+ (let ((ranges (sre->cset (car ls)
+ (flag-set? flags ~case-insensitive?))))
+ (case (length ranges)
+ ((1)
+ (extend-state (lp (cdr ls) n flags next) (car ranges)))
+ (else
+ (let ((next (lp (cdr ls) n flags next)))
+ (and
+ next
+ (lp (list (sre-alternate
+ (map (lambda (x) (if (pair? x)
+ (list '/ (car x) (cdr x))
+ x))
+ ranges)))
+ (new-state-number next)
+ (flag-clear flags ~case-insensitive?)
+ next)))))))
+ ((or)
+ (let* ((next (lp (cdr ls) n flags next))
+ (b (and next
+ (lp (list (sre-alternate (cddar ls)))
+ (new-state-number next)
+ flags
+ next)))
+ (a (and b (lp (list (cadar ls))
+ (new-state-number b)
+ flags
+ next))))
+ ;; compile both branches and insert epsilon
+ ;; transitions to either
+ (and a
+ `((,(new-state-number a)
+ (epsilon . ,(caar a))
+ (epsilon . ,(caar b)))
+ ,@(take-up-to a next)
+ ,@b))))
+ ((?)
+ (let ((next (lp (cdr ls) n flags next)))
+ ;; insert an epsilon transition directly to next
+ (and
+ next
+ (let ((a (lp (cdar ls) (new-state-number next) flags next)))
+ (cond
+ (a
+ (set-cdr! (car a) `((epsilon . ,(caar next)) ,@(cdar a)))
+ a)
+ (else
+ #f))))))
+ ((+ *)
+ (let ((next (lp (cdr ls) n flags next)))
+ (and
+ next
+ (let* ((new (lp '(epsilon)
+ (new-state-number next)
+ flags
+ next))
+ (a (lp (cdar ls) (new-state-number new) flags new)))
+ (and
+ a
+ (begin
+ ;; for *, insert an epsilon transition as in ? above
+ (if (eq? '* (caar ls))
+ (set-cdr! (car a)
+ `((epsilon . ,(caar new)) ,@(cdar a))))
+ ;; for both, insert a loop back to self
+ (set-cdr! (car new)
+ `((epsilon . ,(caar a)) ,@(cdar new)))
+ a))))))
+ ((submatch submatch-named)
+ ;; ignore submatches altogether
+ (lp (cons (sre-sequence (cdar ls)) (cdr ls)) n flags next))
+ (else
+ #f)))))
+ (else
+ #f)))))
+
+;; We don't really want to use this, we use the closure compilation
+;; below instead, but this is included for reference and testing the
+;; sre->nfa conversion.
+
+;; (define (nfa-match nfa str)
+;; (let lp ((ls (string->list str)) (state (car nfa)) (epsilons '()))
+;; (if (null? ls)
+;; (zero? (car state))
+;; (any (lambda (m)
+;; (if (eq? 'epsilon (car m))
+;; (and (not (memv (cdr m) epsilons))
+;; (lp ls (assv (cdr m) nfa) (cons (cdr m) epsilons)))
+;; (and (or (eqv? (car m) (car ls))
+;; (and (pair? (car m))
+;; (char<=? (caar m) (car ls))
+;; (char<=? (car ls) (cdar m))))
+;; (lp (cdr ls) (assv (cdr m) nfa) '()))))
+;; (cdr state)))))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; NFA->DFA compilation
+;;
+;; During processing, the DFA is a list of the form:
+;;
+;; ((NFA-states ...) accepting-state? transitions ...)
+;;
+;; where the transitions are as in the NFA, except there are no
+;; epsilons, duplicate characters or overlapping char-set ranges, and
+;; the states moved to are closures (sets of NFA states). Multiple
+;; DFA states may be accepting states.
+
+(define (nfa->dfa nfa . o)
+ (let ((max-states (and (pair? o) (car o))))
+ (let lp ((ls (list (nfa-closure nfa (list (caar nfa)))))
+ (i 0)
+ (res '()))
+ (cond
+ ((null? ls)
+ (dfa-renumber (reverse res)))
+ ((assoc (car ls) res)
+ (lp (cdr ls) i res))
+ (else
+ (let* ((states (car ls))
+ (trans (nfa-state-transitions nfa states))
+ (accept? (and (memv 0 states) #t)))
+ (and (or (not max-states) (< (+ i 1) max-states))
+ (lp (append (map cdr trans) (cdr ls))
+ (+ i 1)
+ `((,states ,accept? ,@trans) ,@res)))))))))
+
+;; When the conversion is complete we renumber the DFA sets-of-states
+;; in order and convert the result to a vector for fast lookup.
+(define (dfa-renumber dfa)
+ (let ((states (map cons (map car dfa) (zero-to (length dfa)))))
+ (define (renumber state)
+ (cdr (assoc state states)))
+ (list->vector
+ (map
+ (lambda (node)
+ (cons (cadr node)
+ (map (lambda (x) (cons (car x) (renumber (cdr x))))
+ (cddr node))))
+ dfa))))
+
+;; Extract all distinct characters or ranges and the potential states
+;; they can transition to from a given set of states. Any ranges that
+;; would overlap with distinct characters are split accordingly.
+(define (nfa-state-transitions nfa states)
+ (let lp ((trans '()) ;; list of (char . state) or ((char . char) . state)
+ (ls states) ;; list of integers (remaining state numbers)
+ (res '())) ;; (char state ...) or ((char . char) state ...)
+ (cond
+ ((null? trans)
+ (if (null? ls)
+ (map (lambda (x) (cons (car x) (nfa-closure nfa (cdr x))))
+ res)
+ (let ((node (assv (car ls) nfa)))
+ (lp (if node (cdr node) '()) (cdr ls) res))))
+ ((eq? 'epsilon (caar trans))
+ (lp (cdr trans) ls res))
+ (else
+ (lp (cdr trans) ls (nfa-join-transitions! res (car trans)))))))
+
+(define (nfa-join-transitions! existing new)
+ (define (join ls elt state)
+ (if (not elt)
+ ls
+ (nfa-join-transitions! ls (cons elt state))))
+ (cond
+ ((char? (car new))
+ (let ((ch (car new)))
+ (let lp ((ls existing) (res '()))
+ (cond
+ ((null? ls)
+ ;; done, just cons this on to the original list
+ (cons (list ch (cdr new)) existing))
+ ((eqv? ch (caar ls))
+ ;; add a new state to an existing char
+ (set-cdr! (car ls) (insert-sorted (cdr new) (cdar ls)))
+ existing)
+ ((and (pair? (caar ls))
+ (char<=? (caaar ls) ch)
+ (char<=? ch (cdaar ls)))
+ ;; split a range
+ (apply
+ (lambda (left right)
+ (cons (cons ch (insert-sorted (cdr new) (cdar ls)))
+ (append (if left (list (cons left (cdar ls))) '())
+ (if right (list (cons right (cdar ls))) '())
+ res
+ (cdr ls))))
+ (split-char-range (caar ls) (car new))))
+ (else
+ ;; keep looking
+ (lp (cdr ls) (cons (car ls) res)))))))
+ (else
+ (let ((lo (caar new))
+ (hi (cdar new)))
+ (let lp ((ls existing) (res '()))
+ (cond
+ ((null? ls)
+ (cons (list (car new) (cdr new)) existing))
+ ((and (char? (caar ls)) (char<=? lo (caar ls)) (char<=? (caar ls) hi))
+ ;; range enclosing a character
+ (apply
+ (lambda (left right)
+ (set-cdr! (car ls) (insert-sorted (cdr new) (cdar ls)))
+ (join (join existing left (cdr new)) right (cdr new)))
+ (split-char-range (car new) (caar ls))))
+ ((and (pair? (caar ls))
+ (or (and (char<=? (caaar ls) hi) (char<=? lo (cdaar ls)))
+ (and (char<=? hi (caaar ls)) (char<=? (cdaar ls) lo))))
+ ;; overlapping ranges
+ (apply
+ (lambda (left1 left2 same right1 right2)
+ (let ((old-states (cdar ls)))
+ (set-car! (car ls) same)
+ (set-cdr! (car ls) (insert-sorted (cdr new) old-states))
+ (let* ((res (if right1
+ (cons (cons right1 old-states) existing)
+ existing))
+ (res (if right2 (cons (cons right2 old-states) res) res)))
+ (join (join res left1 (cdr new)) left2 (cdr new)))))
+ (intersect-char-ranges (car new) (caar ls))))
+ (else
+ (lp (cdr ls) (cons (car ls) res)))))))))
+
+(define (char-range c1 c2)
+ (if (eqv? c1 c2) c1 (cons c1 c2)))
+
+;; assumes ch is included in the range
+(define (split-char-range range ch)
+ (list
+ (and (not (eqv? ch (car range)))
+ (char-range (car range) (integer->char (- (char->integer ch) 1))))
+ (and (not (eqv? ch (cdr range)))
+ (char-range (integer->char (+ (char->integer ch) 1)) (cdr range)))))
+
+;; returns (possibly #f) char ranges:
+;; a-only-1 a-only-2 a-and-b b-only-1 b-only-2
+(define (intersect-char-ranges a b)
+ (if (char>? (car a) (car b))
+ (reverse (intersect-char-ranges b a))
+ (let ((a-lo (car a))
+ (a-hi (cdr a))
+ (b-lo (car b))
+ (b-hi (cdr b)))
+ (list
+ (and (char<? a-lo b-lo)
+ (char-range a-lo (integer->char (- (char->integer b-lo) 1))))
+ (and (char>? a-hi b-hi)
+ (char-range (integer->char (+ (char->integer b-hi) 1)) a-hi))
+ (char-range b-lo (if (char<? b-hi a-hi) b-hi a-hi))
+ #f
+ (and (char>? b-hi a-hi)
+ (char-range (integer->char (+ (char->integer a-hi) 1)) b-hi))))))
+
+;; The `closure' of a list of NFA states - all states that can be
+;; reached from any of them using any number of epsilon transitions.
+(define (nfa-closure nfa states)
+ (let lp ((ls states)
+ (res '()))
+ (cond
+ ((null? ls)
+ res)
+ ((memv (car ls) res)
+ (lp (cdr ls) res))
+ (else
+ (lp (append (map cdr
+ (filter (lambda (trans) (eq? 'epsilon (car trans)))
+ (cdr (assv (car ls) nfa))))
+ (cdr ls))
+ (insert-sorted (car ls) res))))))
+
+;; insert an integer uniquely into a sorted list
+(define (insert-sorted n ls)
+ (cond
+ ((null? ls)
+ (cons n '()))
+ ((<= n (car ls))
+ (if (= n (car ls))
+ ls
+ (cons n ls)))
+ (else
+ (cons (car ls) (insert-sorted n (cdr ls))))))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; DFAs don't give us match information, so once we match and
+;; determine the start and end, we need to recursively break the
+;; problem into smaller DFAs to get each submatch.
+;;
+;; See http://compilers.iecc.com/comparch/article/07-10-026
+
+(define (sre-match-extractor sre)
+ (let lp ((sre sre) (n 1) (submatch-deps? #f))
+ (cond
+ ((not (sre-has-submatchs? sre))
+ (if (not submatch-deps?)
+ (lambda (str i j matches) j)
+ (let ((dfa (nfa->dfa (sre->nfa sre))))
+ (lambda (str i j matches)
+ (dfa-match/longest dfa str i j)))))
+ ((pair? sre)
+ (case (car sre)
+ ((: seq)
+ (let* ((right (sre-sequence (cddr sre)))
+ (match-left (lp (cadr sre) n #t))
+ (match-right
+ (lp right (+ n (sre-count-submatches (cadr sre))) #t)))
+ (lambda (str i j matches)
+ (let lp ((k j) (best #f))
+ (if (< k i)
+ best
+ (let* ((middle (match-left str i k matches))
+ (end (and middle
+ (eqv? middle k)
+ (match-right str middle j matches))))
+ (if (eqv? end j)
+ end
+ (lp (- k 1)
+ (if (or (not best) (and end (> end best)))
+ end
+ best)))))))))
+ ((or)
+ (let* ((rest (sre-alternate (cddr sre)))
+ (match-first
+ (lp (cadr sre) n #t))
+ (match-rest
+ (lp rest
+ (+ n (sre-count-submatches (cadr sre)))
+ submatch-deps?)))
+ (lambda (str i j matches)
+ (let ((k (match-first str i j matches)))
+ (if (eqv? k j)
+ k
+ (match-rest str i j matches))))))
+ ((* +)
+ (letrec ((match-once
+ (lp (sre-sequence (cdr sre)) n #t))
+ (match-all
+ (lambda (str i j matches)
+ (let ((k (match-once str i j matches)))
+ (if (and k (< i k))
+ (match-all str k j matches)
+ i)))))
+ (if (eq? '* (car sre))
+ match-all
+ (lambda (str i j matches)
+ (let ((k (match-once str i j matches)))
+ (and k
+ (match-all str k j matches)))))))
+ ((?)
+ (let ((match-once (lp (sre-sequence (cdr sre)) n #t)))
+ (lambda (str i j matches)
+ (let ((k (match-once str i j matches)))
+ (or k i)))))
+ ((submatch)
+ (let ((match-one
+ (lp (sre-sequence (cdr sre)) (+ n 1) #t)))
+ (lambda (str i j matches)
+ (let ((res (match-one str i j matches)))
+ (cond
+ ((number? res)
+ (irregex-match-start-index-set! matches n i)
+ (irregex-match-end-index-set! matches n res)))
+ res))))
+ (else
+ (error "unknown regexp operator" (car sre)))))
+ (else
+ (error "unknown regexp" sre)))))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; closure compilation - we use this for non-regular expressions
+;; instead of an interpreted NFA matcher
+
+(define (sre->procedure sre . o)
+ (define names
+ (if (and (pair? o) (pair? (cdr o))) (cadr o) (sre-names sre 1 '())))
+ (let lp ((sre sre)
+ (n 1)
+ (flags (if (pair? o) (car o) ~none))
+ (next (lambda (str i matches fail) i)))
+ (define (rec sre) (lp sre n flags next))
+ (cond
+ ((pair? sre)
+ (if (string? (car sre))
+ (sre-cset->procedure
+ (sre->cset (car sre) (flag-set? flags ~case-insensitive?))
+ next)
+ (case (car sre)
+ ((~ - & /)
+ (sre-cset->procedure
+ (sre->cset sre (flag-set? flags ~case-insensitive?))
+ next))
+ ((or)
+ (case (length (cdr sre))
+ ((0) (lambda (str i matches fail) (fail)))
+ ((1) (rec (cadr sre)))
+ (else
+ (let* ((first (rec (cadr sre)))
+ (rest (lp (sre-alternate (cddr sre))
+ (+ n (sre-count-submatches (cadr sre)))
+ flags
+ next)))
+ (lambda (str i matches fail)
+ (first str i matches (lambda () (rest str i matches fail))))))))
+ ((w/case)
+ (lp (sre-sequence (cdr sre))
+ n
+ (flag-clear flags ~case-insensitive?)
+ next))
+ ((w/nocase)
+ (lp (sre-sequence (cdr sre))
+ n
+ (flag-join flags ~case-insensitive?)
+ next))
+ ((w/utf8)
+ (lp (sre-sequence (cdr sre)) n (flag-join flags ~utf8?) next))
+ ((w/noutf8)
+ (lp (sre-sequence (cdr sre)) n (flag-clear flags ~utf8?) next))
+ ((seq :)
+ (case (length (cdr sre))
+ ((0) next)
+ ((1) (rec (cadr sre)))
+ (else
+ (let ((rest (lp (sre-sequence (cddr sre))
+ (+ n (sre-count-submatches (cadr sre)))
+ flags
+ next)))
+ (lp (cadr sre) n flags rest)))))
+ ((?)
+ (let ((body (rec (sre-sequence (cdr sre)))))
+ (lambda (str i matches fail)
+ (body str i matches (lambda () (next str i matches fail))))))
+ ((??)
+ (let ((body (rec (sre-sequence (cdr sre)))))
+ (lambda (str i matches fail)
+ (next str i matches (lambda () (body str i matches fail))))))
+ ((*)
+ (cond
+ ((sre-empty? (sre-sequence (cdr sre)))
+ (error "invalid sre: empty *" sre))
+ (else
+ (letrec ((body
+ (lp (sre-sequence (cdr sre))
+ n
+ flags
+ (lambda (str i matches fail)
+ (body str
+ i
+ matches
+ (lambda () (next str i matches fail)))))))
+ (lambda (str i matches fail)
+ (body str i matches (lambda () (next str i matches fail))))))))
+ ((*?)
+ (cond
+ ((sre-empty? (sre-sequence (cdr sre)))
+ (error "invalid sre: empty *?" sre))
+ (else
+ (letrec ((body
+ (lp (sre-sequence (cdr sre))
+ n
+ flags
+ (lambda (str i matches fail)
+ (next str
+ i
+ matches
+ (lambda () (body str i matches fail)))))))
+ (lambda (str i matches fail)
+ (next str i matches (lambda () (body str i matches fail))))))))
+ ((+)
+ (lp (sre-sequence (cdr sre))
+ n
+ flags
+ (rec (list '* (sre-sequence (cdr sre))))))
+ ((=)
+ (rec `(** ,(cadr sre) ,(cadr sre) ,@(cddr sre))))
+ ((>=)
+ (rec `(** ,(cadr sre) #f ,@(cddr sre))))
+ ((** **?)
+ (cond
+ ((or (and (number? (cadr sre))
+ (number? (caddr sre))
+ (> (cadr sre) (caddr sre)))
+ (and (not (cadr sre)) (caddr sre)))
+ (lambda (str i matches fail) (fail)))
+ (else
+ (let* ((from (cadr sre))
+ (to (caddr sre))
+ (? (if (eq? '** (car sre)) '? '??))
+ (* (if (eq? '** (car sre)) '* '*?))
+ (sre (sre-sequence (cdddr sre)))
+ (x-sre (sre-strip-submatches sre))
+ (next (if to
+ (if (= from to)
+ next
+ (fold (lambda (x next)
+ (lp `(,? ,sre) n flags next))
+ next
+ (zero-to (- to from))))
+ (rec `(,* ,sre)))))
+ (if (zero? from)
+ next
+ (lp `(seq ,@(map (lambda (x) x-sre) (zero-to (- from 1)))
+ ,sre)
+ n
+ flags
+ next))))))
+ ((word)
+ (rec `(seq bow ,@(cdr sre) eow)))
+ ((word+)
+ (rec `(seq bow (+ (& (or alphanumeric "_")
+ (or ,@(cdr sre)))) eow)))
+ ((posix-string)
+ (rec (string->sre (cadr sre))))
+ ((look-ahead)
+ (let ((check
+ (lp (sre-sequence (cdr sre))
+ n
+ flags
+ (lambda (str i matches fail) i))))
+ (lambda (str i matches fail)
+ (if (check str i matches (lambda () #f))
+ (next str i matches fail)
+ (fail)))))
+ ((neg-look-ahead)
+ (let ((check
+ (lp (sre-sequence (cdr sre))
+ n
+ flags
+ (lambda (str i matches fail) i))))
+ (lambda (str i matches fail)
+ (if (check str i matches (lambda () #f))
+ (fail)
+ (next str i matches fail)))))
+ ((look-behind)
+ (let ((check
+ (lp (sre-sequence (cons '(* any) (cdr sre)))
+ n
+ flags
+ (lambda (str i matches fail) i))))
+ (lambda (str i matches fail)
+ (if (eqv? i (check (substring str 0 i) 0 matches (lambda () #f)))
+ (next str i matches fail)
+ (fail)))))
+ ((neg-look-behind)
+ (let ((check
+ (lp (sre-sequence (cons '(* any) (cdr sre)))
+ n
+ flags
+ (lambda (str i matches fail) i))))
+ (lambda (str i matches fail)
+ (if (eqv? i (check (substring str 0 i) 0 matches (lambda () #f)))
+ (fail)
+ (next str i matches fail)))))
+ ((atomic)
+ (let ((once
+ (lp (sre-sequence (cdr sre))
+ n
+ flags
+ (lambda (str i matches fail) i))))
+ (lambda (str i matches fail)
+ (let ((j (once str i matches (lambda () #f))))
+ (if j
+ (next str j matches fail)
+ (fail))))))
+ ((if)
+ (let* ((test-submatches (sre-count-submatches (cadr sre)))
+ (pass (lp (caddr sre) flags (+ n test-submatches) next))
+ (fail (if (pair? (cdddr sre))
+ (lp (cadddr sre)
+ (+ n test-submatches
+ (sre-count-submatches (caddr sre)))
+ flags
+ next)
+ (lambda (str i matches fail) (fail)))))
+ (cond
+ ((or (number? (cadr sre)) (symbol? (cadr sre)))
+ (let ((index
+ (if (symbol? (cadr sre))
+ (cond
+ ((assq (cadr sre) names) => cdr)
+ (else
+ (error "unknown named backref in SRE IF" sre)))
+ (cadr sre))))
+ (lambda (str i matches fail2)
+ (if (irregex-match-end-index matches index)
+ (pass str i matches fail2)
+ (fail str i matches fail2)))))
+ (else
+ (let ((test (lp (cadr sre) n flags pass)))
+ (lambda (str i matches fail2)
+ (test str i matches (lambda () (fail str i matches fail2)))
+ ))))))
+ ((backref backref-ci)
+ (let ((n (cond ((number? (cadr sre)) (cadr sre))
+ ((assq (cadr sre) names) => cdr)
+ (else (error "unknown backreference" (cadr sre)))))
+ (compare (if (or (eq? (car sre) 'backref-ci)
+ (flag-set? flags ~case-insensitive?))
+ string-ci=?
+ string=?)))
+ (lambda (str i matches fail)
+ (let ((s (irregex-match-substring matches n)))
+ (if (not s)
+ (fail)
+ (let ((j (+ i (string-length s))))
+ (if (and (<= j (string-length str))
+ (compare s (substring str i j)))
+ (next str j matches fail)
+ (fail))))))))
+ ((dsm)
+ (lp (sre-sequence (cdddr sre)) (+ n (cadr sre)) flags next))
+ ((submatch)
+ (let ((body
+ (lp (sre-sequence (cdr sre))
+ (+ n 1)
+ flags
+ (lambda (str i matches fail)
+ (let ((old (irregex-match-end-index matches n)))
+ (irregex-match-end-index-set! matches n i)
+ (next str i matches
+ (lambda ()
+ (irregex-match-end-index-set! matches n old)
+ (fail))))))))
+ (lambda (str i matches fail)
+ (let ((old (irregex-match-start-index matches n)))
+ (irregex-match-start-index-set! matches n i)
+ (body str i matches
+ (lambda ()
+ (irregex-match-start-index-set! matches n old)
+ (fail)))))))
+ ((submatch-named)
+ (rec `(submatch ,@(cddr sre))))
+ (else
+ (error "unknown regexp operator" sre)))))
+ ((symbol? sre)
+ (case sre
+ ((any)
+ (lambda (str i matches fail)
+ (if (< i (string-length str))
+ (next str (+ i 1) matches fail)
+ (fail))))
+ ((nonl)
+ (lambda (str i matches fail)
+ (if (and (< i (string-length str))
+ (not (eqv? #\newline (string-ref str i))))
+ (next str (+ i 1) matches fail)
+ (fail))))
+ ((bos)
+ (lambda (str i matches fail)
+ (if (zero? i) (next str i matches fail) (fail))))
+ ((bol)
+ (lambda (str i matches fail)
+ (if (or (zero? i) (eqv? #\newline (string-ref str (- i 1))))
+ (next str i matches fail)
+ (fail))))
+ ((bow)
+ (lambda (str i matches fail)
+ (if (and (or (zero? i)
+ (not (char-alphanumeric? (string-ref str (- i 1)))))
+ (< i (string-length str))
+ (char-alphanumeric? (string-ref str i)))
+ (next str i matches fail)
+ (fail))))
+ ((eos)
+ (lambda (str i matches fail)
+ (if (>= i (string-length str)) (next str i matches fail) (fail))))
+ ((eol)
+ (lambda (str i matches fail)
+ (if (or (>= i (string-length str))
+ (eqv? #\newline (string-ref str i)))
+ (next str i matches fail)
+ (fail))))
+ ((eow)
+ (lambda (str i matches fail)
+ (if (and (or (>= i (string-length str))
+ (not (char-alphanumeric? (string-ref str i))))
+ (> i 0)
+ (char-alphanumeric? (string-ref str (- i 1))))
+ (next str i matches fail)
+ (fail))))
+ ((nwb) ;; non-word-boundary
+ (lambda (str i matches fail)
+ (if (and (not (zero? i))
+ (< i (string-length str))
+ (if (char-alphanumeric? (string-ref str (- i 1)))
+ (char-alphanumeric? (string-ref str i))
+ (not (char-alphanumeric? (string-ref str i)))))
+ (next str i matches fail)
+ (fail))))
+ ((epsilon)
+ next)
+ (else
+ (let ((cell (assq sre sre-named-definitions)))
+ (if cell
+ (rec (cdr cell))
+ (error "unknown regexp" sre))))))
+ ((char? sre)
+ (if (flag-set? flags ~case-insensitive?)
+ (lambda (str i matches fail)
+ (if (and (< i (string-length str))
+ (char-ci=? sre (string-ref str i)))
+ (next str (+ i 1) matches fail)
+ (fail)))
+ (lambda (str i matches fail)
+ (if (and (< i (string-length str))
+ (eqv? sre (string-ref str i)))
+ (next str (+ i 1) matches fail)
+ (fail)))))
+ ((string? sre)
+ (rec (sre-sequence (string->list sre))))
+ (else
+ (error "unknown regexp" sre)))))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Simple character sets as lists of ranges, as used in the NFA/DFA
+;; compilation. This is not especially efficient, but is portable and
+;; scalable for any range of character sets.
+
+(define (sre-cset->procedure cset next)
+ (lambda (str i matches fail)
+ (if (and (< i (string-length str))
+ (cset-contains? cset (string-ref str i)))
+ (next str (+ i 1) matches fail)
+ (fail))))
+
+(define (plist->alist ls)
+ (let lp ((ls ls) (res '()))
+ (if (null? ls)
+ (reverse res)
+ (lp (cddr ls) (cons (cons (car ls) (cadr ls)) res)))))
+
+(define (alist->plist ls)
+ (let lp ((ls ls) (res '()))
+ (if (null? ls)
+ (reverse res)
+ (lp (cdr ls) (cons (cdar ls) (cons (caar ls) res))))))
+
+(define (sre->cset sre . o)
+ (let lp ((sre sre) (ci? (and (pair? o) (car o))))
+ (define (rec sre) (lp sre ci?))
+ (cond
+ ((pair? sre)
+ (if (string? (car sre))
+ (if ci?
+ (cset-case-insensitive (string->list (car sre)))
+ (string->list (car sre)))
+ (case (car sre)
+ ((~)
+ (cset-complement
+ (fold cset-union (rec (cadr sre)) (map rec (cddr sre)))))
+ ((&)
+ (fold cset-intersection (rec (cadr sre)) (map rec (cddr sre))))
+ ((-)
+ (fold (lambda (x res) (cset-difference res x))
+ (rec (cadr sre))
+ (map rec (cddr sre))))
+ ((/)
+ (let ((res (plist->alist (sre-flatten-ranges (cdr sre)))))
+ (if ci?
+ (cset-case-insensitive res)
+ res)))
+ ((or)
+ (fold cset-union (rec (cadr sre)) (map rec (cddr sre))))
+ ((w/case)
+ (lp (sre-alternate (cdr sre)) #f))
+ ((w/nocase)
+ (lp (sre-alternate (cdr sre)) #t))
+ (else
+ (error "not a valid sre char-set operator" sre)))))
+ ((char? sre) (rec (list (string sre))))
+ ((string? sre) (rec (list sre)))
+ (else
+ (let ((cell (assq sre sre-named-definitions)))
+ (if cell
+ (rec (cdr cell))
+ (error "not a valid sre char-set" sre)))))))
+
+;;;; another debugging utility
+;; (define (cset->sre cset)
+;; (let lp ((ls cset) (chars '()) (ranges '()))
+;; (cond
+;; ((null? ls)
+;; (sre-alternate
+;; (append
+;; (if (pair? chars) (list (list (list->string chars))) '())
+;; (if (pair? ranges) (list (cons '/ (alist->plist ranges))) '()))))
+;; ((char? (car ls)) (lp (cdr ls) (cons (car ls) chars) ranges))
+;; (else (lp (cdr ls) chars (cons (car ls) ranges))))))
+
+(define (cset-contains? cset ch)
+ (find (lambda (x)
+ (or (eqv? x ch)
+ (and (pair? x) (char<=? (car x) ch) (char<=? ch (cdr x)))))
+ cset))
+
+(define (cset-range x)
+ (if (char? x) (cons x x) x))
+
+(define (char-ranges-overlap? a b)
+ (if (pair? a)
+ (if (pair? b)
+ (or (and (char<=? (car a) (cdr b)) (char<=? (car b) (cdr a)))
+ (and (char<=? (cdr b) (car a)) (char<=? (cdr a) (car b))))
+ (and (char<=? (car a) b) (char<=? b (cdr a))))
+ (if (pair? b)
+ (char-ranges-overlap? b a)
+ (eqv? a b))))
+
+(define (char-ranges-union a b)
+ (cons (if (char<=? (car a) (car b)) (car a) (car b))
+ (if (char>=? (cdr a) (cdr b)) (cdr a) (cdr b))))
+
+(define (cset-union a b)
+ (cond ((null? b) a)
+ ((find-tail (lambda (x) (char-ranges-overlap? x (car b))) a)
+ => (lambda (ls)
+ (cset-union
+ (cset-union (append (take-up-to a ls) (cdr ls))
+ (list (char-ranges-union (cset-range (car ls))
+ (cset-range (car b)))))
+ (cdr b))))
+ (else (cset-union (cons (car b) a) (cdr b)))))
+
+(define (cset-difference a b)
+ (cond ((null? b) a)
+ ((not (car b)) (cset-difference a (cdr b)))
+ ((find-tail (lambda (x) (char-ranges-overlap? x (car b))) a)
+ => (lambda (ls)
+ (apply
+ (lambda (left1 left2 same right1 right2)
+ (let* ((a (append (take-up-to a ls) (cdr ls)))
+ (a (if left1 (cons left1 a) a))
+ (a (if left2 (cons left2 a) a))
+ (b (if right1 (cset-union b (list right1)) b))
+ (b (if right2 (cset-union b (list right2)) b)))
+ (cset-difference a b)))
+ (intersect-char-ranges (cset-range (car ls))
+ (cset-range (car b))))))
+ (else (cset-difference a (cdr b)))))
+
+(define (cset-intersection a b)
+ (let intersect ((a a) (b b) (res '()))
+ (cond ((null? b) res)
+ ((find-tail (lambda (x) (char-ranges-overlap? x (car b))) a)
+ => (lambda (ls)
+ (apply
+ (lambda (left1 left2 same right1 right2)
+ (let* ((a (append (take-up-to a ls) (cdr ls)))
+ (a (if left1 (cons left1 a) a))
+ (a (if left2 (cons left2 a) a))
+ (b (if right1 (cset-union b (list right1)) b))
+ (b (if right2 (cset-union b (list right2)) b)))
+ (intersect a b (cset-union res (list same)))))
+ (intersect-char-ranges (cset-range (car ls))
+ (cset-range (car b))))))
+ (else (intersect a (cdr b) res)))))
+
+(define (cset-complement a)
+ (cset-difference (sre->cset *all-chars*) a))
+
+(define (cset-case-insensitive a)
+ (let lp ((ls a) (res '()))
+ (cond ((null? ls) (reverse res))
+ ((and (char? (car ls)) (char-alphabetic? (car ls)))
+ (let ((c2 (char-altcase (car ls)))
+ (res (cons (car ls) res)))
+ (lp (cdr ls) (if (cset-contains? res c2) res (cons c2 res)))))
+ ((and (pair? (car ls))
+ (char-alphabetic? (caar ls))
+ (char-alphabetic? (cdar ls)))
+ (lp (cdr ls)
+ (cset-union (cset-union res (list (car ls)))
+ (list (cons (char-altcase (caar ls))
+ (char-altcase (cdar ls)))))))
+ (else (lp (cdr ls) (cset-union res (list (car ls))))))))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; match and replace utilities
+
+(define (irregex-fold irx kons knil str . o)
+ (let* ((irx (irregex irx))
+ (matches (irregex-new-matches irx))
+ (finish (if (pair? o) (car o) (lambda (i acc) acc)))
+ (start (if (and (pair? o) (pair? (cdr o))) (cadr o) 0))
+ (end (if (and (pair? o) (pair? (cdr o)) (pair? (cddr o)))
+ (caddr o)
+ (string-length str))))
+ (irregex-match-string-set! matches str)
+ (let lp ((i start) (acc knil))
+ (if (>= i end)
+ (finish i acc)
+ (let ((m (irregex-search/matches irx str i end matches)))
+ (if (not m)
+ (finish i acc)
+ (let* ((end (irregex-match-end m 0))
+ (acc (kons i m acc)))
+ (irregex-reset-matches! matches)
+ (lp end acc))))))))
+
+(define (irregex-replace irx str . o)
+ (let ((m (irregex-search (irregex irx) str)))
+ (and
+ m
+ (string-cat-reverse
+ (cons (substring str (irregex-match-end m 0) (string-length str))
+ (append (irregex-apply-match m o)
+ (list (substring str 0 (irregex-match-start m 0)))))))))
+
+(define (irregex-replace/all irx str . o)
+ (irregex-fold
+ irx
+ (lambda (i m acc)
+ (let ((m-start (irregex-match-start m 0)))
+ (append (irregex-apply-match m o)
+ (if (= i m-start)
+ acc
+ (cons (substring str i m-start) acc)))))
+ '()
+ str
+ (lambda (i acc)
+ (let ((end (string-length str)))
+ (string-cat-reverse (if (= i end)
+ acc
+ (cons (substring str i end) acc)))))))
+
+(define (irregex-apply-match m ls)
+ (let lp ((ls ls) (res '()))
+ (if (null? ls)
+ res
+ (cond
+ ((integer? (car ls))
+ (lp (cdr ls)
+ (cons (or (irregex-match-substring m (car ls)) "") res)))
+ ((procedure? (car ls))
+ (lp (cdr ls) (cons ((car ls) m) res)))
+ ((symbol? (car ls))
+ (case (car ls)
+ ((pre)
+ (lp (cdr ls)
+ (cons (substring (irregex-match-string m)
+ 0
+ (irregex-match-start m 0))
+ res)))
+ ((post)
+ (lp (cdr ls)
+ (cons (substring (irregex-match-string m)
+ (irregex-match-end m 0)
+ (string-length (irregex-match-string m)))
+ res)))
+ (else (error "unknown match replacement" (car ls)))))
+ (else
+ (lp (cdr ls) (cons (car ls) res)))))))
diff --git a/library.scm b/library.scm
index 2eba6055..e3ab347d 100644
--- a/library.scm
+++ b/library.scm
@@ -76,8 +76,7 @@
#define C_a_get_current_seconds(ptr, c, dummy) C_flonum(ptr, time(NULL))
#define C_peek_c_string_at(ptr, i) ((C_char *)(((C_char **)ptr)[ i ]))
-static C_word
-fast_read_line_from_file(C_word str, C_word port, C_word size) {
+static C_word fast_read_line_from_file(C_word str, C_word port, C_word size) {
int n = C_unfix(size);
int i;
int c;
@@ -102,7 +101,7 @@ fast_read_line_from_file(C_word str, C_word port, C_word size) {
}
static C_word
-fast_read_string_from_file(C_word dest, C_word port, C_word len, C_word pos)
+fast_read_string_from_file (C_word dest, C_word port, C_word len, C_word pos)
{
int n = C_unfix (len);
char * buf = ((char *)C_data_pointer (dest) + C_unfix (pos));
@@ -1732,8 +1731,7 @@ EOF
(define (##sys#check-port x . loc)
(unless (%port? x)
- (##sys#signal-hook
- #:type-error (and (pair? loc) (car loc)) "argument is not a port" x) ) )
+ (##sys#signal-hook #:type-error (and (pair? loc) (car loc)) "argument is not a port" x) ) )
(define (##sys#check-port-mode port mode . loc)
(unless (eq? mode (##sys#slot port 1))
diff --git a/manual/Supported language b/manual/Supported language
index d7be9c82..3fe014cd 100644
--- a/manual/Supported language
+++ b/manual/Supported language
@@ -19,7 +19,7 @@
* [[Unit ports]] I/O ports
* [[Unit files]] File and pathname operations
* [[Unit extras]] Useful utility definitions
-* [[Unit irregex]] Regular expressions
+* [[Unit regex]] Regular expressions
* [[Unit srfi-1]] List Library
* [[Unit srfi-4]] Homogeneous numeric vectors
* [[Unit srfi-13]] String library
diff --git a/manual/Unit extras b/manual/Unit extras
index 3d0163a8..f7f28cca 100644
--- a/manual/Unit extras
+++ b/manual/Unit extras
@@ -196,4 +196,4 @@ false. Returns a string with the accumulated characters.
---
Previous: [[Unit files]]
-Next: [[Unit irregex]]
+Next: [[Unit regex]]
diff --git a/manual/Unit irregex b/manual/Unit irregex
deleted file mode 100644
index 51073caf..00000000
--- a/manual/Unit irregex
+++ /dev/null
@@ -1,819 +0,0 @@
-[[tags: manual]]
-[[toc:]]
-
-== Unit irregex
-
-This library unit provides support for regular expressions, using the
-powerful ''irregex'' regular expression engine by Alex Shinn. It
-supports both POSIX syntax with various (irregular) PCRE extensions,
-as well as SCSH's SRE syntax, with various aliases for commonly used
-patterns. DFA matching is used when possible, otherwise a
-closure-compiled NFA approach is used. Matching may be performed over
-standard Scheme strings, or over arbitrarily chunked streams of
-strings.
-
-On systems that support dynamic loading, the {{irregex}} unit can
-be made available in the Chicken interpreter ({{csi}}) by entering
-
-<enscript highlight=scheme>
-(require-extension irregex)
-</enscript>
-
-[[toc:]]
-
-=== Specification
-
-==== Procedures
-
-===== irregex
-===== string->irregex
-===== sre->irregex
-
-<procedure>(irregex <posix-string-or-sre> [<options> ...])</procedure><br>
-<procedure>(string->irregex <posix-string> [<options> ...])</procedure><br>
-<procedure>(sre->irregex <sre> [<options> ...])</procedure><br>
-
-Compiles a regular expression from either a POSIX-style regular
-expression string (with most PCRE extensions) or an SCSH-style SRE.
-There is no {{(rx ...)}} syntax - just use normal Scheme lists, with
-{{quasiquote}} if you like.
-
-Technically a string by itself could be considered a valid (though
-rather silly) SRE, so if you want to just match a literal string you
-should use something like {{(irregex `(: ,str))}}, or use the explicit
-{{(sre->irregex str)}}.
-
-The options are a list of any of the following symbols:
-
-; {{'i}}, {{'case-insensitive}} : match case-insensitively
-; {{'m}}, {{'multi-line}} : treat string as multiple lines (effects {{^}} and {{$}})
-; {{'s}}, {{'single-line}} : treat string as a single line ({{.}} can match newline)
-; {{'utf8}} : utf8-mode (assumes strings are byte-strings)
-; {{'fast}} : try to optimize the regular expression
-; {{'small}} : try to compile a smaller regular expression
-; {{'backtrack}} : enforce a backtracking implementation
-
-The {{'fast}} and {{'small}} options are heuristic guidelines and will
-not necessarily make the compiled expression faster or smaller.
-
-===== string->sre
-===== maybe-string->sre
-
-<procedure>(string->sre <str>)</procedure><br>
-<procedure>(maybe-string->sre <obj>)</procedure><br>
-
-For backwards compatibility, procedures to convert a POSIX string into
-an SRE.
-
-{{maybe-string->sre}} does the same thing, but only if the argument is
-a string, otherwise it assumes {{<obj>}} is an SRE and returns it
-as-is. This is useful when you want to provide an API that allows
-either a POSIX string or SRE (like {{irregex}} or {{irregex-search}}
-below) - it ensures the result is an SRE.
-
-===== irregex?
-
-<procedure>(irregex? <obj>)</procedure><br>
-
-Returns {{#t}} iff the object is a regular expression.
-
-===== irregex-search
-
-<procedure>(irregex-search <irx> <str> [<start> <end>])</procedure>
-
-Searches for any instances of the pattern {{<irx>}} (a POSIX string, SRE
-sexp, or pre-compiled regular expression) in {{<str>}}, optionally between
-the given range. If a match is found, returns a match object,
-otherwise returns {{#f}}.
-
-Match objects can be used to query the original range of the string or
-its submatches using the {{irregex-match-*}} procedures below.
-
-Examples:
-
-<enscript highlight=scheme>
-(irregex-search "foobar" "abcFOOBARdef") => #f
-
-(irregex-search "foobar" "abcFOOBARdef" 'i) => #<match>
-
-(irregex-search '(w/nocase "foobar") "abcFOOBARdef") => #<match>
-</enscript>
-
-Note, the actual match result is represented by a vector in the
-default implementation. Throughout this manual, we'll just write
-{{#<match>}} to show that a successful match was returned when the
-details are not important.
-
-Matching follows the POSIX leftmost, longest semantics, when
-searching. That is, of all possible matches in the string,
-{{irregex-search}} will return the match at the first position
-(leftmost). If multiple matches are possible from that same first
-position, the longest match is returned.
-
-===== irregex-match
-
-<procedure>(irregex-match <irx> <str>)</procedure>
-
-Like {{irregex-search}}, but performs an anchored match against the
-beginning and end of the string, without searching.
-
-Examples:
-
-<enscript highlight=scheme>
-(irregex-match '(w/nocase "foobar") "abcFOOBARdef") => #f
-
-(irregex-match '(w/nocase "foobar") "FOOBAR") => #<match>
-</enscript>
-
-===== irregex-match-data?
-
-<procedure>(irregex-match-data? <obj>)</procedure>
-
-Returns {{#t}} iff the object is a successful match result from
-{{irregex-search}} or {{irregex-match}}.
-
-===== irregex-num-submatches
-===== irregex-match-num-submatches
-
-<procedure>(irregex-num-submatches <irx>)</procedure><br>
-<procedure>(irregex-match-num-submatches <match>)</procedure>
-
-Returns the number of numbered submatches that are defined in the
-irregex or match object.
-
-===== irregex-names
-===== irregex-match-names
-
-<procedure>(irregex-names <irx>)</procedure><br>
-<procedure>(irregex-match-names <match>)</procedure>
-
-Returns an association list of named submatches that are defined in
-the irregex or match object. The {{car}} of each item in this list is
-the name of a submatch, the {{cdr}} of each item is the numerical
-submatch corresponding to this name. If a named submatch occurs
-multiple times in the irregex, it will also occur multiple times in
-this list.
-
-===== irregex-match-substring
-===== irregex-match-start-index
-===== irregex-match-end-index
-
-<procedure>(irregex-match-substring <match> [<index-or-name>])</procedure><br>
-<procedure>(irregex-match-start-index <match> <index-or-name>)</procedure><br>
-<procedure>(irregex-match-end-index <match> <index-or-name>)</procedure>
-
-Fetches the matched substring (or its start or end offset) at the
-given submatch index, or named submatch. The entire match is index 0,
-the first 1, etc. The default is index 0.
-
-===== irregex-match-subchunk
-
-<procedure>(irregex-match-subchunk <match> [<index-or-name>])</procedure>
-
-Generates a chunked data-type for the given match item, of the same
-type as the underlying chunk type (see Chunked String Matching below).
-This is only available if the chunk type specifies the get-subchunk
-API, otherwise an error is raised.
-
-===== irregex-replace
-===== irregex-replace/all
-
-<procedure>(irregex-replace <irx> <str> [<replacements> ...])</procedure><br>
-<procedure>(irregex-replace/all <irx> <str> [<replacements> ...])</procedure>
-
-Matches a pattern in a string, and replaces it with a (possibly empty)
-list of substitutions. Each {{<replacement>}} can be either a string
-literal, a numeric index, a symbol (as a named submatch), or a
-procedure which takes one argument (the match object) and returns a
-string.
-
-Examples:
-
-<enscript highlight=scheme>
-(irregex-replace "[aeiou]" "hello world" "*") => "h*llo world"
-
-(irregex-replace/all "[aeiou]" "hello world" "*") => "h*ll* w*rld"
-</enscript>
-
-===== irregex-split
-===== irregex-extract
-
-<procedure>(irregex-split <irx> <str> [<start> <end>])</procedure><br>
-<procedure>(irregex-extract <irx> <str> [<start> <end>])</procedure>
-
-{{irregex-split}} splits the string {{<str>}} into substrings divided
-by the pattern in {{<irx>}}. {{irregex-extract}} does the opposite,
-returning a list of each instance of the pattern matched disregarding
-the substrings in between.
-
-===== irregex-fold
-
-<procedure>(irregex-fold <irx> <kons> <knil> <str> [<finish> <start> <end>])</procedure>
-
-This performs a fold operation over every non-overlapping place
-{{<irx>}} occurs in the string {{str}}.
-
-The {{<kons>}} procedure takes the following signature:
-
-<enscript highlight=scheme>
-(<kons> <from-index> <match> <seed>)
-</enscript>
-
-where {{<from-index>}} is the index from where we started searching
-(initially {{<start>}} and thereafter the end index of the last
-match), {{<match>}} is the resulting match-data object, and {{<seed>}}
-is the accumulated fold result starting with {{<knil>}}.
-
-The rationale for providing the {{<from-index>}} (which is not
-provided in the SCSH {{regexp-fold}} utility), is because this
-information is useful (e.g. for extracting the unmatched portion of
-the string before the current match, as needed in
-{{irregex-replace}}), and not otherwise directly accessible.
-
-The optional {{<finish>}} takes two arguments:
-
-<enscript highlight=scheme>
-(<finish> <from-index> <seed>)
-</enscript>
-
-which simiarly allows you to pick up the unmatched tail of the string,
-and defaults to just returning the {{<seed>}}.
-
-{{<start>}} and {{<end>}} are numeric indices letting you specify the
-boundaries of the string on which you want to fold.
-
-To extract all instances of a match out of a string, you can use
-
-<enscript highlight=scheme>
-(map irregex-match-substring
- (irregex-fold <irx>
- (lambda (i m s) (cons m s))
- '()
- <str>
- (lambda (i s) (reverse s))))
-</enscript>
-
-==== Extended SRE Syntax
-
-Irregex provides the first native implementation of SREs (Scheme
-Regular Expressions), and includes many extensions necessary both for
-minimal POSIX compatibility, as well as for modern extensions found in
-libraries such as PCRE.
-
-The following table summarizes the SRE syntax, with detailed
-explanations following.
-
- ;; basic patterns
- <string> ; literal string
- (seq <sre> ...) ; sequence
- (: <sre> ...)
- (or <sre> ...) ; alternation
-
- ;; optional/multiple patterns
- (? <sre> ...) ; 0 or 1 matches
- (* <sre> ...) ; 0 or more matches
- (+ <sre> ...) ; 1 or more matches
- (= <n> <sre> ...) ; exactly <n> matches
- (>= <n> <sre> ...) ; <n> or more matches
- (** <from> <to> <sre> ...) ; <n> to <m> matches
- (?? <sre> ...) ; non-greedy (non-greedy) pattern: (0 or 1)
- (*? <sre> ...) ; non-greedy kleene star
- (**? <from> <to> <sre> ...) ; non-greedy range
-
- ;; submatch patterns
- (submatch <sre> ...) ; numbered submatch
- ($ <sre> ...)
- (submatch-named <name> <sre> ...) ; named submatch
- (=> <name> <sre> ...)
- (backref <n-or-name>) ; match a previous submatch
-
- ;; toggling case-sensitivity
- (w/case <sre> ...) ; enclosed <sre>s are case-sensitive
- (w/nocase <sre> ...) ; enclosed <sre>s are case-insensitive
-
- ;; character sets
- <char> ; singleton char set
- (<string>) ; set of chars
- (or <cset-sre> ...) ; set union
- (~ <cset-sre> ...) ; set complement (i.e. [^...])
- (- <cset-sre> ...) ; set difference
- (& <cset-sre> ...) ; set intersection
- (/ <range-spec> ...) ; pairs of chars as ranges
-
- ;; named character sets
- any
- nonl
- ascii
- lower-case lower
- upper-case upper
- alphabetic alpha
- numeric num
- alphanumeric alphanum alnum
- punctuation punct
- graphic graph
- whitespace white space
- printing print
- control cntrl
- hex-digit xdigit
-
- ;; assertions and conditionals
- bos eos ; beginning/end of string
- bol eol ; beginning/end of line
- bow eow ; beginning/end of word
- nwb ; non-word-boundary
- (look-ahead <sre> ...) ; zero-width look-ahead assertion
- (look-behind <sre> ...) ; zero-width look-behind assertion
- (neg-look-ahead <sre> ...) ; zero-width negative look-ahead assertion
- (neg-look-behind <sre> ...) ; zero-width negative look-behind assertion
- (atomic <sre> ...) ; for (?>...) independent patterns
- (if <test> <pass> [<fail>]) ; conditional patterns
- commit ; don't backtrack beyond this (i.e. cut)
-
- ;; backwards compatibility
- (posix-string <string>) ; embed a POSIX string literal
-
-===== Basic SRE Patterns
-
-The simplest SRE is a literal string, which matches that string
-exactly.
-
-<enscript highlight=scheme>
-(irregex-search "needle" "hayneedlehay") => #<match>
-</enscipt>
-
-By default the match is case-sensitive, though you can control this
-either with the compiler flags or local overrides:
-
-<enscript highlight=scheme>
-(irregex-search "needle" "haynEEdlehay") => #f
-
-(irregex-search (irregex "needle" 'i) "haynEEdlehay") => #<match>
-
-(irregex-search '(w/nocase "needle") "haynEEdlehay") => #<match>
-</enscript>
-
-You can use {{w/case}} to switch back to case-sensitivity inside a
-{{w/nocase}} or when the SRE was compiled with {{'i}}:
-
-<enscript highlight=scheme>
-(irregex-search '(w/nocase "SMALL" (w/case "BIG")) "smallBIGsmall") => #<match>
-
-(irregex-search '(w/nocase "small" (w/case "big")) "smallBIGsmall") => #f
-</enscript>
-
-Of course, literal strings by themselves aren't very interesting
-regular expressions, so we want to be able to compose them. The most
-basic way to do this is with the {{seq}} operator (or its abbreviation
-{{:}}), which matches one or more patterns consecutively:
-
-<enscript highlight=scheme>
-(irregex-search '(: "one" space "two" space "three") "one two three") => #<match>
-</enscript>
-
-As you may have noticed above, the {{w/case}} and {{w/nocase}}
-operators allowed multiple SREs in a sequence - other operators that
-take any number of arguments (e.g. the repetition operators below)
-allow such implicit sequences.
-
-To match any one of a set of patterns use the {{or}} alternation
-operator:
-
-<enscript highlight=scheme>
-(irregex-search '(or "eeney" "meeney" "miney") "meeney") => #<match>
-
-(irregex-search '(or "eeney" "meeney" "miney") "moe") => #f
-</enscript>
-
-===== SRE Repetition Patterns
-
-There are also several ways to control the number of times a pattern
-is matched. The simplest of these is {{?}} which just optionally
-matches the pattern:
-
-<enscript highlight=scheme>
-(irregex-search '(: "match" (? "es") "!") "matches!") => #<match>
-
-(irregex-search '(: "match" (? "es") "!") "match!") => #<match>
-
-(irregex-search '(: "match" (? "es") "!") "matche!") => #<match>
-</enscript>
-
-To optionally match any number of times, use {{*}}, the Kleene star:
-
-<enscript highlight=scheme>
-(irregex-search '(: "<" (* (~ #\>)) ">") "<html>") => #<match>
-
-(irregex-search '(: "<" (* (~ #\>)) ">") "<>") => #<match>
-
-(irregex-search '(: "<" (* (~ #\>)) ">") "<html") => #f
-</enscript>
-
-Often you want to match any number of times, but at least one time is
-required, and for that you use {{+}}:
-
-<enscript highlight=scheme>
-(irregex-search '(: "<" (+ (~ #\>)) ">") "<html>") => #<match>
-
-(irregex-search '(: "<" (+ (~ #\>)) ">") "<a>") => #<match>
-
-(irregex-search '(: "<" (+ (~ #\>)) ">") "<>") => #f
-</enscript>
-
-More generally, to match at least a given number of times, use {{>=}}:
-
-<enscript highlight=scheme>
-(irregex-search '(: "<" (>= 3 (~ #\>)) ">") "<table>") => #<match>
-
-(irregex-search '(: "<" (>= 3 (~ #\>)) ">") "<pre>") => #<match>
-
-(irregex-search '(: "<" (>= 3 (~ #\>)) ">") "<tr>") => #f
-</enscript>
-
-To match a specific number of times exactly, use {{=}}:
-
-<enscript highlight=scheme>
-(irregex-search '(: "<" (= 4 (~ #\>)) ">") "<html>") => #<match>
-
-(irregex-search '(: "<" (= 4 (~ #\>)) ">") "<table>") => #f
-</enscript>
-
-And finally, the most general form is {{**}} which specifies a range
-of times to match. All of the earlier forms are special cases of this.
-
-<enscript highlight=scheme>
-(irregex-search '(: (= 3 (** 1 3 numeric) ".") (** 1 3 numeric)) "192.168.1.10") => #<match>
-
-(irregex-search '(: (= 3 (** 1 3 numeric) ".") (** 1 3 numeric)) "192.0168.1.10") => #f
-</enscript>
-
-There are also so-called "non-greedy" variants of these repetition
-operators, by convention suffixed with an additional {{?}}. Since the
-normal repetition patterns can match any of the allotted repetition
-range, these operators will match a string if and only if the normal
-versions matched. However, when the endpoints of which submatch
-matched where are taken into account (specifically, all matches when
-using irregex-search since the endpoints of the match itself matter),
-the use of a non-greedy repetition can change the result.
-
-So, whereas {{?}} can be thought to mean "match or don't match,"
-{{??}} means "don't match or match." {{*}} typically consumes as much
-as possible, but {{*?}} tries first to match zero times, and only
-consumes one at a time if that fails. If you have a greedy operator
-followed by a non-greedy operator in the same pattern, they can
-produce surprisins results as they compete to make the match longer or
-shorter. If this seems confusing, that's because it is. Non-greedy
-repetitions are defined only in terms of the specific backtracking
-algorithm used to implement them, which for compatibility purposes
-always means the Perl algorithm. Thus, when using these patterns you
-force IrRegex to use a backtracking engine, and can't rely on
-efficient execution.
-
-===== SRE Character Sets
-
-Perhaps more common than matching specific strings is matching any of
-a set of characters. You can use the {{or}} alternation pattern on a
-list of single-character strings to simulate a character set, but this
-is too clumsy for everyday use so SRE syntax allows a number of
-shortcuts.
-
-A single character matches that character literally, a trivial
-character class. More conveniently, a list holding a single element
-which is a string refers to the character set composed of every
-character in the string.
-
-<enscript highlight=scheme>
-(irregex-match '(* #\-) "---") => #<match>
-
-(irregex-match '(* #\-) "-_-") => #f
-
-(irregex-match '(* ("aeiou")) "oui") => #<match>
-
-(irregex-match '(* ("aeiou")) "ouais") => #f
-</enscript>
-
-Ranges are introduced with the \q{/} operator. Any strings or
-characters in the \q{/} are flattened and then taken in pairs to
-represent the start and end points, inclusive, of character ranges.
-
-<enscript highlight=scheme>
-(irregex-match '(* (/ "AZ09")) "R2D2") => #<match>
-
-(irregex-match '(* (/ "AZ09")) "C-3PO") => #f
-</enscript>
-
-In addition, a number of set algebra operations are provided. \q{or},
-of course, has the same meaning, but when all the options are
-character sets it can be thought of as the set union operator. This
-is further extended by the \q{&} set intersection, \q{-} set
-difference, and \q{~} set complement operators.
-
-<enscript highlight=scheme>
-(irregex-match '(* (& (/ "az") (~ ("aeiou")))) "xyzzy") => #<match>
-
-(irregex-match '(* (& (/ "az") (~ ("aeiou")))) "vowels") => #f
-
-(irregex-match '(* (- (/ "az") ("aeiou"))) "xyzzy") => #<match>
-
-(irregex-match '(* (- (/ "az") ("aeiou"))) "vowels") => #f
-</enscript>
-
-===== SRE Assertion Patterns
-
-There are a number of times it can be useful to assert something about
-the area around a pattern without explicitly making it part of the
-pattern. The most common cases are specifically anchoring some
-pattern to the beginning or end of a word or line or even the whole
-string. For example, to match on the end of a word:
-
-<enscript highlight=scheme>
-(irregex-match '(: "foo" eow) "foo") => #<match>
-
-(irregex-match '(: "foo" eow) "foo!") => #<match>
-
-(irregex-match '(: "foo" eow) "foof") => #f
-</enscript>
-
-The {{bow}}, {{bol}}, {{eol}}, {{bos}} and {{eos}} work similarly.
-{{nwb}} asserts that you are not in a word-boundary - if replaced for
-{{eow}} in the above examples it would reverse all the results.
-
-There is no {{wb}}, since you tend to know from context whether it
-would be the beginning or end of a word, but if you need it you can
-always use {{(or bow eow)}}.
-
-Somewhat more generally, Perl introduced positive and negative
-look-ahead and look-behind patterns. Perl look-behind patterns are
-limited to a fixed length, however the IrRegex versions have no such
-limit.
-
-<enscript highlight=scheme>
-(irregex-match '(: "regular" (look-ahead " expression"))
- "regular expression")
- => #<match>
-</enscript>
-
-The most general case, of course, would be an \q{and} pattern to
-complement the \q{or} pattern - all the patterns must match or the
-whole pattern fails. This may be provided in a future release,
-although it (and look-ahead and look-behind assertions) are unlikely
-to be compiled efficiently.
-
-===== SRE Utility Patterns
-
-The following utility regular expressions are also provided for common
-patterns that people are eternally reinventing. They are not
-necessarily the official patterns matching the RFC definitions of the
-given data, because of the way that such patterns tend to be used.
-There are three general usages for regexps:
-
-; searching : search for a pattern matching a desired object in a larger text
-
-; validation : determine whether an entire string matches a pattern
-
-; extraction : given a string already known to be valid, extract certain fields from it as submatches
-
-In some cases, but not always, these will overlap. When they are
-different, {{irregex-search}} will naturally always want the searching
-version, so IrRegex provides that version.
-
-As an example where these might be different, consider a URL. If you
-want to match all the URLs in some arbitrary text, you probably want
-to exclude a period or comma at the tail end of a URL, since it's more
-likely being used as punctuation rather than part of the URL, despite
-the fact that it would be valid URL syntax.
-
-Another problem with the RFC definitions is the standard itself may
-have become irrelevant. For example, the pattern IrRegex provides for
-email addresses doesn't match quoted local parts (e.g.
-{{"first last"@domain.com}}) because these are increasingly rare, and
-unsupported by enough software that it's better to discourage their use.
-Conversely, technically consecutive periods
-(e.g. {{first..last@domain.com}}) are not allowed in email addresses, but
-most email software does allow this, and in fact such addresses are
-quite common in Japan.
-
-The current patterns provided are:
-
- newline ; general newline pattern (crlf, cr, lf)
- integer ; an integer
- real ; a real number (including scientific)
- string ; a "quoted" string
- symbol ; an R5RS Scheme symbol
- ipv4-address ; a numeric decimal ipv4 address
- ipv6-address ; a numeric hexadecimal ipv6 address
- domain ; a domain name
- email ; an email address
- http-url ; a URL beginning with https?://
-
-Because of these issues the exact definitions of these patterns are
-subject to be changed, but will be documented clearly when they are
-finalized. More common patterns are also planned, but as what you
-want increases in complexity it's probably better to use a real
-parser.
-
-==== Supported PCRE Syntax
-
-Since the PCRE syntax is so overwhelming complex, it's easier to just
-list what we *don't* support for now. Refer to the
-[[http://pcre.org/pcre.txt|PCRE documentation]] for details. You
-should be using the SRE syntax anyway!
-
-Unicode character classes ({{\P}}) are not supported, but will be
-in an upcoming release. {{\C}} named characters are not supported.
-
-Callbacks, subroutine patterns and recursive patterns are not
-supported. ({{*FOO}}) patterns are not supported and may never be.
-
-{{\G}} and {{\K}} are not supported.
-
-Octal character escapes are not supported because they are ambiguous
-with back-references - just use hex character escapes.
-
-Other than that everything should work, including named submatches,
-zero-width assertions, conditional patterns, etc.
-
-In addition, {{\<}} and {{\>}} act as beginning-of-word and end-of-word
-marks, respectively, as in Emacs regular expressions.
-
-Also, two escapes are provided to embed SRE patterns inside PCRE
-strings, {{"\'<sre>"}} and {{"(*'<sre>)"}}. For example, to match a
-comma-delimited list of integers you could use
-
-<enscript highlight=scheme>
-"\\'integer(,\\'integer)*"
-</enscript>
-
-and to match a URL in angle brackets you could use
-
-<enscript highlight=scheme>
-"<('*http-url)>"
-</enscript>
-
-Note in the second example the enclosing {{"('*...)"}} syntax is needed
-because the Scheme reader would consider the closing {{">"}} as part of
-the SRE symbol.
-
-The following chart gives a quick reference from PCRE form to the SRE
-equivalent:
-
- ;; basic syntax
- "^" ;; bos (or eos inside (?m: ...))
- "$" ;; eos (or eos inside (?m: ...))
- "." ;; nonl
- "a?" ;; (? a)
- "a*" ;; (* a)
- "a+" ;; (+ a)
- "a??" ;; (?? a)
- "a*?" ;; (*? a)
- "a+?" ;; (+? a)
- "a{n,m}" ;; (** n m a)
-
- ;; grouping
- "(...)" ;; (submatch ...)
- "(?:...)" ;; (: ...)
- "(?i:...)" ;; (w/nocase ...)
- "(?-i:...)" ;; (w/case ...)
- "(?<name>...)" ;; (=> <name>...)
-
- ;; character classes
- "[aeiou]" ;; ("aeiou")
- "[^aeiou]" ;; (~ "aeiou")
- "[a-z]" ;; (/ "az") or (/ "a" "z")
- "[[:alpha:]]" ;; alpha
-
- ;; assertions
- "(?=...)" ;; (look-ahead ...)
- "(?!...)" ;; (neg-look-ahead ...)
- "(?<=...)" ;; (look-behind ...)
- "(?<!...)" ;; (neg-look-behind ...)
- "(?(test)pass|fail)" ;; (if test pass fail)
- "(*COMMIT)" ;; commit
-
-==== Chunked String Matching
-
-It's often desirable to perform regular expression matching over
-sequences of characters not represented as a single string. The most
-obvious example is a text-buffer data structure, but you may also want
-to match over lists or trees of strings (i.e. ropes), over only
-certain ranges within a string, over an input port, etc. With
-existing regular expression libraries, the only way to accomplish this
-is by converting the abstract sequence into a freshly allocated
-string. This can be expensive, or even impossible if the object is a
-text-buffer opened onto a 500MB file.
-
-IrRegex provides a chunked string API specifically for this purpose.
-You define a chunking API with {{make-irregex-chunker}}:
-
-===== make-irregex-chunker
-
-<procedure>(make-irregex-chunker <get-next> <get-string> [<get-start> <get-end> <get-substring> <get-subchunk>])</procedure>
-
-where
-
-{{(<get-next> chunk) => }} returns the next chunk, or {{#f}} if there are no more chunks
-
-{{(<get-string> chunk) => }} a string source for the chunk
-
-{{(<get-start> chunk) => }} the start index of the result of {{<get-string>}} (defaults to always 0)
-
-{{(<get-end> chunk) => }} the end (exclusive) of the string (defaults to {{string-length}} of the source string)
-
-{{(<get-substring> cnk1 i cnk2 j) => }} a substring for the range between the chunk {{cnk1}} starting at index {{i}} and ending at {{cnk2}} at index {{j}}
-
-{{(<get-subchunk> cnk1 i cnk2 j) => }} as above but returns a new chunked data type instead of a string (optional)
-
-There are two important constraints on the {{<get-next>}} procedure.
-It must return an {{eq?}} identical object when called multiple times
-on the same chunk, and it must not return a chunk with an empty string
-(start == end). This second constraint is for performance reasons -
-we push the work of possibly filtering empty chunks to the chunker
-since there are many chunk types for which empty strings aren't
-possible, and this work is thus not needed. Note that the initial
-chunk passed to match on is allowed to be empty.
-
-{{<get-substring>}} is provided for possible performance improvements
-- without it a default is used. {{<get-subchunk>}} is optional -
-without it you may not use {{irregex-match-subchunk}} described above.
-
-You can then match chunks of these types with the following
-procedures:
-
-===== irregex-search/chunked
-===== irregex-match/chunked
-
-<procedure>(irregex-search/chunked <irx> <chunker> <chunk> [<start>])</procedure><br>
-<procedure>(irregex-match/chunked <irx> <chunker> <chunk> [<start>])</procedure>
-
-These return normal match-data objects.
-
-Example:
-
-To match against a simple, flat list of strings use:
-
-<enscript highlight=scheme>
- (define (rope->string rope1 start rope2 end)
- (if (eq? rope1 rope2)
- (substring (car rope1) start end)
- (let loop ((rope (cdr rope1))
- (res (list (substring (car rope1) start))))
- (if (eq? rope rope2)
- (string-concatenate-reverse ; from SRFI-13
- (cons (substring (car rope) 0 end) res))
- (loop (cdr rope) (cons (car rope) res))))))
-
- (define rope-chunker
- (make-irregex-chunker (lambda (x) (and (pair? (cdr x)) (cdr x)))
- car
- (lambda (x) 0)
- (lambda (x) (string-length (car x)))
- rope->string))
-
- (irregex-search/chunked <pat> rope-chunker <list-of-strings>)
-</enscript>
-
-Here we are just using the default start, end and substring behaviors,
-so the above chunker could simply be defined as:
-
-<enscript highlight=scheme>
- (define rope-chunker
- (make-irregex-chunker (lambda (x) (and (pair? (cdr x)) (cdr x))) car))
-</enscript>
-
-===== irregex-fold/chunked
-
-<procedure>(irregex-fold/chunked <irx> <kons> <knil> <chunker> <chunk> [<finish> [<start-index>]])</procedure>
-
-Chunked version of {{irregex-fold}}.
-
-==== Utilities
-
-The following procedures are also available.
-
-===== irregex-quote
-
-<procedure>(irregex-quote <str>)</procedure>
-
-Returns a new string with any special regular expression characters
-escaped, to match the original string literally in POSIX regular
-expressions.
-
-===== irregex-opt
-
-<procedure>(irregex-opt <list-of-strings>)</procedure>
-
-Returns an optimized SRE matching any of the literal strings
-in the list, like Emacs' \q{regexp-opt}. Note this optimization
-doesn't help when irregex is able to build a DFA.
-
-===== sre->string
-
-<procedure>(sre->string <sre>)</procedure>
-
-Convert an SRE to a POSIX-style regular expression string, if
-possible.
-
-
----
-Previous: [[Unit extras]]
-
-Next: [[Unit srfi-1]]
diff --git a/manual/Unit regex b/manual/Unit regex
index bd6eb479..2d0c249e 100644
--- a/manual/Unit regex
+++ b/manual/Unit regex
@@ -3,15 +3,21 @@
== Unit regex
+This library unit provides support for regular expressions. The regular
+expression package used is {{irregex}}
+written by Alex Shinn. Irregex supports most Perl-extensions and is
+written completely in Scheme.
-This library unit provides some high-level operations for regular
-expression and operations that are kept for backward compatibility
-to older versions of CHICKEN.
+This library unit exposes two APIs: the standard Chicken API described below, and the
+original irregex API. You may use either API or both:
-This unit uses the {{irregex}} unit internally. It is recommended
-to use the {{irregex}} API where possible, since it provides a
-more featureful interface.
+ (require-library regex) ; required for either API, or both
+ (import regex) ; import the Chicken regex API
+ (import irregex) ; import the original irregex API
+Regular expressions may be either POSIX-style strings (with most PCRE
+extensions) or an SCSH-style SRE. There is no {{(rx ...)}} syntax -
+just use normal Scheme lists, with quasiquote if you like.
=== grep
@@ -190,7 +196,266 @@ into a regular expression.
=> "\\^\\[0-9\\]\\+:.\n.\\*\\$"
</enscript>
+=== Extended SRE Syntax
+
+The following table summarizes the SRE syntax, with detailed explanations following.
+
+ ;; basic patterns
+ <string> ; literal string
+ (seq <sre> ...) ; sequence
+ (: <sre> ...)
+ (or <sre> ...) ; alternation
+
+ ;; optional/multiple patterns
+ (? <sre> ...) ; 0 or 1 matches
+ (* <sre> ...) ; 0 or more matches
+ (+ <sre> ...) ; 1 or more matches
+ (= <n> <sre> ...) ; exactly <n> matches
+ (>= <n> <sre> ...) ; <n> or more matches
+ (** <from> <to> <sre> ...) ; <n> to <m> matches
+ (?? <sre> ...) ; non-greedy (non-greedy) pattern: (0 or 1)
+ (*? <sre> ...) ; non-greedy kleene star
+ (**? <from> <to> <sre> ...) ; non-greedy range
+
+ ;; submatch patterns
+ (submatch <sre> ...) ; numbered submatch
+ (submatch-named <name> <sre> ...) ; named submatch
+ (=> <name> <sre> ...)
+ (backref <n-or-name>) ; match a previous submatch
+
+ ;; toggling case-sensitivity
+ (w/case <sre> ...) ; enclosed <sre>s are case-sensitive
+ (w/nocase <sre> ...) ; enclosed <sre>s are case-insensitive
+
+ ;; character sets
+ <char> ; singleton char set
+ (<string>) ; set of chars
+ (or <cset-sre> ...) ; set union
+ (~ <cset-sre> ...) ; set complement (i.e. [^...])
+ (- <cset-sre> ...) ; set difference
+ (& <cset-sre> ...) ; set intersection
+ (/ <range-spec> ...) ; pairs of chars as ranges
+
+ ;; named character sets
+ any
+ nonl
+ ascii
+ lower-case lower
+ upper-case upper
+ alphabetic alpha
+ numeric num
+ alphanumeric alphanum alnum
+ punctuation punct
+ graphic graph
+ whitespace white space
+ printing print
+ control cntrl
+ hex-digit xdigit
+
+ ;; assertions and conditionals
+ bos eos ; beginning/end of string
+ bol eol ; beginning/end of line
+ bow eow ; beginning/end of word
+ nwb ; non-word-boundary
+ (look-ahead <sre> ...) ; zero-width look-ahead assertion
+ (look-behind <sre> ...) ; zero-width look-behind assertion
+ (neg-look-ahead <sre> ...) ; zero-width negative look-ahead assertion
+ (neg-look-behind <sre> ...) ; zero-width negative look-behind assertion
+ (atomic <sre> ...) ; for (?>...) independent patterns
+ (if <test> <pass> [<fail>]) ; conditional patterns
+ commit ; don't backtrack beyond this (i.e. cut)
+
+ ;; backwards compatibility
+ (posix-string <string>) ; embed a POSIX string literal
+
+==== Basic SRE Patterns
+
+The simplest SRE is a literal string, which matches that string exactly.
+
+ (string-search "needle" "hayneedlehay") => <match>
+
+By default the match is case-sensitive, though you can control this either with the compiler flags or local overrides:
+
+ (string-search "needle" "haynEEdlehay") => #f
+
+ (string-search (irregex "needle" 'i) "haynEEdlehay") => <match>
+
+ (string-search '(w/nocase "needle") "haynEEdlehay") => <match>
+
+You can use {{w/case}} to switch back to case-sensitivity inside a {{w/nocase}}:
+
+ (string-search '(w/nocase "SMALL" (w/case "BIG")) "smallBIGsmall") => <match>
+
+ (string-search '(w/nocase "small" (w/case "big")) "smallBIGsmall") => #f
+
+Of course, literal strings by themselves aren't very interesting
+regular expressions, so we want to be able to compose them. The most
+basic way to do this is with the {{seq}} operator (or its abbreviation {{:}}),
+which matches one or more patterns consecutively:
+
+ (string-search '(: "one" space "two" space "three") "one two three") => <match>
+
+As you may have noticed above, the {{w/case}} and {{w/nocase}} operators
+allowed multiple SREs in a sequence - other operators that take any
+number of arguments (e.g. the repetition operators below) allow such
+implicit sequences.
+
+To match any one of a set of patterns use the or alternation operator:
+
+ (string-search '(or "eeney" "meeney" "miney") "meeney") => <match>
+
+ (string-search '(or "eeney" "meeney" "miney") "moe") => #f
+
+==== SRE Repetition Patterns
+
+There are also several ways to control the number of times a pattern
+is matched. The simplest of these is {{?}} which just optionally matches
+the pattern:
+
+ (string-search '(: "match" (? "es") "!") "matches!") => <match>
+
+ (string-search '(: "match" (? "es") "!") "match!") => <match>
+
+ (string-search '(: "match" (? "es") "!") "matche!") => #f
+
+To optionally match any number of times, use {{*}}, the Kleene star:
+
+ (string-search '(: "<" (* (~ #\>)) ">") "<html>") => <match>
+
+ (string-search '(: "<" (* (~ #\>)) ">") "<>") => <match>
+
+ (string-search '(: "<" (* (~ #\>)) ">") "<html") => #f
+
+Often you want to match any number of times, but at least one time is required, and for that you use {{+}}:
+
+ (string-search '(: "<" (+ (~ #\>)) ">") "<html>") => <match>
+
+ (string-search '(: "<" (+ (~ #\>)) ">") "<a>") => <match>
+
+ (string-search '(: "<" (+ (~ #\>)) ">") "<>") => #f
+
+More generally, to match at least a given number of times, use {{>=}}:
+
+ (string-search '(: "<" (>= 3 (~ #\>)) ">") "<table>") => <match>
+
+ (string-search '(: "<" (>= 3 (~ #\>)) ">") "<pre>") => <match>
+
+ (string-search '(: "<" (>= 3 (~ #\>)) ">") "<tr>") => #f
+
+To match a specific number of times exactly, use {=}:
+
+ (string-search '(: "<" (= 4 (~ #\>)) ">") "<html>") => <match>
+
+ (string-search '(: "<" (= 4 (~ #\>)) ">") "<table>") => #f
+
+And finally, the most general form is {{**}} which specifies a range
+of times to match. All of the earlier forms are special cases of this.
+
+ (string-search '(: (= 3 (** 1 3 numeric) ".") (** 1 3 numeric)) "192.168.1.10") => <match>
+
+ (string-search '(: (= 3 (** 1 3 numeric) ".") (** 1 3 numeric)) "192.0168.1.10") => #f
+
+There are also so-called "non-greedy" variants of these repetition
+operators, by convention suffixed with an additional {{?}}. Since the
+normal repetition patterns can match any of the allotted repetition
+range, these operators will match a string if and only if the normal
+versions matched. However, when the endpoints of which submatch
+matched where are taken into account (specifically, all matches when
+using string-search since the endpoints of the match itself matter),
+the use of a non-greedy repetition can change the result.
+
+So, whereas {{?}} can be thought to mean "match or don't match," {{??}} means
+"don't match or match." {{*}} typically consumes as much as possible, but
+{{*?}} tries first to match zero times, and only consumes one at a time if
+that fails. If you have a greedy operator followed by a non-greedy
+operator in the same pattern, they can produce surprisins results as
+they compete to make the match longer or shorter. If this seems
+confusing, that's because it is. Non-greedy repetitions are defined
+only in terms of the specific backtracking algorithm used to implement
+them, which for compatibility purposes always means the Perl
+algorithm. Thus, when using these patterns you force IrRegex to use a
+backtracking engine, and can't rely on efficient execution.
+
+==== SRE Character Sets
+
+Perhaps more common than matching specific strings is matching any of
+a set of characters. You can use the or alternation pattern on a list
+of single-character strings to simulate a character set, but this is
+too clumsy for everyday use so SRE syntax allows a number of
+shortcuts.
+
+A single character matches that character literally, a trivial
+character class. More conveniently, a list holding a single element
+which is a string refers to the character set composed of every
+character in the string.
+
+ (string-match '(* #\-) "---") => <match>
+
+ (string-match '(* #\-) "-_-") => #f
+
+ (string-match '(* ("aeiou")) "oui") => <match>
+
+ (string-match '(* ("aeiou")) "ouais") => #f
+
+Ranges are introduced with the {{/}} operator. Any strings or characters
+in the {{/}} are flattened and then taken in pairs to represent the start
+and end points, inclusive, of character ranges.
+
+ (string-match '(* (/ "AZ09")) "R2D2") => <match>
+
+ (string-match '(* (/ "AZ09")) "C-3PO") => #f
+
+In addition, a number of set algebra operations are provided. or, of
+course, has the same meaning, but when all the options are character
+sets it can be thought of as the set union operator. This is further
+extended by the {{&}} set intersection, {{-}} set difference, and {{~}} set
+complement operators.
+
+ (string-match '(* (& (/ "az") (~ ("aeiou")))) "xyzzy") => <match>
+
+ (string-match '(* (& (/ "az") (~ ("aeiou")))) "vowels") => #f
+
+ (string-match '(* (- (/ "az") ("aeiou"))) "xyzzy") => <match>
+
+ (string-match '(* (- (/ "az") ("aeiou"))) "vowels") => #f
+
+==== SRE Assertion Patterns
+
+There are a number of times it can be useful to assert something about
+the area around a pattern without explicitly making it part of the
+pattern. The most common cases are specifically anchoring some pattern
+to the beginning or end of a word or line or even the whole
+string. For example, to match on the end of a word:
+
+ (string-match '(: "foo" eow) "foo") => <match>
+
+ (string-match '(: "foo" eow) "foo!") => <match>
+
+ (string-match '(: "foo" eow) "foof") => #f
+
+The {{bow}}, {{bol}}, {{eol}}, {{bos}} and {{eos}} work similarly. {{nwb}} asserts that you
+are not in a word-boundary - if replaced for {{eow}} in the above examples
+it would reverse all the results.
+
+There is no {{wb}}, since you tend to know from context whether it
+would be the beginning or end of a word, but if you need it you can
+always use (or bow eow).
+
+Somewhat more generally, Perl introduced positive and negative
+look-ahead and look-behind patterns. Perl look-behind patterns are
+limited to a fixed length, however the IrRegex versions have no such
+limit.
+
+ (string-match '(: "regular" (look-ahead " expression")) "regular expression") => <match>
+
+The most general case, of course, would be an and pattern to
+complement the or pattern - all the patterns must match or the whole
+pattern fails. This may be provided in a future release, although it
+(and look-ahead and look-behind assertions) are unlikely to be
+compiled efficiently.
+
+
---
-Previous: [[Unit irregex]]
+Previous: [[Unit extras]]
Next: [[Unit srfi-1]]
diff --git a/manual/Unit srfi-1 b/manual/Unit srfi-1
index 7a8eb917..72ac271f 100644
--- a/manual/Unit srfi-1
+++ b/manual/Unit srfi-1
@@ -1515,6 +1515,6 @@ arguments.
----
-Previous: [[Unit irregex]]
+Previous: [[Unit regex]]
Next: [[Unit srfi-4]]
diff --git a/manual/Unit utils b/manual/Unit utils
index 8c174b1d..e1af0895 100644
--- a/manual/Unit utils
+++ b/manual/Unit utils
@@ -88,12 +88,11 @@ The initial default options are {{-scrutinize -O2 -d2}}.
<procedure>(scan-input-lines REGEXP [PORT])</procedure>
Reads lines from {{PORT}} (defaults to the result of {{(current-input-port)}})
-using {{read-line}} and returns the result of {{(irregex-search REGEXP LINE)}},
+using {{read-line}} and returns the result of {{(string-search REGEXP LINE)}},
if the match succeeds. If no match could be found, {{#f}} is returned.
{{REGEXP}} may also be a procedure of one argument which is called for each
-input line and should return a non-false value on success, which will then
-be the result of the call to {{scan-input-lines}}.
+input line and should return a non-false value on success.
=== Asking the user for confirmation
diff --git a/manual/faq b/manual/faq
index 24f054af..7abf0381 100644
--- a/manual/faq
+++ b/manual/faq
@@ -533,7 +533,7 @@ Compile the program that uses the module:
The regular expression engine has recently be replaced by [[/users/alex shinn|alex shinn]]'s excellent
{{irregex}} library, which is fully implemented in Scheme. Precompiling regular
expressions to internal form is somewhat slower than with the old PCRE-based
-regex engine. It is advisable to use {{irregex}} to precompile regular expressions
+regex engine. It is advisable to use {{regexp}} to precompile regular expressions
outside of time-critical loops and use them where performance matters.
diff --git a/posix-common.scm b/posix-common.scm
index e77b05fb..55f9f488 100644
--- a/posix-common.scm
+++ b/posix-common.scm
@@ -233,54 +233,64 @@ EOF
;;; Filename globbing:
(define glob
- (lambda paths
- (let conc-loop ((paths paths))
- (if (null? paths)
- '()
- (let ((path (car paths)))
- (let-values (((dir fil ext) (decompose-pathname path)))
- (let ((rx (##sys#glob->regexp (make-pathname #f (or fil "*") ext))))
- (let loop ((fns (directory (or dir ".") #t)))
- (cond ((null? fns) (conc-loop (cdr paths)))
- ((irregex-match rx (car fns))
- => (lambda (m)
- (cons
- (make-pathname dir (irregex-match-substring m))
- (loop (cdr fns)))) )
- (else (loop (cdr fns))) ) ) ) ) ) ) ) ) )
+ (let ((regexp regexp)
+ (string-match string-match)
+ (glob->regexp glob->regexp)
+ (directory directory)
+ (make-pathname make-pathname)
+ (decompose-pathname decompose-pathname) )
+ (lambda paths
+ (let conc-loop ((paths paths))
+ (if (null? paths)
+ '()
+ (let ((path (car paths)))
+ (let-values (((dir fil ext) (decompose-pathname path)))
+ (let* ((patt (glob->regexp (make-pathname #f (or fil "*") ext)))
+ (rx (regexp patt)))
+ (let loop ((fns (directory (or dir ".") #t)))
+ (cond ((null? fns) (conc-loop (cdr paths)))
+ ((string-match rx (car fns))
+ => (lambda (m) (cons (make-pathname dir (car m)) (loop (cdr fns)))) )
+ (else (loop (cdr fns))) ) ) ) ) ) ) ) ) ) )
;;; Find matching files:
(define ##sys#find-files
- (lambda (dir pred action id limit follow dot loc)
- (##sys#check-string dir loc)
- (let* ((depth 0)
- (lproc
- (cond ((not limit) (lambda _ #t))
- ((fixnum? limit) (lambda _ (fx< depth limit)))
- (else limit) ) )
- (pproc
- (if (or (string? pred) (irregex? pred))
- (let ((pred (irregex pred))) ; force compilation
- (lambda (x) (irregex-match pred x)))
- pred) ) )
- (let loop ((fs (glob (make-pathname dir (if dot "?*" "*"))))
- (r id) )
- (if (null? fs)
- r
- (let ((f (##sys#slot fs 0))
- (rest (##sys#slot fs 1)) )
- (cond ((directory? f)
- (cond ((member (pathname-file f) '("." "..")) (loop rest r))
- ((lproc f)
- (loop rest
- (fluid-let ((depth (fx+ depth 1)))
- (loop (glob (make-pathname f "*"))
- (if (pproc f) (action f r) r)) ) ) )
- (else (loop rest (if (pproc f) (action f r) r))) ) )
- ((pproc f) (loop rest (action f r)))
- (else (loop rest r)) ) ) ) ) ) ) )
+ (let ((glob glob)
+ (string-match string-match)
+ (make-pathname make-pathname)
+ (pathname-file pathname-file)
+ (symbolic-link? symbolic-link?)
+ (directory? directory?) )
+ (lambda (dir pred action id limit follow dot loc)
+ (##sys#check-string dir loc)
+ (let* ((depth 0)
+ (lproc
+ (cond ((not limit) (lambda _ #t))
+ ((fixnum? limit) (lambda _ (fx< depth limit)))
+ (else limit) ) )
+ (pproc
+ (if (or (string? pred) (regexp? pred))
+ (let ((pred (regexp pred))) ; force compilation
+ (lambda (x) (string-match pred x)))
+ pred) ) )
+ (let loop ((fs (glob (make-pathname dir (if dot "?*" "*"))))
+ (r id) )
+ (if (null? fs)
+ r
+ (let ((f (##sys#slot fs 0))
+ (rest (##sys#slot fs 1)) )
+ (cond ((directory? f)
+ (cond ((member (pathname-file f) '("." "..")) (loop rest r))
+ ((lproc f)
+ (loop rest
+ (fluid-let ((depth (fx+ depth 1)))
+ (loop (glob (make-pathname f "*"))
+ (if (pproc f) (action f r) r)) ) ) )
+ (else (loop rest (if (pproc f) (action f r) r))) ) )
+ ((pproc f) (loop rest (action f r)))
+ (else (loop rest r)) ) ) ) ) ) ) ) )
(define (find-files dir . args)
(cond ((or (null? args) (not (keyword? (car args))))
diff --git a/posixunix.scm b/posixunix.scm
index ed83fe1b..e17f6b3e 100644
--- a/posixunix.scm
+++ b/posixunix.scm
@@ -27,7 +27,7 @@
(declare
(unit posix)
- (uses scheduler irregex extras utils files ports)
+ (uses scheduler regex extras utils files ports)
(disable-interrupts)
(hide group-member _get-groups _ensure-groups posix-error ##sys#terminal-check)
(not inline ##sys#interrupt-hook ##sys#user-interrupt-hook))
diff --git a/posixwin.scm b/posixwin.scm
index 97a0a232..9dee8ede 100644
--- a/posixwin.scm
+++ b/posixwin.scm
@@ -63,7 +63,7 @@
(declare
(unit posix)
- (uses scheduler irregex extras utils files ports)
+ (uses scheduler regex extras utils files ports)
(disable-interrupts)
(hide $quote-args-list $exec-setup $exec-teardown)
(not inline ##sys#interrupt-hook ##sys#user-interrupt-hook)
diff --git a/regex.import.scm b/regex.import.scm
new file mode 100644
index 00000000..70310a22
--- /dev/null
+++ b/regex.import.scm
@@ -0,0 +1,41 @@
+;;;; regex.import.scm - import library for "regex" module
+;
+; Copyright (c) 2008-2010, The Chicken Team
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following
+; conditions are met:
+;
+; Redistributions of source code must retain the above copyright notice, this list of conditions and the following
+; disclaimer.
+; Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
+; disclaimer in the documentation and/or other materials provided with the distribution.
+; Neither the name of the author nor the names of its contributors may be used to endorse or promote
+; products derived from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS
+; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+; AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
+; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+; OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+; POSSIBILITY OF SUCH DAMAGE.
+
+
+(##sys#register-primitive-module
+ 'regex
+ '(glob->regexp
+ glob?
+ grep
+ regexp
+ regexp-escape
+ regexp?
+ string-match
+ string-match-positions
+ string-search
+ string-search-positions
+ string-split-fields
+ string-substitute
+ string-substitute*))
diff --git a/regex.scm b/regex.scm
new file mode 100644
index 00000000..526e65ad
--- /dev/null
+++ b/regex.scm
@@ -0,0 +1,360 @@
+;;;; regex.scm
+;
+; Copyright (c) 2008-2010, The Chicken Team
+; Copyright (c) 2000-2007, Felix L. Winkelmann
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following
+; conditions are met:
+;
+; Redistributions of source code must retain the above copyright notice, this list of conditions and the following
+; disclaimer.
+; Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
+; disclaimer in the documentation and/or other materials provided with the distribution.
+; Neither the name of the author nor the names of its contributors may be used to endorse or promote
+; products derived from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS
+; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+; AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
+; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+; OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+; POSSIBILITY OF SUCH DAMAGE.
+
+
+(declare (unit regex))
+
+(declare
+ (disable-interrupts)
+ (fixnum)
+ (export
+ regexp? regexp
+ string-match string-match-positions string-search string-search-positions
+ string-split-fields string-substitute string-substitute*
+ glob->regexp
+ grep
+ regexp-escape
+
+ irregex string->irregex sre->irregex string->sre
+ irregex? irregex-match-data?
+ irregex-new-matches irregex-reset-matches!
+ irregex-match-start irregex-match-end irregex-match-substring
+ irregex-match-num-submatches
+ irregex-search irregex-search/matches irregex-match irregex-match-string
+ irregex-fold irregex-replace irregex-replace/all irregex-apply-match
+ irregex-dfa irregex-dfa/search irregex-dfa/extract
+ irregex-nfa irregex-flags irregex-submatches irregex-lengths irregex-names
+ ))
+
+(include "common-declarations.scm")
+
+(register-feature! 'regex 'irregex)
+
+(include "irregex.scm")
+
+
+;;; Record `regexp'
+
+(define-record regexp x)
+
+(define-syntax (build-cache x r c)
+ ;; (build-cache N ARG FAIL)
+ (let* ((n (cadr x))
+ (n2 (* n 2))
+ (arg (caddr x))
+ (fail (cadddr x))
+ (%cache (r 'cache))
+ (%index (r 'index))
+ (%arg (r 'arg))
+ (%let (r 'let))
+ (%let* (r 'let*))
+ (%if (r 'if))
+ (%fx+ (r 'fx+))
+ (%fxmod (r 'fxmod))
+ (%equal? (r 'equal?))
+ (%quote (r 'quote))
+ (%tmp (r 'tmp))
+ (%begin (r 'begin))
+ (cache (make-vector (add1 n2) #f)))
+ (vector-set! cache n2 0) ; last slot: current index
+ `(,%let* ((,%cache (,%quote ,cache))
+ (,%arg ,arg))
+ ,(let fold ((i 0))
+ (if (>= i n)
+ ;; this should be thread-safe: a context-switch can only
+ ;; happen before this code and in the call to FAIL.
+ `(,%let ((,%tmp ,fail)
+ (,%index (##sys#slot ,%cache ,n2)))
+ (##sys#setslot ,%cache ,%index ,%arg)
+ (##sys#setslot ,%cache (,%fx+ ,%index 1) ,%tmp)
+ (##sys#setislot
+ ,%cache ,n2
+ (##core#inline "C_u_fixnum_modulo" (,%fx+ ,%index 2) ,n2))
+ ,%tmp)
+ `(,%if (,%equal? (##sys#slot ,%cache ,(* i 2)) ,%arg)
+ (##sys#slot ,%cache ,(add1 (* i 2)))
+ ,(fold (add1 i))))))))
+
+(define (regexp pat #!optional caseless extended utf8)
+ (if (regexp? pat)
+ pat
+ (make-regexp
+ (apply
+ irregex
+ pat
+ (let ((opts '()))
+ (when caseless (set! opts (cons 'i opts)))
+ (when extended (set! opts (cons 'x opts)))
+ (when utf8 (set! opts (cons 'utf8 opts)))
+ opts))) ) )
+
+(define (unregexp x)
+ (cond ((regexp? x) (regexp-x x))
+ ((irregex? x) x)
+ (else
+ (build-cache
+ 5 x
+ (irregex x)))))
+
+
+;;; Basic `regexp' operations
+
+(define (string-match rx str)
+ (let ((rx (unregexp rx)))
+ (and-let* ((m (irregex-match rx str)))
+ (let loop ((i (irregex-match-num-submatches m))
+ (res '()))
+ (if (fx<= i 0)
+ (cons str res)
+ (loop (fx- i 1) (cons (irregex-match-substring m i) res)))))))
+
+(define (string-match-positions rx str)
+ (let ((rx (unregexp rx)))
+ (and-let* ((m (irregex-match rx str)))
+ (let loop ((i (irregex-match-num-submatches m))
+ (res '()))
+ (if (fx<= i 0)
+ (cons (list 0 (string-length str)) res)
+ (loop (fx- i 1) (cons (list (irregex-match-start-index m i)
+ (irregex-match-end-index m i))
+ res)))))))
+
+(define (string-search rx str #!optional (start 0) (range (string-length str)))
+ (let ((rx (unregexp rx)))
+ (let ((n (string-length str)))
+ (and-let* ((m (irregex-search rx str start (min n (fx+ start range)))))
+ (let loop ((i (irregex-match-num-submatches m))
+ (res '()))
+ (if (fx< i 0)
+ res
+ (loop (fx- i 1) (cons (irregex-match-substring m i) res))))))))
+
+(define (string-search-positions rx str #!optional (start 0) (range (string-length str)))
+ (let ((rx (unregexp rx)))
+ (let ((n (string-length str)))
+ (and-let* ((m (irregex-search rx str start (min n (fx+ start range)))))
+ (let loop ((i (irregex-match-num-submatches m))
+ (res '()))
+ (if (fx< i 0)
+ res
+ (loop (fx- i 1) (cons (list (irregex-match-start-index m i)
+ (irregex-match-end-index m i))
+ res))))))))
+
+
+;;; Split string into fields:
+
+(define string-split-fields
+ (let ([reverse reverse]
+ [substring substring]
+ [string-search-positions string-search-positions] )
+ (lambda (rx str . mode-and-start)
+ (##sys#check-string str 'string-split-fields)
+ (let* ([argc (length mode-and-start)]
+ [len (##sys#size str)]
+ [mode (if (fx> argc 0) (car mode-and-start) #t)]
+ [start (if (fx> argc 1) (cadr mode-and-start) 0)]
+ [fini (case mode
+ [(#:suffix)
+ (lambda (ms start)
+ (if (fx< start len)
+ (##sys#error 'string-split-fields
+ "record does not end with suffix" str rx)
+ (reverse ms) ) ) ]
+ [(#:infix)
+ (lambda (ms start)
+ (if (fx>= start len)
+ (reverse (cons "" ms))
+ (reverse (cons (substring str start len) ms)) ) ) ]
+ [else (lambda (ms start) (reverse ms)) ] ) ]
+ [fetch (case mode
+ [(#:infix #:suffix) (lambda (start from to) (substring str start from))]
+ [else (lambda (start from to) (substring str from to))] ) ] )
+ (let loop ([ms '()] [start start])
+ (let ([m (string-search-positions rx str start)])
+ (if m
+ (let* ([mp (car m)]
+ [from (car mp)]
+ [to (cadr mp)] )
+ (if (fx= from to)
+ (if (fx= to len)
+ (fini ms start)
+ (loop (cons (fetch start (fx+ from 1) (fx+ to 2)) ms) (fx+ to 1)) )
+ (loop (cons (fetch start from to) ms) to) ) )
+ (fini ms start) ) ) ) ) ) ) )
+
+
+;;; Substitute matching strings:
+
+(define string-substitute
+ (let ([substring substring]
+ [reverse reverse]
+ [make-string make-string]
+ [string-search-positions string-search-positions] )
+ (lambda (rx subst string . flag)
+ (##sys#check-string subst 'string-substitute)
+ (##sys#check-string string 'string-substitute)
+ (let* ([which (if (pair? flag) (car flag) 1)]
+ [substlen (##sys#size subst)]
+ (strlen (##sys#size string))
+ [substlen-1 (fx- substlen 1)]
+ [result '()]
+ [total 0] )
+ (define (push x)
+ (set! result (cons x result))
+ (set! total (fx+ total (##sys#size x))) )
+ (define (substitute matches)
+ (let loop ([start 0] [index 0])
+ (if (fx>= index substlen-1)
+ (push (if (fx= start 0) subst (substring subst start substlen)))
+ (let ([c (##core#inline "C_subchar" subst index)]
+ [index+1 (fx+ index 1)] )
+ (if (char=? c #\\)
+ (let ([c2 (##core#inline "C_subchar" subst index+1)])
+ (if (and (not (char=? #\\ c2)) (char-numeric? c2))
+ (let ([mi (list-ref matches (fx- (char->integer c2) 48))])
+ (push (substring subst start index))
+ (push (substring string (car mi) (cadr mi)))
+ (loop (fx+ index 2) index+1) )
+ (loop start (fx+ index+1 1)) ) )
+ (loop start index+1) ) ) ) ) )
+ (let loop ([index 0] [count 1])
+ (let ((matches (and (fx< index strlen)
+ (string-search-positions rx string index))))
+ (cond [matches
+ (let* ([range (car matches)]
+ [upto (cadr range)] )
+ (cond ((fx= 0 (fx- (cadr range) (car range)))
+ (##sys#error
+ 'string-substitute "empty substitution match"
+ rx) )
+ ((or (not (fixnum? which)) (fx= count which))
+ (push (substring string index (car range)))
+ (substitute matches)
+ (loop upto #f) )
+ (else
+ (push (substring string index upto))
+ (loop upto (fx+ count 1)) ) ) ) ]
+ [else
+ (push (substring string index (##sys#size string)))
+ (##sys#fragments->string total (reverse result)) ] ) ) ) ) ) ) )
+
+(define string-substitute*
+ (let ([string-substitute string-substitute])
+ (lambda (str smap . mode)
+ (##sys#check-string str 'string-substitute*)
+ (##sys#check-list smap 'string-substitute*)
+ (let ((mode (and (pair? mode) (car mode))))
+ (let loop ((str str) (smap smap))
+ (if (null? smap)
+ str
+ (let ((sm (car smap)))
+ (loop (string-substitute (car sm) (cdr sm) str mode)
+ (cdr smap) ) ) ) ) ) ) ) )
+
+
+;;; Glob support:
+
+(define glob->regexp
+ (let ((list->string list->string)
+ (string->list string->list)
+ (regexp regexp))
+ (lambda (s #!optional sre?)
+ (##sys#check-string s 'glob->regexp)
+ (let ((sre
+ (cons
+ ':
+ (let loop ((cs (string->list s)) (dir #t))
+ (if (null? cs)
+ '()
+ (let ((c (car cs))
+ (rest (cdr cs)) )
+ (cond ((char=? c #\*)
+ (if dir
+ `((or (: (~ ("./\\"))
+ (* (~ ("/\\"))))
+ (* (~ ("./\\"))))
+ ,@(loop rest #f))
+ `((* (~ ("/\\"))) ,@(loop rest #f))))
+ ((char=? c #\?) (cons 'any (loop rest #f)))
+ ((char=? c #\[)
+ (let loop2 ((rest rest) (s '()))
+ (cond ((not (pair? rest))
+ (error 'glob->regexp "unexpected end of character class" s))
+ ((char=? #\] (car rest))
+ `((or ,@s) ,@(loop (cdr rest) #f)))
+ ((and (pair? (cdr rest))
+ (pair? (cddr rest))
+ (char=? #\- (cadr rest)) )
+ (loop2 (cdddr rest) (cons `(/ ,(car rest) ,(caddr rest)) s)))
+ ((and (pair? (cdr rest))
+ (char=? #\- (car rest)))
+ (loop2 (cddr rest)
+ (cons `(~ ,(cadr rest)) s)))
+ (else
+ (loop2 (cdr rest) (cons (car rest) s))))))
+ (else (cons c (loop rest (memq c '(#\\ #\/))))))))))))
+ (if sre? sre (regexp sre))))))
+
+
+;;; Grep-like function on list:
+
+(define grep
+ (let ((string-search string-search)
+ (regexp regexp))
+ (lambda (rx lst #!optional (acc (lambda (x) x)))
+ (##sys#check-list lst 'grep)
+ (##sys#check-closure acc 'grep)
+ (let ((rx (regexp rx)))
+ (let loop ((lst lst))
+ (if (null? lst)
+ '()
+ (let ((x (##sys#slot lst 0))
+ (r (##sys#slot lst 1)) )
+ (if (string-search rx (acc x))
+ (cons x (loop r))
+ (loop r) ) ) ) ) ) ) ) )
+
+
+;;; Escape regular expression (suggested by Peter Bex):
+
+(define regexp-escape
+ (let ([open-output-string open-output-string]
+ [get-output-string get-output-string] )
+ (lambda (str)
+ (##sys#check-string str 'regexp-escape)
+ (let ([out (open-output-string)]
+ [len (##sys#size str)] )
+ (let loop ([i 0])
+ (cond [(fx>= i len) (get-output-string out)]
+ [(memq (##core#inline "C_subchar" str i)
+ '(#\. #\\ #\? #\* #\+ #\^ #\$ #\( #\) #\[ #\] #\| #\{ #\}))
+ (##sys#write-char-0 #\\ out)
+ (##sys#write-char-0 (##core#inline "C_subchar" str i) out)
+ (loop (fx+ i 1)) ]
+ [else
+ (##sys#write-char-0 (##core#inline "C_subchar" str i) out)
+ (loop (fx+ i 1)) ] ) ) ) ) ) )
diff --git a/rules.make b/rules.make
index a2888e16..9c6ac23a 100644
--- a/rules.make
+++ b/rules.make
@@ -28,11 +28,16 @@ VPATH=$(SRCDIR)
# object files
-LIBCHICKEN_OBJECTS_1 = library eval data-structures ports files extras lolevel utils tcp srfi-1 srfi-4 srfi-13 srfi-14 srfi-18 srfi-69 $(POSIXFILE) irregex scheduler profiler stub expand chicken-syntax chicken-ffi-syntax runtime
+LIBCHICKEN_OBJECTS_1 = \
+ library eval data-structures ports files extras lolevel utils tcp srfi-1 srfi-4 srfi-13 \
+ srfi-14 srfi-18 srfi-69 $(POSIXFILE) regex scheduler \
+ profiler stub expand chicken-syntax chicken-ffi-syntax runtime
LIBCHICKEN_SHARED_OBJECTS = $(LIBCHICKEN_OBJECTS_1:=$(O))
LIBCHICKEN_STATIC_OBJECTS = $(LIBCHICKEN_OBJECTS_1:=-static$(O))
-COMPILER_OBJECTS_1 = chicken batch-driver compiler optimizer compiler-syntax scrutinizer unboxing support c-platform c-backend
+COMPILER_OBJECTS_1 = \
+ chicken batch-driver compiler optimizer compiler-syntax scrutinizer unboxing support \
+ c-platform c-backend
COMPILER_OBJECTS = $(COMPILER_OBJECTS_1:=$(O))
COMPILER_STATIC_OBJECTS = $(COMPILER_OBJECTS_1:=-static$(O))
@@ -95,7 +100,7 @@ profiler$(O): profiler.c chicken.h $(CHICKEN_CONFIG_H)
$(C_COMPILER) $(C_COMPILER_OPTIONS) $(INCLUDES) \
$(C_COMPILER_COMPILE_OPTION) $(C_COMPILER_OPTIMIZATION_OPTIONS) $(C_COMPILER_SHARED_OPTIONS) \
$(C_COMPILER_BUILD_RUNTIME_OPTIONS) $< $(C_COMPILER_OUTPUT)
-irregex$(O): irregex.c chicken.h $(CHICKEN_CONFIG_H)
+regex$(O): regex.c chicken.h $(CHICKEN_CONFIG_H)
$(C_COMPILER) $(C_COMPILER_OPTIONS) \
$(C_COMPILER_COMPILE_OPTION) $(C_COMPILER_OPTIMIZATION_OPTIONS) $(C_COMPILER_SHARED_OPTIONS) \
$(C_COMPILER_BUILD_RUNTIME_OPTIONS) $< $(C_COMPILER_OUTPUT)
@@ -210,7 +215,7 @@ profiler-static$(O): profiler.c chicken.h $(CHICKEN_CONFIG_H)
$(C_COMPILER_COMPILE_OPTION) $(C_COMPILER_OPTIMIZATION_OPTIONS) \
$(C_COMPILER_STATIC_OPTIONS) \
$(C_COMPILER_BUILD_RUNTIME_OPTIONS) $< $(C_COMPILER_OUTPUT)
-irregex-static$(O): irregex.c chicken.h $(CHICKEN_CONFIG_H)
+regex-static$(O): regex.c chicken.h $(CHICKEN_CONFIG_H)
$(C_COMPILER) $(C_COMPILER_OPTIONS) $(INCLUDES) \
$(C_COMPILER_COMPILE_OPTION) $(C_COMPILER_OPTIMIZATION_OPTIONS) \
$(C_COMPILER_STATIC_OPTIONS) \
@@ -321,6 +326,10 @@ extras.import$(O): extras.import.c chicken.h $(CHICKEN_CONFIG_H)
$(HOST_C_COMPILER) $(HOST_C_COMPILER_OPTIONS) $(HOST_C_COMPILER_PTABLES_OPTIONS) $(INCLUDES) -DC_SHARED \
$(HOST_C_COMPILER_COMPILE_OPTION) $(HOST_C_COMPILER_OPTIMIZATION_OPTIONS) $(HOST_C_COMPILER_SHARED_OPTIONS) \
$(HOST_C_COMPILER_BUILD_RUNTIME_OPTIONS) $< $(HOST_C_COMPILER_OUTPUT)
+regex.import$(O): regex.import.c chicken.h $(CHICKEN_CONFIG_H)
+ $(HOST_C_COMPILER) $(HOST_C_COMPILER_OPTIONS) $(HOST_C_COMPILER_PTABLES_OPTIONS) $(INCLUDES) -DC_SHARED \
+ $(HOST_C_COMPILER_COMPILE_OPTION) $(HOST_C_COMPILER_OPTIMIZATION_OPTIONS) $(HOST_C_COMPILER_SHARED_OPTIONS) \
+ $(HOST_C_COMPILER_BUILD_RUNTIME_OPTIONS) $< $(HOST_C_COMPILER_OUTPUT)
irregex.import$(O): irregex.import.c chicken.h $(CHICKEN_CONFIG_H)
$(HOST_C_COMPILER) $(HOST_C_COMPILER_OPTIONS) $(HOST_C_COMPILER_PTABLES_OPTIONS) $(INCLUDES) -DC_SHARED \
$(HOST_C_COMPILER_COMPILE_OPTION) $(HOST_C_COMPILER_OPTIMIZATION_OPTIONS) $(HOST_C_COMPILER_SHARED_OPTIONS) \
@@ -670,6 +679,7 @@ ifdef STATICBUILD
$(INSTALL_PROGRAM) $(INSTALL_PROGRAM_FILE_OPTIONS) srfi-13.import.scm "$(DESTDIR)$(IEGGDIR)"
$(INSTALL_PROGRAM) $(INSTALL_PROGRAM_FILE_OPTIONS) srfi-69.import.scm "$(DESTDIR)$(IEGGDIR)"
$(INSTALL_PROGRAM) $(INSTALL_PROGRAM_FILE_OPTIONS) extras.import.scm "$(DESTDIR)$(IEGGDIR)"
+ $(INSTALL_PROGRAM) $(INSTALL_PROGRAM_FILE_OPTIONS) regex.import.scm "$(DESTDIR)$(IEGGDIR)"
$(INSTALL_PROGRAM) $(INSTALL_PROGRAM_FILE_OPTIONS) srfi-14.import.scm "$(DESTDIR)$(IEGGDIR)"
$(INSTALL_PROGRAM) $(INSTALL_PROGRAM_FILE_OPTIONS) tcp.import.scm "$(DESTDIR)$(IEGGDIR)"
$(INSTALL_PROGRAM) $(INSTALL_PROGRAM_FILE_OPTIONS) foreign.import.scm "$(DESTDIR)$(IEGGDIR)"
@@ -692,6 +702,7 @@ else
$(INSTALL_PROGRAM) $(INSTALL_PROGRAM_EXECUTABLE_OPTIONS) srfi-13.import.so "$(DESTDIR)$(IEGGDIR)"
$(INSTALL_PROGRAM) $(INSTALL_PROGRAM_EXECUTABLE_OPTIONS) srfi-69.import.so "$(DESTDIR)$(IEGGDIR)"
$(INSTALL_PROGRAM) $(INSTALL_PROGRAM_EXECUTABLE_OPTIONS) extras.import.so "$(DESTDIR)$(IEGGDIR)"
+ $(INSTALL_PROGRAM) $(INSTALL_PROGRAM_EXECUTABLE_OPTIONS) regex.import.so "$(DESTDIR)$(IEGGDIR)"
$(INSTALL_PROGRAM) $(INSTALL_PROGRAM_EXECUTABLE_OPTIONS) srfi-14.import.so "$(DESTDIR)$(IEGGDIR)"
$(INSTALL_PROGRAM) $(INSTALL_PROGRAM_EXECUTABLE_OPTIONS) tcp.import.so "$(DESTDIR)$(IEGGDIR)"
$(INSTALL_PROGRAM) $(INSTALL_PROGRAM_EXECUTABLE_OPTIONS) foreign.import.so "$(DESTDIR)$(IEGGDIR)"
@@ -729,6 +740,7 @@ ifneq ($(POSTINSTALL_PROGRAM),true)
$(POSTINSTALL_PROGRAM) $(POSTINSTALL_PROGRAM_FLAGS) "$(DESTDIR)$(IEGGDIR)$(SEP)srfi-13.import.so"
$(POSTINSTALL_PROGRAM) $(POSTINSTALL_PROGRAM_FLAGS) "$(DESTDIR)$(IEGGDIR)$(SEP)srfi-69.import.so"
$(POSTINSTALL_PROGRAM) $(POSTINSTALL_PROGRAM_FLAGS) "$(DESTDIR)$(IEGGDIR)$(SEP)extras.import.so"
+ $(POSTINSTALL_PROGRAM) $(POSTINSTALL_PROGRAM_FLAGS) "$(DESTDIR)$(IEGGDIR)$(SEP)regex.import.so"
$(POSTINSTALL_PROGRAM) $(POSTINSTALL_PROGRAM_FLAGS) "$(DESTDIR)$(IEGGDIR)$(SEP)irregex.import.so"
$(POSTINSTALL_PROGRAM) $(POSTINSTALL_PROGRAM_FLAGS) "$(DESTDIR)$(IEGGDIR)$(SEP)srfi-14.import.so"
$(POSTINSTALL_PROGRAM) $(POSTINSTALL_PROGRAM_FLAGS) "$(DESTDIR)$(IEGGDIR)$(SEP)tcp.import.so"
@@ -852,8 +864,8 @@ posixunix.c: $(SRCDIR)posixunix.scm $(SRCDIR)posix-common.scm $(SRCDIR)common-de
$(CHICKEN) $< $(CHICKEN_LIBRARY_OPTIONS) -output-file $@
posixwin.c: $(SRCDIR)posixwin.scm $(SRCDIR)posix-common.scm $(SRCDIR)common-declarations.scm
$(CHICKEN) $< $(CHICKEN_LIBRARY_OPTIONS) -output-file $@
-irregex.c: $(SRCDIR)irregex.scm $(SRCDIR)irregex-core.scm $(SRCDIR)irregex-utils.scm $(SRCDIR)common-declarations.scm
- $(CHICKEN) $< $(CHICKEN_LIBRARY_OPTIONS) -output-file $@
+regex.c: $(SRCDIR)regex.scm $(SRCDIR)irregex.scm $(SRCDIR)common-declarations.scm
+ $(CHICKEN) $< $(CHICKEN_LIBRARY_OPTIONS) -output-file $@
scheduler.c: $(SRCDIR)scheduler.scm $(SRCDIR)common-declarations.scm
$(CHICKEN) $< $(CHICKEN_LIBRARY_OPTIONS) -output-file $@
profiler.c: $(SRCDIR)profiler.scm $(SRCDIR)common-declarations.scm
@@ -883,6 +895,8 @@ srfi-69.import.c: $(SRCDIR)srfi-69.import.scm
$(CHICKEN) $< $(CHICKEN_IMPORT_LIBRARY_OPTIONS) -output-file $@
extras.import.c: $(SRCDIR)extras.import.scm
$(CHICKEN) $< $(CHICKEN_IMPORT_LIBRARY_OPTIONS) -output-file $@
+regex.import.c: $(SRCDIR)regex.import.scm
+ $(CHICKEN) $< $(CHICKEN_IMPORT_LIBRARY_OPTIONS) -output-file $@
irregex.import.c: $(SRCDIR)irregex.import.scm
$(CHICKEN) $< $(CHICKEN_IMPORT_LIBRARY_OPTIONS) -output-file $@
srfi-14.import.c: $(SRCDIR)srfi-14.import.scm
@@ -966,7 +980,7 @@ setup-download.c: $(SRCDIR)setup-download.scm setup-api.c
distfiles: library.c eval.c expand.c chicken-syntax.c chicken-ffi-syntax.c \
data-structures.c ports.c files.c extras.c lolevel.c utils.c \
tcp.c srfi-1.c srfi-4.c srfi-13.c srfi-14.c srfi-18.c srfi-69.c \
- posixunix.c posixwin.c irregex.c scheduler.c profiler.c stub.c \
+ posixunix.c posixwin.c regex.c scheduler.c profiler.c stub.c \
chicken-profile.c chicken-install.c chicken-uninstall.c chicken-status.c \
csc.c csi.c chicken.c batch-driver.c compiler.c optimizer.c \
compiler-syntax.c scrutinizer.c unboxing.c support.c \
@@ -1009,7 +1023,7 @@ spotless: distclean testclean
-$(REMOVE_COMMAND) $(REMOVE_COMMAND_OPTIONS) library.c eval.c data-structures.c \
ports.c files.c extras.c lolevel.c utils.c chicken-syntax.c chicken-ffi-syntax.c \
tcp.c srfi-1.c srfi-4.c srfi-13.c srfi-14.c srfi-18.c srfi-69.c expand.c \
- posixunix.c posixwin.c irregex.c scheduler.c profiler.c stub.c \
+ posixunix.c posixwin.c regex.c scheduler.c profiler.c stub.c \
chicken-profile.c chicken-bug.c \
csc.c csi.c chicken-install.c chicken-uninstall.c chicken-status.c \
chicken.c batch-driver.c compiler.c optimizer.c compiler-syntax.c \
diff --git a/scripts/henrietta.scm b/scripts/henrietta.scm
index 3a3f829e..af227e22 100644
--- a/scripts/henrietta.scm
+++ b/scripts/henrietta.scm
@@ -36,7 +36,7 @@
; list
-(require-library setup-download irregex extras utils ports srfi-1 posix)
+(require-library setup-download regex extras utils ports srfi-1 posix)
(module main ()
@@ -65,8 +65,8 @@
(remove-directory tmpdir)))
(define test-file?
- (let ((rx (irregex "(\\./)?tests(/.*)?")))
- (lambda (path) (irregex-match rx path))))
+ (let ((rx (regexp "(\\./)?tests(/.*)?")))
+ (lambda (path) (string-match rx path))))
(define (retrieve name version)
(let ((dir (handle-exceptions ex
@@ -113,8 +113,8 @@
(display dir)
(fail "unable to retrieve extension-list"))))
- (define query-string-rx (irregex "[^?]+\\?(.+)"))
- (define query-arg-rx (irregex "^&?(\\w+)=([^&]+)"))
+ (define query-string-rx (regexp "[^?]+\\?(.+)"))
+ (define query-arg-rx (regexp "^&?(\\w+)=([^&]+)"))
(define (service)
(let ((qs (getenv "QUERY_STRING"))
@@ -123,13 +123,13 @@
(or ra "<unknown>") qs)
(unless qs
(error "no QUERY_STRING set"))
- (let ((m (irregex-match query-string-rx qs))
+ (let ((m (string-match query-string-rx qs))
(egg #f)
(version #f))
- (let loop ((qs (if m (irregex-match-substring m 1) qs)))
- (let* ((m (irregex-search query-arg-rx qs))
- (ms (and m (irregex-match-substring m 1)))
- (rest (and m (substring qs (irregex-match-end-index m)))))
+ (let loop ((qs (if m (cadr m) qs)))
+ (let* ((m (string-search-positions query-arg-rx qs))
+ (ms (and m (apply substring qs (cadr m))))
+ (rest (and m (substring qs (cadar m)))))
(cond ((not m)
(headers) ; from here on use `fail'
(cond (egg
@@ -137,10 +137,10 @@
(cleanup) )
(else (fail "no extension name specified") ) ))
((string=? ms "version")
- (set! version (irregex-match-substring m 2))
+ (set! version (apply substring qs (caddr m)))
(loop rest))
((string=? ms "name")
- (set! egg (irregex-match-substring m 2))
+ (set! egg (apply substring qs (caddr m)))
(loop rest))
((string=? ms "tests")
(set! *tests* #t)
@@ -149,7 +149,7 @@
(headers)
(listing))
((string=? ms "mode")
- (set! *mode* (string->symbol (irregex-match-substring m 2)))
+ (set! *mode* (string->symbol (apply substring qs (caddr m))))
(loop rest))
(else
(warning "unrecognized query option" ms)
diff --git a/scripts/make-egg-index.scm b/scripts/make-egg-index.scm
index 69e12c2f..055b987e 100644
--- a/scripts/make-egg-index.scm
+++ b/scripts/make-egg-index.scm
@@ -2,8 +2,9 @@
(load-relative "tools.scm")
-(use setup-download matchable sxml-transforms data-structures irregex)
+(use setup-download matchable sxml-transforms data-structures regex)
+(import irregex)
(define *help* #f)
(define *major-version* (##sys#fudge 41))
@@ -223,24 +224,24 @@
(let* ((end (irregex-match-end m 0))
(acc (kons i m acc)))
(lp end acc))))))))
- (irregex-fold
- irx
- (lambda (i m s)
- (cons (matched (irregex-match-substring m 1))
- (cons (did-not-match
- (substring str i (irregex-match-start-index m 0)))
- s)))
- '()
- str
- (lambda (i s)
- (reverse (cons (did-not-match (substring str i))
- s)))))
+ (let ((irregex-match-start-index irregex-match-start)) ;; upcoming API change in irregex 0.7
+ (irregex-fold irx
+ (lambda (i m s)
+ (cons (matched (irregex-match-substring m 1))
+ (cons (did-not-match
+ (substring str i (irregex-match-start-index m 0)))
+ s)))
+ '()
+ str
+ (lambda (i s)
+ (reverse (cons (did-not-match (substring str i))
+ s))))))
(transform
+link-regexp+
str
(lambda (name) ;; wiki username
`(a (@ (href ,(string-append "http://chicken.wiki.br/users/"
- (irregex-replace/all " " name "-" name))))
+ (string-substitute " " "-" name 'global))))
,name))
(lambda (x) ;; raw HTML chunk
`(literal ,x))))
diff --git a/scripts/makedist.scm b/scripts/makedist.scm
index a9ca70d7..e44f234c 100644
--- a/scripts/makedist.scm
+++ b/scripts/makedist.scm
@@ -1,7 +1,7 @@
;;;; makedist.scm - Make distribution tarballs
-(use srfi-69 irregex)
+(use srfi-69)
(define *release* #f)
@@ -15,7 +15,7 @@
(define *platform*
(let ((sv (symbol->string (software-version))))
- (cond ((irregex-match ".*bsd" sv) "bsd")
+ (cond ((string-match ".*bsd" sv) "bsd")
(else
(case (build-platform)
((mingw32)
diff --git a/scripts/setversion b/scripts/setversion
index d751ee66..a46d7290 100644
--- a/scripts/setversion
+++ b/scripts/setversion
@@ -10,6 +10,10 @@ exec csi -s "$0" "$@"
(define files '("README" "manual/The User's Manual"))
+(define-syntax rx
+ (syntax-rules ()
+ ((_ r) (force (delay (regexp r))))))
+
(define (patch which rx subst)
(cond ((and (list? which) (= 2 (length which)))
(let ((from (car which))
@@ -22,17 +26,17 @@ exec csi -s "$0" "$@"
(let loop ()
(let ((ln (read-line)))
(unless (eof-object? ln)
- (write-line (irregex-replace/all rx ln subst))
+ (write-line (string-substitute rx subst ln #t))
(loop) ) ) ) )
- #:binary) )
- #:binary)))
+ binary:) )
+ binary:)))
(else
(let ((tmp (create-temporary-file)))
(patch (list which tmp) rx subst)
(system* "mv ~S ~S" tmp which) ) ) ) )
(define (parse-version v)
- (string-match (irregex "(\\d+)\\.(\\d+)\\.(\\d+)(.*)") v) )
+ (string-match (rx "(\\d+)\\.(\\d+)\\.(\\d+)(.*)") v) )
(define (main args)
(let ((major (##sys#fudge 41))
@@ -61,14 +65,14 @@ exec csi -s "$0" "$@"
binary:)
(system* "cat version.scm")
(let ([vstr (sprintf "version ~A" buildversion)])
- (for-each (cut patch <> (irregex "version [0-9][-.0-9a-zA-Z]+") vstr) files) )
+ (for-each (cut patch <> (rx "version [0-9][-.0-9a-zA-Z]+") vstr) files) )
(patch
"chicken.h"
- (irregex "C_MAJOR_VERSION[ \\t]+[0-9]+")
+ (rx "C_MAJOR_VERSION[ \\t]+[0-9]+")
(sprintf "C_MAJOR_VERSION ~a" major))
(patch
"chicken.h"
- (irregex "C_MINOR_VERSION[ \\t]+[0-9]+")
+ (rx "C_MINOR_VERSION[ \\t]+[0-9]+")
(sprintf "C_MINOR_VERSION ~a" minor))
0))
diff --git a/scripts/tools.scm b/scripts/tools.scm
index 23dac5e5..c809067f 100644
--- a/scripts/tools.scm
+++ b/scripts/tools.scm
@@ -341,19 +341,15 @@
(set! debug #t) )
(else (usage 1)) )
(loop (cdr args)) )
- ((irregex-match "([-_A-Za-z0-9]+)=(.*)" x) =>
+ ((string-match "([-_A-Za-z0-9]+)=(.*)" x) =>
(lambda (m)
- (let* ((sym (string->symbol (irregex-match-substring m 1))))
+ (let* ((sym (string->symbol (cadr m))))
(if (##sys#symbol-has-toplevel-binding? sym)
(let ((val (##sys#slot sym 0)))
- (if (or (boolean? val)
- (string? val)
- (symbol? val)
- (eq? (void) val))
- (##sys#setslot sym 0 (irregex-match-substring m 2))
- (quit "variable `~a' already has a suspicious value"
- sym) ) )
- (##sys#setslot sym 0 (irregex-match-substring m 2)) )
+ (if (or (boolean? val) (string? val) (symbol? val) (eq? (void) val))
+ (##sys#setslot sym 0 (caddr m))
+ (quit "variable `~a' already has a suspicious value" sym) ) )
+ (##sys#setslot sym 0 (caddr m)) )
(loop (cdr args)) ) ) )
(else
(set! targets (cons x targets))
@@ -428,24 +424,22 @@
val)))
(let loop ((args args) (vals '()))
(cond ((null? args) (reverse vals))
- ((irregex-match "(-{1,2})([-_A-Za-z0-9]+)(=)?\\s*(.+)?" (car args))
+ ((string-match "(-{1,2})([-_A-Za-z0-9]+)(=)?\\s*(.+)?" (car args))
=>
(lambda (m)
(let*-values (((next) (cdr args))
((var val)
- (cond ((equal? "=" (irregex-match-substring m 3))
- (let ((opt (irregex-match-substring m 2))
- (val (irregex-match-substring m 4)))
+ (cond ((equal? "=" (fourth m))
+ (let ((opt (third m))
+ (val (fifth m)))
(cond (val (values opt val))
(else
(when (null? next)
- (error "missing argument for option"
- (car args)) )
+ (error "missing argument for option" (car args)) )
(let ((x (car next)))
(set! next (cdr next))
(values opt x))))) )
- ((string? (irregex-match-substring m 1))
- (values (irregex-match-substring m 2) #t))
+ ((string? (second m)) (values (third m) #t))
(else (values #f #f)) ) ) )
(cond (var
(assign var val)
diff --git a/scrutinizer.scm b/scrutinizer.scm
index e9c30800..097a1e55 100644
--- a/scrutinizer.scm
+++ b/scrutinizer.scm
@@ -48,7 +48,6 @@
; | (procedure [NAME] (VAL1 ... [#!optional VALOPT1 ...] [#!rest [VAL | values]]) . RESULTS)
; | BASIC
; | deprecated
-; | (deprecated NAME)
; BASIC = * | string | symbol | char | number | boolean | list | pair |
; procedure | vector | null | eof | undefined | port |
; blob | noreturn | pointer | locative | fixnum | float
@@ -90,14 +89,8 @@
((eq? a 'deprecated)
(report
loc
- (sprintf "use of deprecated library procedure `~a'" id) )
+ (sprintf "use of deprecated toplevel identifier `~a'" id) )
'*)
- ((and (pair? a) (eq? (car a) 'deprecated))
- (report
- loc
- (sprintf "use of deprecated library procedure `~a' - consider using `~a' instead"
- id (cadr a)))
- '*)
(else (list a)))))
(else '*)))
(define (variable-result id e loc)
@@ -476,8 +469,7 @@
(every procedure-type? (cdr t)))))))
(define (procedure-argument-types t n)
(cond ((or (memq t '(* procedure))
- (not-pair? t)
- (eq? 'deprecated (car t)))
+ (not-pair? t) )
(values (make-list n '*) #f))
((eq? 'procedure (car t))
(let* ((vf #f)
@@ -581,10 +573,8 @@
(walk (first subs) e loc var)
loc))
(b (assq var e)) )
- (when (and type
- (not b)
- (not (or (eq? type 'deprecated)
- (and (pair? type) (eq? 'deprecated (car type)))))
+ (when (and type (not b)
+ (not (eq? type 'deprecated))
(not (match type rt)))
(report
loc
diff --git a/setup-api.scm b/setup-api.scm
index 02b3c12e..bffd48d7 100644
--- a/setup-api.scm
+++ b/setup-api.scm
@@ -24,7 +24,7 @@
; POSSIBILITY OF SUCH DAMAGE.
-(require-library srfi-1 irregex utils posix srfi-13 extras ports data-structures files)
+(require-library srfi-1 regex utils posix srfi-13 extras ports data-structures files)
; This code is partially quite messy and the API is not overly consistent,
; mainly because it has grown "organically" while the old chicken-setup program
@@ -66,7 +66,7 @@
shellpath)
(import scheme chicken foreign
- irregex utils posix ports extras data-structures
+ regex utils posix ports extras data-structures
srfi-1 srfi-13 files)
;;; Constants, variables and parameters
@@ -191,7 +191,7 @@
(let loop ()
(let ((ln (read-line)))
(unless (eof-object? ln)
- (write-line (irregex-replace/all rx ln subst))
+ (write-line (string-substitute rx subst ln #t))
(loop) ) ) ) ) ) ) )
(let ((tmp (create-temporary-file)))
(patch (list tmp tmp) rx subst)
@@ -718,7 +718,7 @@
(define (version>=? v1 v2)
(define (version->list v)
(map (lambda (x) (or (string->number x) x))
- (irregex-split "[-\\._]" (->string v))))
+ (string-split-fields "[-\\._]" (->string v) #:infix)))
(let loop ((p1 (version->list v1))
(p2 (version->list v2)))
(cond ((null? p1) (null? p2))
diff --git a/setup-download.scm b/setup-download.scm
index 3508eb2f..02a687ad 100644
--- a/setup-download.scm
+++ b/setup-download.scm
@@ -24,7 +24,7 @@
; POSSIBILITY OF SUCH DAMAGE.
-(require-library extras irregex posix utils setup-api srfi-1 data-structures tcp srfi-13
+(require-library extras regex posix utils setup-api srfi-1 data-structures tcp srfi-13
files)
@@ -37,13 +37,11 @@
temporary-directory)
(import scheme chicken)
- (import extras irregex posix utils srfi-1 data-structures tcp srfi-13 files setup-api)
+ (import extras regex posix utils srfi-1 data-structures tcp srfi-13 files setup-api)
(define-constant +default-tcp-connect-timeout+ 10000) ; 10 seconds
(define-constant +default-tcp-read/write-timeout+ 20000) ; 20 seconds
- (define-constant +url-regex+ "(http://)?([^/:]+)(:([^:/]+))?(/.+)")
-
(tcp-connect-timeout +default-tcp-connect-timeout+)
(tcp-read-timeout +default-tcp-read/write-timeout+)
(tcp-write-timeout +default-tcp-read/write-timeout+)
@@ -140,9 +138,7 @@
[tagver (existing-version
egg version
(filter-map
- (lambda (f)
- (and-let* ((m (irregex-search "^tags/([^/]+)/" f)))
- (irregex-match-substring m 1)))
+ (lambda (f) (and-let* ((m (string-search "^tags/([^/]+)/" f))) (cadr m)))
files))])
(let-values ([(filedir ver)
(if tagver
@@ -174,15 +170,14 @@
(conc dir #\/ egg ".meta"))
(define (deconstruct-url url)
- (let ([m (irregex-match +url-regex+ url)])
+ (let ([m (string-match "(http://)?([^/:]+)(:([^:/]+))?(/.+)" url)])
(values
- (if m (irregex-match-substring m 2) url)
- (if (and m (irregex-match-substring m 3))
- (let ((port (irregex-match-substring m 4)))
- (or (string->number port)
- (error "not a valid port" port)))
+ (if m (caddr m) url)
+ (if (and m (cadddr m))
+ (or (string->number (list-ref m 4))
+ (error "not a valid port" (list-ref m 4)))
80)
- (if m (irregex-match-substring m 5) "/")) ) )
+ (if m (list-ref m 5) "/")) ) )
(define (locate-egg/http egg url #!optional version destination tests
proxy-host proxy-port)
@@ -231,13 +226,13 @@
(define (match-http-response rsp)
(and (string? rsp)
- (irregex-match "HTTP/[0-9.]+\\s+([0-9]+)\\s+.*" rsp)) )
+ (string-match "HTTP/[0-9.]+\\s+([0-9]+)\\s+.*" rsp)) )
(define (response-match-code? mrsp code)
- (and mrsp (string=? (number->string code) (irregex-match-substring mrsp 1))) )
+ (and mrsp (string=? (number->string code) (cadr mrsp))) )
(define (match-chunked-transfer-encoding ln)
- (irregex-match "[Tt]ransfer-[Ee]ncoding:\\s*chunked.*" ln) )
+ (string-match "[Tt]ransfer-[Ee]ncoding:\\s*chunked.*" ln) )
(define (http-fetch host port locn dest proxy-host proxy-port)
(d "connecting to host ~s, port ~a ~a...~%" host port
diff --git a/setup.defaults b/setup.defaults
index 80839dc4..3dce8dc2 100644
--- a/setup.defaults
+++ b/setup.defaults
@@ -24,7 +24,7 @@
(map
(data-structures
extras files foreign irregex lolevel ports tcp utils
- posix irregex setup-api setup-download
+ posix regex setup-api setup-download
srfi-1 srfi-4 srfi-13 srfi-14 srfi-18 srfi-69
->) )
diff --git a/tests/re-tests.txt b/tests/re-tests.txt
index 3b7bf976..a73604c6 100644
--- a/tests/re-tests.txt
+++ b/tests/re-tests.txt
@@ -104,9 +104,6 @@ a[bcd]*dcdcde adcdcde y & adcdcde
a[bcd]+dcdcde adcdcde n - -
(ab|a)b*c abc y &-\1 abc-ab
((a)(b)c)(d) abcd y \1-\2-\3-\4 abc-a-b-d
-((a)(b)?c)(d) abcd y \1-\2-\3-\4 abc-a-b-d
-((a)(b)?c)(d) acd y \1-\2-\3-\4 ac-a--d
-((aa)(bb)?cc)(dd) aaccdd y \1-\2-\3-\4 aacc-aa--dd
[ -~]* abc y & abc
[ -~ -~]* abc y & abc
[ -~ -~ -~]* abc y & abc
@@ -121,13 +118,8 @@ a[bcd]+dcdcde adcdcde n - -
(bc+d$|ef*g.|h?i(j|k)) effg n - -
(bc+d$|ef*g.|h?i(j|k)) bcdd n - -
(bc+d$|ef*g.|h?i(j|k)) reffgz y &-\1-\2 effgz-effgz-
-((((((((((a))))))))) - c - -
-((((((((((a)))))))))) a y &-\10 a-a
(((((((((a))))))))) a y & a
multiple words of text uh-uh n - -
multiple words multiple words, yeah y & multiple words
(.*)c(.*) abcde y &-\1-\2 abcde-ab-de
\((.*), (.*)\) (a, b) y (\2, \1) (b, a)
-(we|wee|week)(knights|night) weeknights y &-\1-\2 weeknights-wee-knights
-(a([^a])*)* abcaBC y &-\1-\2 abcaBC-aBC-C
-([Aa]b).*\1 abxyzab y &-\1 abxyzab-ab
diff --git a/tests/runtests.sh b/tests/runtests.sh
index 22481ccc..d75607f6 100644
--- a/tests/runtests.sh
+++ b/tests/runtests.sh
@@ -20,7 +20,7 @@ for x in setup-api.so setup-api.import.so setup-download.so \
srfi-1.import.so srfi-4.import.so data-structures.import.so \
ports.import.so files.import.so posix.import.so \
srfi-13.import.so srfi-69.import.so extras.import.so \
- irregex.import.so srfi-14.import.so tcp.import.so \
+ regex.import.so srfi-14.import.so tcp.import.so \
foreign.import.so scheme.import.so srfi-18.import.so \
utils.import.so csi.import.so irregex.import.so types.db; do
cp ../$x test-repository
diff --git a/tests/scrutiny.expected b/tests/scrutiny.expected
index cd3a5bc4..9c5c82a8 100644
--- a/tests/scrutiny.expected
+++ b/tests/scrutiny.expected
@@ -1,6 +1,6 @@
Warning: at toplevel:
- use of deprecated library procedure `current-environment'
+ use of deprecated toplevel identifier `current-environment'
Warning: in local procedure `c',
in local procedure `b',
diff --git a/tests/sgrep.scm b/tests/sgrep.scm
index 555829e4..7503256a 100644
--- a/tests/sgrep.scm
+++ b/tests/sgrep.scm
@@ -1,7 +1,7 @@
;;;; sgrep.scm - grepping benchmark
-(use irregex extras utils posix srfi-1)
+(use regex extras utils posix srfi-1)
(define big-string
@@ -21,7 +21,7 @@
(lambda (line)
(set! c (fx+ c 1))
;(when (zero? (fxmod c 500)) (print* "."))
- (when (irregex-search expr line)
+ (when (string-search expr line)
(set! h (fx+ h 1)))
#f))
;(newline)
diff --git a/tests/test-glob.scm b/tests/test-glob.scm
index 91fc3d64..a5732384 100644
--- a/tests/test-glob.scm
+++ b/tests/test-glob.scm
@@ -1,20 +1,20 @@
;;;; test-glob.scm - test glob-pattern -> regex translation
-(use irregex)
+(use regex)
-(assert (irregex-match (##sys#glob->regexp "foo.bar") "foo.bar"))
-(assert (irregex-match (##sys#glob->regexp "foo*") "foo.bar"))
-(assert (irregex-match (##sys#glob->regexp "foo/*") "foo/bar"))
-(assert (not (irregex-match (##sys#glob->regexp "foo/*") "foo/bar/baz")))
-(assert (irregex-match (##sys#glob->regexp "foo/*/*") "foo/bar/baz"))
-(assert (not (irregex-match (##sys#glob->regexp "foo/*") "foo/.bar")))
-(assert (irregex-match (##sys#glob->regexp "*foo") "xyzfoo"))
-(assert (not (irregex-match (##sys#glob->regexp "*foo") ".foo")))
-(assert (not (irregex-match (##sys#glob->regexp "*foo*") "a.fooxxx/yyy")))
-(assert (irregex-match (##sys#glob->regexp "*foo*") "fooxxx"))
-(assert (irregex-match (##sys#glob->regexp "main.[ch]") "main.c"))
-(assert (irregex-match (##sys#glob->regexp "main.[ch]") "main.h"))
-(assert (not (irregex-match (##sys#glob->regexp "main.[ch]") "main.cpp")))
-(assert (irregex-match (##sys#glob->regexp "main.[-c]") "main.h"))
-(assert (not (irregex-match (##sys#glob->regexp "main.[-h]") "main.h")))
+(assert (string-match (glob->regexp "foo.bar") "foo.bar"))
+(assert (string-match (glob->regexp "foo*") "foo.bar"))
+(assert (string-match (glob->regexp "foo/*") "foo/bar"))
+(assert (not (string-match (glob->regexp "foo/*") "foo/bar/baz")))
+(assert (string-match (glob->regexp "foo/*/*") "foo/bar/baz"))
+(assert (not (string-match (glob->regexp "foo/*") "foo/.bar")))
+(assert (string-match (glob->regexp "*foo") "xyzfoo"))
+(assert (not (string-match (glob->regexp "*foo") ".foo")))
+(assert (not (string-match (glob->regexp "*foo*") "a.fooxxx/yyy")))
+(assert (string-match (glob->regexp "*foo*") "fooxxx"))
+(assert (string-match (glob->regexp "main.[ch]") "main.c"))
+(assert (string-match (glob->regexp "main.[ch]") "main.h"))
+(assert (not (string-match (glob->regexp "main.[ch]") "main.cpp")))
+(assert (string-match (glob->regexp "main.[-c]") "main.h"))
+(assert (not (string-match (glob->regexp "main.[-h]") "main.h")))
diff --git a/tests/test-irregex.scm b/tests/test-irregex.scm
index 5fdc0340..12d49adf 100644
--- a/tests/test-irregex.scm
+++ b/tests/test-irregex.scm
@@ -1,13 +1,15 @@
;;;: test-irregex.scm
-(use extras irregex)
+(use extras regex)
(include "test.scm")
+(import irregex)
+
(define (subst-matches matches subst)
(define (submatch n)
- (if (irregex-match-data? matches)
+ (if (vector? matches)
(irregex-match-substring matches n)
(list-ref matches n)))
(and
@@ -26,41 +28,35 @@
((#\\)
(let ((c (read-char in)))
(if (char-numeric? c)
- (let lp ((res (list c)))
- (if (and (char? (peek-char in))
- (char-numeric? (peek-char in)))
- (lp (cons (read-char in) res))
- (display
- (or (submatch (string->number
- (list->string (reverse res))))
- "")
- out)))
+ (display
+ (or (submatch (string->number (string c))) "")
+ out)
(write-char c out))))
(else
(write-char c out)))
(lp)))))))))))
(define (test-re matcher line)
- (let ((splt (string-split line "\t" #t)))
- (if (list? splt)
- (apply
- (lambda (pattern input result subst output)
- (let ((name (sprintf "~A ~A ~A ~A" pattern input result subst)))
- (cond
- ((equal? "c" result)
- (test-error name (matcher pattern input)))
- ((equal? "n" result)
- (test-assert name (not (matcher pattern input))))
- (else
- (test-equal name output
- (subst-matches (matcher pattern input) subst))))))
- splt)
- (warning "invalid regex test line" line))))
+ (apply
+ (lambda (pattern input result subst output)
+ (let ((name (sprintf "~A ~A ~A" pattern input result)))
+ (cond
+ ((equal? "c" result)
+ (test-error name (matcher pattern input)))
+ ((equal? "n" result)
+ (test-assert name (not (matcher pattern input))))
+ ((equal? "y" result)
+ (test-assert name (matcher pattern input)))
+ (else
+ (test-equal name
+ (subst-matches (matcher pattern input) subst)
+ result)))))
+ (string-split line "\t" #t)))
+
(test-begin)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; basic irregex
(for-each
(lambda (opts)
@@ -73,139 +69,9 @@
(irregex-search (apply irregex pat opts) str))
line))
read-line)))))
- '((backtrack)
- (fast)
- ))
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; chunked irregex
-
-(define (rope . args)
- (map (lambda (x) (if (pair? x) x (list x 0 (string-length x)))) args))
-
-(define rope-chunker
- (make-irregex-chunker
- (lambda (x) (and (pair? (cdr x)) (cdr x)))
- caar
- cadar
- caddar
- (lambda (src1 i src2 j)
- (if (eq? src1 src2)
- (substring (caar src1) i j)
- (let lp ((src (cdr src1))
- (res (list (substring (caar src1) i (caddar src1)))))
- (if (eq? src src2)
- (string-intersperse
- (reverse (cons (substring (caar src2) (cadar src2) j) res))
- "")
- (lp (cdr src)
- (cons (substring (caar src) (cadar src) (caddar src))
- res))))))))
-
-(define (make-ropes str)
- (let ((len (string-length str)))
- (case len
- ((0 1)
- (list (rope str)))
- ((2)
- (list (rope str)
- (rope (substring str 0 1) (substring str 1 2))))
- ((3)
- (list (rope str)
- (rope (substring str 0 1) (substring str 1 3))
- (rope (substring str 0 2) (substring str 2 3))
- (rope (substring str 0 1)
- (substring str 1 2)
- (substring str 2 3))))
- (else
- (let ((mid (quotient (+ len 1) 2)))
- (list (rope str)
- (rope (substring str 0 1) (substring str 1 len))
- (rope (substring str 0 mid) (substring str mid len))
- (rope (substring str 0 (- len 1))
- (substring str (- len 1) len))
- (rope (substring str 0 1)
- (substring str 1 mid)
- (substring str mid len))
- ))))))
-
-(define (make-shared-ropes str)
- (let ((len (string-length str)))
- (case len
- ((0 1)
- '())
- ((2)
- (list (list (list str 0 1) (list str 1 2))))
- ((3)
- (list (list (list str 0 1) (list str 1 3))
- (list (list str 0 2) (list str 2 3))
- (list (list str 0 1) (list str 1 2) (list str 2 3))))
- (else
- (let ((mid (quotient (+ len 1) 2)))
- (list (list (list str 0 1) (list str 1 len))
- (list (list str 0 mid) (list str mid len))
- (list (list str 0 (- len 1))
- (list str (- len 1) len))
- (list (list str 0 1) (list str 1 mid) (list str mid len))
- ))))))
-
-(for-each
- (lambda (opts)
- (test-group
- (sprintf "irregex/chunked - ~S" opts)
- (with-input-from-file "re-tests.txt"
- (lambda ()
- (port-for-each
- (lambda (line)
- (let ((splt (string-split line "\t" #t)))
- (if (list? splt)
- (apply
- (lambda (pattern input result subst output)
- (let ((name
- (sprintf "~A ~A ~A ~A" pattern input result subst)))
- (cond
- ((equal? "c" result))
- ((equal? "n" result)
- (for-each
- (lambda (rope)
- (test-assert name
- (not (irregex-search/chunked pattern
- rope-chunker
- rope))))
- (append (make-ropes input)
- (make-shared-ropes input))))
- (else
- (for-each
- (lambda (rope)
- (test-equal
- name output
- (subst-matches (irregex-search/chunked pattern
- rope-chunker
- rope)
- subst)))
- (append (make-ropes input)
- (make-shared-ropes input)))))))
- splt)
- (warning "invalid regex test line" line))))
- read-line)))))
- '((backtrack)
- (fast)
- ))
+ '((small) (fast)))
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; pregexp
-
-'(test-group "pregexp"
- (with-input-from-file "re-tests.txt"
- (lambda ()
- (port-for-each
- (lambda (line) (test-re pregexp-match line))
- read-line))))
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; default regex (PCRE)
-
-'(test-group "regex"
+(test-group "regex"
(with-input-from-file "re-tests.txt"
(lambda ()
(port-for-each
@@ -214,139 +80,14 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-(test-group "unmatchable patterns"
- (test-assert (not (irregex-search '(or) "abc")))
- (test-assert (not (irregex-search '(: "ab" (or)) "abc")))
- (test-assert (not (irregex-search '(submatch "ab" (or)) "abc")))
- (test-assert (not (irregex-search '(: "ab" (submatch (or))) "abc")))
- (test-assert (not (irregex-search '(/) "abc")))
- (test-assert (not (irregex-search '(: "ab" (/)) "abc")))
- (test-assert (not (irregex-search '(~ any) "abc")))
- (test-assert (not (irregex-search '(: "ab" (~ any)) "abc")))
- (test-assert (not (irregex-search '("") "abc")))
- (test-assert (not (irregex-search '(: "ab" ("")) "abc")))
- )
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-(test-group "beginning/end of chunks"
- (test-assert
- (irregex-search/chunked '(: bos "foo") rope-chunker '((" foo" 0 4)) 1))
- (test-assert
- (irregex-search/chunked '(: bos "foo") rope-chunker '((" foo" 1 5)) 2))
- (test-assert
- (irregex-search/chunked '(: bos "foo" eos) rope-chunker '((" foo" 1 4)) 1))
- (test-assert
- (irregex-search/chunked '(: bos "foo" eos) rope-chunker '((" foo" 2 5)) 2))
- (test-assert
- (irregex-search/chunked '(: bos "foo" eos) rope-chunker '((" foo" 0 4)) 1))
- (test-assert
- (irregex-search/chunked '(: bos "foo" eos) rope-chunker '((" foo" 1 5)) 2))
- )
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-(test-group "API"
- (test-assert (irregex? (irregex "a.*b")))
- (test-assert (irregex? (irregex '(: "a" (* any) "b"))))
- (test-assert (not (irregex? (vector '*irregex-tag* #f #f #f #f #f #f #f))))
- (test-assert (not (irregex? (vector #f #f #f #f #f #f #f #f #f))))
- (test-assert (irregex-match-data? (irregex-search "a.*b" "axxxb")))
- (test-assert (irregex-match-data? (irregex-match "a.*b" "axxxb")))
- (test-assert (not (irregex-match-data? (vector '*irregex-match-tag* #f #f #f #f #f #f #f #f #f))))
- (test-assert (not (irregex-match-data? (vector #f #f #f #f #f #f #f #f #f #f #f))))
- (test-equal 0 (irregex-num-submatches (irregex "a.*b")))
- (test-equal 1 (irregex-num-submatches (irregex "a(.*)b")))
- (test-equal 2 (irregex-num-submatches (irregex "(a(.*))b")))
- (test-equal 2 (irregex-num-submatches (irregex "a(.*)(b)")))
- (test-equal 10 (irregex-num-submatches (irregex "((((((((((a))))))))))")))
- (test-equal 0 (irregex-match-num-submatches (irregex-search "a.*b" "axxxb")))
- (test-equal 1 (irregex-match-num-submatches (irregex-search "a(.*)b" "axxxb")))
- (test-equal 2 (irregex-match-num-submatches (irregex-search "(a(.*))b" "axxxb")))
- (test-equal 2 (irregex-match-num-submatches (irregex-search "a(.*)(b)" "axxxb")))
- (test-equal 10 (irregex-match-num-submatches (irregex-search "((((((((((a))))))))))" "a")))
- )
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
(test-group "utils"
- (test-equal "h*llo world"
- (irregex-replace "[aeiou]" "hello world" "*"))
- (test-equal "h*ll* w*rld"
- (irregex-replace/all "[aeiou]" "hello world" "*"))
- (test-equal '("bob@test.com" "fred@example.com")
- (irregex-fold 'email
- (lambda (i m s) (cons (irregex-match-substring m) s))
- '()
- "bob@test.com and fred@example.com"
- (lambda (i s) (reverse s))))
- (test-equal '("bob@test.com" "fred@example.com")
- (irregex-fold/chunked
- 'email
- (lambda (src i m s) (cons (irregex-match-substring m) s))
- '()
- rope-chunker
- (rope "bob@test.com and fred@example.com")
- (lambda (src i s) (reverse s))))
- )
-
-
-(define (extract name irx str)
- (irregex-match-substring (irregex-match irx str) name))
-
-(test-group "named submatches"
- (test-equal "matching submatch is seen and extracted"
- "first" (extract 'first `(or (submatch-named first "first")
- (submatch-named second "second"))
- "first"))
- (test-equal "nonmatching submatch is known but returns false"
- #f (extract 'second `(or (submatch-named first "first")
- (submatch-named second "second"))
- "first"))
- (test-error "nonexisting submatch is unknown and raises an error"
- (extract 'third `(or (submatch-named first "first")
- (submatch-named second "second"))
- "first"))
- (test-equal "matching alternative is used"
- "first" (extract 'sub `(or (submatch-named sub "first")
- (submatch-named sub "second"))
- "first"))
- (test-equal "matching alternative is used (second match)"
- "second" (extract 'sub `(or (submatch-named sub "first")
- (submatch-named sub "second"))
- "second"))
- (test-equal "last match is used with multiple matches for a name"
- "second" (extract 'sub `(seq (submatch-named sub "first")
- space
- (submatch-named sub "second"))
- "first second")))
+ (test-equal "replace"
+ (irregex-replace "[aeiou]" "hello world" "*")
+ "h*llo world")
+ (test-equal "replace/all"
+ (irregex-replace/all "[aeiou]" "hello world" "*")
+ "h*ll* w*rld"))
(test-end)
-
-
-;;; UTF-8 tests
-
-(test-begin)
-
-(test-assert (irregex-search "(?u:<..>)" "<漢字>"))
-(test-assert (irregex-search "(?u:<.*>)" "<漢字>"))
-(test-assert (irregex-search "(?u:<.+>)" "<漢字>"))
-(test-assert (not (irregex-search "(?u:<.>)" "<漢字>")))
-(test-assert (not (irregex-search "(?u:<...>)" "<漢>")))
-
-(test-assert (irregex-search "(?u:<[^a-z]*>)" "<漢字>"))
-(test-assert (not (irregex-search "(?u:<[^a-z]*>)" "<漢m字>")))
-(test-assert (irregex-search "(?u:<[^a-z][^a-z]>)" "<漢字>"))
-(test-assert (irregex-search "(?u:<あ*>)" "<あ>"))
-(test-assert (irregex-search "(?u:<あ*>)" "<ああ>"))
-(test-assert (not (irregex-search "(?u:<あ*>)" "<あxあ>")))
-
-(test-assert (irregex-search "(?u:<[あ-ん]*>)" "<あん>"))
-(test-assert (irregex-search "(?u:<[あ-ん]*>)" "<ひらがな>"))
-(test-assert (not (irregex-search "(?u:<[あ-ん]*>)" "<ひらgがな>")))
-
-(test-end)
-
-
(test-exit)
diff --git a/tests/test.scm b/tests/test.scm
index c16de6a5..e9b43c14 100644
--- a/tests/test.scm
+++ b/tests/test.scm
@@ -77,8 +77,7 @@
(define-syntax test-equal
(syntax-rules ()
((_ name expr value eq) (run-equal name (lambda () expr) value eq))
- ((_ name expr value) (run-equal name (lambda () expr) value equal?))
- ((_ expr value) (run-equal (->string value) (lambda () expr) value equal?))))
+ ((_ name expr value) (run-equal name (lambda () expr) value equal?))))
(define-syntax test-error
(syntax-rules ()
@@ -90,8 +89,7 @@
(define-syntax test-assert
(syntax-rules ()
- ((_ name expr) (run-equal name (lambda () (if expr #t #f)) #t eq?))
- ((_ expr) (run-equal (->string expr) (lambda () (if expr #t #f)) #t eq?))))
+ ((_ name expr) (run-equal name (lambda () (if expr #t #f)) #t eq?))))
(define-syntax test-group
(syntax-rules ()
diff --git a/types.db b/types.db
index e1b8ca57..c831ee6a 100644
--- a/types.db
+++ b/types.db
@@ -537,58 +537,30 @@
;; irregex
(irregex (procedure irregex (#!rest) *))
-;irregex-apply-match
-(irregex-dfa (procedure irregex-dfa (*) *))
-(irregex-dfa/extract (procedure irregex-dfa/extract (*) *))
-(irregex-dfa/search (procedure irregex-dfa/search (*) *))
-(irregex-extract (procedure irregex-extract (* string #!optional fixnum fixnum) list))
-(irregex-flags (procedure irregex-flags (*) *))
-(irregex-fold (procedure irregex-fold (* (procedure (fixnum (struct regexp-match)) *) * string #!optional (procedure (fixnum *) *) fixnum fixnum) *))
-(irregex-fold/chunked (procedure irregex-fold/chunked (* (procedure (fixnum (struct regexp-match)) *) * procedure * #!optional (procedure (fixnum *) *) fixnum fixnum) *))
-(irregex-lengths (procedure irregex-lengths (*) *))
-(irregex-match (procedure irregex-match (* string) *))
-;irregex-match?
+(string->irregex (procedure string->irregex (string #!rest) *))
+(sre->irregex (procedure sre->irregex (#!rest) *))
+(string->sre (procedure string->sre (string #!rest) *))
+(irregex? (procedure irregex? (*) boolean))
(irregex-match-data? (procedure irregex-match-data? (*) boolean))
-(irregex-match-end (procedure irregex-match-end (* #!optional *) *))
-;irregex-match-end-chunk
-(irregex-match-end-index (procedure irregex-match-end-index ((struct regexp-match) *) fixnum))
-(irregex-match-names (procedure irregex-match-names ((struct regexp-match)) list))
-(irregex-match-num-submatches (procedure irregex-match-num-submatches ((struct regexp-match)) fixnum))
-(irregex-match-start (procedure irregex-match-start (* #!optional *) *))
-;irregex-match-start-chunk
-(irregex-match-start-index (procedure irregex-match-start-index ((struct regexp-match) *) fixnum))
-(irregex-match-string (procedure irregex-match-string (*) *))
-(irregex-match-subchunk (procedure irregex-match-subchunk ((struct regexp-match) #!optional *) *))
-(irregex-match-substring (procedure irregex-match-substring (* #!optional *) *))
-(irregex-match/chunked (procedure irregex-match/chunked (* * * #!optional fixnum) *))
-(irregex-names (procedure irregex-names (*) *))
(irregex-new-matches (procedure irregex-new-matches (*) *))
-(irregex-nfa (procedure irregex-nfa (*) *))
-(irregex-num-submatches (procedure irregex-num-submatches (*) fixnum))
-(irregex-opt (procedure irregex-opt (list) *))
-(irregex-quote (procedure irregex-quote (string) string))
-(irregex-replace (procedure irregex-replace (* string #!rest) *))
-(irregex-replace/all (procedure irregex-replace/all (* string #!rest) *))
(irregex-reset-matches! (procedure irregex-reset-matches! (*) *))
+(irregex-match-start (procedure irregex-match-start (* #!optional *) *))
+(irregex-match-end (procedure irregex-match-end (* #!optional *) *))
+(irregex-match-substring (procedure irregex-match-substring (* #!optional *) *))
(irregex-search (procedure irregex-search (* string #!optional fixnum fixnum) *))
(irregex-search/matches (procedure irregex-search/matches (* string fixnum fixnum *) *))
-(irregex-split (procedure irregex-split (* string #!optional fixnum fixnum) list))
+(irregex-match (procedure irregex-match (* string) *))
+(irregex-match-string (procedure irregex-match-string (*) *))
+(irregex-replace (procedure irregex-replace (* string #!rest) *))
+(irregex-replace/all (procedure irregex-replace/all (* string #!rest) *))
+(irregex-dfa (procedure irregex-dfa (*) *))
+(irregex-dfa/search (procedure irregex-dfa/search (*) *))
+(irregex-dfa/extract (procedure irregex-dfa/extract (*) *))
+(irregex-nfa (procedure irregex-nfa (*) *))
+(irregex-flags (procedure irregex-flags (*) *))
(irregex-submatches (procedure irregex-submatches (*) *))
-(irregex? (procedure irregex? (*) boolean))
-(make-irregex-chunker
- (procedure make-irregex-chunker
- ((procedure (*) *)
- (procedure (*) *)
- #!optional
- (procedure (*) *)
- (procedure (*) *)
- (procedure (* fixnum * fixnum) string)
- (procedure (* fixnum * fixnum) *))
- *))
-(maybe-string->sre (procedure maybe-string->sre (*) *))
-(sre->irregex (procedure sre->irregex (#!rest) *))
-(string->irregex (procedure string->irregex (string #!rest) *))
-(string->sre (procedure string->sre (string #!rest) *))
+(irregex-lengths (procedure irregex-lengths (*) *))
+(irregex-names (procedure irregex-names (*) *))
;; lolevel
@@ -920,6 +892,22 @@
(with-input-from-pipe (procedure with-input-from-pipe (string (procedure () . *) #!optional symbol) . *))
(with-output-to-pipe (procedure with-output-to-pipe (string (procedure () . *) #!optional symbol) . *))
+;; regex
+
+(glob->regexp (procedure glob->regexp (string #!optional *) *))
+(glob? deprecated)
+(grep (procedure grep (* list #!optional (procedure (*) *)) list))
+(regexp (procedure regexp (* #!optional * * *) (struct regexp)))
+(regexp-escape (procedure regexp-escape (string) string))
+(regexp? (procedure regexp? (*) boolean))
+(string-match (procedure string-match (* string) *))
+(string-match-positions (procedure string-match-positions (* string) *))
+(string-search (procedure string-search (* string #!optional fixnum fixnum) *))
+(string-search-positions (procedure string-search-positions (* string #!optional fixnum fixnum) *))
+(string-split-fields (procedure string-split-fields (* string #!optional * fixnum) list))
+(string-substitute (procedure string-substitute (* string string #!optional *) string))
+(string-substitute* (procedure string-substitute* (string list #!optional *) string))
+
;; srfi-1
(alist-cons (procedure alist-cons (* * *) list))
diff --git a/utils.scm b/utils.scm
index b934a550..03bd4ed6 100644
--- a/utils.scm
+++ b/utils.scm
@@ -27,7 +27,7 @@
(declare
(unit utils)
- (uses extras srfi-13 posix files irregex)
+ (uses extras srfi-13 posix files regex)
(fixnum)
(hide chop-pds)
(disable-interrupts) )
@@ -115,15 +115,18 @@
;;; Scan lines until regex or predicate matches
(define scan-input-lines
- (lambda (rx #!optional (port ##sys#standard-input))
- (let ((rx (if (procedure? rx)
- rx
- (cute irregex-search (irregex rx) <>))))
- (let loop ()
- (let ((ln (read-line port)))
- (and (not (eof-object? ln))
- (or (rx ln)
- (loop))))))))
+ (let ((regexp regexp)
+ (read-line read-line)
+ (string-search string-search))
+ (lambda (rx #!optional (port ##sys#standard-input))
+ (let ((rx (if (procedure? rx)
+ rx
+ (cut string-search (regexp rx) <>))))
+ (let loop ()
+ (let ((ln (read-line port)))
+ (and (not (eof-object? ln))
+ (or (rx ln)
+ (loop)))))))))
;; Ask for confirmation
Trap