~ chicken-core (chicken-5) d720a5d15cc0fb0619c70ccc26eb888171744a7f
commit d720a5d15cc0fb0619c70ccc26eb888171744a7f Author: felix <felix@call-with-current-continuation.org> AuthorDate: Tue Aug 3 01:16:11 2010 +0200 Commit: felix <felix@call-with-current-continuation.org> CommitDate: Tue Aug 3 01:16:11 2010 +0200 Revert "Merge branch 'total-irregex' of ssh://chicken@galinha.ucpel.tche.br/~/chicken-core into experimental" This reverts commit c53dcbfd42c6baf325538aa312c3364bd5d3b88f. Well, this can happen to the best of us ... (that was a joke) diff --git a/chicken-install.scm b/chicken-install.scm index eb67238c..6150ef27 100644 --- a/chicken-install.scm +++ b/chicken-install.scm @@ -25,13 +25,13 @@ (require-library setup-download setup-api) -(require-library srfi-1 posix data-structures utils irregex ports extras srfi-13 files) +(require-library srfi-1 posix data-structures utils regex ports extras srfi-13 files) (require-library chicken-syntax) ; in case an import library reexports chicken syntax (require-library chicken-ffi-syntax) ; same reason, also for filling modules.db (module main () - (import scheme chicken srfi-1 posix data-structures utils irregex ports extras + (import scheme chicken srfi-1 posix data-structures utils regex ports extras srfi-13 files) (import setup-download setup-api) @@ -51,6 +51,7 @@ "srfi-13.import.so" "srfi-69.import.so" "extras.import.so" + "regex.import.so" "srfi-14.import.so" "tcp.import.so" "foreign.import.so" @@ -490,17 +491,17 @@ (let* ((files (glob (make-pathname (repository-path) "*.import.*"))) (tmpdir (create-temporary-directory)) (dbfile (make-pathname tmpdir +module-db+)) - (rx (irregex ".*/([^/]+)\\.import\\.(scm|so)"))) + (rx (regexp ".*/([^/]+)\\.import\\.(scm|so)"))) (print "loading import libraries ...") (fluid-let ((##sys#warnings-enabled #f)) (for-each (lambda (f) - (let ((m (irregex-match rx f))) + (let ((m (string-match rx f))) (handle-exceptions ex (print-error-message ex (current-error-port) (sprintf "Failed to import from `~a'" f)) - (eval `(import ,(string->symbol (irregex-match-substring m 1))))))) + (eval `(import ,(string->symbol (cadr m))))))) files)) (print "generating database") (let ((db @@ -586,10 +587,10 @@ EOF (define (setup-proxy uri) (if (string? uri) - (cond ((irregex-match "(.+)\\:([0-9]+)" uri) => + (cond ((string-match "(.+)\\:([0-9]+)" uri) => (lambda (m) - (set! *proxy-host* (irregex-match-substring m 1)) - (set! *proxy-port* (string->number (irregex-match-substring m 2)))) + (set! *proxy-host* (cadr m)) + (set! *proxy-port* (string->number (caddr m)))) (else (set! *proxy-host* uri) (set! *proxy-port* 80)))))) @@ -598,7 +599,7 @@ EOF (define (main args) (let ((update #f) - (rx (irregex "([^:]+):(.+)"))) + (rx (regexp "([^:]+):(.+)"))) (setup-proxy (get-environment-variable "http_proxy")) (let loop ((args args) (eggs '())) (cond ((null? args) @@ -740,14 +741,9 @@ EOF "") *eggs+dirs+vers*)) (loop (cdr args) (cons egg eggs)))) - ((irregex-match rx arg) => + ((string-match rx arg) => (lambda (m) - (loop - (cdr args) - (alist-cons - (irregex-match-substring m 1) - (irregex-match-substring m 2) - eggs)))) + (loop (cdr args) (alist-cons (cadr m) (caddr m) eggs)))) (else (loop (cdr args) (cons arg eggs)))))))))) (register-feature! 'chicken-install) diff --git a/chicken-status.scm b/chicken-status.scm index cbd246c3..5222ebb0 100644 --- a/chicken-status.scm +++ b/chicken-status.scm @@ -24,13 +24,13 @@ ; POSSIBILITY OF SUCH DAMAGE. -(require-library setup-api srfi-1 posix data-structures utils ports irregex files) +(require-library setup-api srfi-1 posix data-structures utils ports regex files) (module main () (import scheme chicken foreign) - (import srfi-1 posix data-structures utils ports irregex + (import srfi-1 posix data-structures utils ports regex files setup-api) (define-foreign-variable C_TARGET_LIB_HOME c-string) @@ -45,9 +45,6 @@ (make-pathname C_TARGET_LIB_HOME (sprintf "chicken/~a" C_BINARY_VERSION)) (repository-path))) - (define (grep rx lst) - (filter (cut irregex-search rx <>) lst)) - (define (gather-eggs patterns) (let ((eggs (map pathname-file (glob (make-pathname (repo-path) "*" "setup-info"))))) @@ -125,10 +122,11 @@ EOF (lambda () (let* ((patterns (map - irregex + regexp (cond ((null? pats) '(".*")) + ;;XXX change for total-irregex branch: (exact (map (lambda (p) - (string-append "^" (irregex-quote p) "$")) + (string-append "^" (regexp-escape p) "$")) pats)) (else pats)))) (eggs (gather-eggs patterns))) diff --git a/chicken-uninstall.scm b/chicken-uninstall.scm index bd6b35ee..8a074b4d 100644 --- a/chicken-uninstall.scm +++ b/chicken-uninstall.scm @@ -26,14 +26,14 @@ (require-library setup-api - srfi-1 posix data-structures utils ports irregex srfi-13 files) + srfi-1 posix data-structures utils ports regex srfi-13 files) (module main () (import scheme chicken foreign) (import setup-api) - (import srfi-1 posix data-structures utils ports irregex srfi-13 files) + (import srfi-1 posix data-structures utils ports regex srfi-13 files) (define-foreign-variable C_TARGET_LIB_HOME c-string) (define-foreign-variable C_BINARY_VERSION int) @@ -49,9 +49,6 @@ (define *force* #f) - (define (grep rx lst) - (filter (cut irregex-search rx <>) lst)) - (define (gather-eggs patterns) (let ((eggs (map pathname-file (glob (make-pathname (repo-path) "*" "setup-info"))))) @@ -120,8 +117,8 @@ EOF (map (lambda (p) (if exact - (irregex (string-append "^" (irregex-quote p) "$")) - (##sys#glob->regexp p))) + (regexp (string-append "^" (regexp-escape p) "$")) + (glob->regexp p))) pats)))) (else (let ((arg (car args))) diff --git a/compiler.scm b/compiler.scm index 5d064d2f..435909b0 100644 --- a/compiler.scm +++ b/compiler.scm @@ -1251,12 +1251,10 @@ (##sys#strip-syntax x se)) (define stripu ##sys#strip-syntax) (define (globalize sym) - (if (symbol? sym) - (let loop ((se se)) ; ignores syntax bindings - (cond ((null? se) (##sys#alias-global-hook sym #f)) - ((and (eq? sym (caar se)) (symbol? (cdar se))) (cdar se)) - (else (loop (cdr se))))) - sym)) + (let loop ((se se)) ; ignores syntax bindings + (cond ((null? se) (##sys#alias-global-hook sym #f)) + ((and (eq? sym (caar se)) (symbol? (cdar se))) (cdar se)) + (else (loop (cdr se)))))) (define (globalize-all syms) (map globalize syms)) (call-with-current-continuation (lambda (return) diff --git a/defaults.make b/defaults.make index 4564b952..e4e8537b 100644 --- a/defaults.make +++ b/defaults.make @@ -293,7 +293,7 @@ CSI ?= csi$(EXE) # Scheme compiler flags -CHICKEN_OPTIONS = -optimize-level 2 -include-path . -include-path $(SRCDIR) -inline -ignore-repository -feature building-chicken +CHICKEN_OPTIONS = -optimize-level 2 -include-path . -include-path $(SRCDIR) -inline -ignore-repository ifdef DEBUGBUILD CHICKEN_OPTIONS += -feature debugbuild -scrutinize -types $(SRCDIR)types.db else @@ -321,9 +321,12 @@ CHICKEN_INSTALL_PROGRAM = $(PROGRAM_PREFIX)chicken-install$(PROGRAM_SUFFIX) CHICKEN_UNINSTALL_PROGRAM = $(PROGRAM_PREFIX)chicken-uninstall$(PROGRAM_SUFFIX) CHICKEN_STATUS_PROGRAM = $(PROGRAM_PREFIX)chicken-status$(PROGRAM_SUFFIX) CHICKEN_BUG_PROGRAM = $(PROGRAM_PREFIX)chicken-bug$(PROGRAM_SUFFIX) -IMPORT_LIBRARIES = chicken lolevel srfi-1 srfi-4 data-structures ports files posix srfi-13 srfi-69 extras srfi-14 tcp foreign scheme srfi-18 utils csi irregex +IMPORT_LIBRARIES = chicken lolevel srfi-1 srfi-4 data-structures ports files posix srfi-13 srfi-69 extras \ + regex srfi-14 tcp foreign scheme srfi-18 utils csi irregex IMPORT_LIBRARIES += setup-api setup-download -SCRUTINIZED_LIBRARIES = library eval data-structures ports files extras lolevel utils tcp srfi-1 srfi-4 srfi-13 srfi-14 srfi-18 srfi-69 $(POSIXFILE) irregex scheduler profiler stub expand chicken-syntax chicken-ffi-syntax +SCRUTINIZED_LIBRARIES = library eval data-structures ports files extras lolevel utils tcp srfi-1 srfi-4 srfi-13 \ + srfi-14 srfi-18 srfi-69 $(POSIXFILE) regex scheduler \ + profiler stub expand chicken-syntax chicken-ffi-syntax ifdef STATICBUILD CHICKEN_STATIC_EXECUTABLE = $(CHICKEN_PROGRAM)$(EXE) diff --git a/distribution/manifest b/distribution/manifest index 01ce9249..affd61e9 100644 --- a/distribution/manifest +++ b/distribution/manifest @@ -28,7 +28,7 @@ optimizer.c compiler-syntax.c scrutinizer.c unboxing.c -irregex.c +regex.c posixunix.c posixwin.c profiler.c @@ -77,9 +77,8 @@ optimizer.scm compiler-syntax.scm scrutinizer.scm unboxing.scm +regex.scm irregex.scm -irregex-core.scm -irregex-utils.scm posixunix.scm posixwin.scm posix-common.scm @@ -209,6 +208,7 @@ posix.import.scm srfi-13.import.scm srfi-69.import.scm extras.import.scm +regex.import.scm irregex.import.scm srfi-14.import.scm tcp.import.scm @@ -222,6 +222,7 @@ posix.import.c srfi-13.import.c srfi-69.import.c extras.import.c +regex.import.c irregex.import.c srfi-14.import.c tcp.import.c @@ -287,7 +288,7 @@ manual/Unit library manual/Unit lolevel manual/Unit ports manual/Unit posix -manual/Unit irregex +manual/Unit regex manual/Unit srfi-1 manual/Unit srfi-13 manual/Unit srfi-14 diff --git a/eval.scm b/eval.scm index 69e2bd03..ff35cfaf 100644 --- a/eval.scm +++ b/eval.scm @@ -55,7 +55,7 @@ (define-foreign-variable binary-version int "C_BINARY_VERSION") (define ##sys#core-library-modules - '(extras lolevel utils files tcp irregex posix srfi-1 srfi-4 srfi-13 + '(extras lolevel utils files tcp regex posix srfi-1 srfi-4 srfi-13 srfi-14 srfi-18 srfi-69 data-structures ports chicken-syntax chicken-ffi-syntax)) diff --git a/files.scm b/files.scm index 1ef2eda2..2c1c167f 100644 --- a/files.scm +++ b/files.scm @@ -36,7 +36,7 @@ (declare (unit files) - (uses irregex data-structures) + (uses regex data-structures) (fixnum) (hide chop-pds absolute-pathname-root root-origin root-directory split-directory) (disable-interrupts) @@ -172,19 +172,20 @@ EOF (define root-origin) (define root-directory) -(if ##sys#windows-platform - (let ((rx (irregex "([A-Za-z]:)?([\\/\\\\]).*"))) - (set! absolute-pathname-root (lambda (pn) (irregex-match rx pn))) - (set! root-origin (lambda (rt) (and rt (irregex-match-substring rt 1)))) - (set! root-directory (lambda (rt) (and rt (irregex-match-substring rt 2)))) ) - (let ((rx (irregex "([\\/\\\\]).*"))) - (set! absolute-pathname-root (lambda (pn) (irregex-match rx pn))) - (set! root-origin (lambda (rt) #f)) - (set! root-directory (lambda (rt) (and rt (irregex-match-substring rt 1)))) ) ) +(let ((string-match string-match)) + (if ##sys#windows-platform + (let ((rx (regexp "([A-Za-z]:)?([\\/\\\\]).*"))) + (set! absolute-pathname-root (lambda (pn) (string-match rx pn))) + (set! root-origin (lambda (rt) (and rt (cadr rt)))) + (set! root-directory (lambda (rt) (and rt (caddr rt)))) ) + (let ((rx (regexp "([\\/\\\\]).*"))) + (set! absolute-pathname-root (lambda (pn) (string-match rx pn))) + (set! root-origin (lambda (rt) #f)) + (set! root-directory (lambda (rt) (and rt (cadr rt)))) ) ) ) (define (absolute-pathname? pn) (##sys#check-string pn 'absolute-pathname?) - (irregex-match-data? (absolute-pathname-root pn)) ) + (pair? (absolute-pathname-root pn)) ) (define-inline (*char-pds? ch) (memq ch '(#\\ #\/))) @@ -261,33 +262,28 @@ EOF file ext def-pds) ) ) ) (define decompose-pathname - (let* ([patt1 "^(.*[\\/\\\\])?([^\\/\\\\]+)(\\.([^\\/\\\\.]+))$"] - [patt2 "^(.*[\\/\\\\])?((\\.)?[^\\/\\\\]+)$"] - [rx1 (irregex patt1)] - [rx2 (irregex patt2)] - [strip-pds - (lambda (dir) - (and dir - (if (member dir '("/" "\\")) - dir - (chop-pds dir #f) ) ) )] ) - (lambda (pn) - (##sys#check-string pn 'decompose-pathname) - (if (fx= 0 (##sys#size pn)) - (values #f #f #f) - (let ([ms (irregex-search rx1 pn)]) - (if ms - (values - (strip-pds (irregex-match-substring ms 1)) - (irregex-match-substring ms 2) - (irregex-match-substring ms 4)) - (let ([ms (irregex-search rx2 pn)]) - (if ms - (values - (strip-pds (irregex-match-substring ms 1)) - (irregex-match-substring ms 2) - #f) - (values (strip-pds pn) #f #f) ) ) ) ) ) ) ) ) + (let ((string-match string-match)) + (let* ([patt1 "^(.*[\\/\\\\])?([^\\/\\\\]+)(\\.([^\\/\\\\.]+))$"] + [patt2 "^(.*[\\/\\\\])?((\\.)?[^\\/\\\\]+)$"] + [rx1 (regexp patt1)] + [rx2 (regexp patt2)] + [strip-pds + (lambda (dir) + (and dir + (if (member dir '("/" "\\")) + dir + (chop-pds dir #f) ) ) )] ) + (lambda (pn) + (##sys#check-string pn 'decompose-pathname) + (if (fx= 0 (##sys#size pn)) + (values #f #f #f) + (let ([ms (string-match rx1 pn)]) + (if ms + (values (strip-pds (cadr ms)) (caddr ms) (car (cddddr ms))) + (let ([ms (string-match rx2 pn)]) + (if ms + (values (strip-pds (cadr ms)) (caddr ms) #f) + (values (strip-pds pn) #f #f) ) ) ) ) ) ) ) ) ) (define pathname-directory) (define pathname-file) diff --git a/irregex-core.scm b/irregex-core.scm deleted file mode 100644 index 7cd57d81..00000000 --- a/irregex-core.scm +++ /dev/null @@ -1,3874 +0,0 @@ -;;;; irregex.scm -- IrRegular Expressions -;; -;; Copyright (c) 2005-2010 Alex Shinn. All rights reserved. -;; BSD-style license: http://synthcode.com/license.txt - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; At this moment there was a loud ring at the bell, and I could -;; hear Mrs. Hudson, our landlady, raising her voice in a wail of -;; expostulation and dismay. -;; -;; "By heaven, Holmes," I said, half rising, "I believe that -;; they are really after us." -;; -;; "No, it's not quite so bad as that. It is the unofficial -;; force, -- the Baker Street irregulars." - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;;; Notes -;; -;; This code should not require any porting - it should work out of -;; the box in any R[45]RS Scheme implementation. Slight modifications -;; are needed for R6RS (a separate R6RS-compatible version is included -;; in the distribution as irregex-r6rs.scm). -;; -;; The goal of portability makes this code a little clumsy and -;; inefficient. Future versions will include both cleanup and -;; performance tuning, but you can only go so far while staying -;; portable. AND-LET*, SRFI-9 records and custom macros would've been -;; nice. - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;;; History -;; -;; 0.8.2: 2010/07/30 - (...)? submatch extraction fix and alternate -;; named submatches from Peter Bex -;; 0.8.1: 2010/03/09 - backtracking irregex-match fix and other small fixes -;; 0.8.0: 2010/01/20 - optimizing DFA compilation, adding SRE escapes -;; inside PCREs, adding utility SREs -;; 0.7.5: 2009/08/31 - adding irregex-extract and irregex-split -;; *-fold copies match data (use *-fold/fast for speed) -;; irregex-opt now returns an SRE -;; 0.7.4: 2009/05/14 - empty alternates (or) and empty csets always fail, -;; bugfix in default finalizer for irregex-fold/chunked -;; 0.7.3: 2009/04/14 - adding irregex-fold/chunked, minor doc fixes -;; 0.7.2: 2009/02/11 - some bugfixes, much improved documentation -;; 0.7.1: 2008/10/30 - several bugfixes (thanks to Derick Eddington) -;; 0.7.0: 2008/10/20 - support abstract chunked strings -;; 0.6.2: 2008/07/26 - minor bugfixes, allow global disabling of utf8 mode, -;; friendlier error messages in parsing, \Q..\E support -;; 0.6.1: 2008/07/21 - added utf8 mode, more utils, bugfixes -;; 0.6: 2008/05/01 - most of PCRE supported -;; 0.5: 2008/04/24 - fully portable R4RS, many PCRE features implemented -;; 0.4: 2008/04/17 - rewriting NFA to use efficient closure compilation, -;; normal strings only, but all of the spencer tests pass -;; 0.3: 2008/03/10 - adding DFA converter (normal strings only) -;; 0.2: 2005/09/27 - adding irregex-opt (like elisp's regexp-opt) utility -;; 0.1: 2005/08/18 - simple NFA interpreter over abstract chunked strings - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;;; Data Structures - -(cond-expand - (building-chicken - (begin - (define-syntax (internal x r c) - `(,(with-input-from-string (cadr x) read) ,@(cddr x))) - ;; make-irregex defined elsewhere - (define (irregex? x) - (internal "##sys#structure?" x 'regexp)) - (define (irregex-dfa x) - (internal "##sys#check-structure" x 'regexp 'irregex-dfa) - (internal "##sys#slot" x 1)) - (define (irregex-dfa/search x) - (internal "##sys#check-structure" x 'regexp 'irregex-dfa/search) - (internal "##sys#slot" x 2)) - (define (irregex-dfa/extract x) - (internal "##sys#check-structure" x 'regexp 'irregex-dfa/extract) - (internal "##sys#slot" x 3)) - (define (irregex-nfa x) - (internal "##sys#check-structure" x 'regexp 'irregex-nfa) - (internal "##sys#slot" x 4)) - (define (irregex-flags x) - (internal "##sys#check-structure" x 'regexp 'irregex-flags) - (internal "##sys#slot" x 5)) - (define (irregex-num-submatches x) - (internal "##sys#check-structure" x 'regexp 'irregex-num-submatches) - (internal "##sys#slot" x 6)) - (define (irregex-lengths x) - (internal "##sys#check-structure" x 'regexp 'irregex-lengths) - (internal "##sys#slot" x 7)) - (define (irregex-names x) - (internal "##sys#check-structure" x 'regexp 'irregex-names) - (internal "##sys#slot" x 8)))) - (else - (begin - (define irregex-tag '*irregex-tag*) - (define (make-irregex dfa dfa/search dfa/extract nfa flags - submatches lengths names) - (vector irregex-tag dfa dfa/search dfa/extract nfa flags - submatches lengths names)) - (define (irregex? obj) - (and (vector? obj) - (= 9 (vector-length obj)) - (eq? irregex-tag (vector-ref obj 0)))) - (define (irregex-dfa x) (vector-ref x 1)) - (define (irregex-dfa/search x) (vector-ref x 2)) - (define (irregex-dfa/extract x) (vector-ref x 3)) - (define (irregex-nfa x) (vector-ref x 4)) - (define (irregex-flags x) (vector-ref x 5)) - (define (irregex-num-submatches x) (vector-ref x 6)) - (define (irregex-lengths x) (vector-ref x 7)) - (define (irregex-names x) (vector-ref x 8))))) - -(cond-expand - (building-chicken - (begin - ;; make-irregex-match defined elsewhere - (define (irregex-new-matches irx) - (make-irregex-match (irregex-num-submatches irx) (irregex-names irx))) - (define (irregex-reset-matches! m) - (let ((v (internal "##sys#slot" m 1))) - (vector-fill! v #f) - m)) - (define (irregex-copy-matches m) - (and (internal "##sys#structure?" m 'regexp-match) - (internal - "##sys#make-structure" - 'regexp-match - (let* ((v (internal "##sys#slot" m 1)) - (v2 (make-vector (internal "##sys#size" v)))) - (vector-copy! v v2) - v2) - (internal "##sys#slot" m 2) - (internal "##sys#slot" m 3) - (internal "##sys#slot" m 4)))) - (define (irregex-match-data? obj) - (internal "##sys#structure?" obj 'regexp-match)) - (define (irregex-match-num-submatches m) - (internal "##sys#check-structure" m 'regexp-match 'irregex-match-num-submatches) - (- (fx/ (internal "##sys#size" (internal "##sys#slot" m 1)) 4) 2)) - (define (irregex-match-chunker m) - (internal "##sys#slot" m 3)) - (define (irregex-match-names m) - (internal "##sys#check-structure" m 'regexp-match 'irregex-match-names) - (internal "##sys#slot" m 2)) - (define (irregex-match-chunker-set! m str) - (internal "##sys#setslot" m 3 str)) - (define-inline (%irregex-match-start-chunk m n) - (internal "##sys#slot" (internal "##sys#slot" m 1) (* n 4))) - (define-inline (%irregex-match-start-index m n) - (internal "##sys#slot" (internal "##sys#slot" m 1) (+ 1 (* n 4)))) - (define-inline (%irregex-match-end-chunk m n) - (internal "##sys#slot" (internal "##sys#slot" m 1) (+ 2 (* n 4)))) - (define (%irregex-match-end-index m n) - (internal "##sys#slot" (internal "##sys#slot" m 1) (+ 3 (* n 4)))) - (define (%irregex-match-fail m) (internal "##sys#slot" m 4)) - (define (%irregex-match-fail-set! m x) (internal "##sys#setslot" m 4 x)) - (define-record-printer (regexp-match m out) - (let ((n (irregex-match-num-submatches m))) - (display "#<regexp-match (" out) - (display n out) - (display " submatches)>" out))))) - (else - (begin - (define (irregex-new-matches irx) - (make-irregex-match (irregex-num-submatches irx) (irregex-names irx))) - (define (irregex-reset-matches! m) - (do ((i (- (vector-length m) 1) (- i 1))) - ((<= i 3) m) - (vector-set! m i #f))) - (define (irregex-copy-matches m) - (and (vector? m) - (let ((r (make-vector (vector-length m)))) - (do ((i (- (vector-length m) 1) (- i 1))) - ((< i 0) r) - (vector-set! r i (vector-ref m i)))))) - (define irregex-match-tag '*irregex-match-tag*) - (define (irregex-match-data? obj) - (and (vector? obj) - (>= (vector-length obj) 11) - (eq? irregex-match-tag (vector-ref obj 0)))) - (define (make-irregex-match count names) - (let ((res (make-vector (+ (* 4 (+ 2 count)) 4) #f))) - (vector-set! res 0 irregex-match-tag) - (vector-set! res 2 names) - res)) - (define (irregex-match-num-submatches m) - (- (quotient (- (vector-length m) 3) 4) 2)) - (define (irregex-match-chunker m) - (vector-ref m 1)) - (define (irregex-match-names m) - (vector-ref m 2)) - (define (irregex-match-chunker-set! m str) - (vector-set! m 1 str)) - (define (%irregex-match-start-chunk m n) (vector-ref m (+ 3 (* n 4)))) - (define (%irregex-match-start-index m n) (vector-ref m (+ 4 (* n 4)))) - (define (%irregex-match-end-chunk m n) (vector-ref m (+ 5 (* n 4)))) - (define (%irregex-match-end-index m n) (vector-ref m (+ 6 (* n 4)))) - (define (%irregex-match-fail m) (vector-ref m (- (vector-length m) 1))) - (define (%irregex-match-fail-set! m x) (vector-set! m (- (vector-length m) 1) x))))) - -;; public interface with error checking -(define (irregex-match-start-chunk m n) - (if (not (irregex-match-valid-index? m n)) - (error "irregex-match-start-chunk: not a valid index" m n)) - (%irregex-match-start-chunk m n)) -(define (irregex-match-start-index m n) - (if (not (irregex-match-valid-index? m n)) - (error "irregex-match-start-index: not a valid index" m n)) - (%irregex-match-start-index m n)) -(define (irregex-match-end-chunk m n) - (if (not (irregex-match-valid-index? m n)) - (error "irregex-match-end-chunk: not a valid index" m n)) - (%irregex-match-end-chunk m n)) -(define (irregex-match-end-index m n) - (if (not (irregex-match-valid-index? m n)) - (error "irregex-match-end-index: not a valid index" m n)) - (%irregex-match-end-index m n)) - -(define (irregex-match-start-chunk-set! m n start) - (vector-set! m (+ 3 (* n 4)) start)) -(define (irregex-match-start-index-set! m n start) - (vector-set! m (+ 4 (* n 4)) start)) -(define (irregex-match-end-chunk-set! m n end) - (vector-set! m (+ 5 (* n 4)) end)) -(define (irregex-match-end-index-set! m n end) - (vector-set! m (+ 6 (* n 4)) end)) - -(define (irregex-match-index m opt) - (if (pair? opt) - (if (number? (car opt)) - (car opt) - (let lp ((ls (irregex-match-names m)) - (exists #f)) - (cond ((null? ls) - (if exists #f (error "unknown match name" (car opt)))) - ((eq? (car opt) (caar ls)) - (if (%irregex-match-start-chunk m (cdar ls)) - (cdar ls) - (lp (cdr ls) #t))) - (else (lp (cdr ls) exists))))) - 0)) - -(cond-expand - (building-chicken - (define-inline (%irregex-match-valid-index? m n) - (let ((v (internal "##sys#slot" m 1))) - (and (< (* n 4) (internal "##sys#size" v)) - (internal "##sys#slot" v (+ 1 (* n 4))))))) - (else - (define (%irregex-match-valid-index? m n) - (and (< (+ 3 (* n 4)) (vector-length m)) - (vector-ref m (+ 4 (* n 4))))))) - -(define (irregex-match-valid-index? m n) - (if (not (irregex-match-data? m)) - (error "irregex-match-valid-index?: not match data" m)) - (if (not (integer? n)) - (error "irregex-match-valid-index?: not an integer" n)) - (%irregex-match-valid-index? m n)) - -(define (irregex-match-substring m . opt) - (if (not (irregex-match-data? m)) - (error "irregex-match-substring: not match data" m)) - (let* ((cnk (irregex-match-chunker m)) - (n (irregex-match-index m opt))) - (and (%irregex-match-valid-index? m n) - ((chunker-get-substring cnk) - (%irregex-match-start-chunk m n) - (%irregex-match-start-index m n) - (%irregex-match-end-chunk m n) - (%irregex-match-end-index m n))))) - -(define (irregex-match-subchunk m . opt) - (if (not (irregex-match-data? m)) - (error "irregex-match-subchunk: not match data" m)) - (let* ((cnk (irregex-match-chunker m)) - (n (irregex-match-index m opt)) - (get-subchunk (chunker-get-subchunk cnk))) - (if (not get-subchunk) - (error "this chunk type does not support match subchunks") - (and n - (%irregex-match-valid-index? m n) - (get-subchunk - (%irregex-match-start-chunk m n) - (%irregex-match-start-index m n) - (%irregex-match-end-chunk m n) - (%irregex-match-end-index m n)))))) - -;; chunkers tell us how to navigate through chained chunks of strings - -(define (make-irregex-chunker get-next get-str . o) - (let* ((get-start (or (and (pair? o) (car o)) (lambda (cnk) 0))) - (o (if (pair? o) (cdr o) o)) - (get-end (or (and (pair? o) (car o)) - (lambda (cnk) (string-length (get-str cnk))))) - (o (if (pair? o) (cdr o) o)) - (get-substr - (or (and (pair? o) (car o)) - (lambda (cnk1 start cnk2 end) - (if (eq? cnk1 cnk2) - (substring (get-str cnk1) start end) - (let loop ((cnk (get-next cnk1)) - (res (list (substring (get-str cnk1) - start - (get-end cnk1))))) - (if (eq? cnk cnk2) - (string-cat-reverse - (cons (substring (get-str cnk) - (get-start cnk) - end) - res)) - (loop (get-next cnk) - (cons (substring (get-str cnk) - (get-start cnk) - (get-end cnk)) - res)))))))) - (o (if (pair? o) (cdr o) o)) - (get-subchunk (and (pair? o) (car o)))) - (if (not (and (procedure? get-next) (procedure? get-str) - (procedure? get-start) (procedure? get-substr))) - (error "make-irregex-chunker: expected a procdure")) - (vector get-next get-str get-start get-end get-substr get-subchunk))) - -(define (chunker-get-next cnk) (vector-ref cnk 0)) -(define (chunker-get-str cnk) (vector-ref cnk 1)) -(define (chunker-get-start cnk) (vector-ref cnk 2)) -(define (chunker-get-end cnk) (vector-ref cnk 3)) -(define (chunker-get-substring cnk) (vector-ref cnk 4)) -(define (chunker-get-subchunk cnk) (vector-ref cnk 5)) - -(define (chunker-prev-chunk cnk start end) - (if (eq? start end) - #f - (let ((get-next (chunker-get-next cnk))) - (let lp ((start start)) - (let ((next (get-next start))) - (if (eq? next end) - start - (and next (lp next)))))))) - -(define (chunker-prev-char cnk start end) - (let ((prev (chunker-prev-chunk cnk start end))) - (and prev - (string-ref ((chunker-get-str cnk) prev) - (- ((chunker-get-end cnk) prev) 1))))) - -(define (chunker-next-char cnk src) - (let ((next ((chunker-get-next cnk) src))) - (and next - (string-ref ((chunker-get-str cnk) next) - ((chunker-get-start cnk) next))))) - -(define (chunk-before? cnk a b) - (and (not (eq? a b)) - (let ((next ((chunker-get-next cnk) a))) - (and next - (if (eq? next b) - #t - (chunk-before? cnk next b)))))) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;;; String Utilities - -;; Unicode version (skip surrogates) -(define *all-chars* - `(/ ,(integer->char 0) ,(integer->char #xD7FF) - ,(integer->char #xE000) ,(integer->char #x10FFFF))) - -;; ASCII version, offset to not assume 0-255 -;; (define *all-chars* `(/ ,(integer->char (- (char->integer #\space) 32)) ,(integer->char (+ (char->integer #\space) 223)))) - -;; set to #f to ignore even an explicit request for utf8 handling -(define *allow-utf8-mode?* #t) - -;; (define *named-char-properties* '()) - -(define (string-scan-char str c . o) - (let ((end (string-length str))) - (let scan ((i (if (pair? o) (car o) 0))) - (cond ((= i end) #f) - ((eqv? c (string-ref str i)) i) - (else (scan (+ i 1))))))) - -(define (string-scan-char-escape str c . o) - (let ((end (string-length str))) - (let scan ((i (if (pair? o) (car o) 0))) - (cond ((= i end) #f) - ((eqv? c (string-ref str i)) i) - ((eqv? c #\\) (scan (+ i 2))) - (else (scan (+ i 1))))))) - -(define (string-scan-pred str pred . o) - (let ((end (string-length str))) - (let scan ((i (if (pair? o) (car o) 0))) - (cond ((= i end) #f) - ((pred (string-ref str i)) i) - (else (scan (+ i 1))))))) - -(define (string-split-char str c) - (let ((end (string-length str))) - (let lp ((i 0) (from 0) (res '())) - (define (collect) (cons (substring str from i) res)) - (cond ((>= i end) (reverse (collect))) - ((eqv? c (string-ref str i)) (lp (+ i 1) (+ i 1) (collect))) - (else (lp (+ i 1) from res)))))) - -(define (char-alphanumeric? c) - (or (char-alphabetic? c) (char-numeric? c))) - -(define (%substring=? a b start1 start2 len) - (let lp ((i 0)) - (cond ((>= i len) - #t) - ((char=? (string-ref a (+ start1 i)) (string-ref b (+ start2 i))) - (lp (+ i 1))) - (else - #f)))) - -;; SRFI-13 extracts - -(define (%%string-copy! to tstart from fstart fend) - (do ((i fstart (+ i 1)) - (j tstart (+ j 1))) - ((>= i fend)) - (string-set! to j (string-ref from i)))) - -(define (string-cat-reverse string-list) - (string-cat-reverse/aux - (fold (lambda (s a) (+ (string-length s) a)) 0 string-list) - string-list)) - -(define (string-cat-reverse/aux len string-list) - (let ((res (make-string len))) - (let lp ((i len) (ls string-list)) - (if (pair? ls) - (let* ((s (car ls)) - (slen (string-length s)) - (i (- i slen))) - (%%string-copy! res i s 0 slen) - (lp i (cdr ls))))) - res)) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;;; List Utilities - -;; like the one-arg IOTA case -(define (zero-to n) - (if (<= n 0) - '() - (let lp ((i (- n 1)) (res '())) - (if (zero? i) (cons 0 res) (lp (- i 1) (cons i res)))))) - -;; take the head of list FROM up to but not including TO, which must -;; be a tail of the list -(define (take-up-to from to) - (let lp ((ls from) (res '())) - (if (and (pair? ls) (not (eq? ls to))) - (lp (cdr ls) (cons (car ls) res)) - (reverse res)))) - -;; SRFI-1 extracts (simplified 1-ary versions) - -(define (find pred ls) - (let lp ((ls ls)) - (cond ((null? ls) #f) - ((pred (car ls)) (car ls)) - (else (lp (cdr ls)))))) - -(define (find-tail pred ls) - (let lp ((ls ls)) - (cond ((null? ls) #f) - ((pred (car ls)) ls) - (else (lp (cdr ls)))))) - -(define (last ls) - (if (not (pair? ls)) - (error "can't take last of empty list" ls) - (let lp ((ls ls)) - (if (pair? (cdr ls)) - (lp (cdr ls)) - (car ls))))) - -(define (any pred ls) - (and (pair? ls) - (let lp ((head (car ls)) (tail (cdr ls))) - (if (null? tail) - (pred head) - (or (pred head) (lp (car tail) (cdr tail))))))) - -(define (every pred ls) - (or (null? ls) - (let lp ((head (car ls)) (tail (cdr ls))) - (if (null? tail) - (pred head) - (and (pred head) (lp (car tail) (cdr tail))))))) - -(define (fold kons knil ls) - (let lp ((ls ls) (res knil)) - (if (null? ls) - res - (lp (cdr ls) (kons (car ls) res))))) - -(define (filter pred ls) - (let lp ((ls ls) (res '())) - (if (null? ls) - (reverse res) - (lp (cdr ls) (if (pred (car ls)) (cons (car ls) res) res))))) - -(define (remove pred ls) - (let lp ((ls ls) (res '())) - (if (null? ls) - (reverse res) - (lp (cdr ls) (if (pred (car ls)) res (cons (car ls) res)))))) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;;; Flags - -(define (bit-shr n i) - (quotient n (expt 2 i))) - -(define (bit-shl n i) - (* n (expt 2 i))) - -(define (bit-not n) (- #xFFFF n)) - -(define (bit-ior a b) - (cond - ((zero? a) b) - ((zero? b) a) - (else - (+ (if (or (odd? a) (odd? b)) 1 0) - (* 2 (bit-ior (quotient a 2) (quotient b 2))))))) - -(define (bit-and a b) - (cond - ((zero? a) 0) - ((zero? b) 0) - (else - (+ (if (and (odd? a) (odd? b)) 1 0) - (* 2 (bit-and (quotient a 2) (quotient b 2))))))) - -(define (integer-log n) - (define (b8 n r) - (if (>= n (bit-shl 1 8)) (b4 (bit-shr n 8) (+ r 8)) (b4 n r))) - (define (b4 n r) - (if (>= n (bit-shl 1 4)) (b2 (bit-shr n 4) (+ r 4)) (b2 n r))) - (define (b2 n r) - (if (>= n (bit-shl 1 2)) (b1 (bit-shr n 2) (+ r 2)) (b1 n r))) - (define (b1 n r) (if (>= n (bit-shl 1 1)) (+ r 1) r)) - (if (>= n (bit-shl 1 16)) (b8 (bit-shr n 16) 16) (b8 n 0))) - -(define (flag-set? flags i) - (= i (bit-and flags i))) -(define (flag-join a b) - (if b (bit-ior a b) a)) -(define (flag-clear a b) - (bit-and a (bit-not b))) - -(define ~none 0) -(define ~searcher? 1) -(define ~consumer? 2) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;;; Parsing Embedded SREs in PCRE Strings - -;; (define (with-read-from-string str i proc) -;; (define (port-size in) -;; (let lp ((i 0)) (if (eof-object? (read-char in)) i (lp (+ i 1))))) -;; (let* ((len (string-length str)) -;; (tail-len (- len i)) -;; (in (open-input-string (substring str i len))) -;; (sre (read in)) -;; (unused-len (port-size in))) -;; (close-input-port in) -;; (proc sre (- tail-len unused-len)))) - -(define close-token (list 'close)) -(define dot-token (string->symbol ".")) - -(define (with-read-from-string str i proc) - (define end (string-length str)) - (define (read i k) - (cond - ((>= i end) (error "unterminated embedded SRE" str)) - (else - (case (string-ref str i) - ((#\() - (let lp ((i (+ i 1)) (ls '())) - (read - i - (lambda (x j) - (cond - ((eq? x close-token) - (k (reverse ls) j)) - ((eq? x dot-token) - (if (null? ls) - (error "bad dotted form" str) - (read j (lambda (y j2) - (read j2 (lambda (z j3) - (if (not (eq? z close-token)) - (error "bad dotted form" str) - (k (append (reverse (cdr ls)) - (cons (car ls) y)) - j3)))))))) - (else - (lp j (cons x ls)))))))) - ((#\)) - (k close-token (+ i 1))) - ((#\;) - (let skip ((i (+ i 1))) - (if (or (>= i end) (eqv? #\newline (string-ref str i))) - (read (+ i 1) k) - (skip (+ i 1))))) - ((#\' #\`) - (read (+ i 1) - (lambda (sexp j) - (let ((q (if (eqv? #\' (string-ref str i)) 'quote 'quasiquote))) - (k (list q sexp) j))))) - ((#\,) - (let* ((at? (and (< (+ i 1) end) (eqv? #\@ (string-ref str (+ i 1))))) - (u (if at? 'uquote-splicing 'unquote)) - (j (if at? (+ i 2) (+ i 1)))) - (read j (lambda (sexp j) (k (list u sexp) j))))) - ((#\") - (let scan ((from (+ i 1)) (i (+ i 1)) (res '())) - (define (collect) - (if (= from i) res (cons (substring str from i) res))) - (if (>= i end) - (error "unterminated string in embedded SRE" str) - (case (string-ref str i) - ((#\") (k (string-cat-reverse (collect)) (+ i 1))) - ((#\\) (scan (+ i 1) (+ i 2) (collect))) - (else (scan from (+ i 1) res)))))) - ((#\#) - (case (string-ref str (+ i 1)) - ((#\;) - (read (+ i 2) (lambda (sexp j) (read j k)))) - ((#\\) - (read (+ i 2) - (lambda (sexp j) - (k (case sexp - ((space) #\space) - ((newline) #\newline) - (else (let ((s (if (number? sexp) - (number->string sexp) - (symbol->string sexp)))) - (string-ref s 0)))) - j)))) - ((#\t #\f) - (k (eqv? #\t (string-ref str (+ i 1))) (+ i 2))) - (else - (error "bad # syntax in simplified SRE" i)))) - (else - (cond - ((char-whitespace? (string-ref str i)) - (read (+ i 1) k)) - (else ;; symbol/number - (let scan ((j (+ i 1))) - (cond - ((or (>= j end) - (let ((c (string-ref str j))) - (or (char-whitespace? c) - (memv c '(#\; #\( #\) #\" #\# #\\))))) - (let ((str2 (substring str i j))) - (k (or (string->number str2) (string->symbol str2)) j))) - (else (scan (+ j 1)))))))))))) - (read i (lambda (res j) - (if (eq? res 'close-token) - (error "unexpected ')' in SRE" str j) - (proc res j))))) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;;; Parsing PCRE Strings - -(define ~save? 1) -(define ~case-insensitive? 2) -(define ~multi-line? 4) -(define ~single-line? 8) -(define ~ignore-space? 16) -(define ~utf8? 32) - -(define (symbol-list->flags ls) - (let lp ((ls ls) (res ~none)) - (if (not (pair? ls)) - res - (lp (cdr ls) - (flag-join - res - (case (car ls) - ((i ci case-insensitive) ~case-insensitive?) - ((m multi-line) ~multi-line?) - ((s single-line) ~single-line?) - ((x ignore-space) ~ignore-space?) - ((u utf8) (if *allow-utf8-mode?* ~utf8? ~none)) - (else #f))))))) - -(define (maybe-string->sre obj) - (if (string? obj) (string->sre obj) obj)) - -(define (string->sre str . o) - (if (not (string? str)) (error "string->sre: expected a string" str)) - (let ((end (string-length str)) - (flags (symbol-list->flags o))) - - (let lp ((i 0) (from 0) (flags flags) (res '()) (st '())) - - ;; handle case sensitivity at the literal char/string level - (define (cased-char ch) - (if (and (flag-set? flags ~case-insensitive?) - (char-alphabetic? ch)) - `(or ,ch ,(char-altcase ch)) - ch)) - (define (cased-string str) - (if (flag-set? flags ~case-insensitive?) - (sre-sequence (map cased-char (string->list str))) - str)) - ;; accumulate the substring from..i as literal text - (define (collect) - (if (= i from) res (cons (cased-string (substring str from i)) res))) - ;; like collect but breaks off the last single character when - ;; collecting literal data, as the argument to ?/*/+ etc. - (define (collect/single) - (let* ((utf8? (flag-set? flags ~utf8?)) - (j (if (and utf8? (> i 1)) - (utf8-backup-to-initial-char str (- i 1)) - (- i 1)))) - (cond - ((< j from) - res) - (else - (let ((c (cased-char (if utf8? - (utf8-string-ref str j (- i j)) - (string-ref str j))))) - (cond - ((= j from) - (cons c res)) - (else - (cons c - (cons (cased-string (substring str from j)) - res))))))))) - ;; collects for use as a result, reversing and grouping OR - ;; terms, and some ugly tweaking of `function-like' groups and - ;; conditionals - (define (collect/terms) - (let* ((ls (collect)) - (func - (and (pair? ls) - (memq (last ls) - '(atomic if look-ahead neg-look-ahead - look-behind neg-look-behind - => submatch-named - w/utf8 w/noutf8)))) - (prefix (if (and func (memq (car func) '(=> submatch-named))) - (list 'submatch-named (cadr (reverse ls))) - (and func (list (car func))))) - (ls (if func - (if (memq (car func) '(=> submatch-named)) - (reverse (cddr (reverse ls))) - (reverse (cdr (reverse ls)))) - ls))) - (let lp ((ls ls) (term '()) (res '())) - (define (shift) - (cons (sre-sequence term) res)) - (cond - ((null? ls) - (let* ((res (sre-alternate (shift))) - (res (if (flag-set? flags ~save?) - (list 'submatch res) - res))) - (if prefix - (if (eq? 'if (car prefix)) - (cond - ((not (pair? res)) - 'epsilon) - ((memq (car res) - '(look-ahead neg-look-ahead - look-behind neg-look-behind)) - res) - ((eq? 'seq (car res)) - `(if ,(cadr res) - ,(if (pair? (cdr res)) - (sre-sequence (cddr res)) - 'epsilon))) - (else - `(if ,(cadadr res) - ,(if (pair? (cdr res)) - (sre-sequence (cddadr res)) - 'epsilon) - ,(sre-alternate - (if (pair? (cdr res)) (cddr res) '()))))) - `(,@prefix ,res)) - res))) - ((eq? 'or (car ls)) (lp (cdr ls) '() (shift))) - (else (lp (cdr ls) (cons (car ls) term) res)))))) - (define (save) - (cons (cons flags (collect)) st)) - - ;; main parsing - (if (>= i end) - (if (pair? st) - (error "unterminated parenthesis in regexp" str) - (collect/terms)) - (let ((c (string-ref str i))) - (case c - ((#\.) - (lp (+ i 1) (+ i 1) flags - (cons (if (flag-set? flags ~single-line?) 'any 'nonl) - (collect)) - st)) - ((#\?) - (let ((res (collect/single))) - (if (null? res) - (error "? can't follow empty pattern" str res) - (let ((x (car res))) - (lp (+ i 1) - (+ i 1) - flags - (cons - (if (pair? x) - (case (car x) - ((*) `(*? ,@(cdr x))) - ((+) `(**? 1 #f ,@(cdr x))) - ((?) `(?? ,@(cdr x))) - ((**) `(**? ,@(cdr x))) - ((=) `(**? ,(cadr x) ,@(cdr x))) - ((>=) `(**? ,(cadr x) #f ,@(cddr x))) - (else `(? ,x))) - `(? ,x)) - (cdr res)) - st))))) - ((#\+ #\*) - (let* ((res (collect/single)) - (x (if (pair? res) (car res) 'epsilon)) - (op (string->symbol (string c)))) - (cond - ((sre-repeater? x) - (error "duplicate repetition (e.g. **) in pattern" str res)) - ((sre-empty? x) - (error "can't repeat empty pattern (e.g. ()*)" str res)) - (else - (lp (+ i 1) (+ i 1) flags - (cons (list op x) (cdr res)) - st))))) - ((#\() - (cond - ((>= (+ i 1) end) - (error "unterminated parenthesis in regexp" str)) - ((not (memv (string-ref str (+ i 1)) '(#\? #\*))) ; normal case - (lp (+ i 1) (+ i 1) (flag-join flags ~save?) '() (save))) - ((>= (+ i 2) end) - (error "unterminated parenthesis in regexp" str)) - ((eqv? (string-ref str (+ i 1)) #\*) - (if (eqv? #\' (string-ref str (+ i 2))) - (with-read-from-string str (+ i 3) - (lambda (sre j) - (if (or (>= j end) (not (eqv? #\) (string-ref str j)))) - (error "unterminated (*'...) SRE escape" str) - (lp (+ j 1) (+ j 1) flags (cons sre (collect)) st)))) - (error "bad regexp syntax: (*FOO) not supported" str))) - (else ;; (?...) case - (case (string-ref str (+ i 2)) - ((#\#) - (let ((j (string-scan-char str #\) (+ i 3)))) - (lp (+ j i) (+ j 1) flags (collect) st))) - ((#\:) - (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) '() (save))) - ((#\=) - (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) - '(look-ahead) (save))) - ((#\!) - (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) - '(neg-look-ahead) (save))) - ((#\<) - (cond - ((>= (+ i 3) end) - (error "unterminated parenthesis in regexp" str)) - (else - (case (string-ref str (+ i 3)) - ((#\=) - (lp (+ i 4) (+ i 4) (flag-clear flags ~save?) - '(look-behind) (save))) - ((#\!) - (lp (+ i 4) (+ i 4) (flag-clear flags ~save?) - '(neg-look-behind) (save))) - (else - (let ((j (and (char-alphabetic? - (string-ref str (+ i 3))) - (string-scan-char str #\> (+ i 4))))) - (if j - (lp (+ j 1) (+ j 1) (flag-clear flags ~save?) - `(,(string->symbol (substring str (+ i 3) j)) - submatch-named) - (save)) - (error "invalid (?< sequence" str)))))))) - ((#\>) - (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) - '(atomic) (save))) - ;;((#\' #\P) ; named subpatterns - ;; ) - ;;((#\R) ; recursion - ;; ) - ((#\() - (cond - ((>= (+ i 3) end) - (error "unterminated parenthesis in regexp" str)) - ((char-numeric? (string-ref str (+ i 3))) - (let* ((j (string-scan-char str #\) (+ i 3))) - (n (string->number (substring str (+ i 3) j)))) - (if (not n) - (error "invalid conditional reference" str) - (lp (+ j 1) (+ j 1) (flag-clear flags ~save?) - `(,n if) (save))))) - ((char-alphabetic? (string-ref str (+ i 3))) - (let* ((j (string-scan-char str #\) (+ i 3))) - (s (string->symbol (substring str (+ i 3) j)))) - (lp (+ j 1) (+ j 1) (flag-clear flags ~save?) - `(,s if) (save)))) - (else - (lp (+ i 2) (+ i 2) (flag-clear flags ~save?) - '(if) (save))))) - ((#\{) - (error "unsupported Perl-style cluster" str)) - (else - (let ((old-flags flags)) - (let lp2 ((j (+ i 2)) (flags flags) (invert? #f)) - (define (join x) - ((if invert? flag-clear flag-join) flags x)) - (define (new-res res) - (let ((before (flag-set? old-flags ~utf8?)) - (after (flag-set? flags ~utf8?))) - (if (eq? before after) - res - (cons (if after 'w/utf8 'w/noutf8) res)))) - (cond - ((>= j end) - (error "incomplete cluster" str i)) - (else - (case (string-ref str j) - ((#\i) - (lp2 (+ j 1) (join ~case-insensitive?) invert?)) - ((#\m) - (lp2 (+ j 1) (join ~multi-line?) invert?)) - ((#\x) - (lp2 (+ j 1) (join ~ignore-space?) invert?)) - ((#\u) - (if *allow-utf8-mode?* - (lp2 (+ j 1) (join ~utf8?) invert?) - (lp2 (+ j 1) flags invert?))) - ((#\-) - (lp2 (+ j 1) flags (not invert?))) - ((#\)) - (lp (+ j 1) (+ j 1) flags (new-res (collect)) - st)) - ((#\:) - (lp (+ j 1) (+ j 1) flags (new-res '()) - (cons (cons old-flags (collect)) st))) - (else - (error "unknown regex cluster modifier" str) - ))))))))))) - ((#\)) - (if (null? st) - (error "too many )'s in regexp" str) - (lp (+ i 1) - (+ i 1) - (caar st) - (cons (collect/terms) (cdar st)) - (cdr st)))) - ((#\[) - (apply - (lambda (sre j) - (lp (+ j 1) (+ j 1) flags (cons sre (collect)) st)) - (string-parse-cset str (+ i 1) flags))) - ((#\{) - (cond - ((or (>= (+ i 1) end) - (not (or (char-numeric? (string-ref str (+ i 1))) - (eqv? #\, (string-ref str (+ i 1)))))) - (lp (+ i 1) from flags res st)) - (else - (let ((res (collect/single))) - (cond - ((null? res) - (error "{ can't follow empty pattern")) - (else - (let* ((x (car res)) - (tail (cdr res)) - (j (string-scan-char str #\} (+ i 1))) - (s2 (string-split-char (substring str (+ i 1) j) - #\,)) - (n (string->number (car s2))) - (m (and (pair? (cdr s2)) - (string->number (cadr s2))))) - (cond - ((or (not n) - (and (pair? (cdr s2)) - (not (equal? "" (cadr s2))) - (not m))) - (error "invalid {n} repetition syntax" s2)) - ((null? (cdr s2)) - (lp (+ j 1) (+ j 1) flags `((= ,n ,x) ,@tail) st)) - (m - (lp (+ j 1) (+ j 1) flags `((** ,n ,m ,x) ,@tail) st)) - (else - (lp (+ j 1) (+ j 1) flags `((>= ,n ,x) ,@tail) st) - ))))))))) - ((#\\) - (cond - ((>= (+ i 1) end) - (error "incomplete escape sequence" str)) - (else - (let ((c (string-ref str (+ i 1)))) - (case c - ((#\d) - (lp (+ i 2) (+ i 2) flags `(numeric ,@(collect)) st)) - ((#\D) - (lp (+ i 2) (+ i 2) flags `((~ numeric) ,@(collect)) st)) - ((#\s) - (lp (+ i 2) (+ i 2) flags `(space ,@(collect)) st)) - ((#\S) - (lp (+ i 2) (+ i 2) flags `((~ space) ,@(collect)) st)) - ((#\w) - (lp (+ i 2) (+ i 2) flags - `((or alphanumeric ("_")) ,@(collect)) st)) - ((#\W) - (lp (+ i 2) (+ i 2) flags - `((~ (or alphanumeric ("_"))) ,@(collect)) st)) - ((#\b) - (lp (+ i 2) (+ i 2) flags - `((or bow eow) ,@(collect)) st)) - ((#\B) - (lp (+ i 2) (+ i 2) flags `(nwb ,@(collect)) st)) - ((#\A) - (lp (+ i 2) (+ i 2) flags `(bos ,@(collect)) st)) - ((#\Z) - (lp (+ i 2) (+ i 2) flags - `((? #\newline) eos ,@(collect)) st)) - ((#\z) - (lp (+ i 2) (+ i 2) flags `(eos ,@(collect)) st)) - ((#\R) - (lp (+ i 2) (+ i 2) flags `(newline ,@(collect)) st)) - ((#\K) - (lp (+ i 2) (+ i 2) flags `(reset ,@(collect)) st)) - ;; these two are from Emacs and TRE, but not in PCRE - ((#\<) - (lp (+ i 2) (+ i 2) flags `(bow ,@(collect)) st)) - ((#\>) - (lp (+ i 2) (+ i 2) flags `(eow ,@(collect)) st)) - ((#\x) - (apply - (lambda (ch j) - (lp (+ j 1) (+ j 1) flags `(,ch ,@(collect)) st)) - (string-parse-hex-escape str (+ i 2) end))) - ((#\k) - (let ((c (string-ref str (+ i 2)))) - (if (not (memv c '(#\< #\{ #\'))) - (error "bad \\k usage, expected \\k<...>" str) - (let* ((terminal (char-mirror c)) - (j (string-scan-char str terminal (+ i 2))) - (s (and j (substring str (+ i 3) j))) - (backref - (if (flag-set? flags ~case-insensitive?) - 'backref-ci - 'backref))) - (if (not j) - (error "unterminated named backref" str) - (lp (+ j 1) (+ j 1) flags - `((,backref ,(string->symbol s)) - ,@(collect)) - st)))))) - ((#\Q) ;; \Q..\E escapes - (let ((res (collect))) - (let lp2 ((j (+ i 2))) - (cond - ((>= j end) - (lp j (+ i 2) flags res st)) - ((eqv? #\\ (string-ref str j)) - (cond - ((>= (+ j 1) end) - (lp (+ j 1) (+ i 2) flags res st)) - ((eqv? #\E (string-ref str (+ j 1))) - (lp (+ j 2) (+ j 2) flags - (cons (substring str (+ i 2) j) res) st)) - (else - (lp2 (+ j 2))))) - (else - (lp2 (+ j 1))))))) - ((#\') - (with-read-from-string str (+ i 2) - (lambda (sre j) - (lp j j flags (cons sre (collect)) st)))) - ;;((#\p) ; XXXX unicode properties - ;; ) - ;;((#\P) - ;; ) - (else - (cond - ((char-numeric? c) - (let* ((j (or (string-scan-pred - str - (lambda (c) (not (char-numeric? c))) - (+ i 2)) - end)) - (backref - (if (flag-set? flags ~case-insensitive?) - 'backref-ci - 'backref)) - (res `((,backref ,(string->number - (substring str (+ i 1) j))) - ,@(collect)))) - (lp j j flags res st))) - ((char-alphabetic? c) - (let ((cell (assv c posix-escape-sequences))) - (if cell - (lp (+ i 2) (+ i 2) flags - (cons (cdr cell) (collect)) st) - (error "unknown escape sequence" str c)))) - (else - (lp (+ i 2) (+ i 1) flags (collect) st))))))))) - ((#\|) - (lp (+ i 1) (+ i 1) flags (cons 'or (collect)) st)) - ((#\^) - (let ((sym (if (flag-set? flags ~multi-line?) 'bol 'bos))) - (lp (+ i 1) (+ i 1) flags (cons sym (collect)) st))) - ((#\$) - (let ((sym (if (flag-set? flags ~multi-line?) 'eol 'eos))) - (lp (+ i 1) (+ i 1) flags (cons sym (collect)) st))) - ((#\space) - (if (flag-set? flags ~ignore-space?) - (lp (+ i 1) (+ i 1) flags (collect) st) - (lp (+ i 1) from flags res st))) - ((#\#) - (if (flag-set? flags ~ignore-space?) - (let ((j (or (string-scan-char str #\newline (+ i 1)) - (- end 1)))) - (lp (+ j 1) (+ j 1) flags (collect) st)) - (lp (+ i 1) from flags res st))) - (else - (lp (+ i 1) from flags res st)))))))) - -(define posix-escape-sequences - `((#\n . #\newline) - (#\r . ,(integer->char (+ (char->integer #\newline) 3))) - (#\t . ,(integer->char (- (char->integer #\newline) 1))) - (#\a . ,(integer->char (- (char->integer #\newline) 3))) - (#\e . ,(integer->char (+ (char->integer #\newline) #x11))) - (#\f . ,(integer->char (+ (char->integer #\newline) 2))) - )) - -(define (char-altcase c) - (if (char-upper-case? c) (char-downcase c) (char-upcase c))) - -(define (char-mirror c) - (case c ((#\<) #\>) ((#\{) #\}) ((#\() #\)) ((#\[) #\]) (else c))) - -(define (string-parse-hex-escape str i end) - (cond - ((>= i end) - (error "incomplete hex escape" str i)) - ((eqv? #\{ (string-ref str i)) - (let ((j (string-scan-char-escape str #\} (+ i 1)))) - (if (not j) - (error "incomplete hex brace escape" str i) - (let* ((s (substring str (+ i 1) j)) - (n (string->number s 16))) - (if n - (list (integer->char n) j) - (error "bad hex brace escape" s)))))) - ((>= (+ i 1) end) - (error "incomplete hex escape" str i)) - (else - (let* ((s (substring str i (+ i 2))) - (n (string->number s 16))) - (if n - (list (integer->char n) (+ i 2)) - (error "bad hex escape" s)))))) - -(define (string-parse-cset str start flags) - (let* ((end (string-length str)) - (invert? (and (< start end) (eqv? #\^ (string-ref str start)))) - (utf8? (flag-set? flags ~utf8?))) - (define (go i chars ranges) - (if (>= i end) - (error "incomplete char set" str i end) - (let ((c (string-ref str i))) - (case c - ((#\]) - (if (and (null? chars) (null? ranges)) - (go (+ i 1) (cons #\] chars) ranges) - (let ((ci? (flag-set? flags ~case-insensitive?)) - (hi-chars (if utf8? (filter high-char? chars) '())) - (chars (if utf8? (remove high-char? chars) chars))) - (list - ((lambda (res) - (if invert? (cons '~ res) (sre-alternate res))) - (append - hi-chars - (if (pair? chars) - (list - (list (list->string - ((if ci? - cset-case-insensitive - (lambda (x) x)) - (reverse chars))))) - '()) - (if (pair? ranges) - (let ((res (if ci? - (cset-case-insensitive - (reverse ranges)) - (reverse ranges)))) - (list (cons '/ (alist->plist res)))) - '()))) - i)))) - ((#\-) - (cond - ((or (= i start) - (and (= i (+ start 1)) (eqv? #\^ (string-ref str start))) - (eqv? #\] (string-ref str (+ i 1)))) - (go (+ i 1) (cons c chars) ranges)) - ((null? chars) - (error "bad char-set")) - (else - (let* ((c1 (car chars)) - (c2 (string-ref str (+ i 1)))) - (apply - (lambda (c2 j) - (if (char<? c2 c1) - (error "inverted range in char-set" c1 c2) - (go j (cdr chars) (cons (cons c1 c2) ranges)))) - (cond - ((and (eqv? #\\ c2) (assv c2 posix-escape-sequences)) - => (lambda (x) (list (cdr x) (+ i 3)))) - ((and (eqv? #\\ c2) - (eqv? (string-ref str (+ i 2)) #\x)) - (string-parse-hex-escape str (+ i 3) end)) - ((and utf8? (<= #x80 (char->integer c2) #xFF)) - (let ((len (utf8-start-char->length c2))) - (list (utf8-string-ref str (+ i 1) len) (+ i 1 len)))) - (else - (list c2 (+ i 2))))))))) - ((#\[) - (let* ((inv? (eqv? #\^ (string-ref str (+ i 1)))) - (i2 (if inv? (+ i 2) (+ i 1)))) - (case (string-ref str i2) - ((#\:) - (let ((j (string-scan-char str #\: (+ i2 1)))) - (if (or (not j) (not (eqv? #\] (string-ref str (+ j 1))))) - (error "incomplete character class" str) - (let* ((cset (sre->cset - (string->symbol - (substring str (+ i2 1) j)))) - (cset (if inv? (cset-complement cset) cset))) - (go (+ j 2) - (append (filter char? cset) chars) - (append (filter pair? cset) ranges)))))) - ((#\= #\.) - (error "collating sequences not supported" str)) - (else - (go (+ i 1) (cons #\[ chars) ranges))))) - ((#\\) - (let ((c (string-ref str (+ i 1)))) - (case c - ((#\d #\D #\s #\S #\w #\W) - (let ((cset (sre->cset (string->sre (string #\\ c))))) - (go (+ i 2) - (append (filter char? cset) chars) - (append (filter pair? cset) ranges)))) - ((#\x) - (apply - (lambda (ch j) - (go j (cons ch chars) ranges)) - (string-parse-hex-escape str (+ i 2) end))) - (else - (let ((c (cond ((assv c posix-escape-sequences) => cdr) - (else c)))) - (go (+ i 2) - (cons (string-ref str (+ i 1)) (cons c chars)) - ranges)))))) - (else - (if (and utf8? (<= #x80 (char->integer c) #xFF)) - (let ((len (utf8-start-char->length c))) - (go (+ i len) - (cons (utf8-string-ref str i len) chars) - ranges)) - (go (+ i 1) (cons c chars) ranges))))))) - (if invert? - (go (+ start 1) - (if (flag-set? flags ~multi-line?) '(#\newline) '()) - '()) - (go start '() '())))) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;;; UTF-8 Utilities - -;; Here are some hairy optimizations that need to be documented -;; better. Thanks to these, we never do any utf8 processing once the -;; regexp is compiled. - -;; two chars: ab..ef -;; a[b..xFF]|[b-d][x80..xFF]|e[x80..xFF] - -;; three chars: abc..ghi -;; ab[c..xFF]|a[d..xFF][x80..xFF]| -;; [b..f][x80..xFF][x80..xFF]| -;; g[x80..g][x80..xFF]|gh[x80..i] - -;; four chars: abcd..ghij -;; abc[d..xFF]|ab[d..xFF][x80..xFF]|a[c..xFF][x80..xFF][x80..xFF]| -;; [b..f][x80..xFF][x80..xFF][x80..xFF]| -;; g[x80..g][x80..xFF][x80..xFF]|gh[x80..h][x80..xFF]|ghi[x80..j] - -(define (high-char? c) (<= #x80 (char->integer c))) - -;; number of total bytes in a utf8 char given the 1st byte - -(define utf8-start-char->length - (let ((table '#( -1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 0x -1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 1x -1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 2x -1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 3x -1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 4x -1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 5x -1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 6x -1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 7x -1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 8x -1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 9x -1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; ax -1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; bx -2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ; cx -2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ; dx -3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 ; ex -4 4 4 4 4 4 4 4 5 5 5 5 6 6 0 0 ; fx -))) - (lambda (c) (vector-ref table (char->integer c))))) - -(define (utf8-string-ref str i len) - (define (byte n) (char->integer (string-ref str n))) - (case len - ((1) ; shouldn't happen in this module - (string-ref str i)) - ((2) - (integer->char - (+ (bit-shl (bit-and (byte i) #b00011111) 6) - (bit-and (byte (+ i 1)) #b00111111)))) - ((3) - (integer->char - (+ (bit-shl (bit-and (byte i) #b00001111) 12) - (bit-shl (bit-and (byte (+ i 1)) #b00111111) 6) - (bit-and (byte (+ i 2)) #b00111111)))) - ((4) - (integer->char - (+ (bit-shl (bit-and (byte i) #b00000111) 18) - (bit-shl (bit-and (byte (+ i 1)) #b00111111) 12) - (bit-shl (bit-and (byte (+ i 2)) #b00111111) 6) - (bit-and (byte (+ i 3)) #b00111111)))) - (else - (error "invalid utf8 length" str len i)))) - -(define (utf8-backup-to-initial-char str i) - (let lp ((i i)) - (if (= i 0) - 0 - (let ((c (char->integer (string-ref str i)))) - (if (or (< c #x80) (>= c #xC0)) - i - (lp (- i 1))))))) - -(define (utf8-lowest-digit-of-length len) - (case len - ((1) 0) ((2) #xC0) ((3) #xE0) ((4) #xF0) - (else (error "invalid utf8 length" len)))) - -(define (utf8-highest-digit-of-length len) - (case len - ((1) #x7F) ((2) #xDF) ((3) #xEF) ((4) #xF7) - (else (error "invalid utf8 length" len)))) - -(define (char->utf8-list c) - (let ((i (char->integer c))) - (cond - ((<= i #x7F) (list i)) - ((<= i #x7FF) - (list (bit-ior #b11000000 (bit-shr i 6)) - (bit-ior #b10000000 (bit-and i #b111111)))) - ((<= i #xFFFF) - (list (bit-ior #b11100000 (bit-shr i 12)) - (bit-ior #b10000000 (bit-and (bit-shr i 6) #b111111)) - (bit-ior #b10000000 (bit-and i #b111111)))) - ((<= i #x1FFFFF) - (list (bit-ior #b11110000 (bit-shr i 18)) - (bit-ior #b10000000 (bit-and (bit-shr i 12) #b111111)) - (bit-ior #b10000000 (bit-and (bit-shr i 6) #b111111)) - (bit-ior #b10000000 (bit-and i #b111111)))) - (else (error "unicode codepoint out of range:" i))))) - -(define (unicode-range->utf8-pattern lo hi) - (let ((lo-ls (char->utf8-list lo)) - (hi-ls (char->utf8-list hi))) - (if (not (= (length lo-ls) (length hi-ls))) - (sre-alternate (list (unicode-range-climb-digits lo-ls hi-ls) - (unicode-range-up-to hi-ls))) - (let lp ((lo-ls lo-ls) (hi-ls hi-ls)) - (cond - ((null? lo-ls) - '()) - ((= (car lo-ls) (car hi-ls)) - (sre-sequence - (list (integer->char (car lo-ls)) - (lp (cdr lo-ls) (cdr hi-ls))))) - ((= (+ (car lo-ls) 1) (car hi-ls)) - (sre-alternate (list (unicode-range-up-from lo-ls) - (unicode-range-up-to hi-ls)))) - (else - (sre-alternate (list (unicode-range-up-from lo-ls) - (unicode-range-middle lo-ls hi-ls) - (unicode-range-up-to hi-ls))))))))) - -(define (unicode-range-helper one ls prefix res) - (if (null? ls) - res - (unicode-range-helper - one - (cdr ls) - (cons (car ls) prefix) - (cons (sre-sequence - `(,@(map integer->char prefix) - ,(one (car ls)) - ,@(map (lambda (_) - `(/ ,(integer->char #x80) - ,(integer->char #xFF))) - (cdr ls)))) - res)))) - -(define (unicode-range-up-from lo-ls) - (sre-sequence - (list (integer->char (car lo-ls)) - (sre-alternate - (unicode-range-helper - (lambda (c) - `(/ ,(integer->char (+ (car lo-ls) 1)) ,(integer->char #xFF))) - (cdr (reverse (cdr lo-ls))) - '() - (list - (sre-sequence - (append - (map integer->char (reverse (cdr (reverse (cdr lo-ls))))) - `((/ ,(integer->char (last lo-ls)) - ,(integer->char #xFF))))))))))) - -(define (unicode-range-up-to hi-ls) - (sre-sequence - (list (integer->char (car hi-ls)) - (sre-alternate - (unicode-range-helper - (lambda (c) - `(/ ,(integer->char #x80) ,(integer->char (- (car hi-ls) 1)))) - (cdr (reverse (cdr hi-ls))) - '() - (list - (sre-sequence - (append - (map integer->char (reverse (cdr (reverse (cdr hi-ls))))) - `((/ ,(integer->char #x80) - ,(integer->char (last hi-ls)))))))))))) - -(define (unicode-range-climb-digits lo-ls hi-ls) - (let ((lo-len (length lo-ls))) - (sre-alternate - (append - (list - (sre-sequence - (cons `(/ ,(integer->char (car lo-ls)) - ,(integer->char (if (<= (car lo-ls) #x7F) #x7F #xFF))) - (map (lambda (_) - `(/ ,(integer->char #x80) ,(integer->char #xFF))) - (cdr lo-ls))))) - (map - (lambda (i) - (sre-sequence - (cons - `(/ ,(integer->char (utf8-lowest-digit-of-length (+ i lo-len 1))) - ,(integer->char (utf8-highest-digit-of-length (+ i lo-len 1)))) - (map (lambda (_) - `(/ ,(integer->char #x80) ,(integer->char #xFF))) - (zero-to (+ i lo-len)))))) - (zero-to (- (length hi-ls) lo-len 1))) - (list - (sre-sequence - (cons `(/ ,(integer->char - (utf8-lowest-digit-of-length - (utf8-start-char->length - (integer->char (- (car hi-ls) 1))))) - ,(integer->char (- (car hi-ls) 1))) - (map (lambda (_) - `(/ ,(integer->char #x80) ,(integer->char #xFF))) - (cdr hi-ls))))))))) - -(define (unicode-range-middle lo-ls hi-ls) - (let ((lo (integer->char (+ (car lo-ls) 1))) - (hi (integer->char (- (car hi-ls) 1)))) - (sre-sequence - (cons (if (char=? lo hi) lo `(/ ,lo ,hi)) - (map (lambda (_) `(/ ,(integer->char #x80) ,(integer->char #xFF))) - (cdr lo-ls)))))) - -(define (cset->utf8-pattern cset) - (let lp ((ls cset) (alts '()) (lo-cset '())) - (cond - ((null? ls) - (sre-alternate (append (reverse alts) - (if (null? lo-cset) - '() - (list (cons '/ (reverse lo-cset))))))) - ((char? (car ls)) - (if (high-char? (car ls)) - (lp (cdr ls) (cons (car ls) alts) lo-cset) - (lp (cdr ls) alts (cons (car ls) lo-cset)))) - (else - (if (or (high-char? (caar ls)) (high-char? (cdar ls))) - (lp (cdr ls) - (cons (unicode-range->utf8-pattern (caar ls) (cdar ls)) alts) - lo-cset) - (lp (cdr ls) alts (cons (cdar ls) (cons (caar ls) lo-cset)))))))) - -(define (sre-adjust-utf8 sre flags) - (let adjust ((sre sre) - (utf8? (flag-set? flags ~utf8?)) - (ci? (flag-set? flags ~case-insensitive?))) - (define (rec sre) (adjust sre utf8? ci?)) - (cond - ((pair? sre) - (case (car sre) - ((w/utf8) (adjust (sre-sequence (cdr sre)) #t ci?)) - ((w/noutf8) (adjust (sre-sequence (cdr sre)) #f ci?)) - ((w/case) - (cons (car sre) (map (lambda (s) (adjust s utf8? #f)) (cdr sre)))) - ((w/nocase) - (cons (car sre) (map (lambda (s) (adjust s utf8? #t)) (cdr sre)))) - ((/ ~ & -) - (if (not utf8?) - sre - (let ((cset (sre->cset sre ci?))) - (if (any (lambda (x) - (if (pair? x) - (or (high-char? (car x)) (high-char? (cdr x))) - (high-char? x))) - cset) - (if ci? - (list 'w/case (cset->utf8-pattern cset)) - (cset->utf8-pattern cset)) - sre)))) - ((*) - (case (sre-sequence (cdr sre)) - ;; special case optimization: .* w/utf8 == .* w/noutf8 - ((any) '(* any)) - ((nonl) '(* nonl)) - (else (cons '* (map rec (cdr sre)))))) - (else - (cons (car sre) (map rec (cdr sre)))))) - (else - (case sre - ((any) 'utf8-any) - ((nonl) 'utf8-nonl) - (else - (if (and utf8? (char? sre) (high-char? sre)) - (sre-sequence (map integer->char (char->utf8-list sre))) - sre))))))) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;;; Compilation - -(cond-expand - (building-chicken - (define-syntax cached - (syntax-rules () - ((_ arg fail) (build-cache 5 arg fail))))) - (else - (define-syntax cached - (syntax-rules () - ((_ arg fail) fail))))) - -(define (irregex x . o) - (cond ((irregex? x) x) - ((null? o) - (cached - x - (if (string? x) - (string->irregex x) - (sre->irregex x)))) - (else - (if (string? x) - (apply string->irregex x o) - (apply sre->irregex x o))))) - -(define (string->irregex str . o) - (apply sre->irregex (apply string->sre str o) o)) - -(define (sre->irregex sre . o) - (let* ((pat-flags (symbol-list->flags o)) - (sre (if *allow-utf8-mode?* - (sre-adjust-utf8 sre pat-flags) - sre)) - (searcher? (sre-searcher? sre)) - (sre-dfa (if searcher? (sre-remove-initial-bos sre) sre)) - (dfa-limit (cond ((memq 'small o) 1) ((memq 'fast o) 50) (else 10))) - (dfa/search - (cond ((memq 'backtrack o) #f) - (searcher? #t) - ((sre->nfa `(seq (* any) ,sre-dfa) pat-flags) - => (lambda (nfa) - (nfa->dfa nfa (* dfa-limit (nfa-num-states nfa))))) - (else #f))) - (dfa (cond ((and dfa/search (sre->nfa sre-dfa pat-flags)) - => (lambda (nfa) - (nfa->dfa nfa (* dfa-limit (nfa-num-states nfa))))) - (else #f))) - (submatches (sre-count-submatches sre-dfa)) - (extractor - (and dfa dfa/search (sre-match-extractor sre-dfa submatches))) - (names (sre-names sre-dfa 1 '())) - (lens (sre-length-ranges sre-dfa names)) - (flags (flag-join - (flag-join ~none (and searcher? ~searcher?)) - (and (sre-consumer? sre) ~consumer?)))) - (cond - (dfa - (make-irregex dfa dfa/search extractor #f flags submatches lens names)) - (else - (let ((f (sre->procedure sre pat-flags names))) - (make-irregex #f #f #f f flags submatches lens names)))))) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;;; SRE Analysis - -;; returns #t if the sre can ever be empty -(define (sre-empty? sre) - (if (pair? sre) - (case (car sre) - ((* ? look-ahead look-behind neg-look-ahead neg-look-behind) #t) - ((**) (or (not (number? (cadr sre))) (zero? (cadr sre)))) - ((or) (any sre-empty? (cdr sre))) - ((: seq $ submatch => submatch-named + atomic) - (every sre-empty? (cdr sre))) - (else #f)) - (memq sre '(epsilon bos eos bol eol bow eow commit)))) - -(define (sre-any? sre) - (or (eq? sre 'any) - (and (pair? sre) - (case (car sre) - ((seq : $ submatch => submatch-named) - (and (pair? (cdr sre)) (null? (cddr sre)) (sre-any? (cadr sre)))) - ((or) (every sre-any? (cdr sre))) - (else #f))))) - -(define (sre-repeater? sre) - (and (pair? sre) - (or (memq (car sre) '(* +)) - (and (memq (car sre) '($ submatch => submatch-named seq :)) - (pair? (cdr sre)) - (null? (cddr sre)) - (sre-repeater? (cadr sre)))))) - -(define (sre-searcher? sre) - (if (pair? sre) - (case (car sre) - ((* +) (sre-any? (sre-sequence (cdr sre)))) - ((seq : $ submatch => submatch-named) - (and (pair? (cdr sre)) (sre-searcher? (cadr sre)))) - ((or) (every sre-searcher? (cdr sre))) - (else #f)) - (eq? 'bos sre))) - -(define (sre-consumer? sre) - (if (pair? sre) - (case (car sre) - ((* +) (sre-any? (sre-sequence (cdr sre)))) - ((seq : $ submatch => submatch-named) - (and (pair? (cdr sre)) (sre-consumer? (last sre)))) - ((or) (every sre-consumer? (cdr sre))) - (else #f)) - (eq? 'eos sre))) - -(define (sre-has-submatches? sre) - (and (pair? sre) - (or (memq (car sre) '($ submatch => submatch-named)) - (if (eq? 'posix-string (car sre)) - (sre-has-submatches? (string->sre (cadr sre))) - (any sre-has-submatches? (cdr sre)))))) - -(define (sre-count-submatches sre) - (let count ((sre sre) (sum 0)) - (if (pair? sre) - (fold count - (+ sum (case (car sre) - (($ submatch => submatch-named) 1) - ((dsm) (+ (cadr sre) (caddr sre))) - ((posix-string) - (sre-count-submatches (string->sre (cadr sre)))) - (else 0))) - (cdr sre)) - sum))) - -(define (sre-length-ranges sre . o) - (let ((names (if (pair? o) (car o) (sre-names sre 1 '()))) - (sublens (make-vector (+ 1 (sre-count-submatches sre)) #f))) - (vector-set! - sublens - 0 - (let lp ((sre sre) (n 1) (lo 0) (hi 0) (return cons)) - (define (grow i) (return (+ lo i) (and hi (+ hi i)))) - (cond - ((pair? sre) - (if (string? (car sre)) - (grow 1) - (case (car sre) - ((/ ~ & -) - (grow 1)) - ((posix-string) - (lp (string->sre (cadr sre)) n lo hi return)) - ((seq : w/case w/nocase atomic) - (let lp2 ((ls (cdr sre)) (n n) (lo2 0) (hi2 0)) - (if (null? ls) - (return (+ lo lo2) (and hi hi2 (+ hi hi2))) - (lp (car ls) n 0 0 - (lambda (lo3 hi3) - (lp2 (cdr ls) - (+ n (sre-count-submatches (car ls))) - (+ lo2 lo3) - (and hi2 hi3 (+ hi2 hi3)))))))) - ((or) - (let lp2 ((ls (cdr sre)) (n n) (lo2 #f) (hi2 0)) - (if (null? ls) - (return (+ lo (or lo2 1)) (and hi hi2 (+ hi hi2))) - (lp (car ls) n 0 0 - (lambda (lo3 hi3) - (lp2 (cdr ls) - (+ n (sre-count-submatches (car ls))) - (if lo2 (min lo2 lo3) lo3) - (and hi2 hi3 (max hi2 hi3)))))))) - ((if) - (cond - ((or (null? (cdr sre)) (null? (cddr sre))) - (return lo hi)) - (else - (let ((n1 (sre-count-submatches (car sre))) - (n2 (sre-count-submatches (cadr sre)))) - (lp (if (or (number? (cadr sre)) (symbol? (cadr sre))) - 'epsilon - (cadr sre)) - n lo hi - (lambda (lo2 hi2) - (lp (caddr sre) (+ n n1) 0 0 - (lambda (lo3 hi3) - (lp (if (pair? (cdddr sre)) - (cadddr sre) - 'epsilon) - (+ n n1 n2) 0 0 - (lambda (lo4 hi4) - (return (+ lo2 (min lo3 lo4)) - (and hi2 hi3 hi4 - (+ hi2 (max hi3 hi4)) - )))))))))))) - ((dsm) - (lp (sre-sequence (cdddr sre)) (+ n (cadr sre)) lo hi return)) - (($ submatch => submatch-named) - (lp (sre-sequence - (if (eq? 'submatch (car sre)) (cdr sre) (cddr sre))) - (+ n 1) lo hi - (lambda (lo2 hi2) - (vector-set! sublens n (cons lo2 hi2)) - (return lo2 hi2)))) - ((backref backref-ci) - (let ((n (cond - ((number? (cadr sre)) (cadr sre)) - ((assq (cadr sre) names) => cdr) - (else (error "unknown backreference" (cadr sre)))))) - (cond - ((or (not (integer? n)) - (not (< 0 n (vector-length sublens)))) - (error "sre-length: invalid backreference" sre)) - ((not (vector-ref sublens n)) - (error "sre-length: invalid forward backreference" sre)) - (else - (let ((lo2 (car (vector-ref sublens n))) - (hi2 (cdr (vector-ref sublens n)))) - (return (+ lo lo2) (and hi hi2 (+ hi hi2)))))))) - ((* *?) - (lp (sre-sequence (cdr sre)) n lo hi (lambda (lo hi) #f)) - (return lo #f)) - ((** **?) - (cond - ((or (and (number? (cadr sre)) - (number? (caddr sre)) - (> (cadr sre) (caddr sre))) - (and (not (cadr sre)) (caddr sre))) - (return lo hi)) - (else - (if (caddr sre) - (lp (sre-sequence (cdddr sre)) n 0 0 - (lambda (lo2 hi2) - (return (+ lo (* (cadr sre) lo2)) - (and hi hi2 (+ hi (* (caddr sre) hi2)))))) - (lp (sre-sequence (cdddr sre)) n 0 0 - (lambda (lo2 hi2) - (return (+ lo (* (cadr sre) lo2)) #f))))))) - ((+) - (lp (sre-sequence (cdr sre)) n lo hi - (lambda (lo2 hi2) - (return (+ lo lo2) #f)))) - ((? ??) - (lp (sre-sequence (cdr sre)) n lo hi - (lambda (lo2 hi2) - (return lo (and hi hi2 (+ hi hi2)))))) - ((= =? >= >=?) - (lp `(** ,(cadr sre) - ,(if (memq (car sre) '(>= >=?)) #f (cadr sre)) - ,@(cddr sre)) - n lo hi return)) - ((look-ahead neg-look-ahead look-behind neg-look-behind) - (return lo hi)) - (else - (cond - ((assq (car sre) sre-named-definitions) - => (lambda (cell) - (lp (apply (cdr cell) (cdr sre)) n lo hi return))) - (else - (error "sre-length-ranges: unknown sre operator" sre))))))) - ((char? sre) - (grow 1)) - ((string? sre) - (grow (string-length sre))) - ((memq sre '(any nonl)) - (grow 1)) - ((memq sre '(epsilon bos eos bol eol bow eow nwb commit)) - (return lo hi)) - (else - (let ((cell (assq sre sre-named-definitions))) - (if cell - (lp (if (procedure? (cdr cell)) ((cdr cell)) (cdr cell)) - n lo hi return) - (error "sre-length-ranges: unknown sre" sre))))))) - sublens)) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;;; SRE Manipulation - -;; build a (seq ls ...) sre from a list -(define (sre-sequence ls) - (cond - ((null? ls) 'epsilon) - ((null? (cdr ls)) (car ls)) - (else (cons 'seq ls)))) - -;; build a (or ls ...) sre from a list -(define (sre-alternate ls) - (cond - ((null? ls) '(or)) - ((null? (cdr ls)) (car ls)) - (else (cons 'or ls)))) - -;; returns an equivalent SRE without any match information -(define (sre-strip-submatches sre) - (if (not (pair? sre)) - sre - (case (car sre) - (($ submatch) (sre-strip-submatches (sre-sequence (cdr sre)))) - ((=> submatch-named) (sre-strip-submatches (sre-sequence (cddr sre)))) - ((dsm) (sre-strip-submatches (sre-sequence (cdddr sre)))) - (else (map sre-strip-submatches sre))))) - -;; given a char-set list of chars and strings, flattens them into -;; chars only -(define (sre-flatten-ranges ls) - (let lp ((ls ls) (res '())) - (cond - ((null? ls) - (reverse res)) - ((string? (car ls)) - (lp (append (string->list (car ls)) (cdr ls)) res)) - (else - (lp (cdr ls) (cons (car ls) res)))))) - -(define (sre-names sre n names) - (if (not (pair? sre)) - names - (case (car sre) - (($ submatch) - (sre-names (sre-sequence (cdr sre)) (+ n 1) names)) - ((=> submatch-named) - (sre-names (sre-sequence (cddr sre)) - (+ n 1) - (cons (cons (cadr sre) n) names))) - ((dsm) - (sre-names (sre-sequence (cdddr sre)) (+ n (cadr sre)) names)) - ((seq : or * + ? *? ?? w/case w/nocase atomic - look-ahead look-behind neg-look-ahead neg-look-behind) - (sre-sequence-names (cdr sre) n names)) - ((= >=) - (sre-sequence-names (cddr sre) n names)) - ((** **?) - (sre-sequence-names (cdddr sre) n names)) - (else - names)))) - -(define (sre-sequence-names ls n names) - (if (null? ls) - names - (sre-sequence-names (cdr ls) - (+ n (sre-count-submatches (car ls))) - (sre-names (car ls) n names)))) - -(define (sre-remove-initial-bos sre) - (cond - ((pair? sre) - (case (car sre) - ((seq : $ submatch => submatch-named * +) - (cond - ((not (pair? (cdr sre))) - sre) - ((eq? 'bos (cadr sre)) - (cons (car sre) (cddr sre))) - (else - (cons (car sre) - (cons (sre-remove-initial-bos (cadr sre)) (cddr sre)))))) - ((or) - (sre-alternate (map sre-remove-initial-bos (cdr sre)))) - (else - sre))) - (else - sre))) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;;; Basic Matching - -(define irregex-basic-string-chunker - (make-irregex-chunker (lambda (x) #f) - car - cadr - caddr - (lambda (src1 i src2 j) - (substring (car src1) i j)))) - -(define (irregex-search x str . o) - (if (not (string? str)) (error "irregex-search: not a string" str)) - (let ((start (or (and (pair? o) (car o)) 0)) - (end (or (and (pair? o) (pair? (cdr o)) (cadr o)) (string-length str)))) - (irregex-search/chunked x - irregex-basic-string-chunker - (list str start end) - start))) - -(define (irregex-search/chunked x cnk src . o) - (let* ((irx (irregex x)) - (matches (irregex-new-matches irx)) - (i (if (pair? o) (car o) ((chunker-get-start cnk) src)))) - (irregex-match-chunker-set! matches cnk) - (irregex-search/matches irx cnk src i matches))) - -;; internal routine, can be used in loops to avoid reallocating the -;; match vector -(define (irregex-search/matches irx cnk src i matches) - (cond - ((irregex-dfa irx) - (cond - ((flag-set? (irregex-flags irx) ~searcher?) - (cond - ((dfa-match/longest (irregex-dfa irx) cnk src i #f #f matches 0) - (irregex-match-start-chunk-set! matches 0 src) - (irregex-match-start-index-set! matches 0 i) - ((irregex-dfa/extract irx) - cnk src i - (%irregex-match-end-chunk matches 0) - (%irregex-match-end-index matches 0) - matches) - matches) - (else - #f))) - ((dfa-match/shortest - (irregex-dfa/search irx) cnk src i matches 0) - (let ((dfa (irregex-dfa irx)) - (get-start (chunker-get-start cnk)) - (get-end (chunker-get-end cnk)) - (get-next (chunker-get-next cnk))) - (let lp1 ((src src) (i i)) - (let ((end (get-end src))) - (let lp2 ((i i)) - (cond - ((dfa-match/longest dfa cnk src i #f #f matches 0) - (irregex-match-start-chunk-set! matches 0 src) - (irregex-match-start-index-set! matches 0 i) - ((irregex-dfa/extract irx) - cnk src i - (%irregex-match-end-chunk matches 0) - (%irregex-match-end-index matches 0) - matches) - matches) - ((>= i end) - (let ((next (get-next src))) - (and next (lp1 next (get-start next))))) - (else - (lp2 (+ i 1))))))))) - (else - #f))) - (else - (let ((res (irregex-search/backtrack irx cnk src i matches))) - (if res (%irregex-match-fail-set! res #f)) - res)))) - -(define (irregex-search/backtrack irx cnk src i matches) - (let ((matcher (irregex-nfa irx)) - (str ((chunker-get-str cnk) src)) - (end ((chunker-get-end cnk) src)) - (get-next (chunker-get-next cnk)) - (init (cons src i))) - (if (flag-set? (irregex-flags irx) ~searcher?) - (matcher cnk init src str i end matches (lambda () #f)) - (let lp ((src2 src) - (str str) - (i i) - (end end)) - (cond - ((matcher cnk init src2 str i end matches (lambda () #f)) - (irregex-match-start-chunk-set! matches 0 src2) - (irregex-match-start-index-set! matches 0 i) - matches) - ((< i end) - (lp src2 str (+ i 1) end)) - (else - (let ((src2 (get-next src2))) - (if src2 - (lp src2 - ((chunker-get-str cnk) src2) - ((chunker-get-start cnk) src2) - ((chunker-get-end cnk) src2)) - #f)))))))) - -(define (irregex-match irx str . o) - (if (not (string? str)) (error "irregex-match: not a string" str)) - (let ((start (or (and (pair? o) (car o)) 0)) - (end (or (and (pair? o) (pair? (cdr o)) (cadr o)) (string-length str)))) - (irregex-match/chunked irx - irregex-basic-string-chunker - (list str start end)))) - -(define (irregex-match/chunked irx cnk src) - (let* ((irx (irregex irx)) - (matches (irregex-new-matches irx))) - (irregex-match-chunker-set! matches cnk) - (cond - ((irregex-dfa irx) - (and - (dfa-match/longest - (irregex-dfa irx) cnk src ((chunker-get-start cnk) src) #f #f matches 0) - (= ((chunker-get-end cnk) (%irregex-match-end-chunk matches 0)) - (%irregex-match-end-index matches 0)) - (begin - (irregex-match-start-chunk-set! matches 0 src) - (irregex-match-start-index-set! matches - 0 - ((chunker-get-start cnk) src)) - ((irregex-dfa/extract irx) - cnk src ((chunker-get-start cnk) src) - (%irregex-match-end-chunk matches 0) - (%irregex-match-end-index matches 0) - matches) - matches))) - (else - (let* ((matcher (irregex-nfa irx)) - (str ((chunker-get-str cnk) src)) - (i ((chunker-get-start cnk) src)) - (end ((chunker-get-end cnk) src)) - (init (cons src i))) - (let lp ((m (matcher cnk init src str i end matches (lambda () #f)))) - (and m - (cond - ((and (not ((chunker-get-next cnk) - (%irregex-match-end-chunk m 0))) - (= ((chunker-get-end cnk) - (%irregex-match-end-chunk m 0)) - (%irregex-match-end-index m 0))) - (%irregex-match-fail-set! m #f) - m) - ((%irregex-match-fail m) - (lp ((%irregex-match-fail m)))) - (else - #f))))))))) - -(define (irregex-match? . args) - (and (apply irregex-match args) #t)) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;;; DFA Matching - -;; inline these -(define (dfa-init-state dfa) - (vector-ref dfa 0)) -(define (dfa-next-state dfa node) - (vector-ref dfa (cdr node))) -(define (dfa-final-state? dfa state) - (car state)) - -;; this searches for the first end index for which a match is possible -(define (dfa-match/shortest dfa cnk src start matches index) - (let ((get-str (chunker-get-str cnk)) - (get-start (chunker-get-start cnk)) - (get-end (chunker-get-end cnk)) - (get-next (chunker-get-next cnk))) - (let lp1 ((src src) (start start) (state (dfa-init-state dfa))) - (and - src - (let ((str (get-str src)) - (end (get-end src))) - (let lp2 ((i start) (state state)) - (cond - ((dfa-final-state? dfa state) - (cond - (index - (irregex-match-end-chunk-set! matches index src) - (irregex-match-end-index-set! matches index i))) - #t) - ((< i end) - (let* ((ch (string-ref str i)) - (next (find (lambda (x) - (if (eqv? ch (car x)) - #t - (and (pair? (car x)) - (char<=? (caar x) ch) - (char<=? ch (cdar x))))) - (cdr state)))) - (and next (lp2 (+ i 1) (dfa-next-state dfa next))))) - (else - (let ((next (get-next src))) - (and next (lp1 next (get-start next) state))))))))))) - -;; this finds the longest match starting at a given index -(define (dfa-match/longest dfa cnk src start end-src end matches index) - (let ((get-str (chunker-get-str cnk)) - (get-start (chunker-get-start cnk)) - (get-end (chunker-get-end cnk)) - (get-next (chunker-get-next cnk)) - (start-is-final? (dfa-final-state? dfa (dfa-init-state dfa)))) - (cond - (index - (irregex-match-end-chunk-set! matches index #f) - (irregex-match-end-index-set! matches index #f))) - (let lp1 ((src src) - (start start) - (state (dfa-init-state dfa)) - (res-src (and start-is-final? src)) - (res-index (and start-is-final? start))) - (let ((str (get-str src)) - (end (if (eq? src end-src) end (get-end src)))) - (let lp2 ((i start) - (state state) - (res-src res-src) - (res-index res-index)) - (cond - ((>= i end) - (cond - ((and index res-src) - (irregex-match-end-chunk-set! matches index res-src) - (irregex-match-end-index-set! matches index res-index))) - (let ((next (and (not (eq? src end-src)) (get-next src)))) - (if next - (lp1 next (get-start next) state res-src res-index) - (and index - (%irregex-match-end-chunk matches index) - #t)))) - (else - (let* ((ch (string-ref str i)) - (cell (find (lambda (x) - (if (eqv? ch (car x)) - #t - (and (pair? (car x)) - (char<=? (caar x) ch) - (char<=? ch (cdar x))))) - (cdr state)))) - (cond - (cell - (let ((next (dfa-next-state dfa cell))) - (if (dfa-final-state? dfa next) - (lp2 (+ i 1) next src (+ i 1)) - (lp2 (+ i 1) next res-src res-index)))) - (res-src - (cond - (index - (irregex-match-end-chunk-set! matches index res-src) - (irregex-match-end-index-set! matches index res-index))) - #t) - ((and index (%irregex-match-end-chunk matches index)) - #t) - (else - #f)))))))))) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;;; Named Definitions - -(define sre-named-definitions - `((any . ,*all-chars*) - (nonl . (- ,*all-chars* (,(string #\newline)))) - (alphabetic . (/ #\a #\z #\A #\Z)) - (alpha . alphabetic) - (alphanumeric . (/ #\a #\z #\A #\Z #\0 #\9)) - (alphanum . alphanumeric) - (alnum . alphanumeric) - (lower-case . (/ #\a #\z)) - (lower . lower-case) - (upper-case . (/ #\A #\Z)) - (upper . upper-case) - (numeric . (/ #\0 #\9)) - (num . numeric) - (digit . numeric) - (punctuation . (or #\! #\" #\# #\% #\& #\' #\( #\) #\* #\, #\- #\. - #\/ #\: #\; #\? #\@ #\[ #\\ #\] #\_ #\{ #\})) - (punct . punctuation) - (graphic - . (or alphanumeric punctuation #\$ #\+ #\< #\= #\> #\^ #\` #\| #\~)) - (graph . graphic) - (blank . (or #\space ,(integer->char (- (char->integer #\space) 23)))) - (whitespace . (or blank #\newline)) - (space . whitespace) - (white . whitespace) - (printing or graphic whitespace) - (print . printing) - - ;; XXXX we assume a (possibly shifted) ASCII-based ordering - (control . (/ ,(integer->char (- (char->integer #\space) 32)) - ,(integer->char (- (char->integer #\space) 1)))) - (cntrl . control) - (hex-digit . (or numeric (/ #\a #\f #\A #\F))) - (xdigit . hex-digit) - (ascii . (/ ,(integer->char (- (char->integer #\space) 32)) - ,(integer->char (+ (char->integer #\space) 95)))) - (ascii-nonl . (/ ,(integer->char (- (char->integer #\space) 32)) - ,(integer->char (- (char->integer #\newline) 1)) - ,(integer->char (+ (char->integer #\newline) 1)) - ,(integer->char (+ (char->integer #\space) 95)))) - (newline . (or (seq ,(integer->char (+ (char->integer #\newline) 3)) - #\newline) - (/ #\newline - ,(integer->char (+ (char->integer #\newline) 3))))) - - ;; ... it's really annoying to support old Scheme48 - (word . (seq bow (+ (or alphanumeric #\_)) eow)) - (utf8-tail-char . (/ ,(integer->char (+ (char->integer #\space) #x60)) - ,(integer->char (+ (char->integer #\space) #xA1)))) - (utf8-2-char . (seq (/ ,(integer->char (+ (char->integer #\space) #xA2)) - ,(integer->char (+ (char->integer #\space) #xBF))) - utf8-tail-char)) - (utf8-3-char . (seq (/ ,(integer->char (+ (char->integer #\space) #xC0)) - ,(integer->char (+ (char->integer #\space) #xCF))) - utf8-tail-char - utf8-tail-char)) - (utf8-4-char . (seq (/ ,(integer->char (+ (char->integer #\space) #xD0)) - ,(integer->char (+ (char->integer #\space) #xD7))) - utf8-tail-char - utf8-tail-char - utf8-tail-char)) - (utf8-any . (or ascii utf8-2-char utf8-3-char utf8-4-char)) - (utf8-nonl . (or ascii-nonl utf8-2-char utf8-3-char utf8-4-char)) - - ;; extended library patterns - (integer . (seq (? (or #\+ #\-)) (+ numeric))) - (real . (seq (+ numeric) (? #\. (+ numeric)) (? (or #\e #\E) integer))) - ;; slightly more lax than R5RS, allow ->foo, etc. - (symbol-initial . (or alpha ("!$%&*/:<=>?^_~"))) - (symbol-subsequent . (or symbol-initial digit ("+-.@"))) - (symbol . (or (seq symbol-initial (* symbol-subsequent)) - (seq ("+-") (? symbol-initial (* symbol-subsequent))) - (seq ".." (* ".")))) - (sexp-space . (seq (* (* space) ";" (* nonl) newline) (+ space))) - (string . (seq #\" (escape #\\ #\") #\")) - (escape . ,(lambda (esc . o) `(* (or (~ ,esc ,@o) (seq ,esc any))))) - - (ipv4-digit . (seq (? (/ "12")) (? numeric) numeric)) - (ipv4-address . (seq ipv4-digit (= 3 #\. ipv4-digit))) - ;; XXXX lax, allows multiple double-colons or < 8 terms w/o a :: - (ipv6-address . (seq (** 0 4 hex-digit) - (** 1 7 #\: (? #\:) (** 0 4 hex-digit)))) - (ip-address . (or ipv4-address ipv6-address)) - (domain-atom . (+ (or alphanumeric #\_ #\-))) - (domain . (seq domain-atom (+ #\. domain-atom))) - ;; XXXX now anything can be a top-level domain, but this is still handy - (top-level-domain . (w/nocase (or "arpa" "com" "gov" "mil" "net" "org" - "aero" "biz" "coop" "info" "museum" - "name" "pro" (= 2 alpha)))) - (domain/common . (seq (+ domain-atom #\.) top-level-domain)) - ;;(email-local-part . (seq (+ (or (~ #\") string)))) - (email-local-part . (+ (or alphanumeric #\_ #\- #\. #\+))) - (email . (seq email-local-part #\@ domain)) - (url-char . (or alnum #\_ #\- #\+ #\\ #\= #\~ #\. #\, #\& #\; - (seq "%" hex-digit hex-digit))) - (url-final-char . (or alnum #\_ #\- #\+ #\\ #\= #\~ #\& - (seq "%" hex-digit hex-digit))) - (http-url . (w/nocase - "http" (? "s") "://" - (or domain/common ipv4-address) ;; (seq "[" ipv6-address "]") - (? ":" (+ numeric)) ;; port - ;; path - (? "/" (* url-char) - (? "?" (* url-char)) ;; query - (? "#" (? (* url-char) url-final-char)) ;; fragment - ))) - - )) - - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;;; SRE->NFA compilation -;; -;; An NFA state is a numbered node with a list of patter->number -;; transitions, where pattern is either a character, (lo . hi) -;; character range, or epsilon (indicating an empty transition). -;; There may be duplicate characters and overlapping ranges - since -;; it's an NFA we process it by considering all possible transitions. - -(define *nfa-presize* 128) ;; constant -(define *nfa-num-fields* 4) ;; constant - -(define (nfa-num-states nfa) (quotient (vector-length nfa) *nfa-num-fields*)) -(define (nfa-start-state nfa) (- (nfa-num-states nfa) 1)) - -(define (nfa-get-state-trans nfa i) - (vector-ref nfa (* i *nfa-num-fields*))) -(define (nfa-set-state-trans! nfa i x) - (vector-set! nfa (* i *nfa-num-fields*) x)) -(define (nfa-push-state-trans! nfa i x) - (nfa-set-state-trans! nfa i (cons x (nfa-get-state-trans nfa i)))) - -(define (nfa-get-epsilons nfa i) - (vector-ref nfa (+ (* i *nfa-num-fields*) 1))) -(define (nfa-set-epsilons! nfa i x) - (vector-set! nfa (+ (* i *nfa-num-fields*) 1) x)) -(define (nfa-add-epsilon! nfa i x) - (let ((eps (nfa-get-epsilons nfa i))) - (if (not (memq x eps)) - (nfa-set-epsilons! nfa i (cons x eps))))) - -(define (nfa-get-state-closure nfa i) - (vector-ref nfa (+ (* i *nfa-num-fields*) 2))) -(define (nfa-set-state-closure! nfa i x) - (vector-set! nfa (+ (* i *nfa-num-fields*) 2) x)) - -(define (nfa-get-closure nfa mst) - (cond ((assoc mst - (vector-ref nfa (+ (* (nfa-multi-state-hash nfa mst) - *nfa-num-fields*) - (- *nfa-num-fields* 1)))) - => cdr) - (else #f))) -(define (nfa-add-closure! nfa mst x) - (let ((i (+ (* (nfa-multi-state-hash nfa mst) *nfa-num-fields*) - (- *nfa-num-fields* 1)))) - (vector-set! nfa i (cons (cons mst x) (vector-ref nfa i))))) - -;; Compile and return the vector of NFA states (in groups of -;; *nfa-num-fields* packed elements). The start state will be the -;; last element(s) of the vector, and all remaining states will be in -;; descending numeric order, with state 0 being the unique accepting -;; state. -(define (sre->nfa sre init-flags) - (let ((buf (make-vector (* *nfa-presize* *nfa-num-fields*) '()))) - ;; we loop over an implicit sequence list - (define (lp ls n flags next) - (define (new-state-number state) - (max n (+ 1 state))) - (define (add-state! n2 trans-ls) - (if (>= (* n2 *nfa-num-fields*) (vector-length buf)) - (let ((tmp (make-vector (* 2 (vector-length buf)) '()))) - (do ((i (- (vector-length buf) 1) (- i 1))) - ((< i 0)) - (vector-set! tmp i (vector-ref buf i))) - (set! buf tmp))) - (nfa-set-state-trans! buf n2 trans-ls) - n2) - (define (extend-state! next . trans) - (and next - (add-state! (new-state-number next) - (map (lambda (x) (cons x next)) trans)))) - (define (add-char-state! next ch) - (let ((alt (char-altcase ch))) - (if (and (flag-set? flags ~case-insensitive?) (not (eqv? ch alt))) - (extend-state! next ch alt) - (extend-state! next ch)))) - (if (null? ls) - next - (cond - ((or (eq? 'epsilon (car ls)) (equal? "" (car ls))) - ;; chars and epsilons go directly into the transition table - (let ((next (lp (cdr ls) n flags next))) - (and next - (let ((new (add-state! (new-state-number next) '()))) - (nfa-add-epsilon! buf new next) - new)))) - ((string? (car ls)) - ;; process literal strings a char at a time - (let ((next (lp (cdr ls) n flags next))) - (and next - (let lp2 ((i (- (string-length (car ls)) 1)) - (next next)) - (if (< i 0) - next - (lp2 (- i 1) - (add-char-state! next (string-ref (car ls) i)))) - )))) - ((char? (car ls)) - (add-char-state! (lp (cdr ls) n flags next) (car ls))) - ((symbol? (car ls)) - (let ((cell (assq (car ls) sre-named-definitions))) - (and cell - (lp (cons (if (procedure? (cdr cell)) - ((cdr cell)) - (cdr cell)) - (cdr ls)) - n - flags - next)))) - ((pair? (car ls)) - (cond - ((string? (caar ls)) - ;; enumerated character set - (lp (cons (sre-alternate (string->list (caar ls))) (cdr ls)) - n - flags - next)) - (else - (case (caar ls) - ((seq :) - ;; for an explicit sequence, just append to the list - (lp (append (cdar ls) (cdr ls)) n flags next)) - ((w/case w/nocase w/utf8 w/noutf8) - (let* ((next (lp (cdr ls) n flags next)) - (flags ((if (memq (caar ls) '(w/case w/utf8)) - flag-clear - flag-join) - flags - (if (memq (caar ls) '(w/case w/nocase)) - ~case-insensitive? - ~utf8?)))) - (and next - (lp (cdar ls) (new-state-number next) flags next)))) - ((/ - & ~) - (let ((ranges - (sre->cset (car ls) - (flag-set? flags ~case-insensitive?)))) - (case (length ranges) - ((1) - (extend-state! (lp (cdr ls) n flags next) (car ranges))) - (else - (let ((next (lp (cdr ls) n flags next))) - (and - next - (lp (list (sre-alternate - (map (lambda (x) (if (pair? x) - (list '/ (car x) (cdr x)) - x)) - ranges))) - (new-state-number next) - (flag-clear flags ~case-insensitive?) - next))))))) - ((or) - (let ((next (lp (cdr ls) n flags next))) - (and - next - (if (null? (cdar ls)) - ;; empty (or) always fails - (add-state! (new-state-number next) '()) - ;; compile both branches and insert epsilon - ;; transitions to either - (let* ((b (lp (list (sre-alternate (cddar ls))) - (new-state-number next) - flags - next)) - (a (and b - (lp (list (cadar ls)) - (new-state-number (max b next)) - flags - next)))) - (and a - (let ((c (add-state! (new-state-number a) '()))) - (nfa-add-epsilon! buf c a) - (nfa-add-epsilon! buf c b) - c))))))) - ((?) - (let ((next (lp (cdr ls) n flags next))) - ;; insert an epsilon transition directly to next - (and - next - (let ((a (lp (cdar ls) (new-state-number next) flags next))) - (if a - (nfa-add-epsilon! buf a next)) - a)))) - ((+ *) - (let ((next (lp (cdr ls) n flags next))) - (and - next - (let* ((new (lp '(epsilon) - (new-state-number next) - flags - next)) - (a (lp (cdar ls) (new-state-number new) flags new))) - (cond - (a - ;; for *, insert an epsilon transition as in ? above - (if (eq? '* (caar ls)) - (nfa-add-epsilon! buf a new)) - ;; for both, insert a loop back to self - (nfa-add-epsilon! buf new a))) - a)))) - ;; need to add these to the match extractor first, - ;; but they tend to generate large DFAs - ;;((=) - ;; (lp (append (vector->list - ;; (make-vector (cadar ls) - ;; (sre-sequence (cddar ls)))) - ;; (cdr ls)) - ;; n flags next)) - ;;((>=) - ;; (lp (append (vector->list - ;; (make-vector (- (cadar ls) 1) - ;; (sre-sequence (cddar ls)))) - ;; (cons `(+ ,@(cddar ls)) (cdr ls))) - ;; n flags next)) - ;;((**) - ;; (lp (append (vector->list - ;; (make-vector (cadar ls) - ;; (sre-sequence (cdddar ls)))) - ;; (map - ;; (lambda (x) `(? ,x)) - ;; (vector->list - ;; (make-vector (- (caddar ls) (cadar ls)) - ;; (sre-sequence (cdddar ls))))) - ;; (cdr ls)) - ;; n flags next)) - ;; ignore submatches altogether - (($ submatch) - (lp (cons (sre-sequence (cdar ls)) (cdr ls)) n flags next)) - ((=> submatch-named) - (lp (cons (sre-sequence (cddar ls)) (cdr ls)) n flags next)) - (else - (cond - ((assq (caar ls) sre-named-definitions) - => (lambda (cell) - (if (procedure? (cdr cell)) - (lp (cons (apply (cdr cell) (cdar ls)) (cdr ls)) - n flags next) - (error "non-procedure in op position" (caar ls))))) - (else #f))))))) - (else - #f)))) - (let ((len (lp (list sre) 1 init-flags 0))) - (and len - (let ((nfa (make-vector (* *nfa-num-fields* (+ len 1))))) - (do ((i (- (vector-length nfa) 1) (- i 1))) - ((< i 0)) - (vector-set! nfa i (vector-ref buf i))) - nfa))))) - -;; We don't really want to use this, we use the closure compilation -;; below instead, but this is included for reference and testing the -;; sre->nfa conversion. - -;; (define (nfa-match nfa str) -;; (let lp ((ls (string->list str)) (state (car nfa)) (epsilons '())) -;; (if (null? ls) -;; (zero? (car state)) -;; (any (lambda (m) -;; (if (eq? 'epsilon (car m)) -;; (and (not (memv (cdr m) epsilons)) -;; (lp ls (assv (cdr m) nfa) (cons (cdr m) epsilons))) -;; (and (or (eqv? (car m) (car ls)) -;; (and (pair? (car m)) -;; (char<=? (caar m) (car ls)) -;; (char<=? (car ls) (cdar m)))) -;; (lp (cdr ls) (assv (cdr m) nfa) '())))) -;; (cdr state))))) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;;; NFA multi-state representation - -;; Cache closures in a simple hash-table keyed on the smallest state -;; (define (nfa-multi-state-hash nfa mst) -;; (car mst)) - -;; Original sorted list-based representation - -;; (define (make-nfa-multi-state nfa) -;; '()) - -;; (define (nfa-state->multi-state nfa state) -;; (list state)) - -;; (define (nfa-multi-state-copy mst) -;; (map (lambda (x) x) mst)) - -;; (define (list->nfa-multi-state nfa ls) -;; (nfa-multi-state-copy ls)) - -;; (define (nfa-multi-state-contains? mst i) -;; (memq i mst)) - -;; (define (nfa-multi-state-fold mst kons knil) -;; (fold kons knil mst)) - -;; (define (nfa-multi-state-add! mst i) -;; (insert-sorted i mst)) - -;; (define (nfa-multi-state-add mst i) -;; (insert-sorted i mst)) - -;; (define (nfa-multi-state-union a b) -;; (merge-sorted a b)) - -;; Sorted List Utilities - -;; (define (insert-sorted n ls) -;; (cond -;; ((null? ls) -;; (cons n '())) -;; ((<= n (car ls)) -;; (if (= n (car ls)) -;; ls -;; (cons n ls))) -;; (else -;; (cons (car ls) (insert-sorted n (cdr ls)))))) - -;; (define (insert-sorted! n ls) -;; (cond -;; ((null? ls) -;; (cons n '())) -;; ((<= n (car ls)) -;; (if (= n (car ls)) -;; ls -;; (cons n ls))) -;; (else -;; (let lp ((head ls) (tail (cdr ls))) -;; (cond ((or (null? tail) (< n (car tail))) -;; (set-cdr! head (cons n tail))) -;; ((> n (car tail)) -;; (lp tail (cdr tail))))) -;; ls))) - -;; (define (merge-sorted a b) -;; (cond ((null? a) b) -;; ((null? b) a) -;; ((< (car a) (car b)) -;; (cons (car a) (merge-sorted (cdr a) b))) -;; ((> (car a) (car b)) -;; (cons (car b) (merge-sorted a (cdr b)))) -;; (else (merge-sorted (cdr a) b)))) - -;; ========================================================= ;; - -;; Presized bit-vector based - -(define (nfa-multi-state-hash nfa mst) - (modulo (vector-ref mst 0) (nfa-num-states nfa))) - -(define (make-nfa-multi-state nfa) - (make-vector (quotient (+ (nfa-num-states nfa) 24 -1) 24) 0)) - -(define (nfa-state->multi-state nfa state) - (nfa-multi-state-add! (make-nfa-multi-state nfa) state)) - -(define (nfa-multi-state-copy mst) - (let ((res (make-vector (vector-length mst)))) - (do ((i (- (vector-length mst) 1) (- i 1))) - ((< i 0) res) - (vector-set! res i (vector-ref mst i))))) - -(define (nfa-multi-state-contains? mst i) - (let ((cell (quotient i 24)) - (bit (remainder i 24))) - (not (zero? (bit-and (vector-ref mst cell) (bit-shl 1 bit)))))) - -(define (nfa-multi-state-add! mst i) - (let ((cell (quotient i 24)) - (bit (remainder i 24))) - (vector-set! mst cell (bit-ior (vector-ref mst cell) (bit-shl 1 bit))) - mst)) - -(define (nfa-multi-state-add mst i) - (nfa-multi-state-add! (nfa-multi-state-copy mst) i)) - -(define (nfa-multi-state-union! a b) - (do ((i (- (vector-length a) 1) (- i 1))) - ((< i 0) a) - (vector-set! a i (bit-ior (vector-ref a i) (vector-ref b i))))) - -(define (nfa-multi-state-union a b) - (nfa-multi-state-union! (nfa-multi-state-copy a) b)) - -(define (nfa-multi-state-fold mst kons knil) - (let ((limit (vector-length mst))) - (let lp1 ((i 0) - (acc knil)) - (if (>= i limit) - acc - (let lp2 ((n (vector-ref mst i)) - (acc acc)) - (if (zero? n) - (lp1 (+ i 1) acc) - (let* ((n2 (bit-and n (- n 1))) - (n-tail (- n n2)) - (bit (+ (* i 24) (integer-log n-tail)))) - (lp2 n2 (kons bit acc))))))))) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;;; NFA->DFA compilation -;; -;; During processing, the DFA is a list of the form: -;; -;; ((NFA-states ...) accepting-state? transitions ...) -;; -;; where the transitions are as in the NFA, except there are no -;; epsilons, duplicate characters or overlapping char-set ranges, and -;; the states moved to are closures (sets of NFA states). Multiple -;; DFA states may be accepting states. - -(define (nfa->dfa nfa . o) - (let ((max-states (and (pair? o) (car o)))) - (let lp ((ls (list (nfa-cache-state-closure! nfa (nfa-start-state nfa)))) - (i 0) - (res '())) - (cond - ((null? ls) - (dfa-renumber nfa (reverse res))) - ((assoc (car ls) res) ;; already seen this combination of states - (lp (cdr ls) i res)) - ((and max-states (> i max-states)) ;; too many DFA states - #f) - (else - (let* ((states (car ls)) - (trans (nfa-state-transitions nfa states)) - (accept? (and (nfa-multi-state-contains? states 0) #t))) - (lp (append (map cdr trans) (cdr ls)) - (+ i 1) - `((,states ,accept? ,@trans) ,@res)))))))) - -;; When the conversion is complete we renumber the DFA sets-of-states -;; in order and convert the result to a vector for fast lookup. -(define (dfa-renumber nfa dfa) - (let* ((len (length dfa)) - (states (make-vector (nfa-num-states nfa) '())) - (res (make-vector len))) - (define (renumber mst) - (cdr (assoc mst (vector-ref states (nfa-multi-state-hash nfa mst))))) - (let lp ((ls dfa) (i 0)) - (cond ((pair? ls) - (let ((j (nfa-multi-state-hash nfa (caar ls)))) - (vector-set! states j (cons (cons (caar ls) i) - (vector-ref states j)))) - (lp (cdr ls) (+ i 1))))) - (let lp ((ls dfa) (i 0)) - (cond ((pair? ls) - (for-each - (lambda (x) (set-cdr! x (renumber (cdr x)))) - (cddar ls)) - (vector-set! res i (cdar ls)) - (lp (cdr ls) (+ i 1))))) - res)) - -;; Extract all distinct characters or ranges and the potential states -;; they can transition to from a given set of states. Any ranges that -;; would overlap with distinct characters are split accordingly. -(define (nfa-state-transitions nfa states) - (let ((res (nfa-multi-state-fold - states - (lambda (st res) - (fold (lambda (trans res) - (nfa-join-transitions! nfa res (car trans) (cdr trans))) - res - (nfa-get-state-trans nfa st))) - '()))) - (for-each (lambda (x) (set-cdr! x (nfa-closure nfa (cdr x)))) res) - res)) - -(define (nfa-join-transitions! nfa existing elt state) - (define (join! ls elt state) - (if (not elt) - ls - (nfa-join-transitions! nfa ls elt state))) - (cond - ((char? elt) - (let lp ((ls existing) (res '())) - (cond - ((null? ls) - ;; done, just cons this on to the original list - (cons (cons elt (nfa-state->multi-state nfa state)) existing)) - ((eq? elt (caar ls)) - ;; add a new state to an existing char - (set-cdr! (car ls) (nfa-multi-state-add! (cdar ls) state)) - existing) - ((and (pair? (caar ls)) - (char<=? (caaar ls) elt) - (char<=? elt (cdaar ls))) - ;; split a range - (apply - (lambda (left right) - (let ((left-copy (nfa-multi-state-copy (cdar ls))) - (right-copy (nfa-multi-state-copy (cdar ls)))) - (cons (cons elt (nfa-multi-state-add! (cdar ls) state)) - (append (if left (list (cons left left-copy)) '()) - (if right (list (cons right right-copy)) '()) - res - (cdr ls))))) - (split-char-range (caar ls) elt))) - (else - ;; keep looking - (lp (cdr ls) (cons (car ls) res)))))) - (else - (let ((lo (car elt)) - (hi (cdr elt))) - (let lp ((ls existing) (res '())) - (cond - ((null? ls) - ;; done, just cons this on to the original list - (cons (cons elt (nfa-state->multi-state nfa state)) existing)) - ((and (char? (caar ls)) (char<=? lo (caar ls)) (char<=? (caar ls) hi)) - ;; range enclosing a character - (apply - (lambda (left right) - (set-cdr! (car ls) (nfa-multi-state-add! (cdar ls) state)) - (join! (join! existing left state) right state)) - (split-char-range elt (caar ls)))) - ((and (pair? (caar ls)) - (or (and (char<=? (caaar ls) hi) (char<=? lo (cdaar ls))) - (and (char<=? hi (caaar ls)) (char<=? (cdaar ls) lo)))) - ;; overlapping ranges - (apply - (lambda (left1 left2 same right1 right2) ;; 5 regions - (let ((right1-copy (nfa-multi-state-copy (cdar ls))) - (right2-copy (nfa-multi-state-copy (cdar ls)))) - (set-car! (car ls) same) - (set-cdr! (car ls) (nfa-multi-state-add! (cdar ls) state)) - (let* ((res (if right1 - (cons (cons right1 right1-copy) existing) - existing)) - (res (if right2 - (cons (cons right2 right2-copy) res) - res))) - (join! (join! res left1 state) left2 state)))) - (intersect-char-ranges elt (caar ls)))) - (else - (lp (cdr ls) (cons (car ls) res))))))))) - -(define (char-range c1 c2) - (if (eqv? c1 c2) c1 (cons c1 c2))) - -;; assumes ch is included in the range -(define (split-char-range range ch) - (list - (and (not (eqv? ch (car range))) - (char-range (car range) (integer->char (- (char->integer ch) 1)))) - (and (not (eqv? ch (cdr range))) - (char-range (integer->char (+ (char->integer ch) 1)) (cdr range))))) - -;; returns 5 (possibly #f) char ranges: -;; a-only-1 a-only-2 a-and-b b-only-1 b-only-2 -(define (intersect-char-ranges a b) - (if (char>? (car a) (car b)) - (reverse (intersect-char-ranges b a)) - (let ((a-lo (car a)) - (a-hi (cdr a)) - (b-lo (car b)) - (b-hi (cdr b))) - (list - (and (char<? a-lo b-lo) - (char-range a-lo (integer->char (- (char->integer b-lo) 1)))) - (and (char>? a-hi b-hi) - (char-range (integer->char (+ (char->integer b-hi) 1)) a-hi)) - (char-range b-lo (if (char<? b-hi a-hi) b-hi a-hi)) - #f - (and (char>? b-hi a-hi) - (char-range (integer->char (+ (char->integer a-hi) 1)) b-hi)))))) - -(define (nfa-cache-state-closure! nfa state) - (let ((cached (nfa-get-state-closure nfa state))) - (cond - ((not (null? cached)) - cached) - (else - (let ((res (nfa-state-closure-internal nfa state))) - (nfa-set-state-closure! nfa state res) - res))))) - -;; The `closure' of a list of NFA states - all states that can be -;; reached from any of them using any number of epsilon transitions. -(define (nfa-state-closure-internal nfa state) - (let lp ((ls (list state)) - (res (make-nfa-multi-state nfa))) - (cond - ((null? ls) - res) - ((nfa-multi-state-contains? res (car ls)) - (lp (cdr ls) res)) - (else - (lp (append (nfa-get-epsilons nfa (car ls)) (cdr ls)) - (nfa-multi-state-add! res (car ls))))))) - -(define (nfa-closure-internal nfa states) - (nfa-multi-state-fold - states - (lambda (st res) - (nfa-multi-state-union! res (nfa-cache-state-closure! nfa st))) - (make-nfa-multi-state nfa))) - -(define (nfa-closure nfa states) - (or (nfa-get-closure nfa states) - (let ((res (nfa-closure-internal nfa states))) - (nfa-add-closure! nfa states res) - res))) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;;; Match Extraction -;; -;; DFAs don't give us match information, so once we match and -;; determine the start and end, we need to recursively break the -;; problem into smaller DFAs to get each submatch. -;; -;; See http://compilers.iecc.com/comparch/article/07-10-026 - -(define (match-vector-ref v i) (vector-ref v (+ 3 i))) - -(define (match-vector-set! v i x) (vector-set! v (+ 3 i) x)) - -(define (sre-match-extractor sre num-submatches) - (let* ((tmp (+ num-submatches 1)) - (tmp-end-src-offset (+ 2 (* tmp 4))) - (tmp-end-index-offset (+ 3 (* tmp 4)))) - (let lp ((sre sre) (n 1) (submatch-deps? #f)) - (cond - ((not (sre-has-submatches? sre)) - (if (not submatch-deps?) - (lambda (cnk start i end j matches) #t) - (let ((dfa (nfa->dfa (sre->nfa sre ~none)))) - (lambda (cnk start i end j matches) - (dfa-match/longest dfa cnk start i end j matches tmp))))) - ((pair? sre) - (case (car sre) - ((: seq) - (let* ((right (sre-sequence (cddr sre))) - (match-left (lp (cadr sre) n #t)) - (match-right - (lp right (+ n (sre-count-submatches (cadr sre))) #t))) - (lambda (cnk start i end j matches) - (let lp1 ((end2 end) (j2 j) (best-src #f) (best-index #f)) - (let ((limit (if (eq? start end2) - i - ((chunker-get-start cnk) end2)))) - (let lp2 ((k j2) (best-src best-src) (best-index best-index)) - (if (< k limit) - (cond - ((not (eq? start end2)) - (let ((prev (chunker-prev-chunk cnk start end2))) - (lp1 prev - ((chunker-get-end cnk) prev) - best-src - best-index))) - (best-src - (match-vector-set! matches tmp-end-src-offset best-src) - (match-vector-set! matches tmp-end-index-offset best-index) - #t) - (else - #f)) - (if (and (match-left cnk start i end2 k matches) - (eq? end2 (match-vector-ref matches - tmp-end-src-offset)) - (eqv? k (match-vector-ref matches - tmp-end-index-offset)) - (match-right cnk end2 k end j matches)) - (let ((right-src - (match-vector-ref matches tmp-end-src-offset)) - (right - (match-vector-ref matches tmp-end-index-offset))) - (cond - ((and (eq? end right-src) (eqv? j right)) - (match-vector-set! matches tmp-end-src-offset end) - (match-vector-set! matches tmp-end-index-offset j) - #t) - ((or (not best-src) - (if (eq? best-src right-src) - (> right best-index) - (chunk-before? cnk - best-src - right-src))) - (lp2 (- k 1) right-src right)) - (else - (lp2 (- k 1) best-src best-index)))) - (lp2 (- k 1) best-src best-index))))))))) - ((or) - (if (null? (cdr sre)) - (lambda (cnk start i end j matches) #f) - (let* ((rest (sre-alternate (cddr sre))) - (match-first - (lp (cadr sre) n #t)) - (match-rest - (lp rest - (+ n (sre-count-submatches (cadr sre))) - submatch-deps?))) - (lambda (cnk start i end j matches) - (or (and (match-first cnk start i end j matches) - (eq? end (match-vector-ref matches tmp-end-src-offset)) - (eqv? j (match-vector-ref matches tmp-end-index-offset))) - (match-rest cnk start i end j matches)))))) - ((* +) - (letrec ((match-once - (lp (sre-sequence (cdr sre)) n #t)) - (match-all - (lambda (cnk start i end j matches) - (if (match-once cnk start i end j matches) - (let ((src (match-vector-ref matches tmp-end-src-offset)) - (k (match-vector-ref matches tmp-end-index-offset))) - (if (and src (or (not (eq? start src)) (< i k))) - (match-all cnk src k end j matches) - #t)) - (begin - (match-vector-set! matches tmp-end-src-offset start) - (match-vector-set! matches tmp-end-index-offset i) - #t))))) - (if (eq? '* (car sre)) - match-all - (lambda (cnk start i end j matches) - (and (match-once cnk start i end j matches) - (let ((src (match-vector-ref matches tmp-end-src-offset)) - (k (match-vector-ref matches tmp-end-index-offset))) - (match-all cnk src k end j matches))))))) - ((?) - (let ((match-once (lp (sre-sequence (cdr sre)) n #t))) - (lambda (cnk start i end j matches) - (cond - ((match-once cnk start i end j matches) - #t) - (else - (match-vector-set! matches tmp-end-src-offset start) - (match-vector-set! matches tmp-end-index-offset i) - #t))))) - (($ submatch => submatch-named) - (let ((match-one - (lp (sre-sequence (if (memq (car sre) '($ submatch)) - (cdr sre) - (cddr sre))) - (+ n 1) - #t)) - (start-src-offset (* n 4)) - (start-index-offset (+ 1 (* n 4))) - (end-src-offset (+ 2 (* n 4))) - (end-index-offset (+ 3 (* n 4)))) - (lambda (cnk start i end j matches) - (cond - ((match-one cnk start i end j matches) - (match-vector-set! matches start-src-offset start) - (match-vector-set! matches start-index-offset i) - (match-vector-set! matches end-src-offset - (match-vector-ref matches tmp-end-src-offset)) - (match-vector-set! matches end-index-offset - (match-vector-ref matches tmp-end-index-offset)) - #t) - (else - #f))))) - (else - (error "unknown regexp operator" (car sre))))) - (else - (error "unknown regexp" sre)))))) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;;; Closure Compilation -;; -;; We use this for non-regular expressions instead of an interpreted -;; NFA matcher. We use backtracking anyway, but this gives us more -;; freedom of implementation, allowing us to support patterns that -;; can't be represented in the above NFA representation. - -(define (sre->procedure sre . o) - (define names - (if (and (pair? o) (pair? (cdr o))) (cadr o) (sre-names sre 1 '()))) - (let lp ((sre sre) - (n 1) - (flags (if (pair? o) (car o) ~none)) - (next (lambda (cnk init src str i end matches fail) - (irregex-match-start-chunk-set! matches 0 (car init)) - (irregex-match-start-index-set! matches 0 (cdr init)) - (irregex-match-end-chunk-set! matches 0 src) - (irregex-match-end-index-set! matches 0 i) - (%irregex-match-fail-set! matches fail) - matches))) - ;; XXXX this should be inlined - (define (rec sre) (lp sre n flags next)) - (cond - ((pair? sre) - (if (string? (car sre)) - (sre-cset->procedure - (sre->cset (car sre) (flag-set? flags ~case-insensitive?)) - next) - (case (car sre) - ((~ - & /) - (sre-cset->procedure - (sre->cset sre (flag-set? flags ~case-insensitive?)) - next)) - ((or) - (case (length (cdr sre)) - ((0) (lambda (cnk init src str i end matches fail) (fail))) - ((1) (rec (cadr sre))) - (else - (let* ((first (rec (cadr sre))) - (rest (lp (sre-alternate (cddr sre)) - (+ n (sre-count-submatches (cadr sre))) - flags - next))) - (lambda (cnk init src str i end matches fail) - (first cnk init src str i end matches - (lambda () - (rest cnk init src str i end matches fail)))))))) - ((w/case) - (lp (sre-sequence (cdr sre)) - n - (flag-clear flags ~case-insensitive?) - next)) - ((w/nocase) - (lp (sre-sequence (cdr sre)) - n - (flag-join flags ~case-insensitive?) - next)) - ((w/utf8) - (lp (sre-sequence (cdr sre)) n (flag-join flags ~utf8?) next)) - ((w/noutf8) - (lp (sre-sequence (cdr sre)) n (flag-clear flags ~utf8?) next)) - ((seq :) - (case (length (cdr sre)) - ((0) next) - ((1) (rec (cadr sre))) - (else - (let ((rest (lp (sre-sequence (cddr sre)) - (+ n (sre-count-submatches (cadr sre))) - flags - next))) - (lp (cadr sre) n flags rest))))) - ((?) - (let ((body (rec (sre-sequence (cdr sre))))) - (lambda (cnk init src str i end matches fail) - (body cnk init src str i end matches - (lambda () (next cnk init src str i end matches fail)))))) - ((??) - (let ((body (rec (sre-sequence (cdr sre))))) - (lambda (cnk init src str i end matches fail) - (next cnk init src str i end matches - (lambda () (body cnk init src str i end matches fail)))))) - ((*) - (cond - ((sre-empty? (sre-sequence (cdr sre))) - (error "invalid sre: empty *" sre)) - (else - (letrec - ((body - (lp (sre-sequence (cdr sre)) - n - flags - (lambda (cnk init src str i end matches fail) - (body cnk init src str i end matches - (lambda () - (next cnk init src str i end matches fail) - )))))) - (lambda (cnk init src str i end matches fail) - (body cnk init src str i end matches - (lambda () - (next cnk init src str i end matches fail)))))))) - ((*?) - (cond - ((sre-empty? (sre-sequence (cdr sre))) - (error "invalid sre: empty *?" sre)) - (else - (letrec - ((body - (lp (sre-sequence (cdr sre)) - n - flags - (lambda (cnk init src str i end matches fail) - (next cnk init src str i end matches - (lambda () - (body cnk init src str i end matches fail) - )))))) - (lambda (cnk init src str i end matches fail) - (next cnk init src str i end matches - (lambda () - (body cnk init src str i end matches fail)))))))) - ((+) - (lp (sre-sequence (cdr sre)) - n - flags - (rec (list '* (sre-sequence (cdr sre)))))) - ((=) - (rec `(** ,(cadr sre) ,(cadr sre) ,@(cddr sre)))) - ((>=) - (rec `(** ,(cadr sre) #f ,@(cddr sre)))) - ((** **?) - (cond - ((or (and (number? (cadr sre)) - (number? (caddr sre)) - (> (cadr sre) (caddr sre))) - (and (not (cadr sre)) (caddr sre))) - (lambda (cnk init src str i end matches fail) (fail))) - (else - (let* ((from (cadr sre)) - (to (caddr sre)) - (? (if (eq? '** (car sre)) '? '??)) - (* (if (eq? '** (car sre)) '* '*?)) - (sre (sre-sequence (cdddr sre))) - (x-sre (sre-strip-submatches sre)) - (next (if to - (if (= from to) - next - (fold (lambda (x next) - (lp `(,? ,sre) n flags next)) - next - (zero-to (- to from)))) - (rec `(,* ,sre))))) - (if (zero? from) - next - (lp `(seq ,@(map (lambda (x) x-sre) (zero-to (- from 1))) - ,sre) - n - flags - next)))))) - ((word) - (rec `(seq bow ,@(cdr sre) eow))) - ((word+) - (rec `(seq bow (+ (& (or alphanumeric "_") - (or ,@(cdr sre)))) eow))) - ((posix-string) - (rec (string->sre (cadr sre)))) - ((look-ahead) - (let ((check - (lp (sre-sequence (cdr sre)) - n - flags - (lambda (cnk init src str i end matches fail) i)))) - (lambda (cnk init src str i end matches fail) - (if (check cnk init src str i end matches (lambda () #f)) - (next cnk init src str i end matches fail) - (fail))))) - ((neg-look-ahead) - (let ((check - (lp (sre-sequence (cdr sre)) - n - flags - (lambda (cnk init src str i end matches fail) i)))) - (lambda (cnk init src str i end matches fail) - (if (check cnk init src str i end matches (lambda () #f)) - (fail) - (next cnk init src str i end matches fail))))) - ((look-behind neg-look-behind) - (let ((check - (lp (sre-sequence - (cons '(* any) (append (cdr sre) '(eos)))) - n - flags - (lambda (cnk init src str i end matches fail) i)))) - (lambda (cnk init src str i end matches fail) - (let* ((prev ((chunker-get-substring cnk) - (car init) - (cdr init) - src - i)) - (len (string-length prev)) - (src2 (list prev 0 len))) - (if ((if (eq? (car sre) 'look-behind) (lambda (x) x) not) - (check irregex-basic-string-chunker - (cons src2 0) src2 prev 0 len matches (lambda () #f))) - (next cnk init src str i end matches fail) - (fail)))))) - ((atomic) - (let ((once - (lp (sre-sequence (cdr sre)) - n - flags - (lambda (cnk init src str i end matches fail) i)))) - (lambda (cnk init src str i end matches fail) - (let ((j (once cnk init src str i end matches (lambda () #f)))) - (if j - (next cnk init src str j end matches fail) - (fail)))))) - ((if) - (let* ((test-submatches (sre-count-submatches (cadr sre))) - (pass (lp (caddr sre) flags (+ n test-submatches) next)) - (fail (if (pair? (cdddr sre)) - (lp (cadddr sre) - (+ n test-submatches - (sre-count-submatches (caddr sre))) - flags - next) - (lambda (cnk init src str i end matches fail) - (fail))))) - (cond - ((or (number? (cadr sre)) (symbol? (cadr sre))) - (let ((index - (if (symbol? (cadr sre)) - (cond - ((assq (cadr sre) names) => cdr) - (else - (error "unknown named backref in SRE IF" sre))) - (cadr sre)))) - (lambda (cnk init src str i end matches fail2) - (if (%irregex-match-end-chunk matches index) - (pass cnk init src str i end matches fail2) - (fail cnk init src str i end matches fail2))))) - (else - (let ((test (lp (cadr sre) n flags pass))) - (lambda (cnk init src str i end matches fail2) - (test cnk init src str i end matches - (lambda () (fail cnk init src str i end matches fail2))) - )))))) - ((backref backref-ci) - (let ((n (cond ((number? (cadr sre)) (cadr sre)) - ((assq (cadr sre) names) => cdr) - (else (error "unknown backreference" (cadr sre))))) - (compare (if (or (eq? (car sre) 'backref-ci) - (flag-set? flags ~case-insensitive?)) - string-ci=? - string=?))) - (lambda (cnk init src str i end matches fail) - (let ((s (irregex-match-substring matches n))) - (if (not s) - (fail) - ;; XXXX create an abstract subchunk-compare - (let lp ((src src) - (str str) - (i i) - (end end) - (j 0) - (len (string-length s))) - (cond - ((<= len (- end i)) - (cond - ((compare (substring s j (string-length s)) - (substring str i (+ i len))) - (next cnk init src str (+ i len) end matches fail)) - (else - (fail)))) - (else - (cond - ((compare (substring s j (+ j (- end i))) - (substring str i end)) - (let ((src2 ((chunker-get-next cnk) src))) - (if src2 - (lp src2 - ((chunker-get-str cnk) src2) - ((chunker-get-start cnk) src2) - ((chunker-get-end cnk) src2) - (+ j (- end i)) - (- len (- end i))) - (fail)))) - (else - (fail))))))))))) - ((dsm) - (lp (sre-sequence (cdddr sre)) (+ n (cadr sre)) flags next)) - (($ submatch) - (let ((body - (lp (sre-sequence (cdr sre)) - (+ n 1) - flags - (lambda (cnk init src str i end matches fail) - (let ((old-source - (%irregex-match-end-chunk matches n)) - (old-index - (%irregex-match-end-index matches n))) - (irregex-match-end-chunk-set! matches n src) - (irregex-match-end-index-set! matches n i) - (next cnk init src str i end matches - (lambda () - (irregex-match-end-chunk-set! - matches n old-source) - (irregex-match-end-index-set! - matches n old-index) - (fail)))))))) - (lambda (cnk init src str i end matches fail) - (let ((old-source (%irregex-match-start-chunk matches n)) - (old-index (%irregex-match-start-index matches n))) - (irregex-match-start-chunk-set! matches n src) - (irregex-match-start-index-set! matches n i) - (body cnk init src str i end matches - (lambda () - (irregex-match-start-chunk-set! - matches n old-source) - (irregex-match-start-index-set! - matches n old-index) - (fail))))))) - ((=> submatch-named) - (rec `(submatch ,@(cddr sre)))) - (else - (error "unknown regexp operator" sre))))) - ((symbol? sre) - (case sre - ((any) - (lambda (cnk init src str i end matches fail) - (if (< i end) - (next cnk init src str (+ i 1) end matches fail) - (let ((src2 ((chunker-get-next cnk) src))) - (if src2 - (let ((str2 ((chunker-get-str cnk) src2)) - (i2 ((chunker-get-start cnk) src2)) - (end2 ((chunker-get-end cnk) src2))) - (next cnk init src2 str2 (+ i2 1) end2 matches fail)) - (fail)))))) - ((nonl) - (lambda (cnk init src str i end matches fail) - (if (< i end) - (if (not (eqv? #\newline (string-ref str i))) - (next cnk init src str (+ i 1) end matches fail) - (fail)) - (let ((src2 ((chunker-get-next cnk) src))) - (if src2 - (let ((str2 ((chunker-get-str cnk) src2)) - (i2 ((chunker-get-start cnk) src2)) - (end2 ((chunker-get-end cnk) src2))) - (if (not (eqv? #\newline (string-ref str2 i2))) - (next cnk init src2 str2 (+ i2 1) end2 matches fail) - (fail))) - (fail)))))) - ((bos) - (lambda (cnk init src str i end matches fail) - (if (and (eq? src (car init)) (eqv? i (cdr init))) - (next cnk init src str i end matches fail) - (fail)))) - ((bol) - (lambda (cnk init src str i end matches fail) - (if (or (and (eq? src (car init)) (eqv? i (cdr init))) - (and (> i ((chunker-get-start cnk) src)) - (eqv? #\newline (string-ref str (- i 1))))) - (next cnk init src str i end matches fail) - (fail)))) - ((bow) - (lambda (cnk init src str i end matches fail) - (if (and (or (if (> i ((chunker-get-start cnk) src)) - (not (char-alphanumeric? (string-ref str (- i 1)))) - (let ((ch (chunker-prev-char cnk src end))) - (and ch (not (char-alphanumeric? ch))))) - (and (eq? src (car init)) (eqv? i (cdr init)))) - (if (< i end) - (char-alphanumeric? (string-ref str i)) - (let ((next ((chunker-get-next cnk) src))) - (and next - (char-alphanumeric? - (string-ref ((chunker-get-str cnk) next) - ((chunker-get-start cnk) next))))))) - (next cnk init src str i end matches fail) - (fail)))) - ((eos) - (lambda (cnk init src str i end matches fail) - (if (and (>= i end) (not ((chunker-get-next cnk) src))) - (next cnk init src str i end matches fail) - (fail)))) - ((eol) - (lambda (cnk init src str i end matches fail) - (if (if (< i end) - (eqv? #\newline (string-ref str i)) - (let ((src2 ((chunker-get-next cnk) src))) - (if (not src2) - #t - (eqv? #\newline - (string-ref ((chunker-get-str cnk) src2) - ((chunker-get-start cnk) src2)))))) - (next cnk init src str i end matches fail) - (fail)))) - ((eow) - (lambda (cnk init src str i end matches fail) - (if (and (if (< i end) - (not (char-alphanumeric? (string-ref str i))) - (let ((ch (chunker-next-char cnk src))) - (or (not ch) (not (char-alphanumeric? ch))))) - (if (> i ((chunker-get-start cnk) src)) - (char-alphanumeric? (string-ref str (- i 1))) - (let ((prev (chunker-prev-char cnk init src))) - (or (not prev) (char-alphanumeric? prev))))) - (next cnk init src str i end matches fail) - (fail)))) - ((nwb) ;; non-word-boundary - (lambda (cnk init src str i end matches fail) - (let ((c1 (if (< i end) - (string-ref str i) - (chunker-next-char cnk src))) - (c2 (if (> i ((chunker-get-start cnk) src)) - (string-ref str (- i 1)) - (chunker-prev-char cnk init src)))) - (if (and c1 c2 - (if (char-alphanumeric? c1) - (char-alphanumeric? c2) - (not (char-alphanumeric? c2)))) - (next cnk init src str i end matches fail) - (fail))))) - ((epsilon) - next) - (else - (let ((cell (assq sre sre-named-definitions))) - (if cell - (rec (cdr cell)) - (error "unknown regexp" sre)))))) - ((char? sre) - (if (flag-set? flags ~case-insensitive?) - ;; case-insensitive - (lambda (cnk init src str i end matches fail) - (if (>= i end) - (let lp ((src2 ((chunker-get-next cnk) src))) - (if src2 - (let ((str2 ((chunker-get-str cnk) src2)) - (i2 ((chunker-get-start cnk) src2)) - (end2 ((chunker-get-end cnk) src2))) - (if (>= i2 end2) - (lp ((chunker-get-next cnk) src2)) - (if (char-ci=? sre (string-ref str2 i2)) - (next cnk init src2 str2 (+ i2 1) end2 - matches fail) - (fail)))) - (fail))) - (if (char-ci=? sre (string-ref str i)) - (next cnk init src str (+ i 1) end matches fail) - (fail)))) - ;; case-sensitive - (lambda (cnk init src str i end matches fail) - (if (>= i end) - (let lp ((src2 ((chunker-get-next cnk) src))) - (if src2 - (let ((str2 ((chunker-get-str cnk) src2)) - (i2 ((chunker-get-start cnk) src2)) - (end2 ((chunker-get-end cnk) src2))) - (if (>= i2 end2) - (lp ((chunker-get-next cnk) src2)) - (if (char=? sre (string-ref str2 i2)) - (next cnk init src2 str2 (+ i2 1) end2 - matches fail) - (fail)))) - (fail))) - (if (char=? sre (string-ref str i)) - (next cnk init src str (+ i 1) end matches fail) - (fail)))) - )) - ((string? sre) - (rec (sre-sequence (string->list sre))) -;; XXXX reintroduce faster string matching on chunks -;; (if (flag-set? flags ~case-insensitive?) -;; (rec (sre-sequence (string->list sre))) -;; (let ((len (string-length sre))) -;; (lambda (cnk init src str i end matches fail) -;; (if (and (<= (+ i len) end) -;; (%substring=? sre str 0 i len)) -;; (next str (+ i len) matches fail) -;; (fail))))) - ) - (else - (error "unknown regexp" sre))))) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;;; Character Sets -;; -;; Simple character sets as lists of ranges, as used in the NFA/DFA -;; compilation. This is not especially efficient, but is portable and -;; scalable for any range of character sets. - -(define (sre-cset->procedure cset next) - (lambda (cnk init src str i end matches fail) - (if (< i end) - (if (cset-contains? cset (string-ref str i)) - (next cnk init src str (+ i 1) end matches fail) - (fail)) - (let ((src2 ((chunker-get-next cnk) src))) - (if src2 - (let ((str2 ((chunker-get-str cnk) src2)) - (i2 ((chunker-get-start cnk) src2)) - (end2 ((chunker-get-end cnk) src2))) - (if (cset-contains? cset (string-ref str2 i2)) - (next cnk init src2 str2 (+ i2 1) end2 matches fail) - (fail))) - (fail)))))) - -(define (plist->alist ls) - (let lp ((ls ls) (res '())) - (if (null? ls) - (reverse res) - (lp (cddr ls) (cons (cons (car ls) (cadr ls)) res))))) - -(define (alist->plist ls) - (let lp ((ls ls) (res '())) - (if (null? ls) - (reverse res) - (lp (cdr ls) (cons (cdar ls) (cons (caar ls) res)))))) - -(define (sre->cset sre . o) - (let lp ((sre sre) (ci? (and (pair? o) (car o)))) - (define (rec sre) (lp sre ci?)) - (cond - ((pair? sre) - (if (string? (car sre)) - (if ci? - (cset-case-insensitive (string->list (car sre))) - (string->list (car sre))) - (case (car sre) - ((~) - (cset-complement - (fold cset-union (rec (cadr sre)) (map rec (cddr sre))))) - ((&) - (fold cset-intersection (rec (cadr sre)) (map rec (cddr sre)))) - ((-) - (fold (lambda (x res) (cset-difference res x)) - (rec (cadr sre)) - (map rec (cddr sre)))) - ((/) - (let ((res (plist->alist (sre-flatten-ranges (cdr sre))))) - (if ci? - (cset-case-insensitive res) - res))) - ((or) - (fold cset-union (rec (cadr sre)) (map rec (cddr sre)))) - ((w/case) - (lp (sre-alternate (cdr sre)) #f)) - ((w/nocase) - (lp (sre-alternate (cdr sre)) #t)) - (else - (error "not a valid sre char-set operator" sre))))) - ((char? sre) (rec (list (string sre)))) - ((string? sre) (rec (list sre))) - (else - (let ((cell (assq sre sre-named-definitions))) - (if cell - (rec (cdr cell)) - (error "not a valid sre char-set" sre))))))) - -;; another debugging utility -;; (define (cset->sre cset) -;; (let lp ((ls cset) (chars '()) (ranges '())) -;; (cond -;; ((null? ls) -;; (sre-alternate -;; (append -;; (if (pair? chars) (list (list (list->string chars))) '()) -;; (if (pair? ranges) (list (cons '/ (alist->plist ranges))) '())))) -;; ((char? (car ls)) (lp (cdr ls) (cons (car ls) chars) ranges)) -;; (else (lp (cdr ls) chars (cons (car ls) ranges)))))) - -(define (cset-contains? cset ch) - (find (lambda (x) - (or (eqv? x ch) - (and (pair? x) (char<=? (car x) ch) (char<=? ch (cdr x))))) - cset)) - -(define (cset-range x) - (if (char? x) (cons x x) x)) - -(define (char-ranges-overlap? a b) - (if (pair? a) - (if (pair? b) - (or (and (char<=? (car a) (cdr b)) (char<=? (car b) (cdr a))) - (and (char<=? (cdr b) (car a)) (char<=? (cdr a) (car b)))) - (and (char<=? (car a) b) (char<=? b (cdr a)))) - (if (pair? b) - (char-ranges-overlap? b a) - (eqv? a b)))) - -(define (char-ranges-union a b) - (cons (if (char<=? (car a) (car b)) (car a) (car b)) - (if (char>=? (cdr a) (cdr b)) (cdr a) (cdr b)))) - -(define (cset-union a b) - (cond ((null? b) a) - ((find-tail (lambda (x) (char-ranges-overlap? x (car b))) a) - => (lambda (ls) - (cset-union - (cset-union (append (take-up-to a ls) (cdr ls)) - (list (char-ranges-union (cset-range (car ls)) - (cset-range (car b))))) - (cdr b)))) - (else (cset-union (cons (car b) a) (cdr b))))) - -(define (cset-difference a b) - (cond ((null? b) a) - ((not (car b)) (cset-difference a (cdr b))) - ((find-tail (lambda (x) (char-ranges-overlap? x (car b))) a) - => (lambda (ls) - (apply - (lambda (left1 left2 same right1 right2) - (let* ((a (append (take-up-to a ls) (cdr ls))) - (a (if left1 (cons left1 a) a)) - (a (if left2 (cons left2 a) a)) - (b (if right1 (cset-union b (list right1)) b)) - (b (if right2 (cset-union b (list right2)) b))) - (cset-difference a b))) - (intersect-char-ranges (cset-range (car ls)) - (cset-range (car b)))))) - (else (cset-difference a (cdr b))))) - -(define (cset-intersection a b) - (let intersect ((a a) (b b) (res '())) - (cond ((null? b) res) - ((find-tail (lambda (x) (char-ranges-overlap? x (car b))) a) - => (lambda (ls) - (apply - (lambda (left1 left2 same right1 right2) - (let* ((a (append (take-up-to a ls) (cdr ls))) - (a (if left1 (cons left1 a) a)) - (a (if left2 (cons left2 a) a)) - (b (if right1 (cset-union b (list right1)) b)) - (b (if right2 (cset-union b (list right2)) b))) - (intersect a b (cset-union res (list same))))) - (intersect-char-ranges (cset-range (car ls)) - (cset-range (car b)))))) - (else (intersect a (cdr b) res))))) - -(define (cset-complement a) - (cset-difference (sre->cset *all-chars*) a)) - -(define (cset-case-insensitive a) - (let lp ((ls a) (res '())) - (cond ((null? ls) (reverse res)) - ((and (char? (car ls)) (char-alphabetic? (car ls))) - (let ((c2 (char-altcase (car ls))) - (res (cons (car ls) res))) - (lp (cdr ls) (if (cset-contains? res c2) res (cons c2 res))))) - ((and (pair? (car ls)) - (char-alphabetic? (caar ls)) - (char-alphabetic? (cdar ls))) - (lp (cdr ls) - (cset-union (cset-union res (list (car ls))) - (list (cons (char-altcase (caar ls)) - (char-altcase (cdar ls))))))) - (else (lp (cdr ls) (cset-union res (list (car ls)))))))) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;;; Match and Replace Utilities - -(define (irregex-fold/fast irx kons knil str . o) - (if (not (string? str)) (error "irregex-fold: not a string" str)) - (if (not (procedure? kons)) (error "irregex-fold: not a procedure" kons)) - (let* ((irx (irregex irx)) - (matches (irregex-new-matches irx)) - (finish (or (and (pair? o) (car o)) (lambda (i acc) acc))) - (start (if (and (pair? o) (pair? (cdr o))) (cadr o) 0)) - (end (if (and (pair? o) (pair? (cdr o)) (pair? (cddr o))) - (caddr o) - (string-length str)))) - (irregex-match-chunker-set! matches irregex-basic-string-chunker) - (let lp ((i start) (acc knil)) - (if (>= i end) - (finish i acc) - (let ((m (irregex-search/matches - irx - irregex-basic-string-chunker - (list str i end) - i - matches))) - (if (not m) - (finish i acc) - (let* ((end (%irregex-match-end-index m 0)) - (acc (kons i m acc))) - (irregex-reset-matches! matches) - (lp end acc)))))))) - -(define (irregex-fold irx kons . args) - (let ((kons2 (lambda (i m acc) (kons i (irregex-copy-matches m) acc)))) - (apply irregex-fold/fast irx kons2 args))) - -(define (irregex-fold/chunked/fast irx kons knil cnk start . o) - (let* ((irx (irregex irx)) - (matches (irregex-new-matches irx)) - (finish (or (and (pair? o) (car o)) (lambda (src i acc) acc))) - (i (if (and (pair? o) (pair? (cdr o))) - (cadr o) - ((chunker-get-start cnk) start)))) - (irregex-match-chunker-set! matches cnk) - (let lp ((start start) (i i) (acc knil)) - (if (not start) - (finish start i acc) - (let ((m (irregex-search/matches irx cnk start i matches))) - (if (not m) - (finish start i acc) - (let* ((acc (kons start i m acc)) - (end-src (%irregex-match-end-chunk m 0)) - (end-index (%irregex-match-end-index m 0))) - (irregex-reset-matches! matches) - (lp end-src end-index acc)))))))) - -(define (irregex-fold/chunked irx kons . args) - (if (not (procedure? kons)) (error "irregex-fold/chunked: not a procedure" kons)) - (let ((kons2 (lambda (s i m acc) (kons s i (irregex-copy-matches m) acc)))) - (apply irregex-fold/chunked/fast irx kons2 args))) - -(define (irregex-replace irx str . o) - (if (not (string? str)) (error "irregex-replace: not a string" str)) - (let ((m (irregex-search irx str))) - (and - m - (string-cat-reverse - (cons (substring str (%irregex-match-end-index m 0) (string-length str)) - (append (irregex-apply-match m o) - (list (substring str 0 (%irregex-match-start-index m 0))) - )))))) - -(define (irregex-replace/all irx str . o) - (if (not (string? str)) (error "irregex-replace/all: not a string" str)) - (irregex-fold/fast - irx - (lambda (i m acc) - (let ((m-start (%irregex-match-start-index m 0))) - (append (irregex-apply-match m o) - (if (>= i m-start) - acc - (cons (substring str i m-start) acc))))) - '() - str - (lambda (i acc) - (let ((end (string-length str))) - (string-cat-reverse (if (>= i end) - acc - (cons (substring str i end) acc))))))) - -(define (irregex-apply-match m ls) - (let lp ((ls ls) (res '())) - (if (null? ls) - res - (cond - ((integer? (car ls)) - (lp (cdr ls) - (cons (or (irregex-match-substring m (car ls)) "") res))) - ((procedure? (car ls)) - (lp (cdr ls) (cons ((car ls) m) res))) - ((symbol? (car ls)) - (case (car ls) - ((pre) - (lp (cdr ls) - (cons (substring (car (%irregex-match-start-chunk m 0)) - 0 - (%irregex-match-start-index m 0)) - res))) - ((post) - (let ((str (car (%irregex-match-start-chunk m 0)))) - (lp (cdr ls) - (cons (substring str - (%irregex-match-end-index m 0) - (string-length str)) - res)))) - (else - (cond - ((assq (car ls) (irregex-match-names m)) - => (lambda (x) (lp (cons (cdr x) (cdr ls)) res))) - (else - (error "unknown match replacement" (car ls))))))) - (else - (lp (cdr ls) (cons (car ls) res))))))) - -(define (irregex-extract irx str . o) - (if (not (string? str)) (error "irregex-extract: not a string" str)) - (apply irregex-fold/fast - irx - (lambda (i m a) (cons (irregex-match-substring m) a)) - '() - str - (lambda (i a) (reverse a)) - o)) - -(define (irregex-split irx str . o) - (if (not (string? str)) (error "irregex-split: not a string" str)) - (let ((start (if (pair? o) (car o) 0)) - (end (if (and (pair? o) (pair? (cdr o))) (cadr o) (string-length str)))) - (irregex-fold/fast - irx - (lambda (i m a) - (if (= i (%irregex-match-start-index m 0)) - a - (cons (substring str i (%irregex-match-start-index m 0)) a))) - '() - str - (lambda (i a) - (reverse (if (= i end) a (cons (substring str i end) a)))) - start - end))) diff --git a/irregex-utils.scm b/irregex-utils.scm deleted file mode 100644 index 8332791d..00000000 --- a/irregex-utils.scm +++ /dev/null @@ -1,154 +0,0 @@ -;;;; irregex-utils.scm -;; -;; Copyright (c) 2010 Alex Shinn. All rights reserved. -;; BSD-style license: http://synthcode.com/license.txt - -(define rx-special-chars - "\\|[](){}.*+?^$#") - -(define (string-scan-char str c . o) - (let ((end (string-length str))) - (let scan ((i (if (pair? o) (car o) 0))) - (cond ((= i end) #f) - ((eqv? c (string-ref str i)) i) - (else (scan (+ i 1))))))) - -(define (irregex-quote str) - (list->string - (let loop ((ls (string->list str)) (res '())) - (if (null? ls) - (reverse res) - (let ((c (car ls))) - (if (string-scan-char rx-special-chars c) - (loop (cdr ls) (cons c (cons #\\ res))) - (loop (cdr ls) (cons c res)))))))) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -(define (irregex-opt ls) - (define (make-alt ls) - (cond ((null? (cdr ls)) (car ls)) - ((every char? ls) (list (list->string ls))) - (else (cons 'or ls)))) - (define (make-seq ls) - (cond ((null? (cdr ls)) (car ls)) - ((every (lambda (x) (or (string? x) (char? x))) ls) - (apply string-append (map (lambda (x) (if (char? x) (string x) x)) ls))) - (else (cons 'seq ls)))) - (cond - ((null? ls) "") - ((null? (cdr ls)) (car ls)) - (else - (let ((chars (make-vector 256 '()))) - (let lp1 ((ls ls) (empty? #f)) - (if (null? ls) - (let lp2 ((i 0) (res '())) - (if (= i 256) - (let ((res (make-alt (reverse res)))) - (if empty? `(? ,res) res)) - (let ((c (integer->char i)) - (opts (vector-ref chars i))) - (lp2 (+ i 1) - (cond - ((null? opts) res) - ((equal? opts '("")) `(,c ,@res)) - (else `(,(make-seq (list c (irregex-opt opts))) - ,@res))))))) - (let* ((str (car ls)) - (len (string-length str))) - (if (zero? len) - (lp1 (cdr ls) #t) - (let ((i (char->integer (string-ref str 0)))) - (vector-set! - chars - i - (cons (substring str 1 len) (vector-ref chars i))) - (lp1 (cdr ls) empty?)))))))))) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -(define (cset->string ls) - (let ((out (open-output-string))) - (let lp ((ls ls)) - (cond - ((pair? ls) - (cond - ((pair? (car ls)) - (display (irregex-quote (string (caar ls))) out) - (write-char #\- out) - (display (irregex-quote (string (cdar ls))) out)) - (else (display (irregex-quote (string (car ls))) out))) - (lp (cdr ls))))) - (get-output-string out))) - -(define (sre->string obj) - (let ((out (open-output-string))) - (let lp ((x obj)) - (cond - ((pair? x) - (case (car x) - ((: seq) - (cond - ((and (pair? (cddr x)) (pair? (cddr x)) (not (eq? x obj))) - (display "(?:" out) (for-each lp (cdr x)) (display ")" out)) - (else (for-each lp (cdr x))))) - ((submatch) - (display "(" out) (for-each lp (cdr x)) (display ")" out)) - ((submatch-named) - (display "(?<" out) (display (cadr x) out) (display ">" out) - (for-each lp (cddr x)) (display ")" out)) - ((or) - (display "(?:" out) - (lp (cadr x)) - (for-each (lambda (x) (display "|" out) (lp x)) (cddr x)) - (display ")" out)) - ((* + ? *? ??) - (cond - ((pair? (cddr x)) - (display "(?:" out) (for-each lp (cdr x)) (display ")" out)) - (else (lp (cadr x)))) - (display (car x) out)) - ((not) - (cond - ((and (pair? (cadr x)) (eq? 'cset (caadr x))) - (display "[^" out) - (display (cset->string (cdadr x)) out) - (display "]" out)) - (else (error "can't represent general 'not' in strings" x)))) - ((cset) - (display "[" out) - (display (cset->string (cdr x)) out) - (display "]" out)) - ((- & / ~) - (cond - ((or (eq? #\~ (car x)) - (and (eq? '- (car x)) (pair? (cdr x)) (eq? 'any (cadr x)))) - (display "[^" out) - (display (cset->string (if (eq? #\~ (car x)) (cdr x) (cddr x))) out) - (display "]" out)) - (else - (lp `(cset ,@(sre->cset x)))))) - ((w/case w/nocase) - (display "(?" out) - (if (eq? (car x) 'w/case) (display "-" out)) - (display ":" out) - (for-each lp (cdr x)) - (display ")" out)) - (else - (if (string? (car x)) - (lp `(cset ,@(string->list (car x)))) - (error "unknown sre operator" x))))) - ((symbol? x) - (case x - ((bos bol) (display "^" out)) - ((eos eol) (display "$" out)) - ((any nonl) (display "." out)) - (else (error "unknown sre symbol" x)))) - ((string? x) - (display (irregex-quote x) out)) - ((char? x) - (display (irregex-quote (string x)) out)) - (else - (error "unknown sre pattern" x)))) - (get-output-string out))) - diff --git a/irregex.import.scm b/irregex.import.scm index 63bd1320..7fc3bde7 100644 --- a/irregex.import.scm +++ b/irregex.import.scm @@ -26,49 +26,11 @@ (##sys#register-primitive-module 'irregex - '(irregex - irregex-apply-match - irregex-dfa - irregex-dfa/extract - irregex-dfa/search - irregex-extract - irregex-flags - irregex-fold - irregex-fold/chunked - irregex-lengths - irregex-match - irregex-match? - irregex-match-data? - irregex-match-end - irregex-match-end-chunk - irregex-match-end-index - irregex-match-names - irregex-match-num-submatches - irregex-match-start - irregex-match-start-chunk - irregex-match-start-index - irregex-match-string - irregex-match-subchunk - irregex-match-substring - irregex-match/chunked - irregex-names - irregex-new-matches - irregex-nfa - irregex-num-submatches - irregex-opt - irregex-quote - irregex-replace - irregex-replace/all - irregex-reset-matches! - irregex-search - irregex-search/chunked - irregex-search/matches - irregex-split - irregex? - make-irregex-chunker - maybe-string->sre - sre->irregex - sre->string - string->irregex - string->sre - )) + '(irregex string->irregex sre->irregex string->sre + irregex? irregex-match-data? + irregex-new-matches irregex-reset-matches! + irregex-match-start irregex-match-end irregex-match-substring + irregex-search irregex-search/matches irregex-match irregex-match-string + irregex-fold irregex-replace irregex-replace/all irregex-apply-match + irregex-dfa irregex-dfa/search irregex-dfa/extract + irregex-nfa irregex-flags irregex-submatches irregex-lengths irregex-names)) diff --git a/irregex.scm b/irregex.scm index 0447d3de..5d0f77e7 100644 --- a/irregex.scm +++ b/irregex.scm @@ -1,248 +1,2718 @@ -;;;; irregex.scm - container for irregex-core.scm -; -; Copyright (c) 2010, The Chicken Team -; All rights reserved. -; -; Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following -; conditions are met: -; -; Redistributions of source code must retain the above copyright notice, this list of conditions and the following -; disclaimer. -; Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following -; disclaimer in the documentation and/or other materials provided with the distribution. -; Neither the name of the author nor the names of its contributors may be used to endorse or promote -; products derived from this software without specific prior written permission. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS -; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY -; AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR -; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR -; OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -; POSSIBILITY OF SUCH DAMAGE. - - - -(declare (unit irregex)) - -(declare - (no-procedure-checks) - (fixnum) - (export - ##sys#glob->regexp - irregex - irregex-apply-match - irregex-dfa - irregex-dfa/extract - irregex-dfa/search - irregex-extract - irregex-flags - irregex-fold - irregex-fold/chunked - irregex-lengths - irregex-match - irregex-match? - irregex-match-data? - irregex-match-end - irregex-match-end-chunk - irregex-match-end-index - irregex-match-names - irregex-match-num-submatches - irregex-match-start - irregex-match-start-chunk - irregex-match-start-index - irregex-match-string - irregex-match-subchunk - irregex-match-substring - irregex-match/chunked - irregex-names - irregex-new-matches - irregex-nfa - irregex-num-submatches - irregex-opt - irregex-quote - irregex-replace - irregex-replace/all - irregex-reset-matches! - irregex-search - irregex-search/chunked - irregex-search/matches - irregex-split - irregex-submatches - irregex? - make-irregex-chunker - maybe-string->sre - irregex-search/chunked - sre->irregex - sre->string - string->irregex - string->sre - )) - -(include "common-declarations.scm") - -(register-feature! 'irregex) - -(define-syntax (build-cache x r c) - ;; (build-cache N ARG FAIL) - (let* ((n (cadr x)) - (n2 (* n 2)) - (arg (caddr x)) - (fail (cadddr x)) - (%cache (r 'cache)) - (%index (r 'index)) - (%arg (r 'arg)) - (%let (r 'let)) - (%let* (r 'let*)) - (%if (r 'if)) - (%fx+ (r 'fx+)) - (%fxmod (r 'fxmod)) - (%equal? (r 'equal?)) - (%quote (r 'quote)) - (%tmp (r 'tmp)) - (%begin (r 'begin)) - (cache (make-vector (add1 n2) #f))) - (##sys#setslot cache n2 0) ; last slot: current index - `(,%let* ((,%cache (,%quote ,cache)) ; we mutate a literal vector - (,%arg ,arg)) - ,(let fold ((i 0)) - (if (fx>= i n) - ;; this should be thread-safe: a context-switch can only - ;; happen before this code and in the call to FAIL. - `(,%let ((,%tmp ,fail) - (,%index (##sys#slot ,%cache ,n2))) - (##sys#setslot ,%cache ,%index ,%arg) - (##sys#setslot ,%cache (,%fx+ ,%index 1) ,%tmp) - (##sys#setislot - ,%cache ,n2 - (##core#inline "C_u_fixnum_modulo" (,%fx+ ,%index 2) ,n2)) - ,%tmp) - `(,%if (,%equal? (##sys#slot ,%cache ,(* i 2)) ,%arg) - (##sys#slot ,%cache ,(add1 (* i 2))) - ,(fold (add1 i)))))))) - -(define-compiler-syntax %%string-copy! - (syntax-rules () - ((_ to tstart from fstart fend) - (let ((x to) - (y tstart) - (z from) - (u fstart) - (v fend)) - (##core#inline "C_substring_copy" z x u v y))))) - -(define-compiler-syntax %substring=? - (syntax-rules () - ((_ a b start1 start2 len) - (##core#inline "C_substring_compare" a b start1 start2 len)))) - -(define-compiler-syntax make-irregex - (syntax-rules () - ((_ dfa dfa/search dfa/extract nfa flags submatches lengths names) - (##sys#make-structure - 'regexp dfa dfa/search dfa/extract nfa flags submatches lengths names)))) - -(define-compiler-syntax make-irregex-match - (syntax-rules () - ((_ count names) - (##sys#make-structure - 'regexp-match - (make-vector (+ (* 4 (+ 2 count)) 3) #f) ; #1: submatches - names ; #2: (guess) - #f ; #3: chunka - #f)))) ; #4: fail - -(define-compiler-syntax bit-shl - (syntax-rules () - ((_ n i) (fxshl n i)))) - -(define-compiler-syntax bit-shr - (syntax-rules () - ((_ n i) (fxshr n i)))) - -(define-compiler-syntax bit-not - (syntax-rules () - ((_ n) (fxnot n)))) - -(define-compiler-syntax bit-ior - (syntax-rules () - ((_ a b) (fxior a b)))) - -(define-compiler-syntax bit-and - (syntax-rules () - ((_ a b) (fxand a b)))) - -(define-compiler-syntax match-vector-ref - (syntax-rules () - ((_ m i) (##sys#slot (##sys#slot m 1) i)))) - -(define-compiler-syntax match-vector-set! - (syntax-rules () - ((_ m i x) (##sys#setslot (##sys#slot m 1) i x)))) - -(define-compiler-syntax irregex-match-start-chunk-set! - (syntax-rules () - ((_ m n start) - (vector-set! (##sys#slot m 1) (* n 4) start)))) - -(define-compiler-syntax irregex-match-start-index-set! - (syntax-rules () - ((_ m n start) - (vector-set! (##sys#slot m 1) (+ 1 (* n 4)) start)))) - -(define-compiler-syntax irregex-match-end-chunk-set! - (syntax-rules () - ((_ m n end) - (vector-set! (##sys#slot m 1) (+ 2 (* n 4)) end)))) - -(define-compiler-syntax irregex-match-end-index-set! - (syntax-rules () - ((_ m n end) - (vector-set! (##sys#slot m 1) (+ 3 (* n 4)) end)))) - -(include "irregex-core.scm") -(include "irregex-utils.scm") - -(define ##sys#glob->regexp - (let ((list->string list->string) - (string->list string->list)) - (lambda (s #!optional sre?) - (##sys#check-string s 'glob->regexp) - (let ((sre - (cons - ': - (let loop ((cs (string->list s)) (dir #t)) - (if (null? cs) - '() - (let ((c (car cs)) - (rest (cdr cs)) ) - (cond ((char=? c #\*) - (if dir - `((or (: (~ ("./\\")) - (* (~ ("/\\")))) - (* (~ ("./\\")))) - ,@(loop rest #f)) - `((* (~ ("/\\"))) ,@(loop rest #f)))) - ((char=? c #\?) (cons 'any (loop rest #f))) - ((char=? c #\[) - (let loop2 ((rest rest) (s '())) - (cond ((not (pair? rest)) - (error 'glob->regexp - "unexpected end of character class" s)) - ((char=? #\] (car rest)) - `((or ,@s) ,@(loop (cdr rest) #f))) - ((and (pair? (cdr rest)) - (pair? (cddr rest)) - (char=? #\- (cadr rest)) ) - (loop2 (cdddr rest) - (cons `(/ ,(car rest) ,(caddr rest)) s))) - ((and (pair? (cdr rest)) - (char=? #\- (car rest))) - (loop2 (cddr rest) - (cons `(~ ,(cadr rest)) s))) - (else - (loop2 (cdr rest) (cons (car rest) s)))))) - (else (cons c (loop rest (memq c '(#\\ #\/)))))))))))) - (if sre? sre (irregex sre)))))) +;;;; irregex.scm -- IrRegular Expressions +;; +;; Copyright (c) 2005-2008 Alex Shinn. All rights reserved. +;; BSD-style license: http://synthcode.com/license.txt + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; At this moment there was a loud ring at the bell, and I could +;; hear Mrs. Hudson, our landlady, raising her voice in a wail of +;; expostulation and dismay. +;; +;; "By heaven, Holmes," I said, half rising, "I believe that +;; they are really after us." +;; +;; "No, it's not quite so bad as that. It is the unofficial +;; force, -- the Baker Street irregulars." + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; History +;; +;; 0.6.2: 2008/07/26 - minor bugfixes, allow global disabling of utf8 mode, +;; friendlier error messages in parsing, \Q..\E support +;; 0.6.1: 2008/07/21 - added utf8 mode, more utils, bugfixes +;; 0.6: 2008/05/01 - most of PCRE supported +;; 0.5: 2008/04/24 - fully portable R4RS, many PCRE features implemented +;; 0.4: 2008/04/17 - rewriting NFA to use efficient closure compilation, +;; normal strings only, but all of the spencer tests pass +;; 0.3: 2008/03/10 - adding DFA converter (normal strings only) +;; 0.2: 2005/09/27 - adding irregex-opt (like elisp's regexp-opt) utility +;; 0.1: 2005/08/18 - simple NFA interpreter over abstract chunked strings + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define irregex-tag '*irregex-tag*) + +(define (make-irregex dfa dfa/search dfa/extract nfa flags + submatches lengths names) + (vector irregex-tag dfa dfa/search dfa/extract nfa flags + submatches lengths names)) + +(define (irregex? obj) + (and (vector? obj) + (= 9 (vector-length obj)) + (eq? irregex-tag (vector-ref obj 0)))) + +(define (irregex-dfa x) (vector-ref x 1)) +(define (irregex-dfa/search x) (vector-ref x 2)) +(define (irregex-dfa/extract x) (vector-ref x 3)) +(define (irregex-nfa x) (vector-ref x 4)) +(define (irregex-flags x) (vector-ref x 5)) +(define (irregex-submatches x) (vector-ref x 6)) +(define (irregex-lengths x) (vector-ref x 7)) +(define (irregex-names x) (vector-ref x 8)) + +(define (irregex-new-matches irx) + (make-irregex-match #f (irregex-submatches irx) (irregex-names irx))) +(define (irregex-reset-matches! m) + (do ((i (- (vector-length m) 1) (- i 1))) + ((<= i 3) m) + (vector-set! m i #f))) + +(define irregex-match-tag '*irregex-match-tag*) + +(define (irregex-match-data? obj) + (and (vector? obj) + (>= (vector-length obj) 5) + (eq? irregex-match-tag (vector-ref obj 0)))) + +(define (make-irregex-match str count names) + (let ((res (make-vector (+ (* 2 (+ 1 count)) 3) #f))) + (vector-set! res 0 irregex-match-tag) + (vector-set! res 1 str) + (vector-set! res 2 names) + res)) + +(define (irregex-match-num-submatches m) + (- (quotient (- (vector-length m) 3) 2) 1)) + +(define (irregex-match-string m) + (vector-ref m 1)) +(define (irregex-match-names m) + (vector-ref m 2)) +(define (irregex-match-string-set! m str) + (vector-set! m 1 str)) + +(define (irregex-match-start-index m n) + (vector-ref m (+ 3 (* n 2)))) +(define (irregex-match-end-index m n) + (vector-ref m (+ 4 (* n 2)))) + +(define (irregex-match-start-index-set! m n start) + (vector-set! m (+ 3 (* n 2)) start)) +(define (irregex-match-end-index-set! m n end) + (vector-set! m (+ 4 (* n 2)) end)) + +(define (irregex-match-index m opt) + (if (pair? opt) + (cond ((number? (car opt)) (car opt)) + ((assq (car opt) (irregex-match-names m)) => cdr) + (else (error "unknown match name" (car opt)))) + 0)) + +(define (irregex-match-valid-index? m n) + (and (< (+ 3 (* n 2)) (vector-length m)) + (vector-ref m (+ 4 (* n 2))))) + +(define (irregex-match-substring m . opt) + (let ((n (irregex-match-index m opt))) + (and (irregex-match-valid-index? m n) + (substring (irregex-match-string m) + (vector-ref m (+ 3 (* n 2))) + (vector-ref m (+ 4 (* n 2))))))) + +(define (irregex-match-start m . opt) + (let ((n (irregex-match-index m opt))) + (and (irregex-match-valid-index? m n) + (vector-ref m (+ 3 (* n 2)))))) + +(define (irregex-match-end m . opt) + (irregex-match-valid-index? m (irregex-match-index m opt))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; string utilities + +;;;; Unicode version (skip surrogates) +(define *all-chars* + `(/ ,(integer->char 0) ,(integer->char #xD7FF) + ,(integer->char #xE000) ,(integer->char #x10FFFF))) + +;;;; ASCII version, offset to not assume 0-255 +;; (define *all-chars* `(/ ,(integer->char (- (char->integer #\space) 32)) ,(integer->char (+ (char->integer #\space) 223)))) + +;; set to #f to ignore even an explicit request for utf8 handling +(define *allow-utf8-mode?* #t) + +;; (define *named-char-properties* '()) + +(define (string-scan-char str c . o) + (let ((end (string-length str))) + (let scan ((i (if (pair? o) (car o) 0))) + (cond ((= i end) #f) + ((eqv? c (string-ref str i)) i) + (else (scan (+ i 1))))))) + +(define (string-scan-char-escape str c . o) + (let ((end (string-length str))) + (let scan ((i (if (pair? o) (car o) 0))) + (cond ((= i end) #f) + ((eqv? c (string-ref str i)) i) + ((eqv? c #\\) (scan (+ i 2))) + (else (scan (+ i 1))))))) + +(define (string-scan-pred str pred . o) + (let ((end (string-length str))) + (let scan ((i (if (pair? o) (car o) 0))) + (cond ((= i end) #f) + ((pred (string-ref str i)) i) + (else (scan (+ i 1))))))) + +(define (string-split-char str c) + (let ((end (string-length str))) + (let lp ((i 0) (from 0) (res '())) + (define (collect) (cons (substring str from i) res)) + (cond ((>= i end) (reverse (collect))) + ((eqv? c (string-ref str i)) (lp (+ i 1) (+ i 1) (collect))) + (else (lp (+ i 1) from res)))))) + +(define (char-alphanumeric? c) + (or (char-alphabetic? c) (char-numeric? c))) + +;; SRFI-13 extracts + +(define (%%string-copy! to tstart from fstart fend) + (do ((i fstart (+ i 1)) + (j tstart (+ j 1))) + ((>= i fend)) + (string-set! to j (string-ref from i)))) + +(define (string-cat-reverse string-list) + (string-cat-reverse/aux + (fold (lambda (s a) (+ (string-length s) a)) 0 string-list) + string-list)) + +(define (string-cat-reverse/aux len string-list) + (let ((res (make-string len))) + (let lp ((i len) (ls string-list)) + (if (pair? ls) + (let* ((s (car ls)) + (slen (string-length s)) + (i (- i slen))) + (%%string-copy! res i s 0 slen) + (lp i (cdr ls))))) + res)) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; list utilities + +;; like the one-arg IOTA case +(define (zero-to n) + (if (<= n 0) + '() + (let lp ((i (- n 1)) (res '())) + (if (zero? i) (cons 0 res) (lp (- i 1) (cons i res)))))) + +;; take the head of list FROM up to but not including TO, which must +;; be a tail of the list +(define (take-up-to from to) + (let lp ((ls from) (res '())) + (if (and (pair? ls) (not (eq? ls to))) + (lp (cdr ls) (cons (car ls) res)) + (reverse res)))) + +;; SRFI-1 extracts (simplified 1-ary versions) + +(define (find pred ls) + (cond ((find-tail pred ls) => car) + (else #f))) + +(define (find-tail pred ls) + (let lp ((ls ls)) + (cond ((null? ls) #f) + ((pred (car ls)) ls) + (else (lp (cdr ls)))))) + +(define (last ls) + (if (not (pair? ls)) + (error "can't take last of empty list" ls) + (let lp ((ls ls)) + (if (pair? (cdr ls)) + (lp (cdr ls)) + (car ls))))) + +(define (any pred ls) + (and (pair? ls) + (let lp ((head (car ls)) (tail (cdr ls))) + (if (null? tail) + (pred head) + (or (pred head) (lp (car tail) (cdr tail))))))) + +(define (every pred ls) + (or (null? ls) + (let lp ((head (car ls)) (tail (cdr ls))) + (if (null? tail) + (pred head) + (and (pred head) (lp (car tail) (cdr tail))))))) + +(define (fold kons knil ls) + (let lp ((ls ls) (res knil)) + (if (null? ls) + res + (lp (cdr ls) (kons (car ls) res))))) + +(define (filter pred ls) + (let lp ((ls ls) (res '())) + (if (null? ls) + (reverse res) + (lp (cdr ls) (if (pred (car ls)) (cons (car ls) res) res))))) + +(define (remove pred ls) + (let lp ((ls ls) (res '())) + (if (null? ls) + (reverse res) + (lp (cdr ls) (if (pred (car ls)) res (cons (car ls) res)))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; flags + +(define (bit-shr n i) + (quotient n (expt 2 i))) + +(define (bit-shl n i) + (* n (expt 2 i))) + +(define (bit-not n) (- #xFFFF n)) + +(define (bit-ior a b) + (cond + ((zero? a) b) + ((zero? b) a) + (else + (+ (if (or (odd? a) (odd? b)) 1 0) + (* 2 (bit-ior (quotient a 2) (quotient b 2))))))) + +(define (bit-and a b) + (cond + ((zero? a) 0) + ((zero? b) 0) + (else + (+ (if (and (odd? a) (odd? b)) 1 0) + (* 2 (bit-and (quotient a 2) (quotient b 2))))))) + +(define (flag-set? flags i) + (= i (bit-and flags i))) +(define (flag-join a b) + (if b (bit-ior a b) a)) +(define (flag-clear a b) + (bit-and a (bit-not b))) + +(define ~none 0) +(define ~searcher? 1) +(define ~consumer? 2) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; parsing + +(define ~save? 1) +(define ~case-insensitive? 2) +(define ~multi-line? 4) +(define ~single-line? 8) +(define ~ignore-space? 16) +(define ~utf8? 32) + +(define (symbol-list->flags ls) + (let lp ((ls ls) (res ~none)) + (if (not (pair? ls)) + res + (lp (cdr ls) + (flag-join + res + (case (car ls) + ((i ci case-insensitive) ~case-insensitive?) + ((m multi-line) ~multi-line?) + ((s single-line) ~single-line?) + ((x ignore-space) ~ignore-space?) + ((u utf8) ~utf8?) + (else #f))))))) + +(define (string->sre str . o) + (let ((end (string-length str)) + (flags (symbol-list->flags o))) + + (let lp ((i 0) (from 0) (flags flags) (res '()) (st '())) + + ;; handle case sensitivity at the literal char/string level + (define (cased-char ch) + (if (and (flag-set? flags ~case-insensitive?) + (char-alphabetic? ch)) + `(or ,ch ,(char-altcase ch)) + ch)) + (define (cased-string str) + (if (flag-set? flags ~case-insensitive?) + (sre-sequence (map cased-char (string->list str))) + str)) + ;; accumulate the substring from..i as literal text + (define (collect) + (if (= i from) res (cons (cased-string (substring str from i)) res))) + ;; like collect but breaks off the last single character when + ;; collecting literal data, as the argument to ?/*/+ etc. + (define (collect/single) + (let* ((utf8? (flag-set? flags ~utf8?)) + (j (if (and utf8? (> i 1)) + (utf8-backup-to-initial-char str (- i 1)) + (- i 1)))) + (cond + ((< j from) + res) + (else + (let ((c (cased-char (if utf8? + (utf8-string-ref str j (- i j) ) + (string-ref str j))))) + (cond + ((= j from) + (cons c res)) + (else + (cons c + (cons (cased-string (substring str from j)) + res))))))))) + ;; collects for use as a result, reversing and grouping OR + ;; terms, and some ugly tweaking of `function-like' groups and + ;; conditionals + (define (collect/terms) + (let* ((ls (collect)) + (func + (and (pair? ls) + (memq (last ls) + '(atomic if look-ahead neg-look-ahead + look-behind neg-look-behind submatch-named + w/utf8 w/noutf8)))) + (prefix (if (and func (eq? 'submatch-named (car func))) + (list 'submatch-named (cadr (reverse ls))) + (and func (list (car func))))) + (ls (if func + (if (eq? 'submatch-named (car func)) + (reverse (cddr (reverse ls))) + (reverse (cdr (reverse ls)))) + ls))) + (let lp ((ls ls) (term '()) (res '())) + (define (shift) + (cons (sre-sequence term) res)) + (cond + ((null? ls) + (let* ((res (sre-alternate (shift))) + (res (if (flag-set? flags ~save?) + (list 'submatch res) + res))) + (if prefix + (if (eq? 'if (car prefix)) + (cond + ((not (pair? res)) + 'epsilon) + ((memq (car res) + '(look-ahead neg-look-ahead + look-behind neg-look-behind)) + res) + ((eq? 'seq (car res)) + `(if ,(cadr res) + ,(if (pair? (cdr res)) + (sre-sequence (cddr res)) + 'epsilon))) + (else + `(if ,(cadadr res) + ,(if (pair? (cdr res)) + (sre-sequence (cddadr res)) + 'epsilon) + ,(sre-alternate + (if (pair? (cdr res)) (cddr res) '()))))) + `(,@prefix ,res)) + res))) + ((eq? 'or (car ls)) (lp (cdr ls) '() (shift))) + (else (lp (cdr ls) (cons (car ls) term) res)))))) + (define (save) + (cons (cons flags (collect)) st)) + + ;; main parsing + (if (>= i end) + (if (pair? st) + (error "unterminated parenthesis in regexp" str) + (collect/terms)) + (let ((c (string-ref str i))) + (case c + ((#\.) + (lp (+ i 1) (+ i 1) flags + (cons (if (flag-set? flags ~single-line?) 'any 'nonl) + (collect)) + st)) + ((#\?) + (let ((res (collect/single))) + (if (null? res) + (error "? can't follow empty sre" str res) + (let ((x (car res))) + (lp (+ i 1) + (+ i 1) + flags + (cons + (if (pair? x) + (case (car x) + ((*) `(*? ,@(cdr x))) + ((+) `(**? 1 #f ,@(cdr x))) + ((?) `(?? ,@(cdr x))) + ((**) `(**? ,@(cdr x))) + ((=) `(**? ,(cadr x) ,@(cdr x))) + ((>=) `(**? ,(cadr x) #f ,@(cddr x))) + (else `(? ,x))) + `(? ,x)) + (cdr res)) + st))))) + ((#\+ #\*) + (let* ((res (collect/single)) + (x (car res)) + (op (string->symbol (string c)))) + (cond + ((sre-repeater? x) + (error "duplicate repetition (e.g. **) in sre" str res)) + ((sre-empty? x) + (error "can't repeat empty sre (e.g. ()*)" str res)) + (else + (lp (+ i 1) (+ i 1) flags + (cons (list op x) (cdr res)) + st))))) + ((#\() + (cond + ((>= (+ i 1) end) + (error "unterminated parenthesis in regexp" str)) + ((not (eqv? #\? (string-ref str (+ i 1)))) + (lp (+ i 1) (+ i 1) (flag-join flags ~save?) '() (save))) + ((>= (+ i 2) end) + (error "unterminated parenthesis in regexp" str)) + (else + (case (string-ref str (+ i 2)) + ((#\#) + (let ((j (string-scan-char str #\) (+ i 3)))) + (lp (+ j i) (+ j 1) flags (collect) st))) + ((#\:) + (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) '() (save))) + ((#\=) + (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) + '(look-ahead) (save))) + ((#\!) + (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) + '(neg-look-ahead) (save))) + ((#\<) + (cond + ((>= (+ i 3) end) + (error "unterminated parenthesis in regexp" str)) + (else + (case (string-ref str (+ i 3)) + ((#\=) + (lp (+ i 4) (+ i 4) (flag-clear flags ~save?) + '(look-behind) (save))) + ((#\!) + (lp (+ i 4) (+ i 4) (flag-clear flags ~save?) + '(neg-look-behind) (save))) + (else + (let ((j (and (char-alphabetic? + (string-ref str (+ i 3))) + (string-scan-char str #\> (+ i 4))))) + (if j + (lp (+ j 1) (+ j 1) (flag-clear flags ~save?) + `(,(string->symbol (substring str (+ i 3) j)) + submatch-named) + (save)) + (error "invalid (?< sequence" str)))))))) + ((#\>) + (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) + '(atomic) (save))) + ;;((#\' #\P) ; named subpatterns + ;; ) + ;;((#\R) ; recursion + ;; ) + ((#\() + (cond + ((>= (+ i 3) end) + (error "unterminated parenthesis in regexp" str)) + ((char-numeric? (string-ref str (+ i 3))) + (let* ((j (string-scan-char str #\) (+ i 3))) + (n (string->number (substring str (+ i 3) j)))) + (if (not n) + (error "invalid conditional reference" str) + (lp (+ j 1) (+ j 1) (flag-clear flags ~save?) + `(,n if) (save))))) + ((char-alphabetic? (string-ref str (+ i 3))) + (let* ((j (string-scan-char str #\) (+ i 3))) + (s (string->symbol (substring str (+ i 3) j)))) + (lp (+ j 1) (+ j 1) (flag-clear flags ~save?) + `(,s if) (save)))) + (else + (lp (+ i 2) (+ i 2) (flag-clear flags ~save?) + '(if) (save))))) + ((#\{) + (error "unsupported Perl-style cluster" str)) + (else + (let ((old-flags flags)) + (let lp2 ((j (+ i 2)) (flags flags) (invert? #f)) + (define (join x) + ((if invert? flag-clear flag-join) flags x)) + (define (new-res res) + (let ((before (flag-set? old-flags ~utf8?)) + (after (flag-set? flags ~utf8?))) + (if (eq? before after) + res + (cons (if after 'w/utf8 'w/noutf8) res)))) + (cond + ((>= j end) + (error "incomplete cluster" str i)) + (else + (case (string-ref str j) + ((#\i) + (lp2 (+ j 1) (join ~case-insensitive?) invert?)) + ((#\m) + (lp2 (+ j 1) (join ~multi-line?) invert?)) + ((#\x) + (lp2 (+ j 1) (join ~ignore-space?) invert?)) + ((#\u) + (if *allow-utf8-mode?* + (lp2 (+ j 1) (join ~utf8?) invert?) + (lp2 (+ j 1) flags invert?))) + ((#\-) + (lp2 (+ j 1) flags (not invert?))) + ((#\)) + (lp (+ j 1) (+ j 1) flags (new-res (collect)) + st)) + ((#\:) + (lp (+ j 1) (+ j 1) flags (new-res '()) + (cons (cons old-flags (collect)) st))) + (else + (error "unknown regex cluster modifier" str) + ))))))))))) + ((#\)) + (if (null? st) + (error "too many )'s in regexp" str) + (lp (+ i 1) + (+ i 1) + (caar st) + (cons (collect/terms) (cdar st)) + (cdr st)))) + ((#\[) + (apply + (lambda (sre j) + (lp (+ j 1) (+ j 1) flags (cons sre (collect)) st)) + (string-parse-cset str (+ i 1) flags))) + ((#\{) + (if (or (>= (+ i 1) end) + (not (or (char-numeric? (string-ref str (+ i 1))) + (eqv? #\, (string-ref str (+ i 1)))))) + (lp (+ i 1) from flags res st) + (let* ((res (collect/single)) + (x (car res)) + (tail (cdr res)) + (j (string-scan-char str #\} (+ i 1))) + (s2 (string-split-char (substring str (+ i 1) j) #\,)) + (n (or (string->number (car s2)) 0)) + (m (and (pair? (cdr s2)) (string->number (cadr s2))))) + (cond + ((null? (cdr s2)) + (lp (+ j 1) (+ j 1) flags `((= ,n ,x) ,@tail) st)) + (m + (lp (+ j 1) (+ j 1) flags `((** ,n ,m ,x) ,@tail) st)) + (else + (lp (+ j 1) (+ j 1) flags `((>= ,n ,x) ,@tail) st) + ))))) + ((#\\) + (cond + ((>= (+ i 1) end) + (error "incomplete escape sequence" str)) + (else + (let ((c (string-ref str (+ i 1)))) + (case c + ((#\d) + (lp (+ i 2) (+ i 2) flags `(numeric ,@(collect)) st)) + ((#\D) + (lp (+ i 2) (+ i 2) flags `((~ numeric) ,@(collect)) st)) + ((#\s) + (lp (+ i 2) (+ i 2) flags `(space ,@(collect)) st)) + ((#\S) + (lp (+ i 2) (+ i 2) flags `((~ space) ,@(collect)) st)) + ((#\w) + (lp (+ i 2) (+ i 2) flags + `((or alphanumeric ("_")) ,@(collect)) st)) + ((#\W) + (lp (+ i 2) (+ i 2) flags + `((~ (or alphanumeric ("_"))) ,@(collect)) st)) + ((#\b) + (lp (+ i 2) (+ i 2) flags + `((or bow eow) ,@(collect)) st)) + ((#\B) + (lp (+ i 2) (+ i 2) flags `(nwb ,@(collect)) st)) + ((#\A) + (lp (+ i 2) (+ i 2) flags `(bos ,@(collect)) st)) + ((#\Z) + (lp (+ i 2) (+ i 2) flags + `((? #\newline) eos ,@(collect)) st)) + ((#\z) + (lp (+ i 2) (+ i 2) flags `(eos ,@(collect)) st)) + ((#\R) + (lp (+ i 2) (+ i 2) flags `(newline ,@(collect)) st)) + ((#\K) + (lp (+ i 2) (+ i 2) flags `(reset ,@(collect)) st)) + ;; these two are from Emacs and TRE, but not PCRE + ((#\<) + (lp (+ i 2) (+ i 2) flags `(bow ,@(collect)) st)) + ((#\>) + (lp (+ i 2) (+ i 2) flags `(eow ,@(collect)) st)) + ((#\x) + (apply + (lambda (ch j) + (lp (+ j 1) (+ j 1) flags `(,ch ,@(collect)) st)) + (string-parse-hex-escape str (+ i 2) end))) + ((#\k) + (let ((c (string-ref str (+ i 2)))) + (if (not (memv c '(#\< #\{ #\'))) + (error "bad \\k usage, expected \\k<...>" str) + (let* ((terminal (char-mirror c)) + (j (string-scan-char str terminal (+ i 2))) + (s (and j (substring str (+ i 3) j))) + (backref + (if (flag-set? flags ~case-insensitive?) + 'backref-ci + 'backref))) + (if (not j) + (error "interminated named backref" str) + (lp (+ j 1) (+ j 1) flags + `((,backref ,(string->symbol s)) + ,@(collect)) + st)))))) + ((#\Q) ;; \Q..\E escapes + (let ((res (collect))) + (let lp2 ((j (+ i 2))) + (cond + ((>= j end) + (lp j (+ i 2) flags res st)) + ((eqv? #\\ (string-ref str j)) + (cond + ((>= (+ j 1) end) + (lp (+ j 1) (+ i 2) flags res st)) + ((eqv? #\E (string-ref str (+ j 1))) + (lp (+ j 2) (+ j 2) flags + (cons (substring str (+ i 2) j) res) st)) + (else + (lp2 (+ j 2))))) + (else + (lp2 (+ j 1))))))) + ;;((#\p) ; XXXX unicode properties + ;; ) + ;;((#\P) + ;; ) + (else + (cond + ((char-numeric? c) + (let* ((j (or (string-scan-pred + str + (lambda (c) (not (char-numeric? c))) + (+ i 2)) + end)) + (backref + (if (flag-set? flags ~case-insensitive?) + 'backref-ci + 'backref)) + (res `((,backref ,(string->number + (substring str (+ i 1) j))) + ,@(collect)))) + (lp j j flags res st))) + ((char-alphabetic? c) + (let ((cell (assv c posix-escape-sequences))) + (if cell + (lp (+ i 2) (+ i 2) flags + (cons (cdr cell) (collect)) st) + (error "unknown escape sequence" str c)))) + (else + (lp (+ i 2) (+ i 1) flags (collect) st))))))))) + ((#\|) + (lp (+ i 1) (+ i 1) flags (cons 'or (collect)) st)) + ((#\^) + (let ((sym (if (flag-set? flags ~multi-line?) 'bol 'bos))) + (lp (+ i 1) (+ i 1) flags (cons sym (collect)) st))) + ((#\$) + (let ((sym (if (flag-set? flags ~multi-line?) 'eol 'eos))) + (lp (+ i 1) (+ i 1) flags (cons sym (collect)) st))) + ((#\space) + (if (flag-set? flags ~ignore-space?) + (lp (+ i 1) (+ i 1) flags (collect) st) + (lp (+ i 1) from flags res st))) + ((#\#) + (if (flag-set? flags ~ignore-space?) + (let ((j (or (string-scan-char str #\newline (+ i 1)) + (- end 1)))) + (lp (+ j 1) (+ j 1) flags (collect) st)) + (lp (+ i 1) from flags res st))) + (else + (lp (+ i 1) from flags res st)))))))) + +(define posix-escape-sequences + `((#\n . #\newline) + (#\r . ,(integer->char (+ (char->integer #\newline) 3))) + (#\t . ,(integer->char (- (char->integer #\newline) 1))) + (#\a . ,(integer->char (- (char->integer #\newline) 3))) + (#\e . ,(integer->char (+ (char->integer #\newline) #x11))) + (#\f . ,(integer->char (+ (char->integer #\newline) 2))) + )) + +(define (char-altcase c) + (if (char-upper-case? c) (char-downcase c) (char-upcase c))) + +(define (char-mirror c) + (case c ((#\<) #\>) ((#\{) #\}) ((#\() #\)) ((#\[) #\]) (else c))) + +(define (string-parse-hex-escape str i end) + (cond + ((>= i end) + (error "incomplete hex escape" str i)) + ((eqv? #\{ (string-ref str i)) + (let ((j (string-scan-char-escape str #\} (+ i 1)))) + (if (not j) + (error "incomplete hex brace escape" str i) + (let* ((s (substring str (+ i 1) j)) + (n (string->number s 16))) + (if n + (list (integer->char n) j) + (error "bad hex brace escape" s)))))) + ((>= (+ i 1) end) + (error "incomplete hex escape" str i)) + (else + (let* ((s (substring str i (+ i 2))) + (n (string->number s 16))) + (if n + (list (integer->char n) (+ i 2)) + (error "bad hex escape" s)))))) + +(define (string-parse-cset str start flags) + (let ((end (string-length str)) + (invert? (eqv? #\^ (string-ref str start))) + (utf8? (flag-set? flags ~utf8?))) + (define (go i chars ranges) + (if (>= i end) + (error "incomplete char set") + (let ((c (string-ref str i))) + (case c + ((#\]) + (if (and (null? chars) (null? ranges)) + (go (+ i 1) (cons #\] chars) ranges) + (let ((ci? (flag-set? flags ~case-insensitive?)) + (hi-chars (if utf8? (filter high-char? chars) '())) + (chars (if utf8? (remove high-char? chars) chars))) + (list + ((lambda (res) + (if invert? (cons '~ res) (sre-alternate res))) + (append + hi-chars + (if (pair? chars) + (list + (list (list->string + ((if ci? + cset-case-insensitive + (lambda (x) x)) + (reverse chars))))) + '()) + (if (pair? ranges) + (let ((res (if ci? + (cset-case-insensitive + (reverse ranges)) + (reverse ranges)))) + (list (cons '/ (alist->plist res)))) + '()))) + i)))) + ((#\-) + (cond + ((or (= i start) + (and (= i (+ start 1)) (eqv? #\^ (string-ref str start))) + (eqv? #\] (string-ref str (+ i 1)))) + (go (+ i 1) (cons c chars) ranges)) + ((null? chars) + (error "bad char-set")) + (else + (let* ((c1 (car chars)) + (c2 (string-ref str (+ i 1)))) + (apply + (lambda (c2 j) + (if (char<? c2 c1) + (error "inverted range in char-set" c1 c2) + (go j (cdr chars) (cons (cons c1 c2) ranges)))) + (cond + ((and (eqv? #\\ c2) (assv c2 posix-escape-sequences)) + => (lambda (x) (list (cdr x) (+ i 3)))) + ((and (eqv? #\\ c2) + (eqv? (string-ref str (+ i 2)) #\x)) + (string-parse-hex-escape str (+ i 3) end)) + ((and utf8? (<= #x80 (char->integer c2) #xFF)) + (let ((len (utf8-start-char->length c2))) + (list (utf8-string-ref str (+ i 1) len) (+ i 1 len)))) + (else + (list c2 (+ i 2))))))))) + ((#\[) + (let* ((inv? (eqv? #\^ (string-ref str (+ i 1)))) + (i2 (if inv? (+ i 2) (+ i 1)))) + (case (string-ref str i2) + ((#\:) + (let ((j (string-scan-char str #\: (+ i2 1)))) + (if (or (not j) (not (eqv? #\] (string-ref str (+ j 1))))) + (error "incomplete character class" str) + (let* ((cset (sre->cset + (string->symbol + (substring str (+ i2 1) j)))) + (cset (if inv? (cset-complement cset) cset))) + (go (+ j 2) + (append (filter char? cset) chars) + (append (filter pair? cset) ranges)))))) + ((#\= #\.) + (error "collating sequences not supported" str)) + (else + (go (+ i 1) (cons #\[ chars) ranges))))) + ((#\\) + (let ((c (string-ref str (+ i 1)))) + (case c + ((#\d #\D #\s #\S #\w #\W) + (let ((cset (sre->cset (string->sre (string #\\ c))))) + (go (+ i 2) + (append (filter char? cset) chars) + (append (filter pair? cset) ranges)))) + ((#\x) + (apply + (lambda (ch j) + (go j (cons ch chars) ranges)) + (string-parse-hex-escape str (+ i 2) end))) + (else + (let ((c (cond ((assv c posix-escape-sequences) => cdr) + (else c)))) + (go (+ i 2) + (cons (string-ref str (+ i 1)) (cons c chars)) + ranges)))))) + (else + (if (and utf8? (<= #x80 (char->integer c) #xFF)) + (let ((len (utf8-start-char->length c))) + (go (+ i len) + (cons (utf8-string-ref str i len) chars) + ranges)) + (go (+ i 1) (cons c chars) ranges))))))) + (if invert? + (go (+ start 1) + (if (flag-set? flags ~multi-line?) '(#\newline) '()) + '()) + (go start '() '())))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; utf8 utilities + +;; Here are some hairy optimizations that need to be documented +;; better. Thanks to these, we never do any utf8 processing once the +;; regexp is compiled. + +;; two chars: ab..ef +;; a[b..xFF]|[b-d][x80..xFF]|e[x80..xFF] + +;; three chars: abc..ghi +;; ab[c..xFF]|a[d..xFF][x80..xFF]| +;; [b..f][x80..xFF][x80..xFF]| +;; g[x80..g][x80..xFF]|gh[x80..i] + +;; four chars: abcd..ghij +;; abc[d..xFF]|ab[d..xFF][x80..xFF]|a[c..xFF][x80..xFF][x80..xFF]| +;; [b..f][x80..xFF][x80..xFF][x80..xFF]| +;; g[x80..g][x80..xFF][x80..xFF]|gh[x80..h][x80..xFF]|ghi[x80..j] + +(define (high-char? c) (<= #x80 (char->integer c))) + +;; number of total bytes in a utf8 char given the 1st byte + +(define utf8-start-char->length + (let ((table '#( +1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 0x +1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 1x +1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 2x +1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 3x +1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 4x +1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 5x +1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 6x +1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 7x +1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 8x +1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 9x +1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; ax +1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; bx +2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ; cx +2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ; dx +3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 ; ex +4 4 4 4 4 4 4 4 5 5 5 5 6 6 0 0 ; fx +))) + (lambda (c) (vector-ref table (char->integer c))))) + +(define (utf8-string-ref str i len) + (define (byte n) (char->integer (string-ref str n))) + (case len + ((1) ; shouldn't happen in this module + (string-ref str i)) + ((2) + (integer->char + (+ (bit-shl (bit-and (byte i) #b00011111) 6) + (bit-and (byte (+ i 1)) #b00111111)))) + ((3) + (integer->char + (+ (bit-shl (bit-and (byte i) #b00001111) 12) + (bit-shl (bit-and (byte (+ i 1)) #b00111111) 6) + (bit-and (byte (+ i 2)) #b00111111)))) + ((4) + (integer->char + (+ (bit-shl (bit-and (byte i) #b00000111) 18) + (bit-shl (bit-and (byte (+ i 1)) #b00111111) 12) + (bit-shl (bit-and (byte (+ i 2)) #b00111111) 6) + (bit-and (byte (+ i 3)) #b00111111)))) + (else + (error "invalid utf8 length" str len i)))) + +(define (utf8-backup-to-initial-char str i) + (let lp ((i i)) + (if (= i 0) + 0 + (let ((c (char->integer (string-ref str i)))) + (if (or (< c #x80) (>= c #xC0)) + i + (lp (- i 1))))))) + +(define (utf8-lowest-digit-of-length len) + (case len + ((1) 0) ((2) #xC0) ((3) #xE0) ((4) #xF0) + (else (error "invalid utf8 length" len)))) + +(define (utf8-highest-digit-of-length len) + (case len + ((1) #x7F) ((2) #xDF) ((3) #xEF) ((4) #xF7) + (else (error "invalid utf8 length" len)))) + +(define (char->utf8-list c) + (let ((i (char->integer c))) + (cond + ((<= i #x7F) (list i)) + ((<= i #x7FF) + (list (bit-ior #b11000000 (bit-shr i 6)) + (bit-ior #b10000000 (bit-and i #b111111)))) + ((<= i #xFFFF) + (list (bit-ior #b11100000 (bit-shr i 12)) + (bit-ior #b10000000 (bit-and (bit-shr i 6) #b111111)) + (bit-ior #b10000000 (bit-and i #b111111)))) + ((<= i #x1FFFFF) + (list (bit-ior #b11110000 (bit-shr i 18)) + (bit-ior #b10000000 (bit-and (bit-shr i 12) #b111111)) + (bit-ior #b10000000 (bit-and (bit-shr i 6) #b111111)) + (bit-ior #b10000000 (bit-and i #b111111)))) + (else (error "unicode codepoint out of range:" i))))) + +(define (unicode-range->utf8-pattern lo hi) + (let ((lo-ls (char->utf8-list lo)) + (hi-ls (char->utf8-list hi))) + (if (not (= (length lo-ls) (length hi-ls))) + (sre-alternate (list (unicode-range-climb-digits lo-ls hi-ls) + (unicode-range-up-to hi-ls))) + (let lp ((lo-ls lo-ls) (hi-ls hi-ls)) + (cond + ((null? lo-ls) + '()) + ((= (car lo-ls) (car hi-ls)) + (sre-sequence + (list (integer->char (car lo-ls)) + (lp (cdr lo-ls) (cdr hi-ls))))) + ((= (+ (car lo-ls) 1) (car hi-ls)) + (sre-alternate (list (unicode-range-up-from lo-ls) + (unicode-range-up-to hi-ls)))) + (else + (sre-alternate (list (unicode-range-up-from lo-ls) + (unicode-range-middle lo-ls hi-ls) + (unicode-range-up-to hi-ls))))))))) + +(define (unicode-range-helper one ls prefix res) + (if (null? ls) + res + (unicode-range-helper + one + (cdr ls) + (cons (car ls) prefix) + (cons (sre-sequence + `(,@(map integer->char prefix) + ,(one (car ls)) + ,@(map (lambda (_) + `(/ ,(integer->char #x80) + ,(integer->char #xFF))) + (cdr ls)))) + res)))) + +(define (unicode-range-up-from lo-ls) + (sre-sequence + (list (integer->char (car lo-ls)) + (sre-alternate + (unicode-range-helper + (lambda (c) + `(/ ,(integer->char (+ (car lo-ls) 1)) ,(integer->char #xFF))) + (cdr (reverse (cdr lo-ls))) + '() + (list + (sre-sequence + (append + (map integer->char (reverse (cdr (reverse (cdr lo-ls))))) + `((/ ,(integer->char (last lo-ls)) + ,(integer->char #xFF))))))))))) + +(define (unicode-range-up-to hi-ls) + (sre-sequence + (list (integer->char (car hi-ls)) + (sre-alternate + (unicode-range-helper + (lambda (c) + `(/ ,(integer->char #x80) ,(integer->char (- (car hi-ls) 1)))) + (cdr (reverse (cdr hi-ls))) + '() + (list + (sre-sequence + (append + (map integer->char (reverse (cdr (reverse (cdr hi-ls))))) + `((/ ,(integer->char #x80) + ,(integer->char (last hi-ls)))))))))))) + +(define (unicode-range-climb-digits lo-ls hi-ls) + (let ((lo-len (length lo-ls))) + (sre-alternate + (append + (list + (sre-sequence + (cons `(/ ,(integer->char (car lo-ls)) + ,(integer->char (if (<= (car lo-ls) #x7F) #x7F #xFF))) + (map (lambda (_) + `(/ ,(integer->char #x80) ,(integer->char #xFF))) + (cdr lo-ls))))) + (map + (lambda (i) + (sre-sequence + (cons + `(/ ,(integer->char (utf8-lowest-digit-of-length (+ i lo-len 1))) + ,(integer->char (utf8-highest-digit-of-length (+ i lo-len 1)))) + (map (lambda (_) + `(/ ,(integer->char #x80) ,(integer->char #xFF))) + (zero-to (+ i lo-len)))))) + (zero-to (- (length hi-ls) lo-len 1))) + (list + (sre-sequence + (cons `(/ ,(integer->char + (utf8-lowest-digit-of-length + (utf8-start-char->length + (integer->char (- (car hi-ls) 1))))) + ,(integer->char (- (car hi-ls) 1))) + (map (lambda (_) + `(/ ,(integer->char #x80) ,(integer->char #xFF))) + (cdr hi-ls))))))))) + +(define (unicode-range-middle lo-ls hi-ls) + (let ((lo (integer->char (+ (car lo-ls) 1))) + (hi (integer->char (- (car hi-ls) 1)))) + (sre-sequence + (cons (if (char=? lo hi) lo `(/ ,lo ,hi)) + (map (lambda (_) `(/ ,(integer->char #x80) ,(integer->char #xFF))) + (cdr lo-ls)))))) + +(define (cset->utf8-pattern cset) + (let lp ((ls cset) (alts '()) (lo-cset '())) + (cond + ((null? ls) + (sre-alternate (append (reverse alts) + (if (null? lo-cset) + '() + (list (cons '/ (reverse lo-cset))))))) + ((char? (car ls)) + (if (high-char? (car ls)) + (lp (cdr ls) (cons (car ls) alts) lo-cset) + (lp (cdr ls) alts (cons (car ls) lo-cset)))) + (else + (if (or (high-char? (caar ls)) (high-char? (cdar ls))) + (lp (cdr ls) + (cons (unicode-range->utf8-pattern (caar ls) (cdar ls)) alts) + lo-cset) + (lp (cdr ls) alts (cons (cdar ls) (cons (caar ls) lo-cset)))))))) + +(define (sre-adjust-utf8 sre flags) + (let adjust ((sre sre) + (utf8? (flag-set? flags ~utf8?)) + (ci? (flag-set? flags ~case-insensitive?))) + (define (rec sre) (adjust sre utf8? ci?)) + (cond + ((pair? sre) + (case (car sre) + ((w/utf8) (adjust (sre-sequence (cdr sre)) #t ci?)) + ((w/noutf8) (adjust (sre-sequence (cdr sre)) #f ci?)) + ((w/case) + (cons (car sre) (map (lambda (s) (adjust s utf8? #f)) (cdr sre)))) + ((w/nocase) + (cons (car sre) (map (lambda (s) (adjust s utf8? #t)) (cdr sre)))) + ((/ ~ & -) + (if (not utf8?) + sre + (let ((cset (sre->cset sre ci?))) + (if (any (lambda (x) + (if (pair? x) + (or (high-char? (car x)) (high-char? (cdr x))) + (high-char? x))) + cset) + (if ci? + (list 'w/case (cset->utf8-pattern cset)) + (cset->utf8-pattern cset)) + sre)))) + ((*) + (case (sre-sequence (cdr sre)) + ;; special case optimization: .* w/utf8 == .* w/noutf8 + ((any) '(* any)) + ((nonl) '(* nonl)) + (else (cons '* (map rec (cdr sre)))))) + (else + (cons (car sre) (map rec (cdr sre)))))) + (else + (case sre + ((any) 'utf8-any) + ((nonl) 'utf8-nonl) + (else + (if (and utf8? (char? sre) (high-char? sre)) + (sre-sequence (map integer->char (char->utf8-list sre))) + sre))))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; compilation + +(define (irregex x . o) + (cond + ((irregex? x) x) + ((string? x) (apply string->irregex x o)) + (else (apply sre->irregex x o)))) + +(define (string->irregex str . o) + (apply sre->irregex (apply string->sre str o) o)) + +(define (sre->irregex sre . o) + (let* ((pat-flags (symbol-list->flags o)) + (sre (if *allow-utf8-mode?* + (sre-adjust-utf8 sre pat-flags) + sre)) + (searcher? (sre-searcher? sre)) + (sre-dfa (if searcher? (sre-remove-initial-bos sre) sre)) + (dfa-limit (cond ((memq 'small o) 1) ((memq 'fast o) 50) (else 10))) + (dfa/search + (if searcher? + #t + (cond ((sre->nfa `(seq (* any) ,sre-dfa) pat-flags) + => (lambda (nfa) (nfa->dfa nfa (* dfa-limit (length nfa))))) + (else #f)))) + (dfa (cond ((and dfa/search (sre->nfa sre-dfa pat-flags)) + => (lambda (nfa) (nfa->dfa nfa (* dfa-limit (length nfa))))) + (else #f))) + (extractor (and dfa dfa/search (sre-match-extractor sre-dfa))) + (submatches (sre-count-submatches sre-dfa)) + (names (sre-names sre-dfa 1 '())) + (lens (sre-length-ranges sre-dfa names)) + (flags (flag-join + (flag-join ~none (and searcher? ~searcher?)) + (and (sre-consumer? sre) ~consumer?)))) + (cond + (dfa + (make-irregex dfa dfa/search extractor #f flags submatches lens names)) + (else + (let ((f (sre->procedure sre pat-flags names))) + (make-irregex #f #f #f f flags submatches lens names)))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; sre analysis + +;; returns #t if the sre can ever be empty +(define (sre-empty? sre) + (if (pair? sre) + (case (car sre) + ((* ? look-ahead look-behind neg-look-ahead neg-look-behind) #t) + ((**) (or (not (number? (cadr sre))) (zero? (cadr sre)))) + ((or) (any sre-empty? (cdr sre))) + ((: seq submatch + atomic) (every sre-empty? (cdr sre))) + (else #f)) + (memq sre '(epsilon bos eos bol eol bow eow commit)))) + +(define (sre-any? sre) + (or (eq? sre 'any) + (and (pair? sre) + (case (car sre) + ((seq : submatch) + (and (pair? (cdr sre)) (null? (cddr sre)) (sre-any? (cadr sre)))) + ((or) (every sre-any? (cdr sre))) + (else #f))))) + +(define (sre-repeater? sre) + (and (pair? sre) + (or (memq (car sre) '(* +)) + (and (memq (car sre) '(submatch seq :)) + (pair? (cdr sre)) + (null? (cddr sre)) + (sre-repeater? (cadr sre)))))) + +(define (sre-searcher? sre) + (if (pair? sre) + (case (car sre) + ((* +) (sre-any? (sre-sequence (cdr sre)))) + ((seq : submatch) (and (pair? (cdr sre)) (sre-searcher? (cadr sre)))) + ((or) (every sre-searcher? (cdr sre))) + (else #f)) + (eq? 'bos sre))) + +(define (sre-consumer? sre) + (if (pair? sre) + (case (car sre) + ((* +) (sre-any? (sre-sequence (cdr sre)))) + ((seq : submatch) (and (pair? (cdr sre)) (sre-consumer? (last sre)))) + ((or) (every sre-consumer? (cdr sre))) + (else #f)) + (eq? 'eos sre))) + +(define (sre-has-submatchs? sre) + (and (pair? sre) + (or (eq? 'submatch (car sre)) + (any sre-has-submatchs? (cdr sre))))) + +(define (sre-count-submatches sre) + (let count ((sre sre) (sum 0)) + (if (pair? sre) + (fold count + (+ sum (case (car sre) + ((submatch submatch-named) 1) + ((dsm) (+ (cadr sre) (caddr sre))) + (else 0))) + (cdr sre)) + sum))) + +(define (sre-length-ranges sre . o) + (let ((names (if (pair? o) (car o) (sre-names sre 1 '()))) + (sublens (make-vector (+ 1 (sre-count-submatches sre)) #f))) + (vector-set! + sublens + 0 + (let lp ((sre sre) (n 1) (lo 0) (hi 0) (return cons)) + (define (grow i) (return (+ lo i) (and hi (+ hi i)))) + (cond + ((pair? sre) + (if (string? (car sre)) + (grow 1) + (case (car sre) + ((/ ~ & -) + (grow 1)) + ((posix-string) + (lp (string->sre (cadr sre)) n lo hi return)) + ((seq : w/case w/nocase atomic) + (let lp2 ((ls (cdr sre)) (n n) (lo2 0) (hi2 0)) + (if (null? ls) + (return (+ lo lo2) (and hi hi2 (+ hi hi2))) + (lp (car ls) n 0 0 + (lambda (lo3 hi3) + (lp2 (cdr ls) + (+ n (sre-count-submatches (car ls))) + (+ lo2 lo3) + (and hi2 hi3 (+ hi2 hi3)))))))) + ((or) + (let lp2 ((ls (cdr sre)) (n n) (lo2 #f) (hi2 0)) + (if (null? ls) + (return (+ lo lo2) (and hi hi2 (+ hi hi2))) + (lp (car ls) n 0 0 + (lambda (lo3 hi3) + (lp2 (cdr ls) + (+ n (sre-count-submatches (car ls))) + (if lo2 (min lo2 lo3) lo3) + (and hi2 hi3 (max hi2 hi3)))))))) + ((if) + (cond + ((or (null? (cdr sre)) (null? (cddr sre))) + (return lo hi)) + (else + (let ((n1 (sre-count-submatches (car sre))) + (n2 (sre-count-submatches (cadr sre)))) + (lp (if (or (number? (cadr sre)) (symbol? (cadr sre))) + 'epsilon + (cadr sre)) + n lo hi + (lambda (lo2 hi2) + (lp (caddr sre) (+ n n1) 0 0 + (lambda (lo3 hi3) + (lp (if (pair? (cdddr sre)) + (cadddr sre) + 'epsilon) + (+ n n1 n2) 0 0 + (lambda (lo4 hi4) + (return (+ lo2 (min lo3 lo4)) + (and hi2 hi3 hi4 + (+ hi2 (max hi3 hi4)) + )))))))))))) + ((dsm) + (lp (sre-sequence (cdddr sre)) (+ n (cadr sre)) lo hi return)) + ((submatch submatch-named) + (lp (sre-sequence + (if (eq? 'submatch (car sre)) (cdr sre) (cddr sre))) + (+ n 1) lo hi + (lambda (lo2 hi2) + (vector-set! sublens n (cons lo2 hi2)) + (return lo2 hi2)))) + ((backref backref-ci) + (let ((n (cond + ((number? (cadr sre)) (cadr sre)) + ((assq (cadr sre) names) => cdr) + (else (error "unknown backreference" (cadr sre)))))) + (cond + ((or (not (integer? n)) + (not (< 0 n (vector-length sublens)))) + (error "sre-length: invalid backreference" sre)) + ((not (vector-ref sublens n)) + (error "sre-length: invalid forward backreference" sre)) + (else + (let ((lo2 (car (vector-ref sublens n))) + (hi2 (cdr (vector-ref sublens n)))) + (return (+ lo lo2) (and hi hi2 (+ hi hi2)))))))) + ((* *?) + (lp (sre-sequence (cdr sre)) n lo hi (lambda (lo hi) #f)) + (return lo #f)) + ((** **?) + (cond + ((or (and (number? (cadr sre)) + (number? (caddr sre)) + (> (cadr sre) (caddr sre))) + (and (not (cadr sre)) (caddr sre))) + (return lo hi)) + (else + (if (caddr sre) + (lp (sre-sequence (cdddr sre)) n 0 0 + (lambda (lo2 hi2) + (return (+ lo (* (cadr sre) lo2)) + (and hi hi2 (+ hi (* (caddr sre) hi2)))))) + (lp (sre-sequence (cdddr sre)) n 0 0 + (lambda (lo2 hi2) + (return (+ lo (* (cadr sre) lo2)) #f))))))) + ((+) + (lp (sre-sequence (cdr sre)) n lo hi + (lambda (lo2 hi2) + (return (+ lo lo2) #f)))) + ((? ??) + (lp (sre-sequence (cdr sre)) n lo hi + (lambda (lo2 hi2) + (return lo (and hi hi2 (+ hi hi2)))))) + ((= =? >= >=?) + (lp `(** ,(cadr sre) + ,(if (memq (car sre) '(>= >=?)) #f (cadr sre)) + ,@(cddr sre)) + n lo hi return)) + ((look-ahead neg-look-ahead look-behind neg-look-behind) + (return lo hi)) + (else + (error "sre-length-ranges: unknown sre operator" sre))))) + ((char? sre) + (grow 1)) + ((string? sre) + (grow (string-length sre))) + ((memq sre '(any nonl)) + (grow 1)) + ((memq sre '(epsilon bos eos bol eol bow eow nwb commit)) + (return lo hi)) + (else + (let ((cell (assq sre sre-named-definitions))) + (if cell + (lp (cdr cell) n lo hi return) + (error "sre-length-ranges: unknown sre" sre))))))) + sublens)) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; sre manipulation + +;; build a (seq ls ...) sre from a list +(define (sre-sequence ls) + (cond + ((null? ls) 'epsilon) + ((null? (cdr ls)) (car ls)) + (else (cons 'seq ls)))) + +;; build a (or ls ...) sre from a list +(define (sre-alternate ls) + (cond + ((null? ls) 'epsilon) + ((null? (cdr ls)) (car ls)) + (else (cons 'or ls)))) + +;; returns an equivalent SRE without any match information +(define (sre-strip-submatches sre) + (if (not (pair? sre)) + sre + (case (car sre) + ((submatch) (sre-strip-submatches (sre-sequence (cdr sre)))) + ((dsm) (sre-strip-submatches (sre-sequence (cdddr sre)))) + (else (map sre-strip-submatches sre))))) + +;; given a char-set list of chars and strings, flattens them into +;; chars only +(define (sre-flatten-ranges ls) + (let lp ((ls ls) (res '())) + (cond + ((null? ls) + (reverse res)) + ((string? (car ls)) + (lp (append (string->list (car ls)) (cdr ls)) res)) + (else + (lp (cdr ls) (cons (car ls) res)))))) + +(define (sre-names sre n names) + (if (not (pair? sre)) + names + (case (car sre) + ((submatch) + (sre-names (sre-sequence (cdr sre)) (+ n 1) names)) + ((submatch-named) + (sre-names (sre-sequence (cddr sre)) + (+ n 1) + (cons (cons (cadr sre) n) names))) + ((dsm) + (sre-names (sre-sequence (cdddr sre)) (+ n (cadr sre)) names)) + ((seq : or * + ? *? ?? w/case w/nocase atomic + look-ahead look-behind neg-look-ahead neg-look-behind) + (sre-sequence-names (cdr sre) n names)) + ((= >=) + (sre-sequence-names (cddr sre) n names)) + ((** **?) + (sre-sequence-names (cdddr sre) n names)) + (else + names)))) + +(define (sre-sequence-names ls n names) + (if (null? ls) + names + (sre-sequence-names (cdr ls) + (+ n (sre-count-submatches (car ls))) + (sre-names (car ls) n names)))) + +(define (sre-remove-initial-bos sre) + (cond + ((pair? sre) + (case (car sre) + ((seq : submatch * +) + (cond + ((not (pair? (cdr sre))) + sre) + ((eq? 'bos (cadr sre)) + (cons (car sre) (cddr sre))) + (else + (cons (car sre) + (cons (sre-remove-initial-bos (cadr sre)) (cddr sre)))))) + ((or) + (sre-alternate (map sre-remove-initial-bos (cdr sre)))) + (else + sre))) + (else + sre))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; matching + +(define (irregex-search x str . o) + (let ((irx (irregex x))) + (let ((start (if (pair? o) (car o) 0)) + (end (if (and (pair? o) (pair? (cdr o))) + (cadr o) (string-length str))) + (matches (irregex-new-matches irx))) + (irregex-match-string-set! matches str) + (irregex-search/matches irx str start end matches)))) + +;; internal routine, can be used in loops to avoid reallocating the +;; match vector +(define (irregex-search/matches irx str start end matches) + (cond + ((irregex-dfa irx) + (cond + ((flag-set? (irregex-flags irx) ~searcher?) + (let ((m-end (dfa-match/longest (irregex-dfa irx) str start end))) + (cond + (m-end + (irregex-match-start-index-set! matches 0 start) + (irregex-match-end-index-set! matches 0 m-end) + ((irregex-dfa/extract irx) str start m-end matches) + matches) + (else + #f)))) + (else + (let ((first-match + (dfa-match/shortest (irregex-dfa/search irx) str start end))) + (and + first-match + (let* ((lo+hi (vector-ref (irregex-lengths irx) 0)) + (m-start (if (cdr lo+hi) + (max start (- first-match (cdr lo+hi))) + start)) + (m-limit (- first-match (car lo+hi))) + (dfa (irregex-dfa irx))) + (let lp ((m-start m-start)) + (and (<= m-start m-limit) + (let ((m-end (dfa-match/longest dfa str m-start end))) + (cond + (m-end + (irregex-match-start-index-set! matches 0 m-start) + (irregex-match-end-index-set! matches 0 m-end) + ((irregex-dfa/extract irx) str m-start m-end matches) + matches) + (else + (lp (+ m-start 1))))))))))))) + (else + (let ((matcher (irregex-nfa irx))) + (let lp ((start start)) + (and (<= start end) + (let ((i (matcher str start matches (lambda () #f)))) + (cond + (i + (irregex-match-start-index-set! matches 0 start) + (irregex-match-end-index-set! matches 0 i) + matches) + (else + (lp (+ start 1))))))))))) + +(define (irregex-match irx str) + (let* ((irx (irregex irx)) + (matches (irregex-new-matches irx)) + (start 0) + (end (string-length str))) + (irregex-match-string-set! matches str) + (cond + ((irregex-dfa irx) + (let ((m-end (dfa-match/longest (irregex-dfa irx) str start end))) + (cond + ((equal? m-end end) + (irregex-match-start-index-set! matches 0 start) + (irregex-match-end-index-set! matches 0 m-end) + ((irregex-dfa/extract irx) str start m-end matches) + matches) + (else + #f)))) + (else + (let* ((matcher (irregex-nfa irx)) + (i (matcher str start matches (lambda () #f)))) + (cond + ((equal? i end) + (irregex-match-start-index-set! matches 0 start) + (irregex-match-end-index-set! matches 0 i) + matches) + (else + #f))))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; DFA matching + +;; inline these +(define (dfa-init-state dfa) + (vector-ref dfa 0)) +(define (dfa-next-state dfa node) + (vector-ref dfa (cdr node))) +(define (dfa-final-state? dfa state) + (car state)) + +;; this searches for the first end index for which a match is possible +(define (dfa-match/shortest dfa str start end) + (let lp ((i start) (state (dfa-init-state dfa))) + (if (dfa-final-state? dfa state) + i + (and (< i end) + (let* ((ch (string-ref str i)) + (next (find (lambda (x) + (or (eqv? ch (car x)) + (and (pair? (car x)) + (char<=? (caar x) ch) + (char<=? ch (cdar x))))) + (cdr state)))) + (and next (lp (+ i 1) (dfa-next-state dfa next)))))))) + +;; this finds the longest match starting at a given index +(define (dfa-match/longest dfa str start end) + (let lp ((i start) + (state (dfa-init-state dfa)) + (res (and (dfa-final-state? dfa (dfa-init-state dfa)) start))) + (if (>= i end) + res + (let* ((ch (string-ref str i)) + (cell (find (lambda (x) + (or (eqv? ch (car x)) + (and (pair? (car x)) + (char<=? (caar x) ch) + (char<=? ch (cdar x))))) + (cdr state)))) + (if cell + (let ((next (dfa-next-state dfa cell))) + (lp (+ i 1) + next + (if (dfa-final-state? dfa next) (+ i 1) res))) + res))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; SRE->NFA compilation +;; +;; An NFA state is a numbered node with a list of patter->number +;; transitions, where pattern is either a character, (lo . hi) +;; character range, or epsilon (indicating an empty transition). +;; There may be duplicate characters and overlapping ranges - since +;; it's an NFA we process it by considering all possible transitions. + +(define sre-named-definitions + `((any . ,*all-chars*) + (nonl . (- ,*all-chars* (,(string #\newline)))) + (alphabetic . (/ #\a #\z #\A #\Z)) + (alpha . alphabetic) + (alphanumeric . (/ #\a #\z #\A #\Z #\0 #\9)) + (alphanum . alphanumeric) + (alnum . alphanumeric) + (lower-case . (/ #\a #\z)) + (lower . lower-case) + (upper-case . (/ #\A #\Z)) + (upper . upper-case) + (numeric . (/ #\0 #\9)) + (num . numeric) + (digit . numeric) + (punctuation . (or #\! #\" #\# #\% #\& #\' #\( #\) #\* #\, #\- #\. + #\/ #\: #\; #\? #\@ #\[ #\\ #\] #\_ #\{ #\})) + (punct . punctuation) + (graphic + . (or alphanumeric punctuation #\$ #\+ #\< #\= #\> #\^ #\` #\| #\~)) + (graph . graphic) + (blank . (or #\space ,(integer->char (- (char->integer #\space) 23)))) + (whitespace . (or blank #\newline)) + (space . whitespace) + (white . whitespace) + (printing or graphic whitespace) + (print . printing) + ;; XXXX we assume a (possibly shifted) ASCII-based ordering + (control . (/ ,(integer->char (- (char->integer #\space) 32)) + ,(integer->char (- (char->integer #\space) 1)))) + (cntrl . control) + (hex-digit . (or numeric (/ #\a #\f #\A #\F))) + (xdigit . hex-digit) + (ascii . (/ ,(integer->char (- (char->integer #\space) 32)) + ,(integer->char (+ (char->integer #\space) 95)))) + (ascii-nonl . (/ ,(integer->char (- (char->integer #\space) 32)) + ,(integer->char (- (char->integer #\newline) 1)) + ,(integer->char (+ (char->integer #\newline) 1)) + ,(integer->char (+ (char->integer #\space) 95)))) + (newline . (or (seq ,(integer->char (+ (char->integer #\newline) 3)) + #\newline) + (/ #\newline + ,(integer->char (+ (char->integer #\newline) 3))))) + + ;; ... it's really annoying to support scheme48 + (word . (seq bow (+ (or alphanumeric #\_)) eow)) + (utf8-tail-char . (/ ,(integer->char (+ (char->integer #\space) #x60)) + ,(integer->char (+ (char->integer #\space) #xA1)))) + (utf8-2-char . (seq (/ ,(integer->char (+ (char->integer #\space) #xA2)) + ,(integer->char (+ (char->integer #\space) #xBF))) + utf8-tail-char)) + (utf8-3-char . (seq (/ ,(integer->char (+ (char->integer #\space) #xC0)) + ,(integer->char (+ (char->integer #\space) #xCF))) + utf8-tail-char + utf8-tail-char)) + (utf8-4-char . (seq (/ ,(integer->char (+ (char->integer #\space) #xD0)) + ,(integer->char (+ (char->integer #\space) #xD7))) + utf8-tail-char + utf8-tail-char + utf8-tail-char)) + (utf8-any . (or ascii utf8-2-char utf8-3-char utf8-4-char)) + (utf8-nonl . (or ascii-nonl utf8-2-char utf8-3-char utf8-4-char)) + )) + +;; Compile and return the list of NFA states. The start state will be +;; at the head of the list, and all remaining states will be in +;; descending numeric order, with state 0 being the unique accepting +;; state. +(define (sre->nfa sre . o) + ;; we loop over an implicit sequence list + (let lp ((ls (list sre)) + (n 1) + (flags (if (pair? o) (car o) ~none)) + (next (list (list 0)))) + (define (new-state-number state) + (max n (+ 1 (caar state)))) + (define (extend-state next . trans) + (and next + (cons (cons (new-state-number next) + (map (lambda (x) (cons x (caar next))) trans)) + next))) + (if (null? ls) + next + (cond + ((string? (car ls)) + ;; process literal strings a char at a time + (lp (append (string->list (car ls)) (cdr ls)) n flags next)) + ((eq? 'epsilon (car ls)) + ;; chars and epsilons go directly into the transition table + (extend-state (lp (cdr ls) n flags next) (car ls))) + ((char? (car ls)) + (let ((alt (char-altcase (car ls)))) + (if (and (flag-set? flags ~case-insensitive?) + (not (eqv? (car ls) alt))) + (extend-state (lp (cdr ls) n flags next) (car ls) alt) + (extend-state (lp (cdr ls) n flags next) (car ls))))) + ((symbol? (car ls)) + (let ((cell (assq (car ls) sre-named-definitions))) + (and cell (lp (cons (cdr cell) (cdr ls)) n flags next)))) + ((pair? (car ls)) + (cond + ((string? (caar ls)) + ;; enumerated character set + (lp (cons (sre-alternate (string->list (caar ls))) (cdr ls)) + n + flags + next)) + (else + (case (caar ls) + ((seq :) + ;; for an explicit sequence, just append to the list + (lp (append (cdar ls) (cdr ls)) n flags next)) + ((w/case w/nocase w/utf8 w/noutf8) + (let* ((next (lp (cdr ls) n flags next)) + (flags ((if (memq (caar ls) '(w/case w/utf8)) + flag-clear + flag-join) + flags + (if (memq (caar ls) '(w/case w/nocase)) + ~case-insensitive? + ~utf8?)))) + (and next (lp (cdar ls) (new-state-number next) flags next)))) + ((/ - & ~) + (let ((ranges (sre->cset (car ls) + (flag-set? flags ~case-insensitive?)))) + (case (length ranges) + ((1) + (extend-state (lp (cdr ls) n flags next) (car ranges))) + (else + (let ((next (lp (cdr ls) n flags next))) + (and + next + (lp (list (sre-alternate + (map (lambda (x) (if (pair? x) + (list '/ (car x) (cdr x)) + x)) + ranges))) + (new-state-number next) + (flag-clear flags ~case-insensitive?) + next))))))) + ((or) + (let* ((next (lp (cdr ls) n flags next)) + (b (and next + (lp (list (sre-alternate (cddar ls))) + (new-state-number next) + flags + next))) + (a (and b (lp (list (cadar ls)) + (new-state-number b) + flags + next)))) + ;; compile both branches and insert epsilon + ;; transitions to either + (and a + `((,(new-state-number a) + (epsilon . ,(caar a)) + (epsilon . ,(caar b))) + ,@(take-up-to a next) + ,@b)))) + ((?) + (let ((next (lp (cdr ls) n flags next))) + ;; insert an epsilon transition directly to next + (and + next + (let ((a (lp (cdar ls) (new-state-number next) flags next))) + (cond + (a + (set-cdr! (car a) `((epsilon . ,(caar next)) ,@(cdar a))) + a) + (else + #f)))))) + ((+ *) + (let ((next (lp (cdr ls) n flags next))) + (and + next + (let* ((new (lp '(epsilon) + (new-state-number next) + flags + next)) + (a (lp (cdar ls) (new-state-number new) flags new))) + (and + a + (begin + ;; for *, insert an epsilon transition as in ? above + (if (eq? '* (caar ls)) + (set-cdr! (car a) + `((epsilon . ,(caar new)) ,@(cdar a)))) + ;; for both, insert a loop back to self + (set-cdr! (car new) + `((epsilon . ,(caar a)) ,@(cdar new))) + a)))))) + ((submatch submatch-named) + ;; ignore submatches altogether + (lp (cons (sre-sequence (cdar ls)) (cdr ls)) n flags next)) + (else + #f))))) + (else + #f))))) + +;; We don't really want to use this, we use the closure compilation +;; below instead, but this is included for reference and testing the +;; sre->nfa conversion. + +;; (define (nfa-match nfa str) +;; (let lp ((ls (string->list str)) (state (car nfa)) (epsilons '())) +;; (if (null? ls) +;; (zero? (car state)) +;; (any (lambda (m) +;; (if (eq? 'epsilon (car m)) +;; (and (not (memv (cdr m) epsilons)) +;; (lp ls (assv (cdr m) nfa) (cons (cdr m) epsilons))) +;; (and (or (eqv? (car m) (car ls)) +;; (and (pair? (car m)) +;; (char<=? (caar m) (car ls)) +;; (char<=? (car ls) (cdar m)))) +;; (lp (cdr ls) (assv (cdr m) nfa) '())))) +;; (cdr state))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; NFA->DFA compilation +;; +;; During processing, the DFA is a list of the form: +;; +;; ((NFA-states ...) accepting-state? transitions ...) +;; +;; where the transitions are as in the NFA, except there are no +;; epsilons, duplicate characters or overlapping char-set ranges, and +;; the states moved to are closures (sets of NFA states). Multiple +;; DFA states may be accepting states. + +(define (nfa->dfa nfa . o) + (let ((max-states (and (pair? o) (car o)))) + (let lp ((ls (list (nfa-closure nfa (list (caar nfa))))) + (i 0) + (res '())) + (cond + ((null? ls) + (dfa-renumber (reverse res))) + ((assoc (car ls) res) + (lp (cdr ls) i res)) + (else + (let* ((states (car ls)) + (trans (nfa-state-transitions nfa states)) + (accept? (and (memv 0 states) #t))) + (and (or (not max-states) (< (+ i 1) max-states)) + (lp (append (map cdr trans) (cdr ls)) + (+ i 1) + `((,states ,accept? ,@trans) ,@res))))))))) + +;; When the conversion is complete we renumber the DFA sets-of-states +;; in order and convert the result to a vector for fast lookup. +(define (dfa-renumber dfa) + (let ((states (map cons (map car dfa) (zero-to (length dfa))))) + (define (renumber state) + (cdr (assoc state states))) + (list->vector + (map + (lambda (node) + (cons (cadr node) + (map (lambda (x) (cons (car x) (renumber (cdr x)))) + (cddr node)))) + dfa)))) + +;; Extract all distinct characters or ranges and the potential states +;; they can transition to from a given set of states. Any ranges that +;; would overlap with distinct characters are split accordingly. +(define (nfa-state-transitions nfa states) + (let lp ((trans '()) ;; list of (char . state) or ((char . char) . state) + (ls states) ;; list of integers (remaining state numbers) + (res '())) ;; (char state ...) or ((char . char) state ...) + (cond + ((null? trans) + (if (null? ls) + (map (lambda (x) (cons (car x) (nfa-closure nfa (cdr x)))) + res) + (let ((node (assv (car ls) nfa))) + (lp (if node (cdr node) '()) (cdr ls) res)))) + ((eq? 'epsilon (caar trans)) + (lp (cdr trans) ls res)) + (else + (lp (cdr trans) ls (nfa-join-transitions! res (car trans))))))) + +(define (nfa-join-transitions! existing new) + (define (join ls elt state) + (if (not elt) + ls + (nfa-join-transitions! ls (cons elt state)))) + (cond + ((char? (car new)) + (let ((ch (car new))) + (let lp ((ls existing) (res '())) + (cond + ((null? ls) + ;; done, just cons this on to the original list + (cons (list ch (cdr new)) existing)) + ((eqv? ch (caar ls)) + ;; add a new state to an existing char + (set-cdr! (car ls) (insert-sorted (cdr new) (cdar ls))) + existing) + ((and (pair? (caar ls)) + (char<=? (caaar ls) ch) + (char<=? ch (cdaar ls))) + ;; split a range + (apply + (lambda (left right) + (cons (cons ch (insert-sorted (cdr new) (cdar ls))) + (append (if left (list (cons left (cdar ls))) '()) + (if right (list (cons right (cdar ls))) '()) + res + (cdr ls)))) + (split-char-range (caar ls) (car new)))) + (else + ;; keep looking + (lp (cdr ls) (cons (car ls) res))))))) + (else + (let ((lo (caar new)) + (hi (cdar new))) + (let lp ((ls existing) (res '())) + (cond + ((null? ls) + (cons (list (car new) (cdr new)) existing)) + ((and (char? (caar ls)) (char<=? lo (caar ls)) (char<=? (caar ls) hi)) + ;; range enclosing a character + (apply + (lambda (left right) + (set-cdr! (car ls) (insert-sorted (cdr new) (cdar ls))) + (join (join existing left (cdr new)) right (cdr new))) + (split-char-range (car new) (caar ls)))) + ((and (pair? (caar ls)) + (or (and (char<=? (caaar ls) hi) (char<=? lo (cdaar ls))) + (and (char<=? hi (caaar ls)) (char<=? (cdaar ls) lo)))) + ;; overlapping ranges + (apply + (lambda (left1 left2 same right1 right2) + (let ((old-states (cdar ls))) + (set-car! (car ls) same) + (set-cdr! (car ls) (insert-sorted (cdr new) old-states)) + (let* ((res (if right1 + (cons (cons right1 old-states) existing) + existing)) + (res (if right2 (cons (cons right2 old-states) res) res))) + (join (join res left1 (cdr new)) left2 (cdr new))))) + (intersect-char-ranges (car new) (caar ls)))) + (else + (lp (cdr ls) (cons (car ls) res))))))))) + +(define (char-range c1 c2) + (if (eqv? c1 c2) c1 (cons c1 c2))) + +;; assumes ch is included in the range +(define (split-char-range range ch) + (list + (and (not (eqv? ch (car range))) + (char-range (car range) (integer->char (- (char->integer ch) 1)))) + (and (not (eqv? ch (cdr range))) + (char-range (integer->char (+ (char->integer ch) 1)) (cdr range))))) + +;; returns (possibly #f) char ranges: +;; a-only-1 a-only-2 a-and-b b-only-1 b-only-2 +(define (intersect-char-ranges a b) + (if (char>? (car a) (car b)) + (reverse (intersect-char-ranges b a)) + (let ((a-lo (car a)) + (a-hi (cdr a)) + (b-lo (car b)) + (b-hi (cdr b))) + (list + (and (char<? a-lo b-lo) + (char-range a-lo (integer->char (- (char->integer b-lo) 1)))) + (and (char>? a-hi b-hi) + (char-range (integer->char (+ (char->integer b-hi) 1)) a-hi)) + (char-range b-lo (if (char<? b-hi a-hi) b-hi a-hi)) + #f + (and (char>? b-hi a-hi) + (char-range (integer->char (+ (char->integer a-hi) 1)) b-hi)))))) + +;; The `closure' of a list of NFA states - all states that can be +;; reached from any of them using any number of epsilon transitions. +(define (nfa-closure nfa states) + (let lp ((ls states) + (res '())) + (cond + ((null? ls) + res) + ((memv (car ls) res) + (lp (cdr ls) res)) + (else + (lp (append (map cdr + (filter (lambda (trans) (eq? 'epsilon (car trans))) + (cdr (assv (car ls) nfa)))) + (cdr ls)) + (insert-sorted (car ls) res)))))) + +;; insert an integer uniquely into a sorted list +(define (insert-sorted n ls) + (cond + ((null? ls) + (cons n '())) + ((<= n (car ls)) + (if (= n (car ls)) + ls + (cons n ls))) + (else + (cons (car ls) (insert-sorted n (cdr ls)))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; DFAs don't give us match information, so once we match and +;; determine the start and end, we need to recursively break the +;; problem into smaller DFAs to get each submatch. +;; +;; See http://compilers.iecc.com/comparch/article/07-10-026 + +(define (sre-match-extractor sre) + (let lp ((sre sre) (n 1) (submatch-deps? #f)) + (cond + ((not (sre-has-submatchs? sre)) + (if (not submatch-deps?) + (lambda (str i j matches) j) + (let ((dfa (nfa->dfa (sre->nfa sre)))) + (lambda (str i j matches) + (dfa-match/longest dfa str i j))))) + ((pair? sre) + (case (car sre) + ((: seq) + (let* ((right (sre-sequence (cddr sre))) + (match-left (lp (cadr sre) n #t)) + (match-right + (lp right (+ n (sre-count-submatches (cadr sre))) #t))) + (lambda (str i j matches) + (let lp ((k j) (best #f)) + (if (< k i) + best + (let* ((middle (match-left str i k matches)) + (end (and middle + (eqv? middle k) + (match-right str middle j matches)))) + (if (eqv? end j) + end + (lp (- k 1) + (if (or (not best) (and end (> end best))) + end + best))))))))) + ((or) + (let* ((rest (sre-alternate (cddr sre))) + (match-first + (lp (cadr sre) n #t)) + (match-rest + (lp rest + (+ n (sre-count-submatches (cadr sre))) + submatch-deps?))) + (lambda (str i j matches) + (let ((k (match-first str i j matches))) + (if (eqv? k j) + k + (match-rest str i j matches)))))) + ((* +) + (letrec ((match-once + (lp (sre-sequence (cdr sre)) n #t)) + (match-all + (lambda (str i j matches) + (let ((k (match-once str i j matches))) + (if (and k (< i k)) + (match-all str k j matches) + i))))) + (if (eq? '* (car sre)) + match-all + (lambda (str i j matches) + (let ((k (match-once str i j matches))) + (and k + (match-all str k j matches))))))) + ((?) + (let ((match-once (lp (sre-sequence (cdr sre)) n #t))) + (lambda (str i j matches) + (let ((k (match-once str i j matches))) + (or k i))))) + ((submatch) + (let ((match-one + (lp (sre-sequence (cdr sre)) (+ n 1) #t))) + (lambda (str i j matches) + (let ((res (match-one str i j matches))) + (cond + ((number? res) + (irregex-match-start-index-set! matches n i) + (irregex-match-end-index-set! matches n res))) + res)))) + (else + (error "unknown regexp operator" (car sre))))) + (else + (error "unknown regexp" sre))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; closure compilation - we use this for non-regular expressions +;; instead of an interpreted NFA matcher + +(define (sre->procedure sre . o) + (define names + (if (and (pair? o) (pair? (cdr o))) (cadr o) (sre-names sre 1 '()))) + (let lp ((sre sre) + (n 1) + (flags (if (pair? o) (car o) ~none)) + (next (lambda (str i matches fail) i))) + (define (rec sre) (lp sre n flags next)) + (cond + ((pair? sre) + (if (string? (car sre)) + (sre-cset->procedure + (sre->cset (car sre) (flag-set? flags ~case-insensitive?)) + next) + (case (car sre) + ((~ - & /) + (sre-cset->procedure + (sre->cset sre (flag-set? flags ~case-insensitive?)) + next)) + ((or) + (case (length (cdr sre)) + ((0) (lambda (str i matches fail) (fail))) + ((1) (rec (cadr sre))) + (else + (let* ((first (rec (cadr sre))) + (rest (lp (sre-alternate (cddr sre)) + (+ n (sre-count-submatches (cadr sre))) + flags + next))) + (lambda (str i matches fail) + (first str i matches (lambda () (rest str i matches fail)))))))) + ((w/case) + (lp (sre-sequence (cdr sre)) + n + (flag-clear flags ~case-insensitive?) + next)) + ((w/nocase) + (lp (sre-sequence (cdr sre)) + n + (flag-join flags ~case-insensitive?) + next)) + ((w/utf8) + (lp (sre-sequence (cdr sre)) n (flag-join flags ~utf8?) next)) + ((w/noutf8) + (lp (sre-sequence (cdr sre)) n (flag-clear flags ~utf8?) next)) + ((seq :) + (case (length (cdr sre)) + ((0) next) + ((1) (rec (cadr sre))) + (else + (let ((rest (lp (sre-sequence (cddr sre)) + (+ n (sre-count-submatches (cadr sre))) + flags + next))) + (lp (cadr sre) n flags rest))))) + ((?) + (let ((body (rec (sre-sequence (cdr sre))))) + (lambda (str i matches fail) + (body str i matches (lambda () (next str i matches fail)))))) + ((??) + (let ((body (rec (sre-sequence (cdr sre))))) + (lambda (str i matches fail) + (next str i matches (lambda () (body str i matches fail)))))) + ((*) + (cond + ((sre-empty? (sre-sequence (cdr sre))) + (error "invalid sre: empty *" sre)) + (else + (letrec ((body + (lp (sre-sequence (cdr sre)) + n + flags + (lambda (str i matches fail) + (body str + i + matches + (lambda () (next str i matches fail))))))) + (lambda (str i matches fail) + (body str i matches (lambda () (next str i matches fail)))))))) + ((*?) + (cond + ((sre-empty? (sre-sequence (cdr sre))) + (error "invalid sre: empty *?" sre)) + (else + (letrec ((body + (lp (sre-sequence (cdr sre)) + n + flags + (lambda (str i matches fail) + (next str + i + matches + (lambda () (body str i matches fail))))))) + (lambda (str i matches fail) + (next str i matches (lambda () (body str i matches fail)))))))) + ((+) + (lp (sre-sequence (cdr sre)) + n + flags + (rec (list '* (sre-sequence (cdr sre)))))) + ((=) + (rec `(** ,(cadr sre) ,(cadr sre) ,@(cddr sre)))) + ((>=) + (rec `(** ,(cadr sre) #f ,@(cddr sre)))) + ((** **?) + (cond + ((or (and (number? (cadr sre)) + (number? (caddr sre)) + (> (cadr sre) (caddr sre))) + (and (not (cadr sre)) (caddr sre))) + (lambda (str i matches fail) (fail))) + (else + (let* ((from (cadr sre)) + (to (caddr sre)) + (? (if (eq? '** (car sre)) '? '??)) + (* (if (eq? '** (car sre)) '* '*?)) + (sre (sre-sequence (cdddr sre))) + (x-sre (sre-strip-submatches sre)) + (next (if to + (if (= from to) + next + (fold (lambda (x next) + (lp `(,? ,sre) n flags next)) + next + (zero-to (- to from)))) + (rec `(,* ,sre))))) + (if (zero? from) + next + (lp `(seq ,@(map (lambda (x) x-sre) (zero-to (- from 1))) + ,sre) + n + flags + next)))))) + ((word) + (rec `(seq bow ,@(cdr sre) eow))) + ((word+) + (rec `(seq bow (+ (& (or alphanumeric "_") + (or ,@(cdr sre)))) eow))) + ((posix-string) + (rec (string->sre (cadr sre)))) + ((look-ahead) + (let ((check + (lp (sre-sequence (cdr sre)) + n + flags + (lambda (str i matches fail) i)))) + (lambda (str i matches fail) + (if (check str i matches (lambda () #f)) + (next str i matches fail) + (fail))))) + ((neg-look-ahead) + (let ((check + (lp (sre-sequence (cdr sre)) + n + flags + (lambda (str i matches fail) i)))) + (lambda (str i matches fail) + (if (check str i matches (lambda () #f)) + (fail) + (next str i matches fail))))) + ((look-behind) + (let ((check + (lp (sre-sequence (cons '(* any) (cdr sre))) + n + flags + (lambda (str i matches fail) i)))) + (lambda (str i matches fail) + (if (eqv? i (check (substring str 0 i) 0 matches (lambda () #f))) + (next str i matches fail) + (fail))))) + ((neg-look-behind) + (let ((check + (lp (sre-sequence (cons '(* any) (cdr sre))) + n + flags + (lambda (str i matches fail) i)))) + (lambda (str i matches fail) + (if (eqv? i (check (substring str 0 i) 0 matches (lambda () #f))) + (fail) + (next str i matches fail))))) + ((atomic) + (let ((once + (lp (sre-sequence (cdr sre)) + n + flags + (lambda (str i matches fail) i)))) + (lambda (str i matches fail) + (let ((j (once str i matches (lambda () #f)))) + (if j + (next str j matches fail) + (fail)))))) + ((if) + (let* ((test-submatches (sre-count-submatches (cadr sre))) + (pass (lp (caddr sre) flags (+ n test-submatches) next)) + (fail (if (pair? (cdddr sre)) + (lp (cadddr sre) + (+ n test-submatches + (sre-count-submatches (caddr sre))) + flags + next) + (lambda (str i matches fail) (fail))))) + (cond + ((or (number? (cadr sre)) (symbol? (cadr sre))) + (let ((index + (if (symbol? (cadr sre)) + (cond + ((assq (cadr sre) names) => cdr) + (else + (error "unknown named backref in SRE IF" sre))) + (cadr sre)))) + (lambda (str i matches fail2) + (if (irregex-match-end-index matches index) + (pass str i matches fail2) + (fail str i matches fail2))))) + (else + (let ((test (lp (cadr sre) n flags pass))) + (lambda (str i matches fail2) + (test str i matches (lambda () (fail str i matches fail2))) + )))))) + ((backref backref-ci) + (let ((n (cond ((number? (cadr sre)) (cadr sre)) + ((assq (cadr sre) names) => cdr) + (else (error "unknown backreference" (cadr sre))))) + (compare (if (or (eq? (car sre) 'backref-ci) + (flag-set? flags ~case-insensitive?)) + string-ci=? + string=?))) + (lambda (str i matches fail) + (let ((s (irregex-match-substring matches n))) + (if (not s) + (fail) + (let ((j (+ i (string-length s)))) + (if (and (<= j (string-length str)) + (compare s (substring str i j))) + (next str j matches fail) + (fail)))))))) + ((dsm) + (lp (sre-sequence (cdddr sre)) (+ n (cadr sre)) flags next)) + ((submatch) + (let ((body + (lp (sre-sequence (cdr sre)) + (+ n 1) + flags + (lambda (str i matches fail) + (let ((old (irregex-match-end-index matches n))) + (irregex-match-end-index-set! matches n i) + (next str i matches + (lambda () + (irregex-match-end-index-set! matches n old) + (fail)))))))) + (lambda (str i matches fail) + (let ((old (irregex-match-start-index matches n))) + (irregex-match-start-index-set! matches n i) + (body str i matches + (lambda () + (irregex-match-start-index-set! matches n old) + (fail))))))) + ((submatch-named) + (rec `(submatch ,@(cddr sre)))) + (else + (error "unknown regexp operator" sre))))) + ((symbol? sre) + (case sre + ((any) + (lambda (str i matches fail) + (if (< i (string-length str)) + (next str (+ i 1) matches fail) + (fail)))) + ((nonl) + (lambda (str i matches fail) + (if (and (< i (string-length str)) + (not (eqv? #\newline (string-ref str i)))) + (next str (+ i 1) matches fail) + (fail)))) + ((bos) + (lambda (str i matches fail) + (if (zero? i) (next str i matches fail) (fail)))) + ((bol) + (lambda (str i matches fail) + (if (or (zero? i) (eqv? #\newline (string-ref str (- i 1)))) + (next str i matches fail) + (fail)))) + ((bow) + (lambda (str i matches fail) + (if (and (or (zero? i) + (not (char-alphanumeric? (string-ref str (- i 1))))) + (< i (string-length str)) + (char-alphanumeric? (string-ref str i))) + (next str i matches fail) + (fail)))) + ((eos) + (lambda (str i matches fail) + (if (>= i (string-length str)) (next str i matches fail) (fail)))) + ((eol) + (lambda (str i matches fail) + (if (or (>= i (string-length str)) + (eqv? #\newline (string-ref str i))) + (next str i matches fail) + (fail)))) + ((eow) + (lambda (str i matches fail) + (if (and (or (>= i (string-length str)) + (not (char-alphanumeric? (string-ref str i)))) + (> i 0) + (char-alphanumeric? (string-ref str (- i 1)))) + (next str i matches fail) + (fail)))) + ((nwb) ;; non-word-boundary + (lambda (str i matches fail) + (if (and (not (zero? i)) + (< i (string-length str)) + (if (char-alphanumeric? (string-ref str (- i 1))) + (char-alphanumeric? (string-ref str i)) + (not (char-alphanumeric? (string-ref str i))))) + (next str i matches fail) + (fail)))) + ((epsilon) + next) + (else + (let ((cell (assq sre sre-named-definitions))) + (if cell + (rec (cdr cell)) + (error "unknown regexp" sre)))))) + ((char? sre) + (if (flag-set? flags ~case-insensitive?) + (lambda (str i matches fail) + (if (and (< i (string-length str)) + (char-ci=? sre (string-ref str i))) + (next str (+ i 1) matches fail) + (fail))) + (lambda (str i matches fail) + (if (and (< i (string-length str)) + (eqv? sre (string-ref str i))) + (next str (+ i 1) matches fail) + (fail))))) + ((string? sre) + (rec (sre-sequence (string->list sre)))) + (else + (error "unknown regexp" sre))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Simple character sets as lists of ranges, as used in the NFA/DFA +;; compilation. This is not especially efficient, but is portable and +;; scalable for any range of character sets. + +(define (sre-cset->procedure cset next) + (lambda (str i matches fail) + (if (and (< i (string-length str)) + (cset-contains? cset (string-ref str i))) + (next str (+ i 1) matches fail) + (fail)))) + +(define (plist->alist ls) + (let lp ((ls ls) (res '())) + (if (null? ls) + (reverse res) + (lp (cddr ls) (cons (cons (car ls) (cadr ls)) res))))) + +(define (alist->plist ls) + (let lp ((ls ls) (res '())) + (if (null? ls) + (reverse res) + (lp (cdr ls) (cons (cdar ls) (cons (caar ls) res)))))) + +(define (sre->cset sre . o) + (let lp ((sre sre) (ci? (and (pair? o) (car o)))) + (define (rec sre) (lp sre ci?)) + (cond + ((pair? sre) + (if (string? (car sre)) + (if ci? + (cset-case-insensitive (string->list (car sre))) + (string->list (car sre))) + (case (car sre) + ((~) + (cset-complement + (fold cset-union (rec (cadr sre)) (map rec (cddr sre))))) + ((&) + (fold cset-intersection (rec (cadr sre)) (map rec (cddr sre)))) + ((-) + (fold (lambda (x res) (cset-difference res x)) + (rec (cadr sre)) + (map rec (cddr sre)))) + ((/) + (let ((res (plist->alist (sre-flatten-ranges (cdr sre))))) + (if ci? + (cset-case-insensitive res) + res))) + ((or) + (fold cset-union (rec (cadr sre)) (map rec (cddr sre)))) + ((w/case) + (lp (sre-alternate (cdr sre)) #f)) + ((w/nocase) + (lp (sre-alternate (cdr sre)) #t)) + (else + (error "not a valid sre char-set operator" sre))))) + ((char? sre) (rec (list (string sre)))) + ((string? sre) (rec (list sre))) + (else + (let ((cell (assq sre sre-named-definitions))) + (if cell + (rec (cdr cell)) + (error "not a valid sre char-set" sre))))))) + +;;;; another debugging utility +;; (define (cset->sre cset) +;; (let lp ((ls cset) (chars '()) (ranges '())) +;; (cond +;; ((null? ls) +;; (sre-alternate +;; (append +;; (if (pair? chars) (list (list (list->string chars))) '()) +;; (if (pair? ranges) (list (cons '/ (alist->plist ranges))) '())))) +;; ((char? (car ls)) (lp (cdr ls) (cons (car ls) chars) ranges)) +;; (else (lp (cdr ls) chars (cons (car ls) ranges)))))) + +(define (cset-contains? cset ch) + (find (lambda (x) + (or (eqv? x ch) + (and (pair? x) (char<=? (car x) ch) (char<=? ch (cdr x))))) + cset)) + +(define (cset-range x) + (if (char? x) (cons x x) x)) + +(define (char-ranges-overlap? a b) + (if (pair? a) + (if (pair? b) + (or (and (char<=? (car a) (cdr b)) (char<=? (car b) (cdr a))) + (and (char<=? (cdr b) (car a)) (char<=? (cdr a) (car b)))) + (and (char<=? (car a) b) (char<=? b (cdr a)))) + (if (pair? b) + (char-ranges-overlap? b a) + (eqv? a b)))) + +(define (char-ranges-union a b) + (cons (if (char<=? (car a) (car b)) (car a) (car b)) + (if (char>=? (cdr a) (cdr b)) (cdr a) (cdr b)))) + +(define (cset-union a b) + (cond ((null? b) a) + ((find-tail (lambda (x) (char-ranges-overlap? x (car b))) a) + => (lambda (ls) + (cset-union + (cset-union (append (take-up-to a ls) (cdr ls)) + (list (char-ranges-union (cset-range (car ls)) + (cset-range (car b))))) + (cdr b)))) + (else (cset-union (cons (car b) a) (cdr b))))) + +(define (cset-difference a b) + (cond ((null? b) a) + ((not (car b)) (cset-difference a (cdr b))) + ((find-tail (lambda (x) (char-ranges-overlap? x (car b))) a) + => (lambda (ls) + (apply + (lambda (left1 left2 same right1 right2) + (let* ((a (append (take-up-to a ls) (cdr ls))) + (a (if left1 (cons left1 a) a)) + (a (if left2 (cons left2 a) a)) + (b (if right1 (cset-union b (list right1)) b)) + (b (if right2 (cset-union b (list right2)) b))) + (cset-difference a b))) + (intersect-char-ranges (cset-range (car ls)) + (cset-range (car b)))))) + (else (cset-difference a (cdr b))))) + +(define (cset-intersection a b) + (let intersect ((a a) (b b) (res '())) + (cond ((null? b) res) + ((find-tail (lambda (x) (char-ranges-overlap? x (car b))) a) + => (lambda (ls) + (apply + (lambda (left1 left2 same right1 right2) + (let* ((a (append (take-up-to a ls) (cdr ls))) + (a (if left1 (cons left1 a) a)) + (a (if left2 (cons left2 a) a)) + (b (if right1 (cset-union b (list right1)) b)) + (b (if right2 (cset-union b (list right2)) b))) + (intersect a b (cset-union res (list same))))) + (intersect-char-ranges (cset-range (car ls)) + (cset-range (car b)))))) + (else (intersect a (cdr b) res))))) + +(define (cset-complement a) + (cset-difference (sre->cset *all-chars*) a)) + +(define (cset-case-insensitive a) + (let lp ((ls a) (res '())) + (cond ((null? ls) (reverse res)) + ((and (char? (car ls)) (char-alphabetic? (car ls))) + (let ((c2 (char-altcase (car ls))) + (res (cons (car ls) res))) + (lp (cdr ls) (if (cset-contains? res c2) res (cons c2 res))))) + ((and (pair? (car ls)) + (char-alphabetic? (caar ls)) + (char-alphabetic? (cdar ls))) + (lp (cdr ls) + (cset-union (cset-union res (list (car ls))) + (list (cons (char-altcase (caar ls)) + (char-altcase (cdar ls))))))) + (else (lp (cdr ls) (cset-union res (list (car ls)))))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; match and replace utilities + +(define (irregex-fold irx kons knil str . o) + (let* ((irx (irregex irx)) + (matches (irregex-new-matches irx)) + (finish (if (pair? o) (car o) (lambda (i acc) acc))) + (start (if (and (pair? o) (pair? (cdr o))) (cadr o) 0)) + (end (if (and (pair? o) (pair? (cdr o)) (pair? (cddr o))) + (caddr o) + (string-length str)))) + (irregex-match-string-set! matches str) + (let lp ((i start) (acc knil)) + (if (>= i end) + (finish i acc) + (let ((m (irregex-search/matches irx str i end matches))) + (if (not m) + (finish i acc) + (let* ((end (irregex-match-end m 0)) + (acc (kons i m acc))) + (irregex-reset-matches! matches) + (lp end acc)))))))) + +(define (irregex-replace irx str . o) + (let ((m (irregex-search (irregex irx) str))) + (and + m + (string-cat-reverse + (cons (substring str (irregex-match-end m 0) (string-length str)) + (append (irregex-apply-match m o) + (list (substring str 0 (irregex-match-start m 0))))))))) + +(define (irregex-replace/all irx str . o) + (irregex-fold + irx + (lambda (i m acc) + (let ((m-start (irregex-match-start m 0))) + (append (irregex-apply-match m o) + (if (= i m-start) + acc + (cons (substring str i m-start) acc))))) + '() + str + (lambda (i acc) + (let ((end (string-length str))) + (string-cat-reverse (if (= i end) + acc + (cons (substring str i end) acc))))))) + +(define (irregex-apply-match m ls) + (let lp ((ls ls) (res '())) + (if (null? ls) + res + (cond + ((integer? (car ls)) + (lp (cdr ls) + (cons (or (irregex-match-substring m (car ls)) "") res))) + ((procedure? (car ls)) + (lp (cdr ls) (cons ((car ls) m) res))) + ((symbol? (car ls)) + (case (car ls) + ((pre) + (lp (cdr ls) + (cons (substring (irregex-match-string m) + 0 + (irregex-match-start m 0)) + res))) + ((post) + (lp (cdr ls) + (cons (substring (irregex-match-string m) + (irregex-match-end m 0) + (string-length (irregex-match-string m))) + res))) + (else (error "unknown match replacement" (car ls))))) + (else + (lp (cdr ls) (cons (car ls) res))))))) diff --git a/library.scm b/library.scm index 2eba6055..e3ab347d 100644 --- a/library.scm +++ b/library.scm @@ -76,8 +76,7 @@ #define C_a_get_current_seconds(ptr, c, dummy) C_flonum(ptr, time(NULL)) #define C_peek_c_string_at(ptr, i) ((C_char *)(((C_char **)ptr)[ i ])) -static C_word -fast_read_line_from_file(C_word str, C_word port, C_word size) { +static C_word fast_read_line_from_file(C_word str, C_word port, C_word size) { int n = C_unfix(size); int i; int c; @@ -102,7 +101,7 @@ fast_read_line_from_file(C_word str, C_word port, C_word size) { } static C_word -fast_read_string_from_file(C_word dest, C_word port, C_word len, C_word pos) +fast_read_string_from_file (C_word dest, C_word port, C_word len, C_word pos) { int n = C_unfix (len); char * buf = ((char *)C_data_pointer (dest) + C_unfix (pos)); @@ -1732,8 +1731,7 @@ EOF (define (##sys#check-port x . loc) (unless (%port? x) - (##sys#signal-hook - #:type-error (and (pair? loc) (car loc)) "argument is not a port" x) ) ) + (##sys#signal-hook #:type-error (and (pair? loc) (car loc)) "argument is not a port" x) ) ) (define (##sys#check-port-mode port mode . loc) (unless (eq? mode (##sys#slot port 1)) diff --git a/manual/Supported language b/manual/Supported language index d7be9c82..3fe014cd 100644 --- a/manual/Supported language +++ b/manual/Supported language @@ -19,7 +19,7 @@ * [[Unit ports]] I/O ports * [[Unit files]] File and pathname operations * [[Unit extras]] Useful utility definitions -* [[Unit irregex]] Regular expressions +* [[Unit regex]] Regular expressions * [[Unit srfi-1]] List Library * [[Unit srfi-4]] Homogeneous numeric vectors * [[Unit srfi-13]] String library diff --git a/manual/Unit extras b/manual/Unit extras index 3d0163a8..f7f28cca 100644 --- a/manual/Unit extras +++ b/manual/Unit extras @@ -196,4 +196,4 @@ false. Returns a string with the accumulated characters. --- Previous: [[Unit files]] -Next: [[Unit irregex]] +Next: [[Unit regex]] diff --git a/manual/Unit irregex b/manual/Unit irregex deleted file mode 100644 index 51073caf..00000000 --- a/manual/Unit irregex +++ /dev/null @@ -1,819 +0,0 @@ -[[tags: manual]] -[[toc:]] - -== Unit irregex - -This library unit provides support for regular expressions, using the -powerful ''irregex'' regular expression engine by Alex Shinn. It -supports both POSIX syntax with various (irregular) PCRE extensions, -as well as SCSH's SRE syntax, with various aliases for commonly used -patterns. DFA matching is used when possible, otherwise a -closure-compiled NFA approach is used. Matching may be performed over -standard Scheme strings, or over arbitrarily chunked streams of -strings. - -On systems that support dynamic loading, the {{irregex}} unit can -be made available in the Chicken interpreter ({{csi}}) by entering - -<enscript highlight=scheme> -(require-extension irregex) -</enscript> - -[[toc:]] - -=== Specification - -==== Procedures - -===== irregex -===== string->irregex -===== sre->irregex - -<procedure>(irregex <posix-string-or-sre> [<options> ...])</procedure><br> -<procedure>(string->irregex <posix-string> [<options> ...])</procedure><br> -<procedure>(sre->irregex <sre> [<options> ...])</procedure><br> - -Compiles a regular expression from either a POSIX-style regular -expression string (with most PCRE extensions) or an SCSH-style SRE. -There is no {{(rx ...)}} syntax - just use normal Scheme lists, with -{{quasiquote}} if you like. - -Technically a string by itself could be considered a valid (though -rather silly) SRE, so if you want to just match a literal string you -should use something like {{(irregex `(: ,str))}}, or use the explicit -{{(sre->irregex str)}}. - -The options are a list of any of the following symbols: - -; {{'i}}, {{'case-insensitive}} : match case-insensitively -; {{'m}}, {{'multi-line}} : treat string as multiple lines (effects {{^}} and {{$}}) -; {{'s}}, {{'single-line}} : treat string as a single line ({{.}} can match newline) -; {{'utf8}} : utf8-mode (assumes strings are byte-strings) -; {{'fast}} : try to optimize the regular expression -; {{'small}} : try to compile a smaller regular expression -; {{'backtrack}} : enforce a backtracking implementation - -The {{'fast}} and {{'small}} options are heuristic guidelines and will -not necessarily make the compiled expression faster or smaller. - -===== string->sre -===== maybe-string->sre - -<procedure>(string->sre <str>)</procedure><br> -<procedure>(maybe-string->sre <obj>)</procedure><br> - -For backwards compatibility, procedures to convert a POSIX string into -an SRE. - -{{maybe-string->sre}} does the same thing, but only if the argument is -a string, otherwise it assumes {{<obj>}} is an SRE and returns it -as-is. This is useful when you want to provide an API that allows -either a POSIX string or SRE (like {{irregex}} or {{irregex-search}} -below) - it ensures the result is an SRE. - -===== irregex? - -<procedure>(irregex? <obj>)</procedure><br> - -Returns {{#t}} iff the object is a regular expression. - -===== irregex-search - -<procedure>(irregex-search <irx> <str> [<start> <end>])</procedure> - -Searches for any instances of the pattern {{<irx>}} (a POSIX string, SRE -sexp, or pre-compiled regular expression) in {{<str>}}, optionally between -the given range. If a match is found, returns a match object, -otherwise returns {{#f}}. - -Match objects can be used to query the original range of the string or -its submatches using the {{irregex-match-*}} procedures below. - -Examples: - -<enscript highlight=scheme> -(irregex-search "foobar" "abcFOOBARdef") => #f - -(irregex-search "foobar" "abcFOOBARdef" 'i) => #<match> - -(irregex-search '(w/nocase "foobar") "abcFOOBARdef") => #<match> -</enscript> - -Note, the actual match result is represented by a vector in the -default implementation. Throughout this manual, we'll just write -{{#<match>}} to show that a successful match was returned when the -details are not important. - -Matching follows the POSIX leftmost, longest semantics, when -searching. That is, of all possible matches in the string, -{{irregex-search}} will return the match at the first position -(leftmost). If multiple matches are possible from that same first -position, the longest match is returned. - -===== irregex-match - -<procedure>(irregex-match <irx> <str>)</procedure> - -Like {{irregex-search}}, but performs an anchored match against the -beginning and end of the string, without searching. - -Examples: - -<enscript highlight=scheme> -(irregex-match '(w/nocase "foobar") "abcFOOBARdef") => #f - -(irregex-match '(w/nocase "foobar") "FOOBAR") => #<match> -</enscript> - -===== irregex-match-data? - -<procedure>(irregex-match-data? <obj>)</procedure> - -Returns {{#t}} iff the object is a successful match result from -{{irregex-search}} or {{irregex-match}}. - -===== irregex-num-submatches -===== irregex-match-num-submatches - -<procedure>(irregex-num-submatches <irx>)</procedure><br> -<procedure>(irregex-match-num-submatches <match>)</procedure> - -Returns the number of numbered submatches that are defined in the -irregex or match object. - -===== irregex-names -===== irregex-match-names - -<procedure>(irregex-names <irx>)</procedure><br> -<procedure>(irregex-match-names <match>)</procedure> - -Returns an association list of named submatches that are defined in -the irregex or match object. The {{car}} of each item in this list is -the name of a submatch, the {{cdr}} of each item is the numerical -submatch corresponding to this name. If a named submatch occurs -multiple times in the irregex, it will also occur multiple times in -this list. - -===== irregex-match-substring -===== irregex-match-start-index -===== irregex-match-end-index - -<procedure>(irregex-match-substring <match> [<index-or-name>])</procedure><br> -<procedure>(irregex-match-start-index <match> <index-or-name>)</procedure><br> -<procedure>(irregex-match-end-index <match> <index-or-name>)</procedure> - -Fetches the matched substring (or its start or end offset) at the -given submatch index, or named submatch. The entire match is index 0, -the first 1, etc. The default is index 0. - -===== irregex-match-subchunk - -<procedure>(irregex-match-subchunk <match> [<index-or-name>])</procedure> - -Generates a chunked data-type for the given match item, of the same -type as the underlying chunk type (see Chunked String Matching below). -This is only available if the chunk type specifies the get-subchunk -API, otherwise an error is raised. - -===== irregex-replace -===== irregex-replace/all - -<procedure>(irregex-replace <irx> <str> [<replacements> ...])</procedure><br> -<procedure>(irregex-replace/all <irx> <str> [<replacements> ...])</procedure> - -Matches a pattern in a string, and replaces it with a (possibly empty) -list of substitutions. Each {{<replacement>}} can be either a string -literal, a numeric index, a symbol (as a named submatch), or a -procedure which takes one argument (the match object) and returns a -string. - -Examples: - -<enscript highlight=scheme> -(irregex-replace "[aeiou]" "hello world" "*") => "h*llo world" - -(irregex-replace/all "[aeiou]" "hello world" "*") => "h*ll* w*rld" -</enscript> - -===== irregex-split -===== irregex-extract - -<procedure>(irregex-split <irx> <str> [<start> <end>])</procedure><br> -<procedure>(irregex-extract <irx> <str> [<start> <end>])</procedure> - -{{irregex-split}} splits the string {{<str>}} into substrings divided -by the pattern in {{<irx>}}. {{irregex-extract}} does the opposite, -returning a list of each instance of the pattern matched disregarding -the substrings in between. - -===== irregex-fold - -<procedure>(irregex-fold <irx> <kons> <knil> <str> [<finish> <start> <end>])</procedure> - -This performs a fold operation over every non-overlapping place -{{<irx>}} occurs in the string {{str}}. - -The {{<kons>}} procedure takes the following signature: - -<enscript highlight=scheme> -(<kons> <from-index> <match> <seed>) -</enscript> - -where {{<from-index>}} is the index from where we started searching -(initially {{<start>}} and thereafter the end index of the last -match), {{<match>}} is the resulting match-data object, and {{<seed>}} -is the accumulated fold result starting with {{<knil>}}. - -The rationale for providing the {{<from-index>}} (which is not -provided in the SCSH {{regexp-fold}} utility), is because this -information is useful (e.g. for extracting the unmatched portion of -the string before the current match, as needed in -{{irregex-replace}}), and not otherwise directly accessible. - -The optional {{<finish>}} takes two arguments: - -<enscript highlight=scheme> -(<finish> <from-index> <seed>) -</enscript> - -which simiarly allows you to pick up the unmatched tail of the string, -and defaults to just returning the {{<seed>}}. - -{{<start>}} and {{<end>}} are numeric indices letting you specify the -boundaries of the string on which you want to fold. - -To extract all instances of a match out of a string, you can use - -<enscript highlight=scheme> -(map irregex-match-substring - (irregex-fold <irx> - (lambda (i m s) (cons m s)) - '() - <str> - (lambda (i s) (reverse s)))) -</enscript> - -==== Extended SRE Syntax - -Irregex provides the first native implementation of SREs (Scheme -Regular Expressions), and includes many extensions necessary both for -minimal POSIX compatibility, as well as for modern extensions found in -libraries such as PCRE. - -The following table summarizes the SRE syntax, with detailed -explanations following. - - ;; basic patterns - <string> ; literal string - (seq <sre> ...) ; sequence - (: <sre> ...) - (or <sre> ...) ; alternation - - ;; optional/multiple patterns - (? <sre> ...) ; 0 or 1 matches - (* <sre> ...) ; 0 or more matches - (+ <sre> ...) ; 1 or more matches - (= <n> <sre> ...) ; exactly <n> matches - (>= <n> <sre> ...) ; <n> or more matches - (** <from> <to> <sre> ...) ; <n> to <m> matches - (?? <sre> ...) ; non-greedy (non-greedy) pattern: (0 or 1) - (*? <sre> ...) ; non-greedy kleene star - (**? <from> <to> <sre> ...) ; non-greedy range - - ;; submatch patterns - (submatch <sre> ...) ; numbered submatch - ($ <sre> ...) - (submatch-named <name> <sre> ...) ; named submatch - (=> <name> <sre> ...) - (backref <n-or-name>) ; match a previous submatch - - ;; toggling case-sensitivity - (w/case <sre> ...) ; enclosed <sre>s are case-sensitive - (w/nocase <sre> ...) ; enclosed <sre>s are case-insensitive - - ;; character sets - <char> ; singleton char set - (<string>) ; set of chars - (or <cset-sre> ...) ; set union - (~ <cset-sre> ...) ; set complement (i.e. [^...]) - (- <cset-sre> ...) ; set difference - (& <cset-sre> ...) ; set intersection - (/ <range-spec> ...) ; pairs of chars as ranges - - ;; named character sets - any - nonl - ascii - lower-case lower - upper-case upper - alphabetic alpha - numeric num - alphanumeric alphanum alnum - punctuation punct - graphic graph - whitespace white space - printing print - control cntrl - hex-digit xdigit - - ;; assertions and conditionals - bos eos ; beginning/end of string - bol eol ; beginning/end of line - bow eow ; beginning/end of word - nwb ; non-word-boundary - (look-ahead <sre> ...) ; zero-width look-ahead assertion - (look-behind <sre> ...) ; zero-width look-behind assertion - (neg-look-ahead <sre> ...) ; zero-width negative look-ahead assertion - (neg-look-behind <sre> ...) ; zero-width negative look-behind assertion - (atomic <sre> ...) ; for (?>...) independent patterns - (if <test> <pass> [<fail>]) ; conditional patterns - commit ; don't backtrack beyond this (i.e. cut) - - ;; backwards compatibility - (posix-string <string>) ; embed a POSIX string literal - -===== Basic SRE Patterns - -The simplest SRE is a literal string, which matches that string -exactly. - -<enscript highlight=scheme> -(irregex-search "needle" "hayneedlehay") => #<match> -</enscipt> - -By default the match is case-sensitive, though you can control this -either with the compiler flags or local overrides: - -<enscript highlight=scheme> -(irregex-search "needle" "haynEEdlehay") => #f - -(irregex-search (irregex "needle" 'i) "haynEEdlehay") => #<match> - -(irregex-search '(w/nocase "needle") "haynEEdlehay") => #<match> -</enscript> - -You can use {{w/case}} to switch back to case-sensitivity inside a -{{w/nocase}} or when the SRE was compiled with {{'i}}: - -<enscript highlight=scheme> -(irregex-search '(w/nocase "SMALL" (w/case "BIG")) "smallBIGsmall") => #<match> - -(irregex-search '(w/nocase "small" (w/case "big")) "smallBIGsmall") => #f -</enscript> - -Of course, literal strings by themselves aren't very interesting -regular expressions, so we want to be able to compose them. The most -basic way to do this is with the {{seq}} operator (or its abbreviation -{{:}}), which matches one or more patterns consecutively: - -<enscript highlight=scheme> -(irregex-search '(: "one" space "two" space "three") "one two three") => #<match> -</enscript> - -As you may have noticed above, the {{w/case}} and {{w/nocase}} -operators allowed multiple SREs in a sequence - other operators that -take any number of arguments (e.g. the repetition operators below) -allow such implicit sequences. - -To match any one of a set of patterns use the {{or}} alternation -operator: - -<enscript highlight=scheme> -(irregex-search '(or "eeney" "meeney" "miney") "meeney") => #<match> - -(irregex-search '(or "eeney" "meeney" "miney") "moe") => #f -</enscript> - -===== SRE Repetition Patterns - -There are also several ways to control the number of times a pattern -is matched. The simplest of these is {{?}} which just optionally -matches the pattern: - -<enscript highlight=scheme> -(irregex-search '(: "match" (? "es") "!") "matches!") => #<match> - -(irregex-search '(: "match" (? "es") "!") "match!") => #<match> - -(irregex-search '(: "match" (? "es") "!") "matche!") => #<match> -</enscript> - -To optionally match any number of times, use {{*}}, the Kleene star: - -<enscript highlight=scheme> -(irregex-search '(: "<" (* (~ #\>)) ">") "<html>") => #<match> - -(irregex-search '(: "<" (* (~ #\>)) ">") "<>") => #<match> - -(irregex-search '(: "<" (* (~ #\>)) ">") "<html") => #f -</enscript> - -Often you want to match any number of times, but at least one time is -required, and for that you use {{+}}: - -<enscript highlight=scheme> -(irregex-search '(: "<" (+ (~ #\>)) ">") "<html>") => #<match> - -(irregex-search '(: "<" (+ (~ #\>)) ">") "<a>") => #<match> - -(irregex-search '(: "<" (+ (~ #\>)) ">") "<>") => #f -</enscript> - -More generally, to match at least a given number of times, use {{>=}}: - -<enscript highlight=scheme> -(irregex-search '(: "<" (>= 3 (~ #\>)) ">") "<table>") => #<match> - -(irregex-search '(: "<" (>= 3 (~ #\>)) ">") "<pre>") => #<match> - -(irregex-search '(: "<" (>= 3 (~ #\>)) ">") "<tr>") => #f -</enscript> - -To match a specific number of times exactly, use {{=}}: - -<enscript highlight=scheme> -(irregex-search '(: "<" (= 4 (~ #\>)) ">") "<html>") => #<match> - -(irregex-search '(: "<" (= 4 (~ #\>)) ">") "<table>") => #f -</enscript> - -And finally, the most general form is {{**}} which specifies a range -of times to match. All of the earlier forms are special cases of this. - -<enscript highlight=scheme> -(irregex-search '(: (= 3 (** 1 3 numeric) ".") (** 1 3 numeric)) "192.168.1.10") => #<match> - -(irregex-search '(: (= 3 (** 1 3 numeric) ".") (** 1 3 numeric)) "192.0168.1.10") => #f -</enscript> - -There are also so-called "non-greedy" variants of these repetition -operators, by convention suffixed with an additional {{?}}. Since the -normal repetition patterns can match any of the allotted repetition -range, these operators will match a string if and only if the normal -versions matched. However, when the endpoints of which submatch -matched where are taken into account (specifically, all matches when -using irregex-search since the endpoints of the match itself matter), -the use of a non-greedy repetition can change the result. - -So, whereas {{?}} can be thought to mean "match or don't match," -{{??}} means "don't match or match." {{*}} typically consumes as much -as possible, but {{*?}} tries first to match zero times, and only -consumes one at a time if that fails. If you have a greedy operator -followed by a non-greedy operator in the same pattern, they can -produce surprisins results as they compete to make the match longer or -shorter. If this seems confusing, that's because it is. Non-greedy -repetitions are defined only in terms of the specific backtracking -algorithm used to implement them, which for compatibility purposes -always means the Perl algorithm. Thus, when using these patterns you -force IrRegex to use a backtracking engine, and can't rely on -efficient execution. - -===== SRE Character Sets - -Perhaps more common than matching specific strings is matching any of -a set of characters. You can use the {{or}} alternation pattern on a -list of single-character strings to simulate a character set, but this -is too clumsy for everyday use so SRE syntax allows a number of -shortcuts. - -A single character matches that character literally, a trivial -character class. More conveniently, a list holding a single element -which is a string refers to the character set composed of every -character in the string. - -<enscript highlight=scheme> -(irregex-match '(* #\-) "---") => #<match> - -(irregex-match '(* #\-) "-_-") => #f - -(irregex-match '(* ("aeiou")) "oui") => #<match> - -(irregex-match '(* ("aeiou")) "ouais") => #f -</enscript> - -Ranges are introduced with the \q{/} operator. Any strings or -characters in the \q{/} are flattened and then taken in pairs to -represent the start and end points, inclusive, of character ranges. - -<enscript highlight=scheme> -(irregex-match '(* (/ "AZ09")) "R2D2") => #<match> - -(irregex-match '(* (/ "AZ09")) "C-3PO") => #f -</enscript> - -In addition, a number of set algebra operations are provided. \q{or}, -of course, has the same meaning, but when all the options are -character sets it can be thought of as the set union operator. This -is further extended by the \q{&} set intersection, \q{-} set -difference, and \q{~} set complement operators. - -<enscript highlight=scheme> -(irregex-match '(* (& (/ "az") (~ ("aeiou")))) "xyzzy") => #<match> - -(irregex-match '(* (& (/ "az") (~ ("aeiou")))) "vowels") => #f - -(irregex-match '(* (- (/ "az") ("aeiou"))) "xyzzy") => #<match> - -(irregex-match '(* (- (/ "az") ("aeiou"))) "vowels") => #f -</enscript> - -===== SRE Assertion Patterns - -There are a number of times it can be useful to assert something about -the area around a pattern without explicitly making it part of the -pattern. The most common cases are specifically anchoring some -pattern to the beginning or end of a word or line or even the whole -string. For example, to match on the end of a word: - -<enscript highlight=scheme> -(irregex-match '(: "foo" eow) "foo") => #<match> - -(irregex-match '(: "foo" eow) "foo!") => #<match> - -(irregex-match '(: "foo" eow) "foof") => #f -</enscript> - -The {{bow}}, {{bol}}, {{eol}}, {{bos}} and {{eos}} work similarly. -{{nwb}} asserts that you are not in a word-boundary - if replaced for -{{eow}} in the above examples it would reverse all the results. - -There is no {{wb}}, since you tend to know from context whether it -would be the beginning or end of a word, but if you need it you can -always use {{(or bow eow)}}. - -Somewhat more generally, Perl introduced positive and negative -look-ahead and look-behind patterns. Perl look-behind patterns are -limited to a fixed length, however the IrRegex versions have no such -limit. - -<enscript highlight=scheme> -(irregex-match '(: "regular" (look-ahead " expression")) - "regular expression") - => #<match> -</enscript> - -The most general case, of course, would be an \q{and} pattern to -complement the \q{or} pattern - all the patterns must match or the -whole pattern fails. This may be provided in a future release, -although it (and look-ahead and look-behind assertions) are unlikely -to be compiled efficiently. - -===== SRE Utility Patterns - -The following utility regular expressions are also provided for common -patterns that people are eternally reinventing. They are not -necessarily the official patterns matching the RFC definitions of the -given data, because of the way that such patterns tend to be used. -There are three general usages for regexps: - -; searching : search for a pattern matching a desired object in a larger text - -; validation : determine whether an entire string matches a pattern - -; extraction : given a string already known to be valid, extract certain fields from it as submatches - -In some cases, but not always, these will overlap. When they are -different, {{irregex-search}} will naturally always want the searching -version, so IrRegex provides that version. - -As an example where these might be different, consider a URL. If you -want to match all the URLs in some arbitrary text, you probably want -to exclude a period or comma at the tail end of a URL, since it's more -likely being used as punctuation rather than part of the URL, despite -the fact that it would be valid URL syntax. - -Another problem with the RFC definitions is the standard itself may -have become irrelevant. For example, the pattern IrRegex provides for -email addresses doesn't match quoted local parts (e.g. -{{"first last"@domain.com}}) because these are increasingly rare, and -unsupported by enough software that it's better to discourage their use. -Conversely, technically consecutive periods -(e.g. {{first..last@domain.com}}) are not allowed in email addresses, but -most email software does allow this, and in fact such addresses are -quite common in Japan. - -The current patterns provided are: - - newline ; general newline pattern (crlf, cr, lf) - integer ; an integer - real ; a real number (including scientific) - string ; a "quoted" string - symbol ; an R5RS Scheme symbol - ipv4-address ; a numeric decimal ipv4 address - ipv6-address ; a numeric hexadecimal ipv6 address - domain ; a domain name - email ; an email address - http-url ; a URL beginning with https?:// - -Because of these issues the exact definitions of these patterns are -subject to be changed, but will be documented clearly when they are -finalized. More common patterns are also planned, but as what you -want increases in complexity it's probably better to use a real -parser. - -==== Supported PCRE Syntax - -Since the PCRE syntax is so overwhelming complex, it's easier to just -list what we *don't* support for now. Refer to the -[[http://pcre.org/pcre.txt|PCRE documentation]] for details. You -should be using the SRE syntax anyway! - -Unicode character classes ({{\P}}) are not supported, but will be -in an upcoming release. {{\C}} named characters are not supported. - -Callbacks, subroutine patterns and recursive patterns are not -supported. ({{*FOO}}) patterns are not supported and may never be. - -{{\G}} and {{\K}} are not supported. - -Octal character escapes are not supported because they are ambiguous -with back-references - just use hex character escapes. - -Other than that everything should work, including named submatches, -zero-width assertions, conditional patterns, etc. - -In addition, {{\<}} and {{\>}} act as beginning-of-word and end-of-word -marks, respectively, as in Emacs regular expressions. - -Also, two escapes are provided to embed SRE patterns inside PCRE -strings, {{"\'<sre>"}} and {{"(*'<sre>)"}}. For example, to match a -comma-delimited list of integers you could use - -<enscript highlight=scheme> -"\\'integer(,\\'integer)*" -</enscript> - -and to match a URL in angle brackets you could use - -<enscript highlight=scheme> -"<('*http-url)>" -</enscript> - -Note in the second example the enclosing {{"('*...)"}} syntax is needed -because the Scheme reader would consider the closing {{">"}} as part of -the SRE symbol. - -The following chart gives a quick reference from PCRE form to the SRE -equivalent: - - ;; basic syntax - "^" ;; bos (or eos inside (?m: ...)) - "$" ;; eos (or eos inside (?m: ...)) - "." ;; nonl - "a?" ;; (? a) - "a*" ;; (* a) - "a+" ;; (+ a) - "a??" ;; (?? a) - "a*?" ;; (*? a) - "a+?" ;; (+? a) - "a{n,m}" ;; (** n m a) - - ;; grouping - "(...)" ;; (submatch ...) - "(?:...)" ;; (: ...) - "(?i:...)" ;; (w/nocase ...) - "(?-i:...)" ;; (w/case ...) - "(?<name>...)" ;; (=> <name>...) - - ;; character classes - "[aeiou]" ;; ("aeiou") - "[^aeiou]" ;; (~ "aeiou") - "[a-z]" ;; (/ "az") or (/ "a" "z") - "[[:alpha:]]" ;; alpha - - ;; assertions - "(?=...)" ;; (look-ahead ...) - "(?!...)" ;; (neg-look-ahead ...) - "(?<=...)" ;; (look-behind ...) - "(?<!...)" ;; (neg-look-behind ...) - "(?(test)pass|fail)" ;; (if test pass fail) - "(*COMMIT)" ;; commit - -==== Chunked String Matching - -It's often desirable to perform regular expression matching over -sequences of characters not represented as a single string. The most -obvious example is a text-buffer data structure, but you may also want -to match over lists or trees of strings (i.e. ropes), over only -certain ranges within a string, over an input port, etc. With -existing regular expression libraries, the only way to accomplish this -is by converting the abstract sequence into a freshly allocated -string. This can be expensive, or even impossible if the object is a -text-buffer opened onto a 500MB file. - -IrRegex provides a chunked string API specifically for this purpose. -You define a chunking API with {{make-irregex-chunker}}: - -===== make-irregex-chunker - -<procedure>(make-irregex-chunker <get-next> <get-string> [<get-start> <get-end> <get-substring> <get-subchunk>])</procedure> - -where - -{{(<get-next> chunk) => }} returns the next chunk, or {{#f}} if there are no more chunks - -{{(<get-string> chunk) => }} a string source for the chunk - -{{(<get-start> chunk) => }} the start index of the result of {{<get-string>}} (defaults to always 0) - -{{(<get-end> chunk) => }} the end (exclusive) of the string (defaults to {{string-length}} of the source string) - -{{(<get-substring> cnk1 i cnk2 j) => }} a substring for the range between the chunk {{cnk1}} starting at index {{i}} and ending at {{cnk2}} at index {{j}} - -{{(<get-subchunk> cnk1 i cnk2 j) => }} as above but returns a new chunked data type instead of a string (optional) - -There are two important constraints on the {{<get-next>}} procedure. -It must return an {{eq?}} identical object when called multiple times -on the same chunk, and it must not return a chunk with an empty string -(start == end). This second constraint is for performance reasons - -we push the work of possibly filtering empty chunks to the chunker -since there are many chunk types for which empty strings aren't -possible, and this work is thus not needed. Note that the initial -chunk passed to match on is allowed to be empty. - -{{<get-substring>}} is provided for possible performance improvements -- without it a default is used. {{<get-subchunk>}} is optional - -without it you may not use {{irregex-match-subchunk}} described above. - -You can then match chunks of these types with the following -procedures: - -===== irregex-search/chunked -===== irregex-match/chunked - -<procedure>(irregex-search/chunked <irx> <chunker> <chunk> [<start>])</procedure><br> -<procedure>(irregex-match/chunked <irx> <chunker> <chunk> [<start>])</procedure> - -These return normal match-data objects. - -Example: - -To match against a simple, flat list of strings use: - -<enscript highlight=scheme> - (define (rope->string rope1 start rope2 end) - (if (eq? rope1 rope2) - (substring (car rope1) start end) - (let loop ((rope (cdr rope1)) - (res (list (substring (car rope1) start)))) - (if (eq? rope rope2) - (string-concatenate-reverse ; from SRFI-13 - (cons (substring (car rope) 0 end) res)) - (loop (cdr rope) (cons (car rope) res)))))) - - (define rope-chunker - (make-irregex-chunker (lambda (x) (and (pair? (cdr x)) (cdr x))) - car - (lambda (x) 0) - (lambda (x) (string-length (car x))) - rope->string)) - - (irregex-search/chunked <pat> rope-chunker <list-of-strings>) -</enscript> - -Here we are just using the default start, end and substring behaviors, -so the above chunker could simply be defined as: - -<enscript highlight=scheme> - (define rope-chunker - (make-irregex-chunker (lambda (x) (and (pair? (cdr x)) (cdr x))) car)) -</enscript> - -===== irregex-fold/chunked - -<procedure>(irregex-fold/chunked <irx> <kons> <knil> <chunker> <chunk> [<finish> [<start-index>]])</procedure> - -Chunked version of {{irregex-fold}}. - -==== Utilities - -The following procedures are also available. - -===== irregex-quote - -<procedure>(irregex-quote <str>)</procedure> - -Returns a new string with any special regular expression characters -escaped, to match the original string literally in POSIX regular -expressions. - -===== irregex-opt - -<procedure>(irregex-opt <list-of-strings>)</procedure> - -Returns an optimized SRE matching any of the literal strings -in the list, like Emacs' \q{regexp-opt}. Note this optimization -doesn't help when irregex is able to build a DFA. - -===== sre->string - -<procedure>(sre->string <sre>)</procedure> - -Convert an SRE to a POSIX-style regular expression string, if -possible. - - ---- -Previous: [[Unit extras]] - -Next: [[Unit srfi-1]] diff --git a/manual/Unit regex b/manual/Unit regex index bd6eb479..2d0c249e 100644 --- a/manual/Unit regex +++ b/manual/Unit regex @@ -3,15 +3,21 @@ == Unit regex +This library unit provides support for regular expressions. The regular +expression package used is {{irregex}} +written by Alex Shinn. Irregex supports most Perl-extensions and is +written completely in Scheme. -This library unit provides some high-level operations for regular -expression and operations that are kept for backward compatibility -to older versions of CHICKEN. +This library unit exposes two APIs: the standard Chicken API described below, and the +original irregex API. You may use either API or both: -This unit uses the {{irregex}} unit internally. It is recommended -to use the {{irregex}} API where possible, since it provides a -more featureful interface. + (require-library regex) ; required for either API, or both + (import regex) ; import the Chicken regex API + (import irregex) ; import the original irregex API +Regular expressions may be either POSIX-style strings (with most PCRE +extensions) or an SCSH-style SRE. There is no {{(rx ...)}} syntax - +just use normal Scheme lists, with quasiquote if you like. === grep @@ -190,7 +196,266 @@ into a regular expression. => "\\^\\[0-9\\]\\+:.\n.\\*\\$" </enscript> +=== Extended SRE Syntax + +The following table summarizes the SRE syntax, with detailed explanations following. + + ;; basic patterns + <string> ; literal string + (seq <sre> ...) ; sequence + (: <sre> ...) + (or <sre> ...) ; alternation + + ;; optional/multiple patterns + (? <sre> ...) ; 0 or 1 matches + (* <sre> ...) ; 0 or more matches + (+ <sre> ...) ; 1 or more matches + (= <n> <sre> ...) ; exactly <n> matches + (>= <n> <sre> ...) ; <n> or more matches + (** <from> <to> <sre> ...) ; <n> to <m> matches + (?? <sre> ...) ; non-greedy (non-greedy) pattern: (0 or 1) + (*? <sre> ...) ; non-greedy kleene star + (**? <from> <to> <sre> ...) ; non-greedy range + + ;; submatch patterns + (submatch <sre> ...) ; numbered submatch + (submatch-named <name> <sre> ...) ; named submatch + (=> <name> <sre> ...) + (backref <n-or-name>) ; match a previous submatch + + ;; toggling case-sensitivity + (w/case <sre> ...) ; enclosed <sre>s are case-sensitive + (w/nocase <sre> ...) ; enclosed <sre>s are case-insensitive + + ;; character sets + <char> ; singleton char set + (<string>) ; set of chars + (or <cset-sre> ...) ; set union + (~ <cset-sre> ...) ; set complement (i.e. [^...]) + (- <cset-sre> ...) ; set difference + (& <cset-sre> ...) ; set intersection + (/ <range-spec> ...) ; pairs of chars as ranges + + ;; named character sets + any + nonl + ascii + lower-case lower + upper-case upper + alphabetic alpha + numeric num + alphanumeric alphanum alnum + punctuation punct + graphic graph + whitespace white space + printing print + control cntrl + hex-digit xdigit + + ;; assertions and conditionals + bos eos ; beginning/end of string + bol eol ; beginning/end of line + bow eow ; beginning/end of word + nwb ; non-word-boundary + (look-ahead <sre> ...) ; zero-width look-ahead assertion + (look-behind <sre> ...) ; zero-width look-behind assertion + (neg-look-ahead <sre> ...) ; zero-width negative look-ahead assertion + (neg-look-behind <sre> ...) ; zero-width negative look-behind assertion + (atomic <sre> ...) ; for (?>...) independent patterns + (if <test> <pass> [<fail>]) ; conditional patterns + commit ; don't backtrack beyond this (i.e. cut) + + ;; backwards compatibility + (posix-string <string>) ; embed a POSIX string literal + +==== Basic SRE Patterns + +The simplest SRE is a literal string, which matches that string exactly. + + (string-search "needle" "hayneedlehay") => <match> + +By default the match is case-sensitive, though you can control this either with the compiler flags or local overrides: + + (string-search "needle" "haynEEdlehay") => #f + + (string-search (irregex "needle" 'i) "haynEEdlehay") => <match> + + (string-search '(w/nocase "needle") "haynEEdlehay") => <match> + +You can use {{w/case}} to switch back to case-sensitivity inside a {{w/nocase}}: + + (string-search '(w/nocase "SMALL" (w/case "BIG")) "smallBIGsmall") => <match> + + (string-search '(w/nocase "small" (w/case "big")) "smallBIGsmall") => #f + +Of course, literal strings by themselves aren't very interesting +regular expressions, so we want to be able to compose them. The most +basic way to do this is with the {{seq}} operator (or its abbreviation {{:}}), +which matches one or more patterns consecutively: + + (string-search '(: "one" space "two" space "three") "one two three") => <match> + +As you may have noticed above, the {{w/case}} and {{w/nocase}} operators +allowed multiple SREs in a sequence - other operators that take any +number of arguments (e.g. the repetition operators below) allow such +implicit sequences. + +To match any one of a set of patterns use the or alternation operator: + + (string-search '(or "eeney" "meeney" "miney") "meeney") => <match> + + (string-search '(or "eeney" "meeney" "miney") "moe") => #f + +==== SRE Repetition Patterns + +There are also several ways to control the number of times a pattern +is matched. The simplest of these is {{?}} which just optionally matches +the pattern: + + (string-search '(: "match" (? "es") "!") "matches!") => <match> + + (string-search '(: "match" (? "es") "!") "match!") => <match> + + (string-search '(: "match" (? "es") "!") "matche!") => #f + +To optionally match any number of times, use {{*}}, the Kleene star: + + (string-search '(: "<" (* (~ #\>)) ">") "<html>") => <match> + + (string-search '(: "<" (* (~ #\>)) ">") "<>") => <match> + + (string-search '(: "<" (* (~ #\>)) ">") "<html") => #f + +Often you want to match any number of times, but at least one time is required, and for that you use {{+}}: + + (string-search '(: "<" (+ (~ #\>)) ">") "<html>") => <match> + + (string-search '(: "<" (+ (~ #\>)) ">") "<a>") => <match> + + (string-search '(: "<" (+ (~ #\>)) ">") "<>") => #f + +More generally, to match at least a given number of times, use {{>=}}: + + (string-search '(: "<" (>= 3 (~ #\>)) ">") "<table>") => <match> + + (string-search '(: "<" (>= 3 (~ #\>)) ">") "<pre>") => <match> + + (string-search '(: "<" (>= 3 (~ #\>)) ">") "<tr>") => #f + +To match a specific number of times exactly, use {=}: + + (string-search '(: "<" (= 4 (~ #\>)) ">") "<html>") => <match> + + (string-search '(: "<" (= 4 (~ #\>)) ">") "<table>") => #f + +And finally, the most general form is {{**}} which specifies a range +of times to match. All of the earlier forms are special cases of this. + + (string-search '(: (= 3 (** 1 3 numeric) ".") (** 1 3 numeric)) "192.168.1.10") => <match> + + (string-search '(: (= 3 (** 1 3 numeric) ".") (** 1 3 numeric)) "192.0168.1.10") => #f + +There are also so-called "non-greedy" variants of these repetition +operators, by convention suffixed with an additional {{?}}. Since the +normal repetition patterns can match any of the allotted repetition +range, these operators will match a string if and only if the normal +versions matched. However, when the endpoints of which submatch +matched where are taken into account (specifically, all matches when +using string-search since the endpoints of the match itself matter), +the use of a non-greedy repetition can change the result. + +So, whereas {{?}} can be thought to mean "match or don't match," {{??}} means +"don't match or match." {{*}} typically consumes as much as possible, but +{{*?}} tries first to match zero times, and only consumes one at a time if +that fails. If you have a greedy operator followed by a non-greedy +operator in the same pattern, they can produce surprisins results as +they compete to make the match longer or shorter. If this seems +confusing, that's because it is. Non-greedy repetitions are defined +only in terms of the specific backtracking algorithm used to implement +them, which for compatibility purposes always means the Perl +algorithm. Thus, when using these patterns you force IrRegex to use a +backtracking engine, and can't rely on efficient execution. + +==== SRE Character Sets + +Perhaps more common than matching specific strings is matching any of +a set of characters. You can use the or alternation pattern on a list +of single-character strings to simulate a character set, but this is +too clumsy for everyday use so SRE syntax allows a number of +shortcuts. + +A single character matches that character literally, a trivial +character class. More conveniently, a list holding a single element +which is a string refers to the character set composed of every +character in the string. + + (string-match '(* #\-) "---") => <match> + + (string-match '(* #\-) "-_-") => #f + + (string-match '(* ("aeiou")) "oui") => <match> + + (string-match '(* ("aeiou")) "ouais") => #f + +Ranges are introduced with the {{/}} operator. Any strings or characters +in the {{/}} are flattened and then taken in pairs to represent the start +and end points, inclusive, of character ranges. + + (string-match '(* (/ "AZ09")) "R2D2") => <match> + + (string-match '(* (/ "AZ09")) "C-3PO") => #f + +In addition, a number of set algebra operations are provided. or, of +course, has the same meaning, but when all the options are character +sets it can be thought of as the set union operator. This is further +extended by the {{&}} set intersection, {{-}} set difference, and {{~}} set +complement operators. + + (string-match '(* (& (/ "az") (~ ("aeiou")))) "xyzzy") => <match> + + (string-match '(* (& (/ "az") (~ ("aeiou")))) "vowels") => #f + + (string-match '(* (- (/ "az") ("aeiou"))) "xyzzy") => <match> + + (string-match '(* (- (/ "az") ("aeiou"))) "vowels") => #f + +==== SRE Assertion Patterns + +There are a number of times it can be useful to assert something about +the area around a pattern without explicitly making it part of the +pattern. The most common cases are specifically anchoring some pattern +to the beginning or end of a word or line or even the whole +string. For example, to match on the end of a word: + + (string-match '(: "foo" eow) "foo") => <match> + + (string-match '(: "foo" eow) "foo!") => <match> + + (string-match '(: "foo" eow) "foof") => #f + +The {{bow}}, {{bol}}, {{eol}}, {{bos}} and {{eos}} work similarly. {{nwb}} asserts that you +are not in a word-boundary - if replaced for {{eow}} in the above examples +it would reverse all the results. + +There is no {{wb}}, since you tend to know from context whether it +would be the beginning or end of a word, but if you need it you can +always use (or bow eow). + +Somewhat more generally, Perl introduced positive and negative +look-ahead and look-behind patterns. Perl look-behind patterns are +limited to a fixed length, however the IrRegex versions have no such +limit. + + (string-match '(: "regular" (look-ahead " expression")) "regular expression") => <match> + +The most general case, of course, would be an and pattern to +complement the or pattern - all the patterns must match or the whole +pattern fails. This may be provided in a future release, although it +(and look-ahead and look-behind assertions) are unlikely to be +compiled efficiently. + + --- -Previous: [[Unit irregex]] +Previous: [[Unit extras]] Next: [[Unit srfi-1]] diff --git a/manual/Unit srfi-1 b/manual/Unit srfi-1 index 7a8eb917..72ac271f 100644 --- a/manual/Unit srfi-1 +++ b/manual/Unit srfi-1 @@ -1515,6 +1515,6 @@ arguments. ---- -Previous: [[Unit irregex]] +Previous: [[Unit regex]] Next: [[Unit srfi-4]] diff --git a/manual/Unit utils b/manual/Unit utils index 8c174b1d..e1af0895 100644 --- a/manual/Unit utils +++ b/manual/Unit utils @@ -88,12 +88,11 @@ The initial default options are {{-scrutinize -O2 -d2}}. <procedure>(scan-input-lines REGEXP [PORT])</procedure> Reads lines from {{PORT}} (defaults to the result of {{(current-input-port)}}) -using {{read-line}} and returns the result of {{(irregex-search REGEXP LINE)}}, +using {{read-line}} and returns the result of {{(string-search REGEXP LINE)}}, if the match succeeds. If no match could be found, {{#f}} is returned. {{REGEXP}} may also be a procedure of one argument which is called for each -input line and should return a non-false value on success, which will then -be the result of the call to {{scan-input-lines}}. +input line and should return a non-false value on success. === Asking the user for confirmation diff --git a/manual/faq b/manual/faq index 24f054af..7abf0381 100644 --- a/manual/faq +++ b/manual/faq @@ -533,7 +533,7 @@ Compile the program that uses the module: The regular expression engine has recently be replaced by [[/users/alex shinn|alex shinn]]'s excellent {{irregex}} library, which is fully implemented in Scheme. Precompiling regular expressions to internal form is somewhat slower than with the old PCRE-based -regex engine. It is advisable to use {{irregex}} to precompile regular expressions +regex engine. It is advisable to use {{regexp}} to precompile regular expressions outside of time-critical loops and use them where performance matters. diff --git a/posix-common.scm b/posix-common.scm index e77b05fb..55f9f488 100644 --- a/posix-common.scm +++ b/posix-common.scm @@ -233,54 +233,64 @@ EOF ;;; Filename globbing: (define glob - (lambda paths - (let conc-loop ((paths paths)) - (if (null? paths) - '() - (let ((path (car paths))) - (let-values (((dir fil ext) (decompose-pathname path))) - (let ((rx (##sys#glob->regexp (make-pathname #f (or fil "*") ext)))) - (let loop ((fns (directory (or dir ".") #t))) - (cond ((null? fns) (conc-loop (cdr paths))) - ((irregex-match rx (car fns)) - => (lambda (m) - (cons - (make-pathname dir (irregex-match-substring m)) - (loop (cdr fns)))) ) - (else (loop (cdr fns))) ) ) ) ) ) ) ) ) ) + (let ((regexp regexp) + (string-match string-match) + (glob->regexp glob->regexp) + (directory directory) + (make-pathname make-pathname) + (decompose-pathname decompose-pathname) ) + (lambda paths + (let conc-loop ((paths paths)) + (if (null? paths) + '() + (let ((path (car paths))) + (let-values (((dir fil ext) (decompose-pathname path))) + (let* ((patt (glob->regexp (make-pathname #f (or fil "*") ext))) + (rx (regexp patt))) + (let loop ((fns (directory (or dir ".") #t))) + (cond ((null? fns) (conc-loop (cdr paths))) + ((string-match rx (car fns)) + => (lambda (m) (cons (make-pathname dir (car m)) (loop (cdr fns)))) ) + (else (loop (cdr fns))) ) ) ) ) ) ) ) ) ) ) ;;; Find matching files: (define ##sys#find-files - (lambda (dir pred action id limit follow dot loc) - (##sys#check-string dir loc) - (let* ((depth 0) - (lproc - (cond ((not limit) (lambda _ #t)) - ((fixnum? limit) (lambda _ (fx< depth limit))) - (else limit) ) ) - (pproc - (if (or (string? pred) (irregex? pred)) - (let ((pred (irregex pred))) ; force compilation - (lambda (x) (irregex-match pred x))) - pred) ) ) - (let loop ((fs (glob (make-pathname dir (if dot "?*" "*")))) - (r id) ) - (if (null? fs) - r - (let ((f (##sys#slot fs 0)) - (rest (##sys#slot fs 1)) ) - (cond ((directory? f) - (cond ((member (pathname-file f) '("." "..")) (loop rest r)) - ((lproc f) - (loop rest - (fluid-let ((depth (fx+ depth 1))) - (loop (glob (make-pathname f "*")) - (if (pproc f) (action f r) r)) ) ) ) - (else (loop rest (if (pproc f) (action f r) r))) ) ) - ((pproc f) (loop rest (action f r))) - (else (loop rest r)) ) ) ) ) ) ) ) + (let ((glob glob) + (string-match string-match) + (make-pathname make-pathname) + (pathname-file pathname-file) + (symbolic-link? symbolic-link?) + (directory? directory?) ) + (lambda (dir pred action id limit follow dot loc) + (##sys#check-string dir loc) + (let* ((depth 0) + (lproc + (cond ((not limit) (lambda _ #t)) + ((fixnum? limit) (lambda _ (fx< depth limit))) + (else limit) ) ) + (pproc + (if (or (string? pred) (regexp? pred)) + (let ((pred (regexp pred))) ; force compilation + (lambda (x) (string-match pred x))) + pred) ) ) + (let loop ((fs (glob (make-pathname dir (if dot "?*" "*")))) + (r id) ) + (if (null? fs) + r + (let ((f (##sys#slot fs 0)) + (rest (##sys#slot fs 1)) ) + (cond ((directory? f) + (cond ((member (pathname-file f) '("." "..")) (loop rest r)) + ((lproc f) + (loop rest + (fluid-let ((depth (fx+ depth 1))) + (loop (glob (make-pathname f "*")) + (if (pproc f) (action f r) r)) ) ) ) + (else (loop rest (if (pproc f) (action f r) r))) ) ) + ((pproc f) (loop rest (action f r))) + (else (loop rest r)) ) ) ) ) ) ) ) ) (define (find-files dir . args) (cond ((or (null? args) (not (keyword? (car args)))) diff --git a/posixunix.scm b/posixunix.scm index ed83fe1b..e17f6b3e 100644 --- a/posixunix.scm +++ b/posixunix.scm @@ -27,7 +27,7 @@ (declare (unit posix) - (uses scheduler irregex extras utils files ports) + (uses scheduler regex extras utils files ports) (disable-interrupts) (hide group-member _get-groups _ensure-groups posix-error ##sys#terminal-check) (not inline ##sys#interrupt-hook ##sys#user-interrupt-hook)) diff --git a/posixwin.scm b/posixwin.scm index 97a0a232..9dee8ede 100644 --- a/posixwin.scm +++ b/posixwin.scm @@ -63,7 +63,7 @@ (declare (unit posix) - (uses scheduler irregex extras utils files ports) + (uses scheduler regex extras utils files ports) (disable-interrupts) (hide $quote-args-list $exec-setup $exec-teardown) (not inline ##sys#interrupt-hook ##sys#user-interrupt-hook) diff --git a/regex.import.scm b/regex.import.scm new file mode 100644 index 00000000..70310a22 --- /dev/null +++ b/regex.import.scm @@ -0,0 +1,41 @@ +;;;; regex.import.scm - import library for "regex" module +; +; Copyright (c) 2008-2010, The Chicken Team +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following +; conditions are met: +; +; Redistributions of source code must retain the above copyright notice, this list of conditions and the following +; disclaimer. +; Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following +; disclaimer in the documentation and/or other materials provided with the distribution. +; Neither the name of the author nor the names of its contributors may be used to endorse or promote +; products derived from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS +; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY +; AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR +; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +; OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +; POSSIBILITY OF SUCH DAMAGE. + + +(##sys#register-primitive-module + 'regex + '(glob->regexp + glob? + grep + regexp + regexp-escape + regexp? + string-match + string-match-positions + string-search + string-search-positions + string-split-fields + string-substitute + string-substitute*)) diff --git a/regex.scm b/regex.scm new file mode 100644 index 00000000..526e65ad --- /dev/null +++ b/regex.scm @@ -0,0 +1,360 @@ +;;;; regex.scm +; +; Copyright (c) 2008-2010, The Chicken Team +; Copyright (c) 2000-2007, Felix L. Winkelmann +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following +; conditions are met: +; +; Redistributions of source code must retain the above copyright notice, this list of conditions and the following +; disclaimer. +; Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following +; disclaimer in the documentation and/or other materials provided with the distribution. +; Neither the name of the author nor the names of its contributors may be used to endorse or promote +; products derived from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS +; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY +; AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR +; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +; OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +; POSSIBILITY OF SUCH DAMAGE. + + +(declare (unit regex)) + +(declare + (disable-interrupts) + (fixnum) + (export + regexp? regexp + string-match string-match-positions string-search string-search-positions + string-split-fields string-substitute string-substitute* + glob->regexp + grep + regexp-escape + + irregex string->irregex sre->irregex string->sre + irregex? irregex-match-data? + irregex-new-matches irregex-reset-matches! + irregex-match-start irregex-match-end irregex-match-substring + irregex-match-num-submatches + irregex-search irregex-search/matches irregex-match irregex-match-string + irregex-fold irregex-replace irregex-replace/all irregex-apply-match + irregex-dfa irregex-dfa/search irregex-dfa/extract + irregex-nfa irregex-flags irregex-submatches irregex-lengths irregex-names + )) + +(include "common-declarations.scm") + +(register-feature! 'regex 'irregex) + +(include "irregex.scm") + + +;;; Record `regexp' + +(define-record regexp x) + +(define-syntax (build-cache x r c) + ;; (build-cache N ARG FAIL) + (let* ((n (cadr x)) + (n2 (* n 2)) + (arg (caddr x)) + (fail (cadddr x)) + (%cache (r 'cache)) + (%index (r 'index)) + (%arg (r 'arg)) + (%let (r 'let)) + (%let* (r 'let*)) + (%if (r 'if)) + (%fx+ (r 'fx+)) + (%fxmod (r 'fxmod)) + (%equal? (r 'equal?)) + (%quote (r 'quote)) + (%tmp (r 'tmp)) + (%begin (r 'begin)) + (cache (make-vector (add1 n2) #f))) + (vector-set! cache n2 0) ; last slot: current index + `(,%let* ((,%cache (,%quote ,cache)) + (,%arg ,arg)) + ,(let fold ((i 0)) + (if (>= i n) + ;; this should be thread-safe: a context-switch can only + ;; happen before this code and in the call to FAIL. + `(,%let ((,%tmp ,fail) + (,%index (##sys#slot ,%cache ,n2))) + (##sys#setslot ,%cache ,%index ,%arg) + (##sys#setslot ,%cache (,%fx+ ,%index 1) ,%tmp) + (##sys#setislot + ,%cache ,n2 + (##core#inline "C_u_fixnum_modulo" (,%fx+ ,%index 2) ,n2)) + ,%tmp) + `(,%if (,%equal? (##sys#slot ,%cache ,(* i 2)) ,%arg) + (##sys#slot ,%cache ,(add1 (* i 2))) + ,(fold (add1 i)))))))) + +(define (regexp pat #!optional caseless extended utf8) + (if (regexp? pat) + pat + (make-regexp + (apply + irregex + pat + (let ((opts '())) + (when caseless (set! opts (cons 'i opts))) + (when extended (set! opts (cons 'x opts))) + (when utf8 (set! opts (cons 'utf8 opts))) + opts))) ) ) + +(define (unregexp x) + (cond ((regexp? x) (regexp-x x)) + ((irregex? x) x) + (else + (build-cache + 5 x + (irregex x))))) + + +;;; Basic `regexp' operations + +(define (string-match rx str) + (let ((rx (unregexp rx))) + (and-let* ((m (irregex-match rx str))) + (let loop ((i (irregex-match-num-submatches m)) + (res '())) + (if (fx<= i 0) + (cons str res) + (loop (fx- i 1) (cons (irregex-match-substring m i) res))))))) + +(define (string-match-positions rx str) + (let ((rx (unregexp rx))) + (and-let* ((m (irregex-match rx str))) + (let loop ((i (irregex-match-num-submatches m)) + (res '())) + (if (fx<= i 0) + (cons (list 0 (string-length str)) res) + (loop (fx- i 1) (cons (list (irregex-match-start-index m i) + (irregex-match-end-index m i)) + res))))))) + +(define (string-search rx str #!optional (start 0) (range (string-length str))) + (let ((rx (unregexp rx))) + (let ((n (string-length str))) + (and-let* ((m (irregex-search rx str start (min n (fx+ start range))))) + (let loop ((i (irregex-match-num-submatches m)) + (res '())) + (if (fx< i 0) + res + (loop (fx- i 1) (cons (irregex-match-substring m i) res)))))))) + +(define (string-search-positions rx str #!optional (start 0) (range (string-length str))) + (let ((rx (unregexp rx))) + (let ((n (string-length str))) + (and-let* ((m (irregex-search rx str start (min n (fx+ start range))))) + (let loop ((i (irregex-match-num-submatches m)) + (res '())) + (if (fx< i 0) + res + (loop (fx- i 1) (cons (list (irregex-match-start-index m i) + (irregex-match-end-index m i)) + res)))))))) + + +;;; Split string into fields: + +(define string-split-fields + (let ([reverse reverse] + [substring substring] + [string-search-positions string-search-positions] ) + (lambda (rx str . mode-and-start) + (##sys#check-string str 'string-split-fields) + (let* ([argc (length mode-and-start)] + [len (##sys#size str)] + [mode (if (fx> argc 0) (car mode-and-start) #t)] + [start (if (fx> argc 1) (cadr mode-and-start) 0)] + [fini (case mode + [(#:suffix) + (lambda (ms start) + (if (fx< start len) + (##sys#error 'string-split-fields + "record does not end with suffix" str rx) + (reverse ms) ) ) ] + [(#:infix) + (lambda (ms start) + (if (fx>= start len) + (reverse (cons "" ms)) + (reverse (cons (substring str start len) ms)) ) ) ] + [else (lambda (ms start) (reverse ms)) ] ) ] + [fetch (case mode + [(#:infix #:suffix) (lambda (start from to) (substring str start from))] + [else (lambda (start from to) (substring str from to))] ) ] ) + (let loop ([ms '()] [start start]) + (let ([m (string-search-positions rx str start)]) + (if m + (let* ([mp (car m)] + [from (car mp)] + [to (cadr mp)] ) + (if (fx= from to) + (if (fx= to len) + (fini ms start) + (loop (cons (fetch start (fx+ from 1) (fx+ to 2)) ms) (fx+ to 1)) ) + (loop (cons (fetch start from to) ms) to) ) ) + (fini ms start) ) ) ) ) ) ) ) + + +;;; Substitute matching strings: + +(define string-substitute + (let ([substring substring] + [reverse reverse] + [make-string make-string] + [string-search-positions string-search-positions] ) + (lambda (rx subst string . flag) + (##sys#check-string subst 'string-substitute) + (##sys#check-string string 'string-substitute) + (let* ([which (if (pair? flag) (car flag) 1)] + [substlen (##sys#size subst)] + (strlen (##sys#size string)) + [substlen-1 (fx- substlen 1)] + [result '()] + [total 0] ) + (define (push x) + (set! result (cons x result)) + (set! total (fx+ total (##sys#size x))) ) + (define (substitute matches) + (let loop ([start 0] [index 0]) + (if (fx>= index substlen-1) + (push (if (fx= start 0) subst (substring subst start substlen))) + (let ([c (##core#inline "C_subchar" subst index)] + [index+1 (fx+ index 1)] ) + (if (char=? c #\\) + (let ([c2 (##core#inline "C_subchar" subst index+1)]) + (if (and (not (char=? #\\ c2)) (char-numeric? c2)) + (let ([mi (list-ref matches (fx- (char->integer c2) 48))]) + (push (substring subst start index)) + (push (substring string (car mi) (cadr mi))) + (loop (fx+ index 2) index+1) ) + (loop start (fx+ index+1 1)) ) ) + (loop start index+1) ) ) ) ) ) + (let loop ([index 0] [count 1]) + (let ((matches (and (fx< index strlen) + (string-search-positions rx string index)))) + (cond [matches + (let* ([range (car matches)] + [upto (cadr range)] ) + (cond ((fx= 0 (fx- (cadr range) (car range))) + (##sys#error + 'string-substitute "empty substitution match" + rx) ) + ((or (not (fixnum? which)) (fx= count which)) + (push (substring string index (car range))) + (substitute matches) + (loop upto #f) ) + (else + (push (substring string index upto)) + (loop upto (fx+ count 1)) ) ) ) ] + [else + (push (substring string index (##sys#size string))) + (##sys#fragments->string total (reverse result)) ] ) ) ) ) ) ) ) + +(define string-substitute* + (let ([string-substitute string-substitute]) + (lambda (str smap . mode) + (##sys#check-string str 'string-substitute*) + (##sys#check-list smap 'string-substitute*) + (let ((mode (and (pair? mode) (car mode)))) + (let loop ((str str) (smap smap)) + (if (null? smap) + str + (let ((sm (car smap))) + (loop (string-substitute (car sm) (cdr sm) str mode) + (cdr smap) ) ) ) ) ) ) ) ) + + +;;; Glob support: + +(define glob->regexp + (let ((list->string list->string) + (string->list string->list) + (regexp regexp)) + (lambda (s #!optional sre?) + (##sys#check-string s 'glob->regexp) + (let ((sre + (cons + ': + (let loop ((cs (string->list s)) (dir #t)) + (if (null? cs) + '() + (let ((c (car cs)) + (rest (cdr cs)) ) + (cond ((char=? c #\*) + (if dir + `((or (: (~ ("./\\")) + (* (~ ("/\\")))) + (* (~ ("./\\")))) + ,@(loop rest #f)) + `((* (~ ("/\\"))) ,@(loop rest #f)))) + ((char=? c #\?) (cons 'any (loop rest #f))) + ((char=? c #\[) + (let loop2 ((rest rest) (s '())) + (cond ((not (pair? rest)) + (error 'glob->regexp "unexpected end of character class" s)) + ((char=? #\] (car rest)) + `((or ,@s) ,@(loop (cdr rest) #f))) + ((and (pair? (cdr rest)) + (pair? (cddr rest)) + (char=? #\- (cadr rest)) ) + (loop2 (cdddr rest) (cons `(/ ,(car rest) ,(caddr rest)) s))) + ((and (pair? (cdr rest)) + (char=? #\- (car rest))) + (loop2 (cddr rest) + (cons `(~ ,(cadr rest)) s))) + (else + (loop2 (cdr rest) (cons (car rest) s)))))) + (else (cons c (loop rest (memq c '(#\\ #\/)))))))))))) + (if sre? sre (regexp sre)))))) + + +;;; Grep-like function on list: + +(define grep + (let ((string-search string-search) + (regexp regexp)) + (lambda (rx lst #!optional (acc (lambda (x) x))) + (##sys#check-list lst 'grep) + (##sys#check-closure acc 'grep) + (let ((rx (regexp rx))) + (let loop ((lst lst)) + (if (null? lst) + '() + (let ((x (##sys#slot lst 0)) + (r (##sys#slot lst 1)) ) + (if (string-search rx (acc x)) + (cons x (loop r)) + (loop r) ) ) ) ) ) ) ) ) + + +;;; Escape regular expression (suggested by Peter Bex): + +(define regexp-escape + (let ([open-output-string open-output-string] + [get-output-string get-output-string] ) + (lambda (str) + (##sys#check-string str 'regexp-escape) + (let ([out (open-output-string)] + [len (##sys#size str)] ) + (let loop ([i 0]) + (cond [(fx>= i len) (get-output-string out)] + [(memq (##core#inline "C_subchar" str i) + '(#\. #\\ #\? #\* #\+ #\^ #\$ #\( #\) #\[ #\] #\| #\{ #\})) + (##sys#write-char-0 #\\ out) + (##sys#write-char-0 (##core#inline "C_subchar" str i) out) + (loop (fx+ i 1)) ] + [else + (##sys#write-char-0 (##core#inline "C_subchar" str i) out) + (loop (fx+ i 1)) ] ) ) ) ) ) ) diff --git a/rules.make b/rules.make index a2888e16..9c6ac23a 100644 --- a/rules.make +++ b/rules.make @@ -28,11 +28,16 @@ VPATH=$(SRCDIR) # object files -LIBCHICKEN_OBJECTS_1 = library eval data-structures ports files extras lolevel utils tcp srfi-1 srfi-4 srfi-13 srfi-14 srfi-18 srfi-69 $(POSIXFILE) irregex scheduler profiler stub expand chicken-syntax chicken-ffi-syntax runtime +LIBCHICKEN_OBJECTS_1 = \ + library eval data-structures ports files extras lolevel utils tcp srfi-1 srfi-4 srfi-13 \ + srfi-14 srfi-18 srfi-69 $(POSIXFILE) regex scheduler \ + profiler stub expand chicken-syntax chicken-ffi-syntax runtime LIBCHICKEN_SHARED_OBJECTS = $(LIBCHICKEN_OBJECTS_1:=$(O)) LIBCHICKEN_STATIC_OBJECTS = $(LIBCHICKEN_OBJECTS_1:=-static$(O)) -COMPILER_OBJECTS_1 = chicken batch-driver compiler optimizer compiler-syntax scrutinizer unboxing support c-platform c-backend +COMPILER_OBJECTS_1 = \ + chicken batch-driver compiler optimizer compiler-syntax scrutinizer unboxing support \ + c-platform c-backend COMPILER_OBJECTS = $(COMPILER_OBJECTS_1:=$(O)) COMPILER_STATIC_OBJECTS = $(COMPILER_OBJECTS_1:=-static$(O)) @@ -95,7 +100,7 @@ profiler$(O): profiler.c chicken.h $(CHICKEN_CONFIG_H) $(C_COMPILER) $(C_COMPILER_OPTIONS) $(INCLUDES) \ $(C_COMPILER_COMPILE_OPTION) $(C_COMPILER_OPTIMIZATION_OPTIONS) $(C_COMPILER_SHARED_OPTIONS) \ $(C_COMPILER_BUILD_RUNTIME_OPTIONS) $< $(C_COMPILER_OUTPUT) -irregex$(O): irregex.c chicken.h $(CHICKEN_CONFIG_H) +regex$(O): regex.c chicken.h $(CHICKEN_CONFIG_H) $(C_COMPILER) $(C_COMPILER_OPTIONS) \ $(C_COMPILER_COMPILE_OPTION) $(C_COMPILER_OPTIMIZATION_OPTIONS) $(C_COMPILER_SHARED_OPTIONS) \ $(C_COMPILER_BUILD_RUNTIME_OPTIONS) $< $(C_COMPILER_OUTPUT) @@ -210,7 +215,7 @@ profiler-static$(O): profiler.c chicken.h $(CHICKEN_CONFIG_H) $(C_COMPILER_COMPILE_OPTION) $(C_COMPILER_OPTIMIZATION_OPTIONS) \ $(C_COMPILER_STATIC_OPTIONS) \ $(C_COMPILER_BUILD_RUNTIME_OPTIONS) $< $(C_COMPILER_OUTPUT) -irregex-static$(O): irregex.c chicken.h $(CHICKEN_CONFIG_H) +regex-static$(O): regex.c chicken.h $(CHICKEN_CONFIG_H) $(C_COMPILER) $(C_COMPILER_OPTIONS) $(INCLUDES) \ $(C_COMPILER_COMPILE_OPTION) $(C_COMPILER_OPTIMIZATION_OPTIONS) \ $(C_COMPILER_STATIC_OPTIONS) \ @@ -321,6 +326,10 @@ extras.import$(O): extras.import.c chicken.h $(CHICKEN_CONFIG_H) $(HOST_C_COMPILER) $(HOST_C_COMPILER_OPTIONS) $(HOST_C_COMPILER_PTABLES_OPTIONS) $(INCLUDES) -DC_SHARED \ $(HOST_C_COMPILER_COMPILE_OPTION) $(HOST_C_COMPILER_OPTIMIZATION_OPTIONS) $(HOST_C_COMPILER_SHARED_OPTIONS) \ $(HOST_C_COMPILER_BUILD_RUNTIME_OPTIONS) $< $(HOST_C_COMPILER_OUTPUT) +regex.import$(O): regex.import.c chicken.h $(CHICKEN_CONFIG_H) + $(HOST_C_COMPILER) $(HOST_C_COMPILER_OPTIONS) $(HOST_C_COMPILER_PTABLES_OPTIONS) $(INCLUDES) -DC_SHARED \ + $(HOST_C_COMPILER_COMPILE_OPTION) $(HOST_C_COMPILER_OPTIMIZATION_OPTIONS) $(HOST_C_COMPILER_SHARED_OPTIONS) \ + $(HOST_C_COMPILER_BUILD_RUNTIME_OPTIONS) $< $(HOST_C_COMPILER_OUTPUT) irregex.import$(O): irregex.import.c chicken.h $(CHICKEN_CONFIG_H) $(HOST_C_COMPILER) $(HOST_C_COMPILER_OPTIONS) $(HOST_C_COMPILER_PTABLES_OPTIONS) $(INCLUDES) -DC_SHARED \ $(HOST_C_COMPILER_COMPILE_OPTION) $(HOST_C_COMPILER_OPTIMIZATION_OPTIONS) $(HOST_C_COMPILER_SHARED_OPTIONS) \ @@ -670,6 +679,7 @@ ifdef STATICBUILD $(INSTALL_PROGRAM) $(INSTALL_PROGRAM_FILE_OPTIONS) srfi-13.import.scm "$(DESTDIR)$(IEGGDIR)" $(INSTALL_PROGRAM) $(INSTALL_PROGRAM_FILE_OPTIONS) srfi-69.import.scm "$(DESTDIR)$(IEGGDIR)" $(INSTALL_PROGRAM) $(INSTALL_PROGRAM_FILE_OPTIONS) extras.import.scm "$(DESTDIR)$(IEGGDIR)" + $(INSTALL_PROGRAM) $(INSTALL_PROGRAM_FILE_OPTIONS) regex.import.scm "$(DESTDIR)$(IEGGDIR)" $(INSTALL_PROGRAM) $(INSTALL_PROGRAM_FILE_OPTIONS) srfi-14.import.scm "$(DESTDIR)$(IEGGDIR)" $(INSTALL_PROGRAM) $(INSTALL_PROGRAM_FILE_OPTIONS) tcp.import.scm "$(DESTDIR)$(IEGGDIR)" $(INSTALL_PROGRAM) $(INSTALL_PROGRAM_FILE_OPTIONS) foreign.import.scm "$(DESTDIR)$(IEGGDIR)" @@ -692,6 +702,7 @@ else $(INSTALL_PROGRAM) $(INSTALL_PROGRAM_EXECUTABLE_OPTIONS) srfi-13.import.so "$(DESTDIR)$(IEGGDIR)" $(INSTALL_PROGRAM) $(INSTALL_PROGRAM_EXECUTABLE_OPTIONS) srfi-69.import.so "$(DESTDIR)$(IEGGDIR)" $(INSTALL_PROGRAM) $(INSTALL_PROGRAM_EXECUTABLE_OPTIONS) extras.import.so "$(DESTDIR)$(IEGGDIR)" + $(INSTALL_PROGRAM) $(INSTALL_PROGRAM_EXECUTABLE_OPTIONS) regex.import.so "$(DESTDIR)$(IEGGDIR)" $(INSTALL_PROGRAM) $(INSTALL_PROGRAM_EXECUTABLE_OPTIONS) srfi-14.import.so "$(DESTDIR)$(IEGGDIR)" $(INSTALL_PROGRAM) $(INSTALL_PROGRAM_EXECUTABLE_OPTIONS) tcp.import.so "$(DESTDIR)$(IEGGDIR)" $(INSTALL_PROGRAM) $(INSTALL_PROGRAM_EXECUTABLE_OPTIONS) foreign.import.so "$(DESTDIR)$(IEGGDIR)" @@ -729,6 +740,7 @@ ifneq ($(POSTINSTALL_PROGRAM),true) $(POSTINSTALL_PROGRAM) $(POSTINSTALL_PROGRAM_FLAGS) "$(DESTDIR)$(IEGGDIR)$(SEP)srfi-13.import.so" $(POSTINSTALL_PROGRAM) $(POSTINSTALL_PROGRAM_FLAGS) "$(DESTDIR)$(IEGGDIR)$(SEP)srfi-69.import.so" $(POSTINSTALL_PROGRAM) $(POSTINSTALL_PROGRAM_FLAGS) "$(DESTDIR)$(IEGGDIR)$(SEP)extras.import.so" + $(POSTINSTALL_PROGRAM) $(POSTINSTALL_PROGRAM_FLAGS) "$(DESTDIR)$(IEGGDIR)$(SEP)regex.import.so" $(POSTINSTALL_PROGRAM) $(POSTINSTALL_PROGRAM_FLAGS) "$(DESTDIR)$(IEGGDIR)$(SEP)irregex.import.so" $(POSTINSTALL_PROGRAM) $(POSTINSTALL_PROGRAM_FLAGS) "$(DESTDIR)$(IEGGDIR)$(SEP)srfi-14.import.so" $(POSTINSTALL_PROGRAM) $(POSTINSTALL_PROGRAM_FLAGS) "$(DESTDIR)$(IEGGDIR)$(SEP)tcp.import.so" @@ -852,8 +864,8 @@ posixunix.c: $(SRCDIR)posixunix.scm $(SRCDIR)posix-common.scm $(SRCDIR)common-de $(CHICKEN) $< $(CHICKEN_LIBRARY_OPTIONS) -output-file $@ posixwin.c: $(SRCDIR)posixwin.scm $(SRCDIR)posix-common.scm $(SRCDIR)common-declarations.scm $(CHICKEN) $< $(CHICKEN_LIBRARY_OPTIONS) -output-file $@ -irregex.c: $(SRCDIR)irregex.scm $(SRCDIR)irregex-core.scm $(SRCDIR)irregex-utils.scm $(SRCDIR)common-declarations.scm - $(CHICKEN) $< $(CHICKEN_LIBRARY_OPTIONS) -output-file $@ +regex.c: $(SRCDIR)regex.scm $(SRCDIR)irregex.scm $(SRCDIR)common-declarations.scm + $(CHICKEN) $< $(CHICKEN_LIBRARY_OPTIONS) -output-file $@ scheduler.c: $(SRCDIR)scheduler.scm $(SRCDIR)common-declarations.scm $(CHICKEN) $< $(CHICKEN_LIBRARY_OPTIONS) -output-file $@ profiler.c: $(SRCDIR)profiler.scm $(SRCDIR)common-declarations.scm @@ -883,6 +895,8 @@ srfi-69.import.c: $(SRCDIR)srfi-69.import.scm $(CHICKEN) $< $(CHICKEN_IMPORT_LIBRARY_OPTIONS) -output-file $@ extras.import.c: $(SRCDIR)extras.import.scm $(CHICKEN) $< $(CHICKEN_IMPORT_LIBRARY_OPTIONS) -output-file $@ +regex.import.c: $(SRCDIR)regex.import.scm + $(CHICKEN) $< $(CHICKEN_IMPORT_LIBRARY_OPTIONS) -output-file $@ irregex.import.c: $(SRCDIR)irregex.import.scm $(CHICKEN) $< $(CHICKEN_IMPORT_LIBRARY_OPTIONS) -output-file $@ srfi-14.import.c: $(SRCDIR)srfi-14.import.scm @@ -966,7 +980,7 @@ setup-download.c: $(SRCDIR)setup-download.scm setup-api.c distfiles: library.c eval.c expand.c chicken-syntax.c chicken-ffi-syntax.c \ data-structures.c ports.c files.c extras.c lolevel.c utils.c \ tcp.c srfi-1.c srfi-4.c srfi-13.c srfi-14.c srfi-18.c srfi-69.c \ - posixunix.c posixwin.c irregex.c scheduler.c profiler.c stub.c \ + posixunix.c posixwin.c regex.c scheduler.c profiler.c stub.c \ chicken-profile.c chicken-install.c chicken-uninstall.c chicken-status.c \ csc.c csi.c chicken.c batch-driver.c compiler.c optimizer.c \ compiler-syntax.c scrutinizer.c unboxing.c support.c \ @@ -1009,7 +1023,7 @@ spotless: distclean testclean -$(REMOVE_COMMAND) $(REMOVE_COMMAND_OPTIONS) library.c eval.c data-structures.c \ ports.c files.c extras.c lolevel.c utils.c chicken-syntax.c chicken-ffi-syntax.c \ tcp.c srfi-1.c srfi-4.c srfi-13.c srfi-14.c srfi-18.c srfi-69.c expand.c \ - posixunix.c posixwin.c irregex.c scheduler.c profiler.c stub.c \ + posixunix.c posixwin.c regex.c scheduler.c profiler.c stub.c \ chicken-profile.c chicken-bug.c \ csc.c csi.c chicken-install.c chicken-uninstall.c chicken-status.c \ chicken.c batch-driver.c compiler.c optimizer.c compiler-syntax.c \ diff --git a/scripts/henrietta.scm b/scripts/henrietta.scm index 3a3f829e..af227e22 100644 --- a/scripts/henrietta.scm +++ b/scripts/henrietta.scm @@ -36,7 +36,7 @@ ; list -(require-library setup-download irregex extras utils ports srfi-1 posix) +(require-library setup-download regex extras utils ports srfi-1 posix) (module main () @@ -65,8 +65,8 @@ (remove-directory tmpdir))) (define test-file? - (let ((rx (irregex "(\\./)?tests(/.*)?"))) - (lambda (path) (irregex-match rx path)))) + (let ((rx (regexp "(\\./)?tests(/.*)?"))) + (lambda (path) (string-match rx path)))) (define (retrieve name version) (let ((dir (handle-exceptions ex @@ -113,8 +113,8 @@ (display dir) (fail "unable to retrieve extension-list")))) - (define query-string-rx (irregex "[^?]+\\?(.+)")) - (define query-arg-rx (irregex "^&?(\\w+)=([^&]+)")) + (define query-string-rx (regexp "[^?]+\\?(.+)")) + (define query-arg-rx (regexp "^&?(\\w+)=([^&]+)")) (define (service) (let ((qs (getenv "QUERY_STRING")) @@ -123,13 +123,13 @@ (or ra "<unknown>") qs) (unless qs (error "no QUERY_STRING set")) - (let ((m (irregex-match query-string-rx qs)) + (let ((m (string-match query-string-rx qs)) (egg #f) (version #f)) - (let loop ((qs (if m (irregex-match-substring m 1) qs))) - (let* ((m (irregex-search query-arg-rx qs)) - (ms (and m (irregex-match-substring m 1))) - (rest (and m (substring qs (irregex-match-end-index m))))) + (let loop ((qs (if m (cadr m) qs))) + (let* ((m (string-search-positions query-arg-rx qs)) + (ms (and m (apply substring qs (cadr m)))) + (rest (and m (substring qs (cadar m))))) (cond ((not m) (headers) ; from here on use `fail' (cond (egg @@ -137,10 +137,10 @@ (cleanup) ) (else (fail "no extension name specified") ) )) ((string=? ms "version") - (set! version (irregex-match-substring m 2)) + (set! version (apply substring qs (caddr m))) (loop rest)) ((string=? ms "name") - (set! egg (irregex-match-substring m 2)) + (set! egg (apply substring qs (caddr m))) (loop rest)) ((string=? ms "tests") (set! *tests* #t) @@ -149,7 +149,7 @@ (headers) (listing)) ((string=? ms "mode") - (set! *mode* (string->symbol (irregex-match-substring m 2))) + (set! *mode* (string->symbol (apply substring qs (caddr m)))) (loop rest)) (else (warning "unrecognized query option" ms) diff --git a/scripts/make-egg-index.scm b/scripts/make-egg-index.scm index 69e12c2f..055b987e 100644 --- a/scripts/make-egg-index.scm +++ b/scripts/make-egg-index.scm @@ -2,8 +2,9 @@ (load-relative "tools.scm") -(use setup-download matchable sxml-transforms data-structures irregex) +(use setup-download matchable sxml-transforms data-structures regex) +(import irregex) (define *help* #f) (define *major-version* (##sys#fudge 41)) @@ -223,24 +224,24 @@ (let* ((end (irregex-match-end m 0)) (acc (kons i m acc))) (lp end acc)))))))) - (irregex-fold - irx - (lambda (i m s) - (cons (matched (irregex-match-substring m 1)) - (cons (did-not-match - (substring str i (irregex-match-start-index m 0))) - s))) - '() - str - (lambda (i s) - (reverse (cons (did-not-match (substring str i)) - s))))) + (let ((irregex-match-start-index irregex-match-start)) ;; upcoming API change in irregex 0.7 + (irregex-fold irx + (lambda (i m s) + (cons (matched (irregex-match-substring m 1)) + (cons (did-not-match + (substring str i (irregex-match-start-index m 0))) + s))) + '() + str + (lambda (i s) + (reverse (cons (did-not-match (substring str i)) + s)))))) (transform +link-regexp+ str (lambda (name) ;; wiki username `(a (@ (href ,(string-append "http://chicken.wiki.br/users/" - (irregex-replace/all " " name "-" name)))) + (string-substitute " " "-" name 'global)))) ,name)) (lambda (x) ;; raw HTML chunk `(literal ,x)))) diff --git a/scripts/makedist.scm b/scripts/makedist.scm index a9ca70d7..e44f234c 100644 --- a/scripts/makedist.scm +++ b/scripts/makedist.scm @@ -1,7 +1,7 @@ ;;;; makedist.scm - Make distribution tarballs -(use srfi-69 irregex) +(use srfi-69) (define *release* #f) @@ -15,7 +15,7 @@ (define *platform* (let ((sv (symbol->string (software-version)))) - (cond ((irregex-match ".*bsd" sv) "bsd") + (cond ((string-match ".*bsd" sv) "bsd") (else (case (build-platform) ((mingw32) diff --git a/scripts/setversion b/scripts/setversion index d751ee66..a46d7290 100644 --- a/scripts/setversion +++ b/scripts/setversion @@ -10,6 +10,10 @@ exec csi -s "$0" "$@" (define files '("README" "manual/The User's Manual")) +(define-syntax rx + (syntax-rules () + ((_ r) (force (delay (regexp r)))))) + (define (patch which rx subst) (cond ((and (list? which) (= 2 (length which))) (let ((from (car which)) @@ -22,17 +26,17 @@ exec csi -s "$0" "$@" (let loop () (let ((ln (read-line))) (unless (eof-object? ln) - (write-line (irregex-replace/all rx ln subst)) + (write-line (string-substitute rx subst ln #t)) (loop) ) ) ) ) - #:binary) ) - #:binary))) + binary:) ) + binary:))) (else (let ((tmp (create-temporary-file))) (patch (list which tmp) rx subst) (system* "mv ~S ~S" tmp which) ) ) ) ) (define (parse-version v) - (string-match (irregex "(\\d+)\\.(\\d+)\\.(\\d+)(.*)") v) ) + (string-match (rx "(\\d+)\\.(\\d+)\\.(\\d+)(.*)") v) ) (define (main args) (let ((major (##sys#fudge 41)) @@ -61,14 +65,14 @@ exec csi -s "$0" "$@" binary:) (system* "cat version.scm") (let ([vstr (sprintf "version ~A" buildversion)]) - (for-each (cut patch <> (irregex "version [0-9][-.0-9a-zA-Z]+") vstr) files) ) + (for-each (cut patch <> (rx "version [0-9][-.0-9a-zA-Z]+") vstr) files) ) (patch "chicken.h" - (irregex "C_MAJOR_VERSION[ \\t]+[0-9]+") + (rx "C_MAJOR_VERSION[ \\t]+[0-9]+") (sprintf "C_MAJOR_VERSION ~a" major)) (patch "chicken.h" - (irregex "C_MINOR_VERSION[ \\t]+[0-9]+") + (rx "C_MINOR_VERSION[ \\t]+[0-9]+") (sprintf "C_MINOR_VERSION ~a" minor)) 0)) diff --git a/scripts/tools.scm b/scripts/tools.scm index 23dac5e5..c809067f 100644 --- a/scripts/tools.scm +++ b/scripts/tools.scm @@ -341,19 +341,15 @@ (set! debug #t) ) (else (usage 1)) ) (loop (cdr args)) ) - ((irregex-match "([-_A-Za-z0-9]+)=(.*)" x) => + ((string-match "([-_A-Za-z0-9]+)=(.*)" x) => (lambda (m) - (let* ((sym (string->symbol (irregex-match-substring m 1)))) + (let* ((sym (string->symbol (cadr m)))) (if (##sys#symbol-has-toplevel-binding? sym) (let ((val (##sys#slot sym 0))) - (if (or (boolean? val) - (string? val) - (symbol? val) - (eq? (void) val)) - (##sys#setslot sym 0 (irregex-match-substring m 2)) - (quit "variable `~a' already has a suspicious value" - sym) ) ) - (##sys#setslot sym 0 (irregex-match-substring m 2)) ) + (if (or (boolean? val) (string? val) (symbol? val) (eq? (void) val)) + (##sys#setslot sym 0 (caddr m)) + (quit "variable `~a' already has a suspicious value" sym) ) ) + (##sys#setslot sym 0 (caddr m)) ) (loop (cdr args)) ) ) ) (else (set! targets (cons x targets)) @@ -428,24 +424,22 @@ val))) (let loop ((args args) (vals '())) (cond ((null? args) (reverse vals)) - ((irregex-match "(-{1,2})([-_A-Za-z0-9]+)(=)?\\s*(.+)?" (car args)) + ((string-match "(-{1,2})([-_A-Za-z0-9]+)(=)?\\s*(.+)?" (car args)) => (lambda (m) (let*-values (((next) (cdr args)) ((var val) - (cond ((equal? "=" (irregex-match-substring m 3)) - (let ((opt (irregex-match-substring m 2)) - (val (irregex-match-substring m 4))) + (cond ((equal? "=" (fourth m)) + (let ((opt (third m)) + (val (fifth m))) (cond (val (values opt val)) (else (when (null? next) - (error "missing argument for option" - (car args)) ) + (error "missing argument for option" (car args)) ) (let ((x (car next))) (set! next (cdr next)) (values opt x))))) ) - ((string? (irregex-match-substring m 1)) - (values (irregex-match-substring m 2) #t)) + ((string? (second m)) (values (third m) #t)) (else (values #f #f)) ) ) ) (cond (var (assign var val) diff --git a/scrutinizer.scm b/scrutinizer.scm index e9c30800..097a1e55 100644 --- a/scrutinizer.scm +++ b/scrutinizer.scm @@ -48,7 +48,6 @@ ; | (procedure [NAME] (VAL1 ... [#!optional VALOPT1 ...] [#!rest [VAL | values]]) . RESULTS) ; | BASIC ; | deprecated -; | (deprecated NAME) ; BASIC = * | string | symbol | char | number | boolean | list | pair | ; procedure | vector | null | eof | undefined | port | ; blob | noreturn | pointer | locative | fixnum | float @@ -90,14 +89,8 @@ ((eq? a 'deprecated) (report loc - (sprintf "use of deprecated library procedure `~a'" id) ) + (sprintf "use of deprecated toplevel identifier `~a'" id) ) '*) - ((and (pair? a) (eq? (car a) 'deprecated)) - (report - loc - (sprintf "use of deprecated library procedure `~a' - consider using `~a' instead" - id (cadr a))) - '*) (else (list a))))) (else '*))) (define (variable-result id e loc) @@ -476,8 +469,7 @@ (every procedure-type? (cdr t))))))) (define (procedure-argument-types t n) (cond ((or (memq t '(* procedure)) - (not-pair? t) - (eq? 'deprecated (car t))) + (not-pair? t) ) (values (make-list n '*) #f)) ((eq? 'procedure (car t)) (let* ((vf #f) @@ -581,10 +573,8 @@ (walk (first subs) e loc var) loc)) (b (assq var e)) ) - (when (and type - (not b) - (not (or (eq? type 'deprecated) - (and (pair? type) (eq? 'deprecated (car type))))) + (when (and type (not b) + (not (eq? type 'deprecated)) (not (match type rt))) (report loc diff --git a/setup-api.scm b/setup-api.scm index 02b3c12e..bffd48d7 100644 --- a/setup-api.scm +++ b/setup-api.scm @@ -24,7 +24,7 @@ ; POSSIBILITY OF SUCH DAMAGE. -(require-library srfi-1 irregex utils posix srfi-13 extras ports data-structures files) +(require-library srfi-1 regex utils posix srfi-13 extras ports data-structures files) ; This code is partially quite messy and the API is not overly consistent, ; mainly because it has grown "organically" while the old chicken-setup program @@ -66,7 +66,7 @@ shellpath) (import scheme chicken foreign - irregex utils posix ports extras data-structures + regex utils posix ports extras data-structures srfi-1 srfi-13 files) ;;; Constants, variables and parameters @@ -191,7 +191,7 @@ (let loop () (let ((ln (read-line))) (unless (eof-object? ln) - (write-line (irregex-replace/all rx ln subst)) + (write-line (string-substitute rx subst ln #t)) (loop) ) ) ) ) ) ) ) (let ((tmp (create-temporary-file))) (patch (list tmp tmp) rx subst) @@ -718,7 +718,7 @@ (define (version>=? v1 v2) (define (version->list v) (map (lambda (x) (or (string->number x) x)) - (irregex-split "[-\\._]" (->string v)))) + (string-split-fields "[-\\._]" (->string v) #:infix))) (let loop ((p1 (version->list v1)) (p2 (version->list v2))) (cond ((null? p1) (null? p2)) diff --git a/setup-download.scm b/setup-download.scm index 3508eb2f..02a687ad 100644 --- a/setup-download.scm +++ b/setup-download.scm @@ -24,7 +24,7 @@ ; POSSIBILITY OF SUCH DAMAGE. -(require-library extras irregex posix utils setup-api srfi-1 data-structures tcp srfi-13 +(require-library extras regex posix utils setup-api srfi-1 data-structures tcp srfi-13 files) @@ -37,13 +37,11 @@ temporary-directory) (import scheme chicken) - (import extras irregex posix utils srfi-1 data-structures tcp srfi-13 files setup-api) + (import extras regex posix utils srfi-1 data-structures tcp srfi-13 files setup-api) (define-constant +default-tcp-connect-timeout+ 10000) ; 10 seconds (define-constant +default-tcp-read/write-timeout+ 20000) ; 20 seconds - (define-constant +url-regex+ "(http://)?([^/:]+)(:([^:/]+))?(/.+)") - (tcp-connect-timeout +default-tcp-connect-timeout+) (tcp-read-timeout +default-tcp-read/write-timeout+) (tcp-write-timeout +default-tcp-read/write-timeout+) @@ -140,9 +138,7 @@ [tagver (existing-version egg version (filter-map - (lambda (f) - (and-let* ((m (irregex-search "^tags/([^/]+)/" f))) - (irregex-match-substring m 1))) + (lambda (f) (and-let* ((m (string-search "^tags/([^/]+)/" f))) (cadr m))) files))]) (let-values ([(filedir ver) (if tagver @@ -174,15 +170,14 @@ (conc dir #\/ egg ".meta")) (define (deconstruct-url url) - (let ([m (irregex-match +url-regex+ url)]) + (let ([m (string-match "(http://)?([^/:]+)(:([^:/]+))?(/.+)" url)]) (values - (if m (irregex-match-substring m 2) url) - (if (and m (irregex-match-substring m 3)) - (let ((port (irregex-match-substring m 4))) - (or (string->number port) - (error "not a valid port" port))) + (if m (caddr m) url) + (if (and m (cadddr m)) + (or (string->number (list-ref m 4)) + (error "not a valid port" (list-ref m 4))) 80) - (if m (irregex-match-substring m 5) "/")) ) ) + (if m (list-ref m 5) "/")) ) ) (define (locate-egg/http egg url #!optional version destination tests proxy-host proxy-port) @@ -231,13 +226,13 @@ (define (match-http-response rsp) (and (string? rsp) - (irregex-match "HTTP/[0-9.]+\\s+([0-9]+)\\s+.*" rsp)) ) + (string-match "HTTP/[0-9.]+\\s+([0-9]+)\\s+.*" rsp)) ) (define (response-match-code? mrsp code) - (and mrsp (string=? (number->string code) (irregex-match-substring mrsp 1))) ) + (and mrsp (string=? (number->string code) (cadr mrsp))) ) (define (match-chunked-transfer-encoding ln) - (irregex-match "[Tt]ransfer-[Ee]ncoding:\\s*chunked.*" ln) ) + (string-match "[Tt]ransfer-[Ee]ncoding:\\s*chunked.*" ln) ) (define (http-fetch host port locn dest proxy-host proxy-port) (d "connecting to host ~s, port ~a ~a...~%" host port diff --git a/setup.defaults b/setup.defaults index 80839dc4..3dce8dc2 100644 --- a/setup.defaults +++ b/setup.defaults @@ -24,7 +24,7 @@ (map (data-structures extras files foreign irregex lolevel ports tcp utils - posix irregex setup-api setup-download + posix regex setup-api setup-download srfi-1 srfi-4 srfi-13 srfi-14 srfi-18 srfi-69 ->) ) diff --git a/tests/re-tests.txt b/tests/re-tests.txt index 3b7bf976..a73604c6 100644 --- a/tests/re-tests.txt +++ b/tests/re-tests.txt @@ -104,9 +104,6 @@ a[bcd]*dcdcde adcdcde y & adcdcde a[bcd]+dcdcde adcdcde n - - (ab|a)b*c abc y &-\1 abc-ab ((a)(b)c)(d) abcd y \1-\2-\3-\4 abc-a-b-d -((a)(b)?c)(d) abcd y \1-\2-\3-\4 abc-a-b-d -((a)(b)?c)(d) acd y \1-\2-\3-\4 ac-a--d -((aa)(bb)?cc)(dd) aaccdd y \1-\2-\3-\4 aacc-aa--dd [ -~]* abc y & abc [ -~ -~]* abc y & abc [ -~ -~ -~]* abc y & abc @@ -121,13 +118,8 @@ a[bcd]+dcdcde adcdcde n - - (bc+d$|ef*g.|h?i(j|k)) effg n - - (bc+d$|ef*g.|h?i(j|k)) bcdd n - - (bc+d$|ef*g.|h?i(j|k)) reffgz y &-\1-\2 effgz-effgz- -((((((((((a))))))))) - c - - -((((((((((a)))))))))) a y &-\10 a-a (((((((((a))))))))) a y & a multiple words of text uh-uh n - - multiple words multiple words, yeah y & multiple words (.*)c(.*) abcde y &-\1-\2 abcde-ab-de \((.*), (.*)\) (a, b) y (\2, \1) (b, a) -(we|wee|week)(knights|night) weeknights y &-\1-\2 weeknights-wee-knights -(a([^a])*)* abcaBC y &-\1-\2 abcaBC-aBC-C -([Aa]b).*\1 abxyzab y &-\1 abxyzab-ab diff --git a/tests/runtests.sh b/tests/runtests.sh index 22481ccc..d75607f6 100644 --- a/tests/runtests.sh +++ b/tests/runtests.sh @@ -20,7 +20,7 @@ for x in setup-api.so setup-api.import.so setup-download.so \ srfi-1.import.so srfi-4.import.so data-structures.import.so \ ports.import.so files.import.so posix.import.so \ srfi-13.import.so srfi-69.import.so extras.import.so \ - irregex.import.so srfi-14.import.so tcp.import.so \ + regex.import.so srfi-14.import.so tcp.import.so \ foreign.import.so scheme.import.so srfi-18.import.so \ utils.import.so csi.import.so irregex.import.so types.db; do cp ../$x test-repository diff --git a/tests/scrutiny.expected b/tests/scrutiny.expected index cd3a5bc4..9c5c82a8 100644 --- a/tests/scrutiny.expected +++ b/tests/scrutiny.expected @@ -1,6 +1,6 @@ Warning: at toplevel: - use of deprecated library procedure `current-environment' + use of deprecated toplevel identifier `current-environment' Warning: in local procedure `c', in local procedure `b', diff --git a/tests/sgrep.scm b/tests/sgrep.scm index 555829e4..7503256a 100644 --- a/tests/sgrep.scm +++ b/tests/sgrep.scm @@ -1,7 +1,7 @@ ;;;; sgrep.scm - grepping benchmark -(use irregex extras utils posix srfi-1) +(use regex extras utils posix srfi-1) (define big-string @@ -21,7 +21,7 @@ (lambda (line) (set! c (fx+ c 1)) ;(when (zero? (fxmod c 500)) (print* ".")) - (when (irregex-search expr line) + (when (string-search expr line) (set! h (fx+ h 1))) #f)) ;(newline) diff --git a/tests/test-glob.scm b/tests/test-glob.scm index 91fc3d64..a5732384 100644 --- a/tests/test-glob.scm +++ b/tests/test-glob.scm @@ -1,20 +1,20 @@ ;;;; test-glob.scm - test glob-pattern -> regex translation -(use irregex) +(use regex) -(assert (irregex-match (##sys#glob->regexp "foo.bar") "foo.bar")) -(assert (irregex-match (##sys#glob->regexp "foo*") "foo.bar")) -(assert (irregex-match (##sys#glob->regexp "foo/*") "foo/bar")) -(assert (not (irregex-match (##sys#glob->regexp "foo/*") "foo/bar/baz"))) -(assert (irregex-match (##sys#glob->regexp "foo/*/*") "foo/bar/baz")) -(assert (not (irregex-match (##sys#glob->regexp "foo/*") "foo/.bar"))) -(assert (irregex-match (##sys#glob->regexp "*foo") "xyzfoo")) -(assert (not (irregex-match (##sys#glob->regexp "*foo") ".foo"))) -(assert (not (irregex-match (##sys#glob->regexp "*foo*") "a.fooxxx/yyy"))) -(assert (irregex-match (##sys#glob->regexp "*foo*") "fooxxx")) -(assert (irregex-match (##sys#glob->regexp "main.[ch]") "main.c")) -(assert (irregex-match (##sys#glob->regexp "main.[ch]") "main.h")) -(assert (not (irregex-match (##sys#glob->regexp "main.[ch]") "main.cpp"))) -(assert (irregex-match (##sys#glob->regexp "main.[-c]") "main.h")) -(assert (not (irregex-match (##sys#glob->regexp "main.[-h]") "main.h"))) +(assert (string-match (glob->regexp "foo.bar") "foo.bar")) +(assert (string-match (glob->regexp "foo*") "foo.bar")) +(assert (string-match (glob->regexp "foo/*") "foo/bar")) +(assert (not (string-match (glob->regexp "foo/*") "foo/bar/baz"))) +(assert (string-match (glob->regexp "foo/*/*") "foo/bar/baz")) +(assert (not (string-match (glob->regexp "foo/*") "foo/.bar"))) +(assert (string-match (glob->regexp "*foo") "xyzfoo")) +(assert (not (string-match (glob->regexp "*foo") ".foo"))) +(assert (not (string-match (glob->regexp "*foo*") "a.fooxxx/yyy"))) +(assert (string-match (glob->regexp "*foo*") "fooxxx")) +(assert (string-match (glob->regexp "main.[ch]") "main.c")) +(assert (string-match (glob->regexp "main.[ch]") "main.h")) +(assert (not (string-match (glob->regexp "main.[ch]") "main.cpp"))) +(assert (string-match (glob->regexp "main.[-c]") "main.h")) +(assert (not (string-match (glob->regexp "main.[-h]") "main.h"))) diff --git a/tests/test-irregex.scm b/tests/test-irregex.scm index 5fdc0340..12d49adf 100644 --- a/tests/test-irregex.scm +++ b/tests/test-irregex.scm @@ -1,13 +1,15 @@ ;;;: test-irregex.scm -(use extras irregex) +(use extras regex) (include "test.scm") +(import irregex) + (define (subst-matches matches subst) (define (submatch n) - (if (irregex-match-data? matches) + (if (vector? matches) (irregex-match-substring matches n) (list-ref matches n))) (and @@ -26,41 +28,35 @@ ((#\\) (let ((c (read-char in))) (if (char-numeric? c) - (let lp ((res (list c))) - (if (and (char? (peek-char in)) - (char-numeric? (peek-char in))) - (lp (cons (read-char in) res)) - (display - (or (submatch (string->number - (list->string (reverse res)))) - "") - out))) + (display + (or (submatch (string->number (string c))) "") + out) (write-char c out)))) (else (write-char c out))) (lp))))))))))) (define (test-re matcher line) - (let ((splt (string-split line "\t" #t))) - (if (list? splt) - (apply - (lambda (pattern input result subst output) - (let ((name (sprintf "~A ~A ~A ~A" pattern input result subst))) - (cond - ((equal? "c" result) - (test-error name (matcher pattern input))) - ((equal? "n" result) - (test-assert name (not (matcher pattern input)))) - (else - (test-equal name output - (subst-matches (matcher pattern input) subst)))))) - splt) - (warning "invalid regex test line" line)))) + (apply + (lambda (pattern input result subst output) + (let ((name (sprintf "~A ~A ~A" pattern input result))) + (cond + ((equal? "c" result) + (test-error name (matcher pattern input))) + ((equal? "n" result) + (test-assert name (not (matcher pattern input)))) + ((equal? "y" result) + (test-assert name (matcher pattern input))) + (else + (test-equal name + (subst-matches (matcher pattern input) subst) + result))))) + (string-split line "\t" #t))) + (test-begin) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; basic irregex (for-each (lambda (opts) @@ -73,139 +69,9 @@ (irregex-search (apply irregex pat opts) str)) line)) read-line))))) - '((backtrack) - (fast) - )) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; chunked irregex - -(define (rope . args) - (map (lambda (x) (if (pair? x) x (list x 0 (string-length x)))) args)) - -(define rope-chunker - (make-irregex-chunker - (lambda (x) (and (pair? (cdr x)) (cdr x))) - caar - cadar - caddar - (lambda (src1 i src2 j) - (if (eq? src1 src2) - (substring (caar src1) i j) - (let lp ((src (cdr src1)) - (res (list (substring (caar src1) i (caddar src1))))) - (if (eq? src src2) - (string-intersperse - (reverse (cons (substring (caar src2) (cadar src2) j) res)) - "") - (lp (cdr src) - (cons (substring (caar src) (cadar src) (caddar src)) - res)))))))) - -(define (make-ropes str) - (let ((len (string-length str))) - (case len - ((0 1) - (list (rope str))) - ((2) - (list (rope str) - (rope (substring str 0 1) (substring str 1 2)))) - ((3) - (list (rope str) - (rope (substring str 0 1) (substring str 1 3)) - (rope (substring str 0 2) (substring str 2 3)) - (rope (substring str 0 1) - (substring str 1 2) - (substring str 2 3)))) - (else - (let ((mid (quotient (+ len 1) 2))) - (list (rope str) - (rope (substring str 0 1) (substring str 1 len)) - (rope (substring str 0 mid) (substring str mid len)) - (rope (substring str 0 (- len 1)) - (substring str (- len 1) len)) - (rope (substring str 0 1) - (substring str 1 mid) - (substring str mid len)) - )))))) - -(define (make-shared-ropes str) - (let ((len (string-length str))) - (case len - ((0 1) - '()) - ((2) - (list (list (list str 0 1) (list str 1 2)))) - ((3) - (list (list (list str 0 1) (list str 1 3)) - (list (list str 0 2) (list str 2 3)) - (list (list str 0 1) (list str 1 2) (list str 2 3)))) - (else - (let ((mid (quotient (+ len 1) 2))) - (list (list (list str 0 1) (list str 1 len)) - (list (list str 0 mid) (list str mid len)) - (list (list str 0 (- len 1)) - (list str (- len 1) len)) - (list (list str 0 1) (list str 1 mid) (list str mid len)) - )))))) - -(for-each - (lambda (opts) - (test-group - (sprintf "irregex/chunked - ~S" opts) - (with-input-from-file "re-tests.txt" - (lambda () - (port-for-each - (lambda (line) - (let ((splt (string-split line "\t" #t))) - (if (list? splt) - (apply - (lambda (pattern input result subst output) - (let ((name - (sprintf "~A ~A ~A ~A" pattern input result subst))) - (cond - ((equal? "c" result)) - ((equal? "n" result) - (for-each - (lambda (rope) - (test-assert name - (not (irregex-search/chunked pattern - rope-chunker - rope)))) - (append (make-ropes input) - (make-shared-ropes input)))) - (else - (for-each - (lambda (rope) - (test-equal - name output - (subst-matches (irregex-search/chunked pattern - rope-chunker - rope) - subst))) - (append (make-ropes input) - (make-shared-ropes input))))))) - splt) - (warning "invalid regex test line" line)))) - read-line))))) - '((backtrack) - (fast) - )) + '((small) (fast))) -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; pregexp - -'(test-group "pregexp" - (with-input-from-file "re-tests.txt" - (lambda () - (port-for-each - (lambda (line) (test-re pregexp-match line)) - read-line)))) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; default regex (PCRE) - -'(test-group "regex" +(test-group "regex" (with-input-from-file "re-tests.txt" (lambda () (port-for-each @@ -214,139 +80,14 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(test-group "unmatchable patterns" - (test-assert (not (irregex-search '(or) "abc"))) - (test-assert (not (irregex-search '(: "ab" (or)) "abc"))) - (test-assert (not (irregex-search '(submatch "ab" (or)) "abc"))) - (test-assert (not (irregex-search '(: "ab" (submatch (or))) "abc"))) - (test-assert (not (irregex-search '(/) "abc"))) - (test-assert (not (irregex-search '(: "ab" (/)) "abc"))) - (test-assert (not (irregex-search '(~ any) "abc"))) - (test-assert (not (irregex-search '(: "ab" (~ any)) "abc"))) - (test-assert (not (irregex-search '("") "abc"))) - (test-assert (not (irregex-search '(: "ab" ("")) "abc"))) - ) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -(test-group "beginning/end of chunks" - (test-assert - (irregex-search/chunked '(: bos "foo") rope-chunker '((" foo" 0 4)) 1)) - (test-assert - (irregex-search/chunked '(: bos "foo") rope-chunker '((" foo" 1 5)) 2)) - (test-assert - (irregex-search/chunked '(: bos "foo" eos) rope-chunker '((" foo" 1 4)) 1)) - (test-assert - (irregex-search/chunked '(: bos "foo" eos) rope-chunker '((" foo" 2 5)) 2)) - (test-assert - (irregex-search/chunked '(: bos "foo" eos) rope-chunker '((" foo" 0 4)) 1)) - (test-assert - (irregex-search/chunked '(: bos "foo" eos) rope-chunker '((" foo" 1 5)) 2)) - ) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -(test-group "API" - (test-assert (irregex? (irregex "a.*b"))) - (test-assert (irregex? (irregex '(: "a" (* any) "b")))) - (test-assert (not (irregex? (vector '*irregex-tag* #f #f #f #f #f #f #f)))) - (test-assert (not (irregex? (vector #f #f #f #f #f #f #f #f #f)))) - (test-assert (irregex-match-data? (irregex-search "a.*b" "axxxb"))) - (test-assert (irregex-match-data? (irregex-match "a.*b" "axxxb"))) - (test-assert (not (irregex-match-data? (vector '*irregex-match-tag* #f #f #f #f #f #f #f #f #f)))) - (test-assert (not (irregex-match-data? (vector #f #f #f #f #f #f #f #f #f #f #f)))) - (test-equal 0 (irregex-num-submatches (irregex "a.*b"))) - (test-equal 1 (irregex-num-submatches (irregex "a(.*)b"))) - (test-equal 2 (irregex-num-submatches (irregex "(a(.*))b"))) - (test-equal 2 (irregex-num-submatches (irregex "a(.*)(b)"))) - (test-equal 10 (irregex-num-submatches (irregex "((((((((((a))))))))))"))) - (test-equal 0 (irregex-match-num-submatches (irregex-search "a.*b" "axxxb"))) - (test-equal 1 (irregex-match-num-submatches (irregex-search "a(.*)b" "axxxb"))) - (test-equal 2 (irregex-match-num-submatches (irregex-search "(a(.*))b" "axxxb"))) - (test-equal 2 (irregex-match-num-submatches (irregex-search "a(.*)(b)" "axxxb"))) - (test-equal 10 (irregex-match-num-submatches (irregex-search "((((((((((a))))))))))" "a"))) - ) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - (test-group "utils" - (test-equal "h*llo world" - (irregex-replace "[aeiou]" "hello world" "*")) - (test-equal "h*ll* w*rld" - (irregex-replace/all "[aeiou]" "hello world" "*")) - (test-equal '("bob@test.com" "fred@example.com") - (irregex-fold 'email - (lambda (i m s) (cons (irregex-match-substring m) s)) - '() - "bob@test.com and fred@example.com" - (lambda (i s) (reverse s)))) - (test-equal '("bob@test.com" "fred@example.com") - (irregex-fold/chunked - 'email - (lambda (src i m s) (cons (irregex-match-substring m) s)) - '() - rope-chunker - (rope "bob@test.com and fred@example.com") - (lambda (src i s) (reverse s)))) - ) - - -(define (extract name irx str) - (irregex-match-substring (irregex-match irx str) name)) - -(test-group "named submatches" - (test-equal "matching submatch is seen and extracted" - "first" (extract 'first `(or (submatch-named first "first") - (submatch-named second "second")) - "first")) - (test-equal "nonmatching submatch is known but returns false" - #f (extract 'second `(or (submatch-named first "first") - (submatch-named second "second")) - "first")) - (test-error "nonexisting submatch is unknown and raises an error" - (extract 'third `(or (submatch-named first "first") - (submatch-named second "second")) - "first")) - (test-equal "matching alternative is used" - "first" (extract 'sub `(or (submatch-named sub "first") - (submatch-named sub "second")) - "first")) - (test-equal "matching alternative is used (second match)" - "second" (extract 'sub `(or (submatch-named sub "first") - (submatch-named sub "second")) - "second")) - (test-equal "last match is used with multiple matches for a name" - "second" (extract 'sub `(seq (submatch-named sub "first") - space - (submatch-named sub "second")) - "first second"))) + (test-equal "replace" + (irregex-replace "[aeiou]" "hello world" "*") + "h*llo world") + (test-equal "replace/all" + (irregex-replace/all "[aeiou]" "hello world" "*") + "h*ll* w*rld")) (test-end) - - -;;; UTF-8 tests - -(test-begin) - -(test-assert (irregex-search "(?u:<..>)" "<漢字>")) -(test-assert (irregex-search "(?u:<.*>)" "<漢字>")) -(test-assert (irregex-search "(?u:<.+>)" "<漢字>")) -(test-assert (not (irregex-search "(?u:<.>)" "<漢字>"))) -(test-assert (not (irregex-search "(?u:<...>)" "<漢>"))) - -(test-assert (irregex-search "(?u:<[^a-z]*>)" "<漢字>")) -(test-assert (not (irregex-search "(?u:<[^a-z]*>)" "<漢m字>"))) -(test-assert (irregex-search "(?u:<[^a-z][^a-z]>)" "<漢字>")) -(test-assert (irregex-search "(?u:<あ*>)" "<あ>")) -(test-assert (irregex-search "(?u:<あ*>)" "<ああ>")) -(test-assert (not (irregex-search "(?u:<あ*>)" "<あxあ>"))) - -(test-assert (irregex-search "(?u:<[あ-ん]*>)" "<あん>")) -(test-assert (irregex-search "(?u:<[あ-ん]*>)" "<ひらがな>")) -(test-assert (not (irregex-search "(?u:<[あ-ん]*>)" "<ひらgがな>"))) - -(test-end) - - (test-exit) diff --git a/tests/test.scm b/tests/test.scm index c16de6a5..e9b43c14 100644 --- a/tests/test.scm +++ b/tests/test.scm @@ -77,8 +77,7 @@ (define-syntax test-equal (syntax-rules () ((_ name expr value eq) (run-equal name (lambda () expr) value eq)) - ((_ name expr value) (run-equal name (lambda () expr) value equal?)) - ((_ expr value) (run-equal (->string value) (lambda () expr) value equal?)))) + ((_ name expr value) (run-equal name (lambda () expr) value equal?)))) (define-syntax test-error (syntax-rules () @@ -90,8 +89,7 @@ (define-syntax test-assert (syntax-rules () - ((_ name expr) (run-equal name (lambda () (if expr #t #f)) #t eq?)) - ((_ expr) (run-equal (->string expr) (lambda () (if expr #t #f)) #t eq?)))) + ((_ name expr) (run-equal name (lambda () (if expr #t #f)) #t eq?)))) (define-syntax test-group (syntax-rules () diff --git a/types.db b/types.db index e1b8ca57..c831ee6a 100644 --- a/types.db +++ b/types.db @@ -537,58 +537,30 @@ ;; irregex (irregex (procedure irregex (#!rest) *)) -;irregex-apply-match -(irregex-dfa (procedure irregex-dfa (*) *)) -(irregex-dfa/extract (procedure irregex-dfa/extract (*) *)) -(irregex-dfa/search (procedure irregex-dfa/search (*) *)) -(irregex-extract (procedure irregex-extract (* string #!optional fixnum fixnum) list)) -(irregex-flags (procedure irregex-flags (*) *)) -(irregex-fold (procedure irregex-fold (* (procedure (fixnum (struct regexp-match)) *) * string #!optional (procedure (fixnum *) *) fixnum fixnum) *)) -(irregex-fold/chunked (procedure irregex-fold/chunked (* (procedure (fixnum (struct regexp-match)) *) * procedure * #!optional (procedure (fixnum *) *) fixnum fixnum) *)) -(irregex-lengths (procedure irregex-lengths (*) *)) -(irregex-match (procedure irregex-match (* string) *)) -;irregex-match? +(string->irregex (procedure string->irregex (string #!rest) *)) +(sre->irregex (procedure sre->irregex (#!rest) *)) +(string->sre (procedure string->sre (string #!rest) *)) +(irregex? (procedure irregex? (*) boolean)) (irregex-match-data? (procedure irregex-match-data? (*) boolean)) -(irregex-match-end (procedure irregex-match-end (* #!optional *) *)) -;irregex-match-end-chunk -(irregex-match-end-index (procedure irregex-match-end-index ((struct regexp-match) *) fixnum)) -(irregex-match-names (procedure irregex-match-names ((struct regexp-match)) list)) -(irregex-match-num-submatches (procedure irregex-match-num-submatches ((struct regexp-match)) fixnum)) -(irregex-match-start (procedure irregex-match-start (* #!optional *) *)) -;irregex-match-start-chunk -(irregex-match-start-index (procedure irregex-match-start-index ((struct regexp-match) *) fixnum)) -(irregex-match-string (procedure irregex-match-string (*) *)) -(irregex-match-subchunk (procedure irregex-match-subchunk ((struct regexp-match) #!optional *) *)) -(irregex-match-substring (procedure irregex-match-substring (* #!optional *) *)) -(irregex-match/chunked (procedure irregex-match/chunked (* * * #!optional fixnum) *)) -(irregex-names (procedure irregex-names (*) *)) (irregex-new-matches (procedure irregex-new-matches (*) *)) -(irregex-nfa (procedure irregex-nfa (*) *)) -(irregex-num-submatches (procedure irregex-num-submatches (*) fixnum)) -(irregex-opt (procedure irregex-opt (list) *)) -(irregex-quote (procedure irregex-quote (string) string)) -(irregex-replace (procedure irregex-replace (* string #!rest) *)) -(irregex-replace/all (procedure irregex-replace/all (* string #!rest) *)) (irregex-reset-matches! (procedure irregex-reset-matches! (*) *)) +(irregex-match-start (procedure irregex-match-start (* #!optional *) *)) +(irregex-match-end (procedure irregex-match-end (* #!optional *) *)) +(irregex-match-substring (procedure irregex-match-substring (* #!optional *) *)) (irregex-search (procedure irregex-search (* string #!optional fixnum fixnum) *)) (irregex-search/matches (procedure irregex-search/matches (* string fixnum fixnum *) *)) -(irregex-split (procedure irregex-split (* string #!optional fixnum fixnum) list)) +(irregex-match (procedure irregex-match (* string) *)) +(irregex-match-string (procedure irregex-match-string (*) *)) +(irregex-replace (procedure irregex-replace (* string #!rest) *)) +(irregex-replace/all (procedure irregex-replace/all (* string #!rest) *)) +(irregex-dfa (procedure irregex-dfa (*) *)) +(irregex-dfa/search (procedure irregex-dfa/search (*) *)) +(irregex-dfa/extract (procedure irregex-dfa/extract (*) *)) +(irregex-nfa (procedure irregex-nfa (*) *)) +(irregex-flags (procedure irregex-flags (*) *)) (irregex-submatches (procedure irregex-submatches (*) *)) -(irregex? (procedure irregex? (*) boolean)) -(make-irregex-chunker - (procedure make-irregex-chunker - ((procedure (*) *) - (procedure (*) *) - #!optional - (procedure (*) *) - (procedure (*) *) - (procedure (* fixnum * fixnum) string) - (procedure (* fixnum * fixnum) *)) - *)) -(maybe-string->sre (procedure maybe-string->sre (*) *)) -(sre->irregex (procedure sre->irregex (#!rest) *)) -(string->irregex (procedure string->irregex (string #!rest) *)) -(string->sre (procedure string->sre (string #!rest) *)) +(irregex-lengths (procedure irregex-lengths (*) *)) +(irregex-names (procedure irregex-names (*) *)) ;; lolevel @@ -920,6 +892,22 @@ (with-input-from-pipe (procedure with-input-from-pipe (string (procedure () . *) #!optional symbol) . *)) (with-output-to-pipe (procedure with-output-to-pipe (string (procedure () . *) #!optional symbol) . *)) +;; regex + +(glob->regexp (procedure glob->regexp (string #!optional *) *)) +(glob? deprecated) +(grep (procedure grep (* list #!optional (procedure (*) *)) list)) +(regexp (procedure regexp (* #!optional * * *) (struct regexp))) +(regexp-escape (procedure regexp-escape (string) string)) +(regexp? (procedure regexp? (*) boolean)) +(string-match (procedure string-match (* string) *)) +(string-match-positions (procedure string-match-positions (* string) *)) +(string-search (procedure string-search (* string #!optional fixnum fixnum) *)) +(string-search-positions (procedure string-search-positions (* string #!optional fixnum fixnum) *)) +(string-split-fields (procedure string-split-fields (* string #!optional * fixnum) list)) +(string-substitute (procedure string-substitute (* string string #!optional *) string)) +(string-substitute* (procedure string-substitute* (string list #!optional *) string)) + ;; srfi-1 (alist-cons (procedure alist-cons (* * *) list)) diff --git a/utils.scm b/utils.scm index b934a550..03bd4ed6 100644 --- a/utils.scm +++ b/utils.scm @@ -27,7 +27,7 @@ (declare (unit utils) - (uses extras srfi-13 posix files irregex) + (uses extras srfi-13 posix files regex) (fixnum) (hide chop-pds) (disable-interrupts) ) @@ -115,15 +115,18 @@ ;;; Scan lines until regex or predicate matches (define scan-input-lines - (lambda (rx #!optional (port ##sys#standard-input)) - (let ((rx (if (procedure? rx) - rx - (cute irregex-search (irregex rx) <>)))) - (let loop () - (let ((ln (read-line port))) - (and (not (eof-object? ln)) - (or (rx ln) - (loop)))))))) + (let ((regexp regexp) + (read-line read-line) + (string-search string-search)) + (lambda (rx #!optional (port ##sys#standard-input)) + (let ((rx (if (procedure? rx) + rx + (cut string-search (regexp rx) <>)))) + (let loop () + (let ((ln (read-line port))) + (and (not (eof-object? ln)) + (or (rx ln) + (loop))))))))) ;; Ask for confirmationTrap