~ chicken-core (chicken-5) 8a516a042542ad4245d79eecc23643e765d61a0e
commit 8a516a042542ad4245d79eecc23643e765d61a0e Author: felix <felix@call-with-current-continuation.org> AuthorDate: Thu Aug 19 03:41:21 2010 -0400 Commit: felix <felix@call-with-current-continuation.org> CommitDate: Thu Aug 19 03:41:21 2010 -0400 manually applied patch between experimental and total-irregex diff --git a/chicken-install.scm b/chicken-install.scm index dcedaa7c..35974439 100644 --- a/chicken-install.scm +++ b/chicken-install.scm @@ -25,13 +25,13 @@ (require-library setup-download setup-api) -(require-library srfi-1 posix data-structures utils regex ports extras srfi-13 files) +(require-library srfi-1 posix data-structures utils irregex ports extras srfi-13 files) (require-library chicken-syntax) ; in case an import library reexports chicken syntax (require-library chicken-ffi-syntax) ; same reason, also for filling modules.db (module main () - (import scheme chicken srfi-1 posix data-structures utils regex ports extras + (import scheme chicken srfi-1 posix data-structures utils irregex ports extras srfi-13 files) (import setup-download setup-api) @@ -51,7 +51,6 @@ "srfi-13.import.so" "srfi-69.import.so" "extras.import.so" - "regex.import.so" "srfi-14.import.so" "tcp.import.so" "foreign.import.so" @@ -515,17 +514,17 @@ (let* ((files (glob (make-pathname (repository-path) "*.import.*"))) (tmpdir (create-temporary-directory)) (dbfile (make-pathname tmpdir +module-db+)) - (rx (regexp ".*/([^/]+)\\.import\\.(scm|so)"))) + (rx (irregex ".*/([^/]+)\\.import\\.(scm|so)"))) (print "loading import libraries ...") (fluid-let ((##sys#warnings-enabled #f)) (for-each (lambda (f) - (let ((m (string-match rx f))) + (let ((m (irregex-match rx f))) (handle-exceptions ex (print-error-message ex (current-error-port) (sprintf "Failed to import from `~a'" f)) - (eval `(import ,(string->symbol (cadr m))))))) + (eval `(import ,(string->symbol (irregex-match-substring m 1))))))) files)) (print "generating database") (let ((db @@ -613,10 +612,10 @@ EOF (define (setup-proxy uri) (if (string? uri) - (cond ((string-match "(.+)\\:([0-9]+)" uri) => + (cond ((irregex-match "(.+)\\:([0-9]+)" uri) => (lambda (m) - (set! *proxy-host* (cadr m)) - (set! *proxy-port* (string->number (caddr m)))) + (set! *proxy-host* (irregex-match-substring m 1)) + (set! *proxy-port* (string->number (irregex-match-substring m 2)))) (else (set! *proxy-host* uri) (set! *proxy-port* 80)))))) @@ -625,7 +624,7 @@ EOF (define (main args) (let ((update #f) - (rx (regexp "([^:]+):(.+)"))) + (rx (irregex "([^:]+):(.+)"))) (setup-proxy (get-environment-variable "http_proxy")) (let loop ((args args) (eggs '())) (cond ((null? args) @@ -776,9 +775,14 @@ EOF "") *eggs+dirs+vers*)) (loop (cdr args) (cons egg eggs)))) - ((string-match rx arg) => + ((irregex-match rx arg) => (lambda (m) - (loop (cdr args) (alist-cons (cadr m) (caddr m) eggs)))) + (loop + (cdr args) + (alist-cons + (irregex-match-substring m 1) + (irregex-match-substring m 2) + eggs)))) (else (loop (cdr args) (cons arg eggs)))))))))) (register-feature! 'chicken-install) diff --git a/chicken-status.scm b/chicken-status.scm index 5222ebb0..cbd246c3 100644 --- a/chicken-status.scm +++ b/chicken-status.scm @@ -24,13 +24,13 @@ ; POSSIBILITY OF SUCH DAMAGE. -(require-library setup-api srfi-1 posix data-structures utils ports regex files) +(require-library setup-api srfi-1 posix data-structures utils ports irregex files) (module main () (import scheme chicken foreign) - (import srfi-1 posix data-structures utils ports regex + (import srfi-1 posix data-structures utils ports irregex files setup-api) (define-foreign-variable C_TARGET_LIB_HOME c-string) @@ -45,6 +45,9 @@ (make-pathname C_TARGET_LIB_HOME (sprintf "chicken/~a" C_BINARY_VERSION)) (repository-path))) + (define (grep rx lst) + (filter (cut irregex-search rx <>) lst)) + (define (gather-eggs patterns) (let ((eggs (map pathname-file (glob (make-pathname (repo-path) "*" "setup-info"))))) @@ -122,11 +125,10 @@ EOF (lambda () (let* ((patterns (map - regexp + irregex (cond ((null? pats) '(".*")) - ;;XXX change for total-irregex branch: (exact (map (lambda (p) - (string-append "^" (regexp-escape p) "$")) + (string-append "^" (irregex-quote p) "$")) pats)) (else pats)))) (eggs (gather-eggs patterns))) diff --git a/chicken-uninstall.scm b/chicken-uninstall.scm index 8a074b4d..bd6b35ee 100644 --- a/chicken-uninstall.scm +++ b/chicken-uninstall.scm @@ -26,14 +26,14 @@ (require-library setup-api - srfi-1 posix data-structures utils ports regex srfi-13 files) + srfi-1 posix data-structures utils ports irregex srfi-13 files) (module main () (import scheme chicken foreign) (import setup-api) - (import srfi-1 posix data-structures utils ports regex srfi-13 files) + (import srfi-1 posix data-structures utils ports irregex srfi-13 files) (define-foreign-variable C_TARGET_LIB_HOME c-string) (define-foreign-variable C_BINARY_VERSION int) @@ -49,6 +49,9 @@ (define *force* #f) + (define (grep rx lst) + (filter (cut irregex-search rx <>) lst)) + (define (gather-eggs patterns) (let ((eggs (map pathname-file (glob (make-pathname (repo-path) "*" "setup-info"))))) @@ -117,8 +120,8 @@ EOF (map (lambda (p) (if exact - (regexp (string-append "^" (regexp-escape p) "$")) - (glob->regexp p))) + (irregex (string-append "^" (irregex-quote p) "$")) + (##sys#glob->regexp p))) pats)))) (else (let ((arg (car args))) diff --git a/defaults.make b/defaults.make index e4e8537b..4564b952 100644 --- a/defaults.make +++ b/defaults.make @@ -293,7 +293,7 @@ CSI ?= csi$(EXE) # Scheme compiler flags -CHICKEN_OPTIONS = -optimize-level 2 -include-path . -include-path $(SRCDIR) -inline -ignore-repository +CHICKEN_OPTIONS = -optimize-level 2 -include-path . -include-path $(SRCDIR) -inline -ignore-repository -feature building-chicken ifdef DEBUGBUILD CHICKEN_OPTIONS += -feature debugbuild -scrutinize -types $(SRCDIR)types.db else @@ -321,12 +321,9 @@ CHICKEN_INSTALL_PROGRAM = $(PROGRAM_PREFIX)chicken-install$(PROGRAM_SUFFIX) CHICKEN_UNINSTALL_PROGRAM = $(PROGRAM_PREFIX)chicken-uninstall$(PROGRAM_SUFFIX) CHICKEN_STATUS_PROGRAM = $(PROGRAM_PREFIX)chicken-status$(PROGRAM_SUFFIX) CHICKEN_BUG_PROGRAM = $(PROGRAM_PREFIX)chicken-bug$(PROGRAM_SUFFIX) -IMPORT_LIBRARIES = chicken lolevel srfi-1 srfi-4 data-structures ports files posix srfi-13 srfi-69 extras \ - regex srfi-14 tcp foreign scheme srfi-18 utils csi irregex +IMPORT_LIBRARIES = chicken lolevel srfi-1 srfi-4 data-structures ports files posix srfi-13 srfi-69 extras srfi-14 tcp foreign scheme srfi-18 utils csi irregex IMPORT_LIBRARIES += setup-api setup-download -SCRUTINIZED_LIBRARIES = library eval data-structures ports files extras lolevel utils tcp srfi-1 srfi-4 srfi-13 \ - srfi-14 srfi-18 srfi-69 $(POSIXFILE) regex scheduler \ - profiler stub expand chicken-syntax chicken-ffi-syntax +SCRUTINIZED_LIBRARIES = library eval data-structures ports files extras lolevel utils tcp srfi-1 srfi-4 srfi-13 srfi-14 srfi-18 srfi-69 $(POSIXFILE) irregex scheduler profiler stub expand chicken-syntax chicken-ffi-syntax ifdef STATICBUILD CHICKEN_STATIC_EXECUTABLE = $(CHICKEN_PROGRAM)$(EXE) diff --git a/distribution/manifest b/distribution/manifest index 6c2ceb82..1c7a9a8e 100644 --- a/distribution/manifest +++ b/distribution/manifest @@ -28,7 +28,7 @@ optimizer.c compiler-syntax.c scrutinizer.c unboxing.c -regex.c +irregex.c posixunix.c posixwin.c profiler.c @@ -77,8 +77,9 @@ optimizer.scm compiler-syntax.scm scrutinizer.scm unboxing.scm -regex.scm irregex.scm +irregex-core.scm +irregex-utils.scm posixunix.scm posixwin.scm posix-common.scm @@ -210,7 +211,6 @@ posix.import.scm srfi-13.import.scm srfi-69.import.scm extras.import.scm -regex.import.scm irregex.import.scm srfi-14.import.scm tcp.import.scm @@ -224,7 +224,6 @@ posix.import.c srfi-13.import.c srfi-69.import.c extras.import.c -regex.import.c irregex.import.c srfi-14.import.c tcp.import.c @@ -290,7 +289,7 @@ manual/Unit library manual/Unit lolevel manual/Unit ports manual/Unit posix -manual/Unit regex +manual/Unit irregex manual/Unit srfi-1 manual/Unit srfi-13 manual/Unit srfi-14 diff --git a/eval.scm b/eval.scm index 0d8bb4fb..c3d98c6c 100644 --- a/eval.scm +++ b/eval.scm @@ -55,7 +55,7 @@ (define-foreign-variable binary-version int "C_BINARY_VERSION") (define ##sys#core-library-modules - '(extras lolevel utils files tcp regex posix srfi-1 srfi-4 srfi-13 + '(extras lolevel utils files tcp irregex posix srfi-1 srfi-4 srfi-13 srfi-14 srfi-18 srfi-69 data-structures ports chicken-syntax chicken-ffi-syntax)) diff --git a/files.scm b/files.scm index 3e8b0d63..630808a4 100644 --- a/files.scm +++ b/files.scm @@ -36,7 +36,8 @@ (declare (unit files) - (uses regex data-structures) + (uses irregex data-structures) + (fixnum) (hide chop-pds absolute-pathname-root root-origin root-directory split-directory) (disable-interrupts) (foreign-declare #<<EOF @@ -171,20 +172,19 @@ EOF (define root-origin) (define root-directory) -(let ((string-match string-match)) - (if ##sys#windows-platform - (let ((rx (regexp "([A-Za-z]:)?([\\/\\\\]).*"))) - (set! absolute-pathname-root (lambda (pn) (string-match rx pn))) - (set! root-origin (lambda (rt) (and rt (cadr rt)))) - (set! root-directory (lambda (rt) (and rt (caddr rt)))) ) - (let ((rx (regexp "([\\/\\\\]).*"))) - (set! absolute-pathname-root (lambda (pn) (string-match rx pn))) - (set! root-origin (lambda (rt) #f)) - (set! root-directory (lambda (rt) (and rt (cadr rt)))) ) ) ) +(if ##sys#windows-platform + (let ((rx (irregex "([A-Za-z]:)?([\\/\\\\]).*"))) + (set! absolute-pathname-root (lambda (pn) (irregex-match rx pn))) + (set! root-origin (lambda (rt) (and rt (irregex-match-substring rt 1)))) + (set! root-directory (lambda (rt) (and rt (irregex-match-substring rt 2)))) ) + (let ((rx (irregex "([\\/\\\\]).*"))) + (set! absolute-pathname-root (lambda (pn) (irregex-match rx pn))) + (set! root-origin (lambda (rt) #f)) + (set! root-directory (lambda (rt) (and rt (irregex-match-substring rt 1)))) ) ) (define (absolute-pathname? pn) (##sys#check-string pn 'absolute-pathname?) - (pair? (absolute-pathname-root pn)) ) + (irregex-match-data? (absolute-pathname-root pn)) ) (define-inline (*char-pds? ch) (memq ch '(#\\ #\/))) @@ -261,28 +261,33 @@ EOF file ext def-pds) ) ) ) (define decompose-pathname - (let ((string-match string-match)) - (let* ([patt1 "^(.*[\\/\\\\])?([^\\/\\\\]+)(\\.([^\\/\\\\.]+))$"] - [patt2 "^(.*[\\/\\\\])?((\\.)?[^\\/\\\\]+)$"] - [rx1 (regexp patt1)] - [rx2 (regexp patt2)] - [strip-pds - (lambda (dir) - (and dir - (if (member dir '("/" "\\")) - dir - (chop-pds dir #f) ) ) )] ) - (lambda (pn) - (##sys#check-string pn 'decompose-pathname) - (if (fx= 0 (##sys#size pn)) - (values #f #f #f) - (let ([ms (string-match rx1 pn)]) - (if ms - (values (strip-pds (cadr ms)) (caddr ms) (car (cddddr ms))) - (let ([ms (string-match rx2 pn)]) - (if ms - (values (strip-pds (cadr ms)) (caddr ms) #f) - (values (strip-pds pn) #f #f) ) ) ) ) ) ) ) ) ) + (let* ([patt1 "^(.*[\\/\\\\])?([^\\/\\\\]+)(\\.([^\\/\\\\.]+))$"] + [patt2 "^(.*[\\/\\\\])?((\\.)?[^\\/\\\\]+)$"] + [rx1 (irregex patt1)] + [rx2 (irregex patt2)] + [strip-pds + (lambda (dir) + (and dir + (if (member dir '("/" "\\")) + dir + (chop-pds dir #f) ) ) )] ) + (lambda (pn) + (##sys#check-string pn 'decompose-pathname) + (if (fx= 0 (##sys#size pn)) + (values #f #f #f) + (let ([ms (irregex-search rx1 pn)]) + (if ms + (values + (strip-pds (irregex-match-substring ms 1)) + (irregex-match-substring ms 2) + (irregex-match-substring ms 4)) + (let ([ms (irregex-search rx2 pn)]) + (if ms + (values + (strip-pds (irregex-match-substring ms 1)) + (irregex-match-substring ms 2) + #f) + (values (strip-pds pn) #f #f) ) ) ) ) ) ) ) ) (define pathname-directory) (define pathname-file) @@ -337,7 +342,11 @@ EOF (define create-temporary-file) (define create-temporary-directory) -(let ((call-with-output-file call-with-output-file) +(let ((get-environment-variable get-environment-variable) + (make-pathname make-pathname) + (file-exists? file-exists?) + (directory-exists? directory-exists?) + (call-with-output-file call-with-output-file) (temp #f) (temp-prefix "temp")) (define (tempdir) diff --git a/irregex-core.scm b/irregex-core.scm new file mode 100644 index 00000000..0baeb1a3 --- /dev/null +++ b/irregex-core.scm @@ -0,0 +1,3896 @@ +;;;; irregex.scm -- IrRegular Expressions +;; +;; Copyright (c) 2005-2010 Alex Shinn. All rights reserved. +;; BSD-style license: http://synthcode.com/license.txt + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; At this moment there was a loud ring at the bell, and I could +;; hear Mrs. Hudson, our landlady, raising her voice in a wail of +;; expostulation and dismay. +;; +;; "By heaven, Holmes," I said, half rising, "I believe that +;; they are really after us." +;; +;; "No, it's not quite so bad as that. It is the unofficial +;; force, -- the Baker Street irregulars." + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Notes +;; +;; This code should not require any porting - it should work out of +;; the box in any R[45]RS Scheme implementation. Slight modifications +;; are needed for R6RS (a separate R6RS-compatible version is included +;; in the distribution as irregex-r6rs.scm). +;; +;; The goal of portability makes this code a little clumsy and +;; inefficient. Future versions will include both cleanup and +;; performance tuning, but you can only go so far while staying +;; portable. AND-LET*, SRFI-9 records and custom macros would've been +;; nice. + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; History +;; +;; 0.8.2: 2010/08/03 - (...)? submatch extraction fix and alternate +;; named submatches from Peter Bex +;; Added irregex-match-valid-index? to export list +;; and made it accept named submatches. The procedures +;; irregex-match-{start,end}-{index,chunk} now also +;; accept named submatches, with the index argument +;; made optional. Improved argument type checks. +;; Disallow negative submatch index. +;; 0.8.1: 2010/03/09 - backtracking irregex-match fix and other small fixes +;; 0.8.0: 2010/01/20 - optimizing DFA compilation, adding SRE escapes +;; inside PCREs, adding utility SREs +;; 0.7.5: 2009/08/31 - adding irregex-extract and irregex-split +;; *-fold copies match data (use *-fold/fast for speed) +;; irregex-opt now returns an SRE +;; 0.7.4: 2009/05/14 - empty alternates (or) and empty csets always fail, +;; bugfix in default finalizer for irregex-fold/chunked +;; 0.7.3: 2009/04/14 - adding irregex-fold/chunked, minor doc fixes +;; 0.7.2: 2009/02/11 - some bugfixes, much improved documentation +;; 0.7.1: 2008/10/30 - several bugfixes (thanks to Derick Eddington) +;; 0.7.0: 2008/10/20 - support abstract chunked strings +;; 0.6.2: 2008/07/26 - minor bugfixes, allow global disabling of utf8 mode, +;; friendlier error messages in parsing, \Q..\E support +;; 0.6.1: 2008/07/21 - added utf8 mode, more utils, bugfixes +;; 0.6: 2008/05/01 - most of PCRE supported +;; 0.5: 2008/04/24 - fully portable R4RS, many PCRE features implemented +;; 0.4: 2008/04/17 - rewriting NFA to use efficient closure compilation, +;; normal strings only, but all of the spencer tests pass +;; 0.3: 2008/03/10 - adding DFA converter (normal strings only) +;; 0.2: 2005/09/27 - adding irregex-opt (like elisp's regexp-opt) utility +;; 0.1: 2005/08/18 - simple NFA interpreter over abstract chunked strings + + +(define (%irregex-error arg1 . args) + (apply + error + (if (symbol? loc1) + (cons (string-append (symbol->string arg1) ": " (car args)) + (cdr args)) + args))) + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Data Structures + +(cond-expand + (building-chicken + (begin + (define-syntax (internal x r c) + `(,(with-input-from-string (cadr x) read) ,@(cddr x))) + ;; make-irregex defined elsewhere + (define (irregex? x) + (internal "##sys#structure?" x 'regexp)) + (define (irregex-dfa x) + (internal "##sys#check-structure" x 'regexp 'irregex-dfa) + (internal "##sys#slot" x 1)) + (define (irregex-dfa/search x) + (internal "##sys#check-structure" x 'regexp 'irregex-dfa/search) + (internal "##sys#slot" x 2)) + (define (irregex-dfa/extract x) + (internal "##sys#check-structure" x 'regexp 'irregex-dfa/extract) + (internal "##sys#slot" x 3)) + (define (irregex-nfa x) + (internal "##sys#check-structure" x 'regexp 'irregex-nfa) + (internal "##sys#slot" x 4)) + (define (irregex-flags x) + (internal "##sys#check-structure" x 'regexp 'irregex-flags) + (internal "##sys#slot" x 5)) + (define (irregex-num-submatches x) + (internal "##sys#check-structure" x 'regexp 'irregex-num-submatches) + (internal "##sys#slot" x 6)) + (define (irregex-lengths x) + (internal "##sys#check-structure" x 'regexp 'irregex-lengths) + (internal "##sys#slot" x 7)) + (define (irregex-names x) + (internal "##sys#check-structure" x 'regexp 'irregex-names) + (internal "##sys#slot" x 8)) + ;; make-irregex-match defined elsewhere + (define (irregex-new-matches irx) + (make-irregex-match (irregex-num-submatches irx) (irregex-names irx))) + (define (irregex-reset-matches! m) + (let ((v (internal "##sys#slot" m 1))) + (vector-fill! v #f) + m)) + (define (irregex-copy-matches m) + (and (internal "##sys#structure?" m 'regexp-match) + (internal + "##sys#make-structure" + 'regexp-match + (let* ((v (internal "##sys#slot" m 1)) + (v2 (make-vector (internal "##sys#size" v)))) + (vector-copy! v v2) + v2) + (internal "##sys#slot" m 2) + (internal "##sys#slot" m 3) + (internal "##sys#slot" m 4)))) + (define (irregex-match-data? obj) + (internal "##sys#structure?" obj 'regexp-match)) + (define (irregex-match-num-submatches m) + (internal "##sys#check-structure" m 'regexp-match 'irregex-match-num-submatches) + (- (fx/ (internal "##sys#size" (internal "##sys#slot" m 1)) 4) 2)) + (define (irregex-match-chunker m) + (internal "##sys#slot" m 3)) + (define (irregex-match-names m) + (internal "##sys#check-structure" m 'regexp-match 'irregex-match-names) + (internal "##sys#slot" m 2)) + (define (irregex-match-chunker-set! m str) + (internal "##sys#setslot" m 3 str)) + (define-inline (%irregex-match-start-chunk m n) + (internal "##sys#slot" (internal "##sys#slot" m 1) (* n 4))) + (define-inline (%irregex-match-start-index m n) + (internal "##sys#slot" (internal "##sys#slot" m 1) (+ 1 (* n 4)))) + (define-inline (%irregex-match-end-chunk m n) + (internal "##sys#slot" (internal "##sys#slot" m 1) (+ 2 (* n 4)))) + (define (%irregex-match-end-index m n) + (internal "##sys#slot" (internal "##sys#slot" m 1) (+ 3 (* n 4)))) + (define (%irregex-match-fail m) (internal "##sys#slot" m 4)) + (define (%irregex-match-fail-set! m x) (internal "##sys#setslot" m 4 x)) + (define-record-printer (regexp-match m out) + (let ((n (irregex-match-num-submatches m))) + (display "#<regexp-match (" out) + (display n out) + (display " submatches)>" out))) + (define-inline (irregex-match-valid-numeric-index? m n) + (let ((v (internal "##sys#slot" m 1))) + (and (>= n 0) (< (* n 4) (internal "##sys#size" v)) + (internal "##sys#slot" v (+ 1 (* n 4)))))))) + (else + (begin + (define irregex-tag '*irregex-tag*) + (define (make-irregex dfa dfa/search dfa/extract nfa flags + submatches lengths names) + (vector irregex-tag dfa dfa/search dfa/extract nfa flags + submatches lengths names)) + (define (irregex? obj) + (and (vector? obj) + (= 9 (vector-length obj)) + (eq? irregex-tag (vector-ref obj 0)))) + (define (irregex-dfa x) (vector-ref x 1)) + (define (irregex-dfa/search x) (vector-ref x 2)) + (define (irregex-dfa/extract x) (vector-ref x 3)) + (define (irregex-nfa x) (vector-ref x 4)) + (define (irregex-flags x) (vector-ref x 5)) + (define (irregex-num-submatches x) (vector-ref x 6)) + (define (irregex-lengths x) (vector-ref x 7)) + (define (irregex-names x) (vector-ref x 8)) + (define (irregex-new-matches irx) + (make-irregex-match (irregex-num-submatches irx) (irregex-names irx))) + (define (irregex-reset-matches! m) + (do ((i (- (vector-length m) 1) (- i 1))) + ((<= i 3) m) + (vector-set! m i #f))) + (define (irregex-copy-matches m) + (and (vector? m) + (let ((r (make-vector (vector-length m)))) + (do ((i (- (vector-length m) 1) (- i 1))) + ((< i 0) r) + (vector-set! r i (vector-ref m i)))))) + (define irregex-match-tag '*irregex-match-tag*) + (define (irregex-match-data? obj) + (and (vector? obj) + (>= (vector-length obj) 11) + (eq? irregex-match-tag (vector-ref obj 0)))) + (define (make-irregex-match count names) + (let ((res (make-vector (+ (* 4 (+ 2 count)) 4) #f))) + (vector-set! res 0 irregex-match-tag) + (vector-set! res 2 names) + res)) + (define (irregex-match-num-submatches m) + (- (quotient (- (vector-length m) 3) 4) 2)) + (define (irregex-match-chunker m) + (vector-ref m 1)) + (define (irregex-match-names m) + (vector-ref m 2)) + (define (irregex-match-chunker-set! m str) + (vector-set! m 1 str)) + (define (%irregex-match-start-chunk m n) (vector-ref m (+ 3 (* n 4)))) + (define (%irregex-match-start-index m n) (vector-ref m (+ 4 (* n 4)))) + (define (%irregex-match-end-chunk m n) (vector-ref m (+ 5 (* n 4)))) + (define (%irregex-match-end-index m n) (vector-ref m (+ 6 (* n 4)))) + (define (%irregex-match-fail m) (vector-ref m (- (vector-length m) 1))) + (define (%irregex-match-fail-set! m x) (vector-set! m (- (vector-length m) 1) x)) + (define (irregex-match-valid-numeric-index? m n) + (and (>= n 0) (< (+ 3 (* n 4)) (vector-length m)) + (vector-ref m (+ 4 (* n 4)))))))) + +(define (irregex-match-valid-named-index? m n) + (and (assq n (irregex-match-names m)) + #t)) + +;; public interface with error checking +(define (irregex-match-start-chunk m . opt) + (let ((n (irregex-match-numeric-index 'irregex-match-start-chunk m opt #t))) + (%irregex-match-start-chunk m n))) +(define (irregex-match-start-index m . opt) + (let ((n (irregex-match-numeric-index 'irregex-match-start-index m opt #t))) + (%irregex-match-start-index m n))) +(define (irregex-match-end-chunk m . opt) + (let ((n (irregex-match-numeric-index 'irregex-match-end-chunk m opt #t))) + (%irregex-match-end-chunk m n))) +(define (irregex-match-end-index m . opt) + (let ((n (irregex-match-numeric-index 'irregex-match-end-index m opt #t))) + (%irregex-match-end-index m n))) + +(define (irregex-match-start-chunk-set! m n start) + (vector-set! m (+ 3 (* n 4)) start)) +(define (irregex-match-start-index-set! m n start) + (vector-set! m (+ 4 (* n 4)) start)) +(define (irregex-match-end-chunk-set! m n end) + (vector-set! m (+ 5 (* n 4)) end)) +(define (irregex-match-end-index-set! m n end) + (vector-set! m (+ 6 (* n 4)) end)) + +;; Helper procedure to convert any type of index from a rest args list +;; to a numeric index. Named submatches are converted to their corresponding +;; numeric index, and numeric submatches are checked for validity. +;; If strict? is true, an error is raised for invalid numeric indices. +;; #f is returned if strict? is false, but unknown named submatches always +;; cause an error, regardless of strict?ness +(define (irregex-match-numeric-index location m opt strict?) + (cond + ((not (irregex-match-data? m)) + (%irregex-error location "not match data" m)) + ((not (pair? opt)) 0) + ((pair? (cdr opt)) + (apply %irregex-error location "too many arguments" m opt)) + (else + (let ((n (car opt))) + (if (number? n) + (or (and (irregex-match-valid-numeric-index? m n) n) + (and strict? + (%irregex-error location "not a valid index" m n))) + (let lp ((ls (irregex-match-names m)) + (unknown? #t)) + (cond + ((null? ls) + (and unknown? + (%irregex-error location "unknown match name" n))) + ((eq? n (caar ls)) + (if (%irregex-match-start-chunk m (cdar ls)) + (cdar ls) + (lp (cdr ls) #f))) + (else (lp (cdr ls) unknown?))))))))) + +(define (irregex-match-valid-index? m n) + (if (not (irregex-match-data? m)) + (%irregex-error 'irregex-match-valid-index? "not match data" m)) + (if (integer? n) + (irregex-match-valid-numeric-index? m n) + (irregex-match-valid-named-index? m n))) + +(define (irregex-match-substring m . opt) + (let* ((n (irregex-match-numeric-index 'irregex-match-substring m opt #f)) + (cnk (irregex-match-chunker m))) + (and n + ((chunker-get-substring cnk) + (%irregex-match-start-chunk m n) + (%irregex-match-start-index m n) + (%irregex-match-end-chunk m n) + (%irregex-match-end-index m n))))) + +(define (irregex-match-subchunk m . opt) + (let* ((n (irregex-match-numeric-index 'irregex-match-subchunk m opt #f)) + (cnk (irregex-match-chunker m)) + (get-subchunk (chunker-get-subchunk cnk))) + (if (not get-subchunk) + (%irregex-error "this chunk type does not support match subchunks" m n) + (and n (get-subchunk + (%irregex-match-start-chunk m n) + (%irregex-match-start-index m n) + (%irregex-match-end-chunk m n) + (%irregex-match-end-index m n)))))) + +;; chunkers tell us how to navigate through chained chunks of strings + +(define (make-irregex-chunker get-next get-str . o) + (let* ((get-start (or (and (pair? o) (car o)) (lambda (cnk) 0))) + (o (if (pair? o) (cdr o) o)) + (get-end (or (and (pair? o) (car o)) + (lambda (cnk) (string-length (get-str cnk))))) + (o (if (pair? o) (cdr o) o)) + (get-substr + (or (and (pair? o) (car o)) + (lambda (cnk1 start cnk2 end) + (if (eq? cnk1 cnk2) + (substring (get-str cnk1) start end) + (let loop ((cnk (get-next cnk1)) + (res (list (substring (get-str cnk1) + start + (get-end cnk1))))) + (if (eq? cnk cnk2) + (string-cat-reverse + (cons (substring (get-str cnk) + (get-start cnk) + end) + res)) + (loop (get-next cnk) + (cons (substring (get-str cnk) + (get-start cnk) + (get-end cnk)) + res)))))))) + (o (if (pair? o) (cdr o) o)) + (get-subchunk (and (pair? o) (car o)))) + (if (not (and (procedure? get-next) (procedure? get-str) + (procedure? get-start) (procedure? get-substr))) + (%irregex-error 'make-irregex-chunker "expected a procdure")) + (vector get-next get-str get-start get-end get-substr get-subchunk))) + +(define (chunker-get-next cnk) (vector-ref cnk 0)) +(define (chunker-get-str cnk) (vector-ref cnk 1)) +(define (chunker-get-start cnk) (vector-ref cnk 2)) +(define (chunker-get-end cnk) (vector-ref cnk 3)) +(define (chunker-get-substring cnk) (vector-ref cnk 4)) +(define (chunker-get-subchunk cnk) (vector-ref cnk 5)) + +(define (chunker-prev-chunk cnk start end) + (if (eq? start end) + #f + (let ((get-next (chunker-get-next cnk))) + (let lp ((start start)) + (let ((next (get-next start))) + (if (eq? next end) + start + (and next (lp next)))))))) + +(define (chunker-prev-char cnk start end) + (let ((prev (chunker-prev-chunk cnk start end))) + (and prev + (string-ref ((chunker-get-str cnk) prev) + (- ((chunker-get-end cnk) prev) 1))))) + +(define (chunker-next-char cnk src) + (let ((next ((chunker-get-next cnk) src))) + (and next + (string-ref ((chunker-get-str cnk) next) + ((chunker-get-start cnk) next))))) + +(define (chunk-before? cnk a b) + (and (not (eq? a b)) + (let ((next ((chunker-get-next cnk) a))) + (and next + (if (eq? next b) + #t + (chunk-before? cnk next b)))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; String Utilities + +;; Unicode version (skip surrogates) +(define *all-chars* + `(/ ,(integer->char 0) ,(integer->char #xD7FF) + ,(integer->char #xE000) ,(integer->char #x10FFFF))) + +;; ASCII version, offset to not assume 0-255 +;; (define *all-chars* `(/ ,(integer->char (- (char->integer #\space) 32)) ,(integer->char (+ (char->integer #\space) 223)))) + +;; set to #f to ignore even an explicit request for utf8 handling +(define *allow-utf8-mode?* #t) + +;; (define *named-char-properties* '()) + +(define (string-scan-char str c . o) + (let ((end (string-length str))) + (let scan ((i (if (pair? o) (car o) 0))) + (cond ((= i end) #f) + ((eqv? c (string-ref str i)) i) + (else (scan (+ i 1))))))) + +(define (string-scan-char-escape str c . o) + (let ((end (string-length str))) + (let scan ((i (if (pair? o) (car o) 0))) + (cond ((= i end) #f) + ((eqv? c (string-ref str i)) i) + ((eqv? c #\\) (scan (+ i 2))) + (else (scan (+ i 1))))))) + +(define (string-scan-pred str pred . o) + (let ((end (string-length str))) + (let scan ((i (if (pair? o) (car o) 0))) + (cond ((= i end) #f) + ((pred (string-ref str i)) i) + (else (scan (+ i 1))))))) + +(define (string-split-char str c) + (let ((end (string-length str))) + (let lp ((i 0) (from 0) (res '())) + (define (collect) (cons (substring str from i) res)) + (cond ((>= i end) (reverse (collect))) + ((eqv? c (string-ref str i)) (lp (+ i 1) (+ i 1) (collect))) + (else (lp (+ i 1) from res)))))) + +(define (char-alphanumeric? c) + (or (char-alphabetic? c) (char-numeric? c))) + +(define (%substring=? a b start1 start2 len) + (let lp ((i 0)) + (cond ((>= i len) + #t) + ((char=? (string-ref a (+ start1 i)) (string-ref b (+ start2 i))) + (lp (+ i 1))) + (else + #f)))) + +;; SRFI-13 extracts + +(define (%%string-copy! to tstart from fstart fend) + (do ((i fstart (+ i 1)) + (j tstart (+ j 1))) + ((>= i fend)) + (string-set! to j (string-ref from i)))) + +(define (string-cat-reverse string-list) + (string-cat-reverse/aux + (fold (lambda (s a) (+ (string-length s) a)) 0 string-list) + string-list)) + +(define (string-cat-reverse/aux len string-list) + (let ((res (make-string len))) + (let lp ((i len) (ls string-list)) + (if (pair? ls) + (let* ((s (car ls)) + (slen (string-length s)) + (i (- i slen))) + (%%string-copy! res i s 0 slen) + (lp i (cdr ls))))) + res)) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; List Utilities + +;; like the one-arg IOTA case +(define (zero-to n) + (if (<= n 0) + '() + (let lp ((i (- n 1)) (res '())) + (if (zero? i) (cons 0 res) (lp (- i 1) (cons i res)))))) + +;; take the head of list FROM up to but not including TO, which must +;; be a tail of the list +(define (take-up-to from to) + (let lp ((ls from) (res '())) + (if (and (pair? ls) (not (eq? ls to))) + (lp (cdr ls) (cons (car ls) res)) + (reverse res)))) + +;; SRFI-1 extracts (simplified 1-ary versions) + +(define (find pred ls) + (let lp ((ls ls)) + (cond ((null? ls) #f) + ((pred (car ls)) (car ls)) + (else (lp (cdr ls)))))) + +(define (find-tail pred ls) + (let lp ((ls ls)) + (cond ((null? ls) #f) + ((pred (car ls)) ls) + (else (lp (cdr ls)))))) + +(define (last ls) + (if (not (pair? ls)) + (%irregex-error "can't take last of empty list") + (let lp ((ls ls)) + (if (pair? (cdr ls)) + (lp (cdr ls)) + (car ls))))) + +(define (any pred ls) + (and (pair? ls) + (let lp ((head (car ls)) (tail (cdr ls))) + (if (null? tail) + (pred head) + (or (pred head) (lp (car tail) (cdr tail))))))) + +(define (every pred ls) + (or (null? ls) + (let lp ((head (car ls)) (tail (cdr ls))) + (if (null? tail) + (pred head) + (and (pred head) (lp (car tail) (cdr tail))))))) + +(define (fold kons knil ls) + (let lp ((ls ls) (res knil)) + (if (null? ls) + res + (lp (cdr ls) (kons (car ls) res))))) + +(define (filter pred ls) + (let lp ((ls ls) (res '())) + (if (null? ls) + (reverse res) + (lp (cdr ls) (if (pred (car ls)) (cons (car ls) res) res))))) + +(define (remove pred ls) + (let lp ((ls ls) (res '())) + (if (null? ls) + (reverse res) + (lp (cdr ls) (if (pred (car ls)) res (cons (car ls) res)))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Flags + +(define (bit-shr n i) + (quotient n (expt 2 i))) + +(define (bit-shl n i) + (* n (expt 2 i))) + +(define (bit-not n) (- #xFFFF n)) + +(define (bit-ior a b) + (cond + ((zero? a) b) + ((zero? b) a) + (else + (+ (if (or (odd? a) (odd? b)) 1 0) + (* 2 (bit-ior (quotient a 2) (quotient b 2))))))) + +(define (bit-and a b) + (cond + ((zero? a) 0) + ((zero? b) 0) + (else + (+ (if (and (odd? a) (odd? b)) 1 0) + (* 2 (bit-and (quotient a 2) (quotient b 2))))))) + +(define (integer-log n) + (define (b8 n r) + (if (>= n (bit-shl 1 8)) (b4 (bit-shr n 8) (+ r 8)) (b4 n r))) + (define (b4 n r) + (if (>= n (bit-shl 1 4)) (b2 (bit-shr n 4) (+ r 4)) (b2 n r))) + (define (b2 n r) + (if (>= n (bit-shl 1 2)) (b1 (bit-shr n 2) (+ r 2)) (b1 n r))) + (define (b1 n r) (if (>= n (bit-shl 1 1)) (+ r 1) r)) + (if (>= n (bit-shl 1 16)) (b8 (bit-shr n 16) 16) (b8 n 0))) + +(define (flag-set? flags i) + (= i (bit-and flags i))) +(define (flag-join a b) + (if b (bit-ior a b) a)) +(define (flag-clear a b) + (bit-and a (bit-not b))) + +(define ~none 0) +(define ~searcher? 1) +(define ~consumer? 2) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Parsing Embedded SREs in PCRE Strings + +;; (define (with-read-from-string str i proc) +;; (define (port-size in) +;; (let lp ((i 0)) (if (eof-object? (read-char in)) i (lp (+ i 1))))) +;; (let* ((len (string-length str)) +;; (tail-len (- len i)) +;; (in (open-input-string (substring str i len))) +;; (sre (read in)) +;; (unused-len (port-size in))) +;; (close-input-port in) +;; (proc sre (- tail-len unused-len)))) + +(define close-token (list 'close)) +(define dot-token (string->symbol ".")) + +(define (with-read-from-string str i proc) + (define end (string-length str)) + (define (read i k) + (cond + ((>= i end) (%irregex-error "unterminated embedded SRE" str)) + (else + (case (string-ref str i) + ((#\() + (let lp ((i (+ i 1)) (ls '())) + (read + i + (lambda (x j) + (cond + ((eq? x close-token) + (k (reverse ls) j)) + ((eq? x dot-token) + (if (null? ls) + (%irregex-error "bad dotted form" str) + (read j (lambda (y j2) + (read j2 (lambda (z j3) + (if (not (eq? z close-token)) + (%irregex-error "bad dotted form" str) + (k (append (reverse (cdr ls)) + (cons (car ls) y)) + j3)))))))) + (else + (lp j (cons x ls)))))))) + ((#\)) + (k close-token (+ i 1))) + ((#\;) + (let skip ((i (+ i 1))) + (if (or (>= i end) (eqv? #\newline (string-ref str i))) + (read (+ i 1) k) + (skip (+ i 1))))) + ((#\' #\`) + (read (+ i 1) + (lambda (sexp j) + (let ((q (if (eqv? #\' (string-ref str i)) 'quote 'quasiquote))) + (k (list q sexp) j))))) + ((#\,) + (let* ((at? (and (< (+ i 1) end) (eqv? #\@ (string-ref str (+ i 1))))) + (u (if at? 'uquote-splicing 'unquote)) + (j (if at? (+ i 2) (+ i 1)))) + (read j (lambda (sexp j) (k (list u sexp) j))))) + ((#\") + (let scan ((from (+ i 1)) (i (+ i 1)) (res '())) + (define (collect) + (if (= from i) res (cons (substring str from i) res))) + (if (>= i end) + (%irregex-error "unterminated string in embedded SRE" str) + (case (string-ref str i) + ((#\") (k (string-cat-reverse (collect)) (+ i 1))) + ((#\\) (scan (+ i 1) (+ i 2) (collect))) + (else (scan from (+ i 1) res)))))) + ((#\#) + (case (string-ref str (+ i 1)) + ((#\;) + (read (+ i 2) (lambda (sexp j) (read j k)))) + ((#\\) + (read (+ i 2) + (lambda (sexp j) + (k (case sexp + ((space) #\space) + ((newline) #\newline) + (else (let ((s (if (number? sexp) + (number->string sexp) + (symbol->string sexp)))) + (string-ref s 0)))) + j)))) + ((#\t #\f) + (k (eqv? #\t (string-ref str (+ i 1))) (+ i 2))) + (else + (%irregex-error "bad # syntax in simplified SRE" i)))) + (else + (cond + ((char-whitespace? (string-ref str i)) + (read (+ i 1) k)) + (else ;; symbol/number + (let scan ((j (+ i 1))) + (cond + ((or (>= j end) + (let ((c (string-ref str j))) + (or (char-whitespace? c) + (memv c '(#\; #\( #\) #\" #\# #\\))))) + (let ((str2 (substring str i j))) + (k (or (string->number str2) (string->symbol str2)) j))) + (else (scan (+ j 1)))))))))))) + (read i (lambda (res j) + (if (eq? res 'close-token) + (%irregex-error "unexpected ')' in SRE" str j) + (proc res j))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Parsing PCRE Strings + +(define ~save? 1) +(define ~case-insensitive? 2) +(define ~multi-line? 4) +(define ~single-line? 8) +(define ~ignore-space? 16) +(define ~utf8? 32) + +(define (symbol-list->flags ls) + (let lp ((ls ls) (res ~none)) + (if (not (pair? ls)) + res + (lp (cdr ls) + (flag-join + res + (case (car ls) + ((i ci case-insensitive) ~case-insensitive?) + ((m multi-line) ~multi-line?) + ((s single-line) ~single-line?) + ((x ignore-space) ~ignore-space?) + ((u utf8) (if *allow-utf8-mode?* ~utf8? ~none)) + (else #f))))))) + +(define (maybe-string->sre obj) + (if (string? obj) (string->sre obj) obj)) + +(define (string->sre str . o) + (if (not (string? str)) (%irregex-error 'string->sre "expected a string" str)) + (let ((end (string-length str)) + (flags (symbol-list->flags o))) + + (let lp ((i 0) (from 0) (flags flags) (res '()) (st '())) + + ;; handle case sensitivity at the literal char/string level + (define (cased-char ch) + (if (and (flag-set? flags ~case-insensitive?) + (char-alphabetic? ch)) + `(or ,ch ,(char-altcase ch)) + ch)) + (define (cased-string str) + (if (flag-set? flags ~case-insensitive?) + (sre-sequence (map cased-char (string->list str))) + str)) + ;; accumulate the substring from..i as literal text + (define (collect) + (if (= i from) res (cons (cased-string (substring str from i)) res))) + ;; like collect but breaks off the last single character when + ;; collecting literal data, as the argument to ?/*/+ etc. + (define (collect/single) + (let* ((utf8? (flag-set? flags ~utf8?)) + (j (if (and utf8? (> i 1)) + (utf8-backup-to-initial-char str (- i 1)) + (- i 1)))) + (cond + ((< j from) + res) + (else + (let ((c (cased-char (if utf8? + (utf8-string-ref str j (- i j)) + (string-ref str j))))) + (cond + ((= j from) + (cons c res)) + (else + (cons c + (cons (cased-string (substring str from j)) + res))))))))) + ;; collects for use as a result, reversing and grouping OR + ;; terms, and some ugly tweaking of `function-like' groups and + ;; conditionals + (define (collect/terms) + (let* ((ls (collect)) + (func + (and (pair? ls) + (memq (last ls) + '(atomic if look-ahead neg-look-ahead + look-behind neg-look-behind + => submatch-named + w/utf8 w/noutf8)))) + (prefix (if (and func (memq (car func) '(=> submatch-named))) + (list 'submatch-named (cadr (reverse ls))) + (and func (list (car func))))) + (ls (if func + (if (memq (car func) '(=> submatch-named)) + (reverse (cddr (reverse ls))) + (reverse (cdr (reverse ls)))) + ls))) + (let lp ((ls ls) (term '()) (res '())) + (define (shift) + (cons (sre-sequence term) res)) + (cond + ((null? ls) + (let* ((res (sre-alternate (shift))) + (res (if (flag-set? flags ~save?) + (list 'submatch res) + res))) + (if prefix + (if (eq? 'if (car prefix)) + (cond + ((not (pair? res)) + 'epsilon) + ((memq (car res) + '(look-ahead neg-look-ahead + look-behind neg-look-behind)) + res) + ((eq? 'seq (car res)) + `(if ,(cadr res) + ,(if (pair? (cdr res)) + (sre-sequence (cddr res)) + 'epsilon))) + (else + `(if ,(cadadr res) + ,(if (pair? (cdr res)) + (sre-sequence (cddadr res)) + 'epsilon) + ,(sre-alternate + (if (pair? (cdr res)) (cddr res) '()))))) + `(,@prefix ,res)) + res))) + ((eq? 'or (car ls)) (lp (cdr ls) '() (shift))) + (else (lp (cdr ls) (cons (car ls) term) res)))))) + (define (save) + (cons (cons flags (collect)) st)) + + ;; main parsing + (if (>= i end) + (if (pair? st) + (%irregex-error "unterminated parenthesis in regexp" str) + (collect/terms)) + (let ((c (string-ref str i))) + (case c + ((#\.) + (lp (+ i 1) (+ i 1) flags + (cons (if (flag-set? flags ~single-line?) 'any 'nonl) + (collect)) + st)) + ((#\?) + (let ((res (collect/single))) + (if (null? res) + (%irregex-error "? can't follow empty pattern" str res) + (let ((x (car res))) + (lp (+ i 1) + (+ i 1) + flags + (cons + (if (pair? x) + (case (car x) + ((*) `(*? ,@(cdr x))) + ((+) `(**? 1 #f ,@(cdr x))) + ((?) `(?? ,@(cdr x))) + ((**) `(**? ,@(cdr x))) + ((=) `(**? ,(cadr x) ,@(cdr x))) + ((>=) `(**? ,(cadr x) #f ,@(cddr x))) + (else `(? ,x))) + `(? ,x)) + (cdr res)) + st))))) + ((#\+ #\*) + (let* ((res (collect/single)) + (x (if (pair? res) (car res) 'epsilon)) + (op (string->symbol (string c)))) + (cond + ((sre-repeater? x) + (%irregex-error "duplicate repetition (e.g. **) in pattern" str res)) + ((sre-empty? x) + (%irregex-error "can't repeat empty pattern (e.g. ()*)" str res)) + (else + (lp (+ i 1) (+ i 1) flags + (cons (list op x) (cdr res)) + st))))) + ((#\() + (cond + ((>= (+ i 1) end) + (%irregex-error "unterminated parenthesis in regexp" str)) + ((not (memv (string-ref str (+ i 1)) '(#\? #\*))) ; normal case + (lp (+ i 1) (+ i 1) (flag-join flags ~save?) '() (save))) + ((>= (+ i 2) end) + (%irregex-error "unterminated parenthesis in regexp" str)) + ((eqv? (string-ref str (+ i 1)) #\*) + (if (eqv? #\' (string-ref str (+ i 2))) + (with-read-from-string str (+ i 3) + (lambda (sre j) + (if (or (>= j end) (not (eqv? #\) (string-ref str j)))) + (%irregex-error "unterminated (*'...) SRE escape" str) + (lp (+ j 1) (+ j 1) flags (cons sre (collect)) st)))) + (%irregex-error "bad regexp syntax: (*FOO) not supported" str))) + (else ;; (?...) case + (case (string-ref str (+ i 2)) + ((#\#) + (let ((j (string-scan-char str #\) (+ i 3)))) + (lp (+ j i) (+ j 1) flags (collect) st))) + ((#\:) + (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) '() (save))) + ((#\=) + (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) + '(look-ahead) (save))) + ((#\!) + (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) + '(neg-look-ahead) (save))) + ((#\<) + (cond + ((>= (+ i 3) end) + (%irregex-error "unterminated parenthesis in regexp" str)) + (else + (case (string-ref str (+ i 3)) + ((#\=) + (lp (+ i 4) (+ i 4) (flag-clear flags ~save?) + '(look-behind) (save))) + ((#\!) + (lp (+ i 4) (+ i 4) (flag-clear flags ~save?) + '(neg-look-behind) (save))) + (else + (let ((j (and (char-alphabetic? + (string-ref str (+ i 3))) + (string-scan-char str #\> (+ i 4))))) + (if j + (lp (+ j 1) (+ j 1) (flag-clear flags ~save?) + `(,(string->symbol (substring str (+ i 3) j)) + submatch-named) + (save)) + (%irregex-error "invalid (?< sequence" str)))))))) + ((#\>) + (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) + '(atomic) (save))) + ;;((#\' #\P) ; named subpatterns + ;; ) + ;;((#\R) ; recursion + ;; ) + ((#\() + (cond + ((>= (+ i 3) end) + (%irregex-error "unterminated parenthesis in regexp" str)) + ((char-numeric? (string-ref str (+ i 3))) + (let* ((j (string-scan-char str #\) (+ i 3))) + (n (string->number (substring str (+ i 3) j)))) + (if (not n) + (%irregex-error "invalid conditional reference" str) + (lp (+ j 1) (+ j 1) (flag-clear flags ~save?) + `(,n if) (save))))) + ((char-alphabetic? (string-ref str (+ i 3))) + (let* ((j (string-scan-char str #\) (+ i 3))) + (s (string->symbol (substring str (+ i 3) j)))) + (lp (+ j 1) (+ j 1) (flag-clear flags ~save?) + `(,s if) (save)))) + (else + (lp (+ i 2) (+ i 2) (flag-clear flags ~save?) + '(if) (save))))) + ((#\{) + (%irregex-error "unsupported Perl-style cluster" str)) + (else + (let ((old-flags flags)) + (let lp2 ((j (+ i 2)) (flags flags) (invert? #f)) + (define (join x) + ((if invert? flag-clear flag-join) flags x)) + (define (new-res res) + (let ((before (flag-set? old-flags ~utf8?)) + (after (flag-set? flags ~utf8?))) + (if (eq? before after) + res + (cons (if after 'w/utf8 'w/noutf8) res)))) + (cond + ((>= j end) + (%irregex-error "incomplete cluster" str i)) + (else + (case (string-ref str j) + ((#\i) + (lp2 (+ j 1) (join ~case-insensitive?) invert?)) + ((#\m) + (lp2 (+ j 1) (join ~multi-line?) invert?)) + ((#\x) + (lp2 (+ j 1) (join ~ignore-space?) invert?)) + ((#\u) + (if *allow-utf8-mode?* + (lp2 (+ j 1) (join ~utf8?) invert?) + (lp2 (+ j 1) flags invert?))) + ((#\-) + (lp2 (+ j 1) flags (not invert?))) + ((#\)) + (lp (+ j 1) (+ j 1) flags (new-res (collect)) + st)) + ((#\:) + (lp (+ j 1) (+ j 1) flags (new-res '()) + (cons (cons old-flags (collect)) st))) + (else + (%irregex-error "unknown regex cluster modifier" str) + ))))))))))) + ((#\)) + (if (null? st) + (%irregex-error "too many )'s in regexp" str) + (lp (+ i 1) + (+ i 1) + (caar st) + (cons (collect/terms) (cdar st)) + (cdr st)))) + ((#\[) + (apply + (lambda (sre j) + (lp (+ j 1) (+ j 1) flags (cons sre (collect)) st)) + (string-parse-cset str (+ i 1) flags))) + ((#\{) + (cond + ((or (>= (+ i 1) end) + (not (or (char-numeric? (string-ref str (+ i 1))) + (eqv? #\, (string-ref str (+ i 1)))))) + (lp (+ i 1) from flags res st)) + (else + (let ((res (collect/single))) + (cond + ((null? res) + (%irregex-error "{ can't follow empty pattern")) + (else + (let* ((x (car res)) + (tail (cdr res)) + (j (string-scan-char str #\} (+ i 1))) + (s2 (string-split-char (substring str (+ i 1) j) + #\,)) + (n (string->number (car s2))) + (m (and (pair? (cdr s2)) + (string->number (cadr s2))))) + (cond + ((or (not n) + (and (pair? (cdr s2)) + (not (equal? "" (cadr s2))) + (not m))) + (%irregex-error "invalid {n} repetition syntax" s2)) + ((null? (cdr s2)) + (lp (+ j 1) (+ j 1) flags `((= ,n ,x) ,@tail) st)) + (m + (lp (+ j 1) (+ j 1) flags `((** ,n ,m ,x) ,@tail) st)) + (else + (lp (+ j 1) (+ j 1) flags `((>= ,n ,x) ,@tail) st) + ))))))))) + ((#\\) + (cond + ((>= (+ i 1) end) + (%irregex-error "incomplete escape sequence" str)) + (else + (let ((c (string-ref str (+ i 1)))) + (case c + ((#\d) + (lp (+ i 2) (+ i 2) flags `(numeric ,@(collect)) st)) + ((#\D) + (lp (+ i 2) (+ i 2) flags `((~ numeric) ,@(collect)) st)) + ((#\s) + (lp (+ i 2) (+ i 2) flags `(space ,@(collect)) st)) + ((#\S) + (lp (+ i 2) (+ i 2) flags `((~ space) ,@(collect)) st)) + ((#\w) + (lp (+ i 2) (+ i 2) flags + `((or alphanumeric ("_")) ,@(collect)) st)) + ((#\W) + (lp (+ i 2) (+ i 2) flags + `((~ (or alphanumeric ("_"))) ,@(collect)) st)) + ((#\b) + (lp (+ i 2) (+ i 2) flags + `((or bow eow) ,@(collect)) st)) + ((#\B) + (lp (+ i 2) (+ i 2) flags `(nwb ,@(collect)) st)) + ((#\A) + (lp (+ i 2) (+ i 2) flags `(bos ,@(collect)) st)) + ((#\Z) + (lp (+ i 2) (+ i 2) flags + `((? #\newline) eos ,@(collect)) st)) + ((#\z) + (lp (+ i 2) (+ i 2) flags `(eos ,@(collect)) st)) + ((#\R) + (lp (+ i 2) (+ i 2) flags `(newline ,@(collect)) st)) + ((#\K) + (lp (+ i 2) (+ i 2) flags `(reset ,@(collect)) st)) + ;; these two are from Emacs and TRE, but not in PCRE + ((#\<) + (lp (+ i 2) (+ i 2) flags `(bow ,@(collect)) st)) + ((#\>) + (lp (+ i 2) (+ i 2) flags `(eow ,@(collect)) st)) + ((#\x) + (apply + (lambda (ch j) + (lp (+ j 1) (+ j 1) flags `(,ch ,@(collect)) st)) + (string-parse-hex-escape str (+ i 2) end))) + ((#\k) + (let ((c (string-ref str (+ i 2)))) + (if (not (memv c '(#\< #\{ #\'))) + (%irregex-error "bad \\k usage, expected \\k<...>" str) + (let* ((terminal (char-mirror c)) + (j (string-scan-char str terminal (+ i 2))) + (s (and j (substring str (+ i 3) j))) + (backref + (if (flag-set? flags ~case-insensitive?) + 'backref-ci + 'backref))) + (if (not j) + (%irregex-error "unterminated named backref" str) + (lp (+ j 1) (+ j 1) flags + `((,backref ,(string->symbol s)) + ,@(collect)) + st)))))) + ((#\Q) ;; \Q..\E escapes + (let ((res (collect))) + (let lp2 ((j (+ i 2))) + (cond + ((>= j end) + (lp j (+ i 2) flags res st)) + ((eqv? #\\ (string-ref str j)) + (cond + ((>= (+ j 1) end) + (lp (+ j 1) (+ i 2) flags res st)) + ((eqv? #\E (string-ref str (+ j 1))) + (lp (+ j 2) (+ j 2) flags + (cons (substring str (+ i 2) j) res) st)) + (else + (lp2 (+ j 2))))) + (else + (lp2 (+ j 1))))))) + ((#\') + (with-read-from-string str (+ i 2) + (lambda (sre j) + (lp j j flags (cons sre (collect)) st)))) + ;;((#\p) ; XXXX unicode properties + ;; ) + ;;((#\P) + ;; ) + (else + (cond + ((char-numeric? c) + (let* ((j (or (string-scan-pred + str + (lambda (c) (not (char-numeric? c))) + (+ i 2)) + end)) + (backref + (if (flag-set? flags ~case-insensitive?) + 'backref-ci + 'backref)) + (res `((,backref ,(string->number + (substring str (+ i 1) j))) + ,@(collect)))) + (lp j j flags res st))) + ((char-alphabetic? c) + (let ((cell (assv c posix-escape-sequences))) + (if cell + (lp (+ i 2) (+ i 2) flags + (cons (cdr cell) (collect)) st) + (%irregex-error "unknown escape sequence" str c)))) + (else + (lp (+ i 2) (+ i 1) flags (collect) st))))))))) + ((#\|) + (lp (+ i 1) (+ i 1) flags (cons 'or (collect)) st)) + ((#\^) + (let ((sym (if (flag-set? flags ~multi-line?) 'bol 'bos))) + (lp (+ i 1) (+ i 1) flags (cons sym (collect)) st))) + ((#\$) + (let ((sym (if (flag-set? flags ~multi-line?) 'eol 'eos))) + (lp (+ i 1) (+ i 1) flags (cons sym (collect)) st))) + ((#\space) + (if (flag-set? flags ~ignore-space?) + (lp (+ i 1) (+ i 1) flags (collect) st) + (lp (+ i 1) from flags res st))) + ((#\#) + (if (flag-set? flags ~ignore-space?) + (let ((j (or (string-scan-char str #\newline (+ i 1)) + (- end 1)))) + (lp (+ j 1) (+ j 1) flags (collect) st)) + (lp (+ i 1) from flags res st))) + (else + (lp (+ i 1) from flags res st)))))))) + +(define posix-escape-sequences + `((#\n . #\newline) + (#\r . ,(integer->char (+ (char->integer #\newline) 3))) + (#\t . ,(integer->char (- (char->integer #\newline) 1))) + (#\a . ,(integer->char (- (char->integer #\newline) 3))) + (#\e . ,(integer->char (+ (char->integer #\newline) #x11))) + (#\f . ,(integer->char (+ (char->integer #\newline) 2))) + )) + +(define (char-altcase c) + (if (char-upper-case? c) (char-downcase c) (char-upcase c))) + +(define (char-mirror c) + (case c ((#\<) #\>) ((#\{) #\}) ((#\() #\)) ((#\[) #\]) (else c))) + +(define (string-parse-hex-escape str i end) + (cond + ((>= i end) + (%irregex-error "incomplete hex escape" str i)) + ((eqv? #\{ (string-ref str i)) + (let ((j (string-scan-char-escape str #\} (+ i 1)))) + (if (not j) + (%irregex-error "incomplete hex brace escape" str i) + (let* ((s (substring str (+ i 1) j)) + (n (string->number s 16))) + (if n + (list (integer->char n) j) + (%irregex-error "bad hex brace escape" s)))))) + ((>= (+ i 1) end) + (%irregex-error "incomplete hex escape" str i)) + (else + (let* ((s (substring str i (+ i 2))) + (n (string->number s 16))) + (if n + (list (integer->char n) (+ i 2)) + (%irregex-error "bad hex escape" s)))))) + +(define (string-parse-cset str start flags) + (let* ((end (string-length str)) + (invert? (and (< start end) (eqv? #\^ (string-ref str start)))) + (utf8? (flag-set? flags ~utf8?))) + (define (go i chars ranges) + (if (>= i end) + (%irregex-error "incomplete char set" str i end) + (let ((c (string-ref str i))) + (case c + ((#\]) + (if (and (null? chars) (null? ranges)) + (go (+ i 1) (cons #\] chars) ranges) + (let ((ci? (flag-set? flags ~case-insensitive?)) + (hi-chars (if utf8? (filter high-char? chars) '())) + (chars (if utf8? (remove high-char? chars) chars))) + (list + ((lambda (res) + (if invert? (cons '~ res) (sre-alternate res))) + (append + hi-chars + (if (pair? chars) + (list + (list (list->string + ((if ci? + cset-case-insensitive + (lambda (x) x)) + (reverse chars))))) + '()) + (if (pair? ranges) + (let ((res (if ci? + (cset-case-insensitive + (reverse ranges)) + (reverse ranges)))) + (list (cons '/ (alist->plist res)))) + '()))) + i)))) + ((#\-) + (cond + ((or (= i start) + (and (= i (+ start 1)) (eqv? #\^ (string-ref str start))) + (eqv? #\] (string-ref str (+ i 1)))) + (go (+ i 1) (cons c chars) ranges)) + ((null? chars) + (%irregex-error "bad char-set")) + (else + (let* ((c1 (car chars)) + (c2 (string-ref str (+ i 1)))) + (apply + (lambda (c2 j) + (if (char<? c2 c1) + (%irregex-error "inverted range in char-set" c1 c2) + (go j (cdr chars) (cons (cons c1 c2) ranges)))) + (cond + ((and (eqv? #\\ c2) (assv c2 posix-escape-sequences)) + => (lambda (x) (list (cdr x) (+ i 3)))) + ((and (eqv? #\\ c2) + (eqv? (string-ref str (+ i 2)) #\x)) + (string-parse-hex-escape str (+ i 3) end)) + ((and utf8? (<= #x80 (char->integer c2) #xFF)) + (let ((len (utf8-start-char->length c2))) + (list (utf8-string-ref str (+ i 1) len) (+ i 1 len)))) + (else + (list c2 (+ i 2))))))))) + ((#\[) + (let* ((inv? (eqv? #\^ (string-ref str (+ i 1)))) + (i2 (if inv? (+ i 2) (+ i 1)))) + (case (string-ref str i2) + ((#\:) + (let ((j (string-scan-char str #\: (+ i2 1)))) + (if (or (not j) (not (eqv? #\] (string-ref str (+ j 1))))) + (%irregex-error "incomplete character class" str) + (let* ((cset (sre->cset + (string->symbol + (substring str (+ i2 1) j)))) + (cset (if inv? (cset-complement cset) cset))) + (go (+ j 2) + (append (filter char? cset) chars) + (append (filter pair? cset) ranges)))))) + ((#\= #\.) + (%irregex-error "collating sequences not supported" str)) + (else + (go (+ i 1) (cons #\[ chars) ranges))))) + ((#\\) + (let ((c (string-ref str (+ i 1)))) + (case c + ((#\d #\D #\s #\S #\w #\W) + (let ((cset (sre->cset (string->sre (string #\\ c))))) + (go (+ i 2) + (append (filter char? cset) chars) + (append (filter pair? cset) ranges)))) + ((#\x) + (apply + (lambda (ch j) + (go j (cons ch chars) ranges)) + (string-parse-hex-escape str (+ i 2) end))) + (else + (let ((c (cond ((assv c posix-escape-sequences) => cdr) + (else c)))) + (go (+ i 2) + (cons (string-ref str (+ i 1)) (cons c chars)) + ranges)))))) + (else + (if (and utf8? (<= #x80 (char->integer c) #xFF)) + (let ((len (utf8-start-char->length c))) + (go (+ i len) + (cons (utf8-string-ref str i len) chars) + ranges)) + (go (+ i 1) (cons c chars) ranges))))))) + (if invert? + (go (+ start 1) + (if (flag-set? flags ~multi-line?) '(#\newline) '()) + '()) + (go start '() '())))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; UTF-8 Utilities + +;; Here are some hairy optimizations that need to be documented +;; better. Thanks to these, we never do any utf8 processing once the +;; regexp is compiled. + +;; two chars: ab..ef +;; a[b..xFF]|[b-d][x80..xFF]|e[x80..xFF] + +;; three chars: abc..ghi +;; ab[c..xFF]|a[d..xFF][x80..xFF]| +;; [b..f][x80..xFF][x80..xFF]| +;; g[x80..g][x80..xFF]|gh[x80..i] + +;; four chars: abcd..ghij +;; abc[d..xFF]|ab[d..xFF][x80..xFF]|a[c..xFF][x80..xFF][x80..xFF]| +;; [b..f][x80..xFF][x80..xFF][x80..xFF]| +;; g[x80..g][x80..xFF][x80..xFF]|gh[x80..h][x80..xFF]|ghi[x80..j] + +(define (high-char? c) (<= #x80 (char->integer c))) + +;; number of total bytes in a utf8 char given the 1st byte + +(define utf8-start-char->length + (let ((table '#( +1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 0x +1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 1x +1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 2x +1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 3x +1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 4x +1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 5x +1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 6x +1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 7x +1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 8x +1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 9x +1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; ax +1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; bx +2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ; cx +2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ; dx +3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 ; ex +4 4 4 4 4 4 4 4 5 5 5 5 6 6 0 0 ; fx +))) + (lambda (c) (vector-ref table (char->integer c))))) + +(define (utf8-string-ref str i len) + (define (byte n) (char->integer (string-ref str n))) + (case len + ((1) ; shouldn't happen in this module + (string-ref str i)) + ((2) + (integer->char + (+ (bit-shl (bit-and (byte i) #b00011111) 6) + (bit-and (byte (+ i 1)) #b00111111)))) + ((3) + (integer->char + (+ (bit-shl (bit-and (byte i) #b00001111) 12) + (bit-shl (bit-and (byte (+ i 1)) #b00111111) 6) + (bit-and (byte (+ i 2)) #b00111111)))) + ((4) + (integer->char + (+ (bit-shl (bit-and (byte i) #b00000111) 18) + (bit-shl (bit-and (byte (+ i 1)) #b00111111) 12) + (bit-shl (bit-and (byte (+ i 2)) #b00111111) 6) + (bit-and (byte (+ i 3)) #b00111111)))) + (else + (%irregex-error "invalid utf8 length" str len i)))) + +(define (utf8-backup-to-initial-char str i) + (let lp ((i i)) + (if (= i 0) + 0 + (let ((c (char->integer (string-ref str i)))) + (if (or (< c #x80) (>= c #xC0)) + i + (lp (- i 1))))))) + +(define (utf8-lowest-digit-of-length len) + (case len + ((1) 0) ((2) #xC0) ((3) #xE0) ((4) #xF0) + (else (%irregex-error "invalid utf8 length" len)))) + +(define (utf8-highest-digit-of-length len) + (case len + ((1) #x7F) ((2) #xDF) ((3) #xEF) ((4) #xF7) + (else (%irregex-error "invalid utf8 length" len)))) + +(define (char->utf8-list c) + (let ((i (char->integer c))) + (cond + ((<= i #x7F) (list i)) + ((<= i #x7FF) + (list (bit-ior #b11000000 (bit-shr i 6)) + (bit-ior #b10000000 (bit-and i #b111111)))) + ((<= i #xFFFF) + (list (bit-ior #b11100000 (bit-shr i 12)) + (bit-ior #b10000000 (bit-and (bit-shr i 6) #b111111)) + (bit-ior #b10000000 (bit-and i #b111111)))) + ((<= i #x1FFFFF) + (list (bit-ior #b11110000 (bit-shr i 18)) + (bit-ior #b10000000 (bit-and (bit-shr i 12) #b111111)) + (bit-ior #b10000000 (bit-and (bit-shr i 6) #b111111)) + (bit-ior #b10000000 (bit-and i #b111111)))) + (else (%irregex-error "unicode codepoint out of range:" i))))) + +(define (unicode-range->utf8-pattern lo hi) + (let ((lo-ls (char->utf8-list lo)) + (hi-ls (char->utf8-list hi))) + (if (not (= (length lo-ls) (length hi-ls))) + (sre-alternate (list (unicode-range-climb-digits lo-ls hi-ls) + (unicode-range-up-to hi-ls))) + (let lp ((lo-ls lo-ls) (hi-ls hi-ls)) + (cond + ((null? lo-ls) + '()) + ((= (car lo-ls) (car hi-ls)) + (sre-sequence + (list (integer->char (car lo-ls)) + (lp (cdr lo-ls) (cdr hi-ls))))) + ((= (+ (car lo-ls) 1) (car hi-ls)) + (sre-alternate (list (unicode-range-up-from lo-ls) + (unicode-range-up-to hi-ls)))) + (else + (sre-alternate (list (unicode-range-up-from lo-ls) + (unicode-range-middle lo-ls hi-ls) + (unicode-range-up-to hi-ls))))))))) + +(define (unicode-range-helper one ls prefix res) + (if (null? ls) + res + (unicode-range-helper + one + (cdr ls) + (cons (car ls) prefix) + (cons (sre-sequence + `(,@(map integer->char prefix) + ,(one (car ls)) + ,@(map (lambda (_) + `(/ ,(integer->char #x80) + ,(integer->char #xFF))) + (cdr ls)))) + res)))) + +(define (unicode-range-up-from lo-ls) + (sre-sequence + (list (integer->char (car lo-ls)) + (sre-alternate + (unicode-range-helper + (lambda (c) + `(/ ,(integer->char (+ (car lo-ls) 1)) ,(integer->char #xFF))) + (cdr (reverse (cdr lo-ls))) + '() + (list + (sre-sequence + (append + (map integer->char (reverse (cdr (reverse (cdr lo-ls))))) + `((/ ,(integer->char (last lo-ls)) + ,(integer->char #xFF))))))))))) + +(define (unicode-range-up-to hi-ls) + (sre-sequence + (list (integer->char (car hi-ls)) + (sre-alternate + (unicode-range-helper + (lambda (c) + `(/ ,(integer->char #x80) ,(integer->char (- (car hi-ls) 1)))) + (cdr (reverse (cdr hi-ls))) + '() + (list + (sre-sequence + (append + (map integer->char (reverse (cdr (reverse (cdr hi-ls))))) + `((/ ,(integer->char #x80) + ,(integer->char (last hi-ls)))))))))))) + +(define (unicode-range-climb-digits lo-ls hi-ls) + (let ((lo-len (length lo-ls))) + (sre-alternate + (append + (list + (sre-sequence + (cons `(/ ,(integer->char (car lo-ls)) + ,(integer->char (if (<= (car lo-ls) #x7F) #x7F #xFF))) + (map (lambda (_) + `(/ ,(integer->char #x80) ,(integer->char #xFF))) + (cdr lo-ls))))) + (map + (lambda (i) + (sre-sequence + (cons + `(/ ,(integer->char (utf8-lowest-digit-of-length (+ i lo-len 1))) + ,(integer->char (utf8-highest-digit-of-length (+ i lo-len 1)))) + (map (lambda (_) + `(/ ,(integer->char #x80) ,(integer->char #xFF))) + (zero-to (+ i lo-len)))))) + (zero-to (- (length hi-ls) lo-len 1))) + (list + (sre-sequence + (cons `(/ ,(integer->char + (utf8-lowest-digit-of-length + (utf8-start-char->length + (integer->char (- (car hi-ls) 1))))) + ,(integer->char (- (car hi-ls) 1))) + (map (lambda (_) + `(/ ,(integer->char #x80) ,(integer->char #xFF))) + (cdr hi-ls))))))))) + +(define (unicode-range-middle lo-ls hi-ls) + (let ((lo (integer->char (+ (car lo-ls) 1))) + (hi (integer->char (- (car hi-ls) 1)))) + (sre-sequence + (cons (if (char=? lo hi) lo `(/ ,lo ,hi)) + (map (lambda (_) `(/ ,(integer->char #x80) ,(integer->char #xFF))) + (cdr lo-ls)))))) + +(define (cset->utf8-pattern cset) + (let lp ((ls cset) (alts '()) (lo-cset '())) + (cond + ((null? ls) + (sre-alternate (append (reverse alts) + (if (null? lo-cset) + '() + (list (cons '/ (reverse lo-cset))))))) + ((char? (car ls)) + (if (high-char? (car ls)) + (lp (cdr ls) (cons (car ls) alts) lo-cset) + (lp (cdr ls) alts (cons (car ls) lo-cset)))) + (else + (if (or (high-char? (caar ls)) (high-char? (cdar ls))) + (lp (cdr ls) + (cons (unicode-range->utf8-pattern (caar ls) (cdar ls)) alts) + lo-cset) + (lp (cdr ls) alts (cons (cdar ls) (cons (caar ls) lo-cset)))))))) + +(define (sre-adjust-utf8 sre flags) + (let adjust ((sre sre) + (utf8? (flag-set? flags ~utf8?)) + (ci? (flag-set? flags ~case-insensitive?))) + (define (rec sre) (adjust sre utf8? ci?)) + (cond + ((pair? sre) + (case (car sre) + ((w/utf8) (adjust (sre-sequence (cdr sre)) #t ci?)) + ((w/noutf8) (adjust (sre-sequence (cdr sre)) #f ci?)) + ((w/case) + (cons (car sre) (map (lambda (s) (adjust s utf8? #f)) (cdr sre)))) + ((w/nocase) + (cons (car sre) (map (lambda (s) (adjust s utf8? #t)) (cdr sre)))) + ((/ ~ & -) + (if (not utf8?) + sre + (let ((cset (sre->cset sre ci?))) + (if (any (lambda (x) + (if (pair? x) + (or (high-char? (car x)) (high-char? (cdr x))) + (high-char? x))) + cset) + (if ci? + (list 'w/case (cset->utf8-pattern cset)) + (cset->utf8-pattern cset)) + sre)))) + ((*) + (case (sre-sequence (cdr sre)) + ;; special case optimization: .* w/utf8 == .* w/noutf8 + ((any) '(* any)) + ((nonl) '(* nonl)) + (else (cons '* (map rec (cdr sre)))))) + (else + (cons (car sre) (map rec (cdr sre)))))) + (else + (case sre + ((any) 'utf8-any) + ((nonl) 'utf8-nonl) + (else + (if (and utf8? (char? sre) (high-char? sre)) + (sre-sequence (map integer->char (char->utf8-list sre))) + sre))))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Compilation + +(cond-expand + (building-chicken + (define-syntax cached + (syntax-rules () + ((_ arg fail) (build-cache 5 arg fail))))) + (else + (define-syntax cached + (syntax-rules () + ((_ arg fail) fail))))) + +(define (irregex x . o) + (cond ((irregex? x) x) + ((null? o) + (cached + x + (if (string? x) + (string->irregex x) + (sre->irregex x)))) + (else + (if (string? x) + (apply string->irregex x o) + (apply sre->irregex x o))))) + +(define (string->irregex str . o) + (apply sre->irregex (apply string->sre str o) o)) + +(define (sre->irregex sre . o) + (let* ((pat-flags (symbol-list->flags o)) + (sre (if *allow-utf8-mode?* + (sre-adjust-utf8 sre pat-flags) + sre)) + (searcher? (sre-searcher? sre)) + (sre-dfa (if searcher? (sre-remove-initial-bos sre) sre)) + (dfa-limit (cond ((memq 'small o) 1) ((memq 'fast o) 50) (else 10))) + (dfa/search + (cond ((memq 'backtrack o) #f) + (searcher? #t) + ((sre->nfa `(seq (* any) ,sre-dfa) pat-flags) + => (lambda (nfa) + (nfa->dfa nfa (* dfa-limit (nfa-num-states nfa))))) + (else #f))) + (dfa (cond ((and dfa/search (sre->nfa sre-dfa pat-flags)) + => (lambda (nfa) + (nfa->dfa nfa (* dfa-limit (nfa-num-states nfa))))) + (else #f))) + (submatches (sre-count-submatches sre-dfa)) + (extractor + (and dfa dfa/search (sre-match-extractor sre-dfa submatches))) + (names (sre-names sre-dfa 1 '())) + (lens (sre-length-ranges sre-dfa names)) + (flags (flag-join + (flag-join ~none (and searcher? ~searcher?)) + (and (sre-consumer? sre) ~consumer?)))) + (cond + (dfa + (make-irregex dfa dfa/search extractor #f flags submatches lens names)) + (else + (let ((f (sre->procedure sre pat-flags names))) + (make-irregex #f #f #f f flags submatches lens names)))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; SRE Analysis + +;; returns #t if the sre can ever be empty +(define (sre-empty? sre) + (if (pair? sre) + (case (car sre) + ((* ? look-ahead look-behind neg-look-ahead neg-look-behind) #t) + ((**) (or (not (number? (cadr sre))) (zero? (cadr sre)))) + ((or) (any sre-empty? (cdr sre))) + ((: seq $ submatch => submatch-named + atomic) + (every sre-empty? (cdr sre))) + (else #f)) + (memq sre '(epsilon bos eos bol eol bow eow commit)))) + +(define (sre-any? sre) + (or (eq? sre 'any) + (and (pair? sre) + (case (car sre) + ((seq : $ submatch => submatch-named) + (and (pair? (cdr sre)) (null? (cddr sre)) (sre-any? (cadr sre)))) + ((or) (every sre-any? (cdr sre))) + (else #f))))) + +(define (sre-repeater? sre) + (and (pair? sre) + (or (memq (car sre) '(* +)) + (and (memq (car sre) '($ submatch => submatch-named seq :)) + (pair? (cdr sre)) + (null? (cddr sre)) + (sre-repeater? (cadr sre)))))) + +(define (sre-searcher? sre) + (if (pair? sre) + (case (car sre) + ((* +) (sre-any? (sre-sequence (cdr sre)))) + ((seq : $ submatch => submatch-named) + (and (pair? (cdr sre)) (sre-searcher? (cadr sre)))) + ((or) (every sre-searcher? (cdr sre))) + (else #f)) + (eq? 'bos sre))) + +(define (sre-consumer? sre) + (if (pair? sre) + (case (car sre) + ((* +) (sre-any? (sre-sequence (cdr sre)))) + ((seq : $ submatch => submatch-named) + (and (pair? (cdr sre)) (sre-consumer? (last sre)))) + ((or) (every sre-consumer? (cdr sre))) + (else #f)) + (eq? 'eos sre))) + +(define (sre-has-submatches? sre) + (and (pair? sre) + (or (memq (car sre) '($ submatch => submatch-named)) + (if (eq? 'posix-string (car sre)) + (sre-has-submatches? (string->sre (cadr sre))) + (any sre-has-submatches? (cdr sre)))))) + +(define (sre-count-submatches sre) + (let count ((sre sre) (sum 0)) + (if (pair? sre) + (fold count + (+ sum (case (car sre) + (($ submatch => submatch-named) 1) + ((dsm) (+ (cadr sre) (caddr sre))) + ((posix-string) + (sre-count-submatches (string->sre (cadr sre)))) + (else 0))) + (cdr sre)) + sum))) + +(define (sre-length-ranges sre . o) + (let ((names (if (pair? o) (car o) (sre-names sre 1 '()))) + (sublens (make-vector (+ 1 (sre-count-submatches sre)) #f))) + (vector-set! + sublens + 0 + (let lp ((sre sre) (n 1) (lo 0) (hi 0) (return cons)) + (define (grow i) (return (+ lo i) (and hi (+ hi i)))) + (cond + ((pair? sre) + (if (string? (car sre)) + (grow 1) + (case (car sre) + ((/ ~ & -) + (grow 1)) + ((posix-string) + (lp (string->sre (cadr sre)) n lo hi return)) + ((seq : w/case w/nocase atomic) + (let lp2 ((ls (cdr sre)) (n n) (lo2 0) (hi2 0)) + (if (null? ls) + (return (+ lo lo2) (and hi hi2 (+ hi hi2))) + (lp (car ls) n 0 0 + (lambda (lo3 hi3) + (lp2 (cdr ls) + (+ n (sre-count-submatches (car ls))) + (+ lo2 lo3) + (and hi2 hi3 (+ hi2 hi3)))))))) + ((or) + (let lp2 ((ls (cdr sre)) (n n) (lo2 #f) (hi2 0)) + (if (null? ls) + (return (+ lo (or lo2 1)) (and hi hi2 (+ hi hi2))) + (lp (car ls) n 0 0 + (lambda (lo3 hi3) + (lp2 (cdr ls) + (+ n (sre-count-submatches (car ls))) + (if lo2 (min lo2 lo3) lo3) + (and hi2 hi3 (max hi2 hi3)))))))) + ((if) + (cond + ((or (null? (cdr sre)) (null? (cddr sre))) + (return lo hi)) + (else + (let ((n1 (sre-count-submatches (car sre))) + (n2 (sre-count-submatches (cadr sre)))) + (lp (if (or (number? (cadr sre)) (symbol? (cadr sre))) + 'epsilon + (cadr sre)) + n lo hi + (lambda (lo2 hi2) + (lp (caddr sre) (+ n n1) 0 0 + (lambda (lo3 hi3) + (lp (if (pair? (cdddr sre)) + (cadddr sre) + 'epsilon) + (+ n n1 n2) 0 0 + (lambda (lo4 hi4) + (return (+ lo2 (min lo3 lo4)) + (and hi2 hi3 hi4 + (+ hi2 (max hi3 hi4)) + )))))))))))) + ((dsm) + (lp (sre-sequence (cdddr sre)) (+ n (cadr sre)) lo hi return)) + (($ submatch => submatch-named) + (lp (sre-sequence + (if (eq? 'submatch (car sre)) (cdr sre) (cddr sre))) + (+ n 1) lo hi + (lambda (lo2 hi2) + (vector-set! sublens n (cons lo2 hi2)) + (return lo2 hi2)))) + ((backref backref-ci) + (let ((n (cond + ((number? (cadr sre)) (cadr sre)) + ((assq (cadr sre) names) => cdr) + (else (%irregex-error "unknown backreference" (cadr sre)))))) + (cond + ((or (not (integer? n)) + (not (< 0 n (vector-length sublens)))) + (%irregex-error 'sre-length "invalid backreference" sre)) + ((not (vector-ref sublens n)) + (%irregex-error 'sre-length "invalid forward backreference" sre)) + (else + (let ((lo2 (car (vector-ref sublens n))) + (hi2 (cdr (vector-ref sublens n)))) + (return (+ lo lo2) (and hi hi2 (+ hi hi2)))))))) + ((* *?) + (lp (sre-sequence (cdr sre)) n lo hi (lambda (lo hi) #f)) + (return lo #f)) + ((** **?) + (cond + ((or (and (number? (cadr sre)) + (number? (caddr sre)) + (> (cadr sre) (caddr sre))) + (and (not (cadr sre)) (caddr sre))) + (return lo hi)) + (else + (if (caddr sre) + (lp (sre-sequence (cdddr sre)) n 0 0 + (lambda (lo2 hi2) + (return (+ lo (* (cadr sre) lo2)) + (and hi hi2 (+ hi (* (caddr sre) hi2)))))) + (lp (sre-sequence (cdddr sre)) n 0 0 + (lambda (lo2 hi2) + (return (+ lo (* (cadr sre) lo2)) #f))))))) + ((+) + (lp (sre-sequence (cdr sre)) n lo hi + (lambda (lo2 hi2) + (return (+ lo lo2) #f)))) + ((? ??) + (lp (sre-sequence (cdr sre)) n lo hi + (lambda (lo2 hi2) + (return lo (and hi hi2 (+ hi hi2)))))) + ((= =? >= >=?) + (lp `(** ,(cadr sre) + ,(if (memq (car sre) '(>= >=?)) #f (cadr sre)) + ,@(cddr sre)) + n lo hi return)) + ((look-ahead neg-look-ahead look-behind neg-look-behind) + (return lo hi)) + (else + (cond + ((assq (car sre) sre-named-definitions) + => (lambda (cell) + (lp (apply (cdr cell) (cdr sre)) n lo hi return))) + (else + (%irregex-error 'sre-length-ranges "unknown sre operator" sre))))))) + ((char? sre) + (grow 1)) + ((string? sre) + (grow (string-length sre))) + ((memq sre '(any nonl)) + (grow 1)) + ((memq sre '(epsilon bos eos bol eol bow eow nwb commit)) + (return lo hi)) + (else + (let ((cell (assq sre sre-named-definitions))) + (if cell + (lp (if (procedure? (cdr cell)) ((cdr cell)) (cdr cell)) + n lo hi return) + (%irregex-error 'sre-length-ranges "unknown sre" sre))))))) + sublens)) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; SRE Manipulation + +;; build a (seq ls ...) sre from a list +(define (sre-sequence ls) + (cond + ((null? ls) 'epsilon) + ((null? (cdr ls)) (car ls)) + (else (cons 'seq ls)))) + +;; build a (or ls ...) sre from a list +(define (sre-alternate ls) + (cond + ((null? ls) '(or)) + ((null? (cdr ls)) (car ls)) + (else (cons 'or ls)))) + +;; returns an equivalent SRE without any match information +(define (sre-strip-submatches sre) + (if (not (pair? sre)) + sre + (case (car sre) + (($ submatch) (sre-strip-submatches (sre-sequence (cdr sre)))) + ((=> submatch-named) (sre-strip-submatches (sre-sequence (cddr sre)))) + ((dsm) (sre-strip-submatches (sre-sequence (cdddr sre)))) + (else (map sre-strip-submatches sre))))) + +;; given a char-set list of chars and strings, flattens them into +;; chars only +(define (sre-flatten-ranges ls) + (let lp ((ls ls) (res '())) + (cond + ((null? ls) + (reverse res)) + ((string? (car ls)) + (lp (append (string->list (car ls)) (cdr ls)) res)) + (else + (lp (cdr ls) (cons (car ls) res)))))) + +(define (sre-names sre n names) + (if (not (pair? sre)) + names + (case (car sre) + (($ submatch) + (sre-names (sre-sequence (cdr sre)) (+ n 1) names)) + ((=> submatch-named) + (sre-names (sre-sequence (cddr sre)) + (+ n 1) + (cons (cons (cadr sre) n) names))) + ((dsm) + (sre-names (sre-sequence (cdddr sre)) (+ n (cadr sre)) names)) + ((seq : or * + ? *? ?? w/case w/nocase atomic + look-ahead look-behind neg-look-ahead neg-look-behind) + (sre-sequence-names (cdr sre) n names)) + ((= >=) + (sre-sequence-names (cddr sre) n names)) + ((** **?) + (sre-sequence-names (cdddr sre) n names)) + (else + names)))) + +(define (sre-sequence-names ls n names) + (if (null? ls) + names + (sre-sequence-names (cdr ls) + (+ n (sre-count-submatches (car ls))) + (sre-names (car ls) n names)))) + +(define (sre-remove-initial-bos sre) + (cond + ((pair? sre) + (case (car sre) + ((seq : $ submatch => submatch-named * +) + (cond + ((not (pair? (cdr sre))) + sre) + ((eq? 'bos (cadr sre)) + (cons (car sre) (cddr sre))) + (else + (cons (car sre) + (cons (sre-remove-initial-bos (cadr sre)) (cddr sre)))))) + ((or) + (sre-alternate (map sre-remove-initial-bos (cdr sre)))) + (else + sre))) + (else + sre))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Basic Matching + +(define irregex-basic-string-chunker + (make-irregex-chunker (lambda (x) #f) + car + cadr + caddr + (lambda (src1 i src2 j) + (substring (car src1) i j)))) + +(define (irregex-search x str . o) + (if (not (string? str)) (%irregex-error 'irregex-search "not a string" str)) + (let ((start (or (and (pair? o) (car o)) 0)) + (end (or (and (pair? o) (pair? (cdr o)) (cadr o)) (string-length str)))) + (if (not (integer? start)) (%irregex-error 'irregex-search "not an integer" start)) + (if (not (integer? end)) (%irregex-error 'irregex-search "not an integer" end)) + (irregex-search/chunked x + irregex-basic-string-chunker + (list str start end) + start))) + +(define (irregex-search/chunked x cnk src . o) + (let* ((irx (irregex x)) + (matches (irregex-new-matches irx)) + (i (if (pair? o) (car o) ((chunker-get-start cnk) src)))) + (if (not (integer? i)) (%irregex-error 'irregex-search "not an integer" i)) + (irregex-match-chunker-set! matches cnk) + (irregex-search/matches irx cnk src i matches))) + +;; internal routine, can be used in loops to avoid reallocating the +;; match vector +(define (irregex-search/matches irx cnk src i matches) + (cond + ((irregex-dfa irx) + (cond + ((flag-set? (irregex-flags irx) ~searcher?) + (cond + ((dfa-match/longest (irregex-dfa irx) cnk src i #f #f matches 0) + (irregex-match-start-chunk-set! matches 0 src) + (irregex-match-start-index-set! matches 0 i) + ((irregex-dfa/extract irx) + cnk src i + (%irregex-match-end-chunk matches 0) + (%irregex-match-end-index matches 0) + matches) + matches) + (else + #f))) + ((dfa-match/shortest + (irregex-dfa/search irx) cnk src i matches 0) + (let ((dfa (irregex-dfa irx)) + (get-start (chunker-get-start cnk)) + (get-end (chunker-get-end cnk)) + (get-next (chunker-get-next cnk))) + (let lp1 ((src src) (i i)) + (let ((end (get-end src))) + (let lp2 ((i i)) + (cond + ((dfa-match/longest dfa cnk src i #f #f matches 0) + (irregex-match-start-chunk-set! matches 0 src) + (irregex-match-start-index-set! matches 0 i) + ((irregex-dfa/extract irx) + cnk src i + (%irregex-match-end-chunk matches 0) + (%irregex-match-end-index matches 0) + matches) + matches) + ((>= i end) + (let ((next (get-next src))) + (and next (lp1 next (get-start next))))) + (else + (lp2 (+ i 1))))))))) + (else + #f))) + (else + (let ((res (irregex-search/backtrack irx cnk src i matches))) + (if res (%irregex-match-fail-set! res #f)) + res)))) + +(define (irregex-search/backtrack irx cnk src i matches) + (let ((matcher (irregex-nfa irx)) + (str ((chunker-get-str cnk) src)) + (end ((chunker-get-end cnk) src)) + (get-next (chunker-get-next cnk)) + (init (cons src i))) + (if (flag-set? (irregex-flags irx) ~searcher?) + (matcher cnk init src str i end matches (lambda () #f)) + (let lp ((src2 src) + (str str) + (i i) + (end end)) + (cond + ((matcher cnk init src2 str i end matches (lambda () #f)) + (irregex-match-start-chunk-set! matches 0 src2) + (irregex-match-start-index-set! matches 0 i) + matches) + ((< i end) + (lp src2 str (+ i 1) end)) + (else + (let ((src2 (get-next src2))) + (if src2 + (lp src2 + ((chunker-get-str cnk) src2) + ((chunker-get-start cnk) src2) + ((chunker-get-end cnk) src2)) + #f)))))))) + +(define (irregex-match irx str . o) + (if (not (string? str)) (%irregex-error 'irregex-match "not a string" str)) + (let ((start (or (and (pair? o) (car o)) 0)) + (end (or (and (pair? o) (pair? (cdr o)) (cadr o)) (string-length str)))) + (irregex-match/chunked irx + irregex-basic-string-chunker + (list str start end)))) + +(define (irregex-match/chunked irx cnk src) + (let* ((irx (irregex irx)) + (matches (irregex-new-matches irx))) + (irregex-match-chunker-set! matches cnk) + (cond + ((irregex-dfa irx) + (and + (dfa-match/longest + (irregex-dfa irx) cnk src ((chunker-get-start cnk) src) #f #f matches 0) + (= ((chunker-get-end cnk) (%irregex-match-end-chunk matches 0)) + (%irregex-match-end-index matches 0)) + (begin + (irregex-match-start-chunk-set! matches 0 src) + (irregex-match-start-index-set! matches + 0 + ((chunker-get-start cnk) src)) + ((irregex-dfa/extract irx) + cnk src ((chunker-get-start cnk) src) + (%irregex-match-end-chunk matches 0) + (%irregex-match-end-index matches 0) + matches) + matches))) + (else + (let* ((matcher (irregex-nfa irx)) + (str ((chunker-get-str cnk) src)) + (i ((chunker-get-start cnk) src)) + (end ((chunker-get-end cnk) src)) + (init (cons src i))) + (let lp ((m (matcher cnk init src str i end matches (lambda () #f)))) + (and m + (cond + ((and (not ((chunker-get-next cnk) + (%irregex-match-end-chunk m 0))) + (= ((chunker-get-end cnk) + (%irregex-match-end-chunk m 0)) + (%irregex-match-end-index m 0))) + (%irregex-match-fail-set! m #f) + m) + ((%irregex-match-fail m) + (lp ((%irregex-match-fail m)))) + (else + #f))))))))) + +(define (irregex-match? . args) + (and (apply irregex-match args) #t)) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; DFA Matching + +;; inline these +(define (dfa-init-state dfa) + (vector-ref dfa 0)) +(define (dfa-next-state dfa node) + (vector-ref dfa (cdr node))) +(define (dfa-final-state? dfa state) + (car state)) + +;; this searches for the first end index for which a match is possible +(define (dfa-match/shortest dfa cnk src start matches index) + (let ((get-str (chunker-get-str cnk)) + (get-start (chunker-get-start cnk)) + (get-end (chunker-get-end cnk)) + (get-next (chunker-get-next cnk))) + (let lp1 ((src src) (start start) (state (dfa-init-state dfa))) + (and + src + (let ((str (get-str src)) + (end (get-end src))) + (let lp2 ((i start) (state state)) + (cond + ((dfa-final-state? dfa state) + (cond + (index + (irregex-match-end-chunk-set! matches index src) + (irregex-match-end-index-set! matches index i))) + #t) + ((< i end) + (let* ((ch (string-ref str i)) + (next (find (lambda (x) + (if (eqv? ch (car x)) + #t + (and (pair? (car x)) + (char<=? (caar x) ch) + (char<=? ch (cdar x))))) + (cdr state)))) + (and next (lp2 (+ i 1) (dfa-next-state dfa next))))) + (else + (let ((next (get-next src))) + (and next (lp1 next (get-start next) state))))))))))) + +;; this finds the longest match starting at a given index +(define (dfa-match/longest dfa cnk src start end-src end matches index) + (let ((get-str (chunker-get-str cnk)) + (get-start (chunker-get-start cnk)) + (get-end (chunker-get-end cnk)) + (get-next (chunker-get-next cnk)) + (start-is-final? (dfa-final-state? dfa (dfa-init-state dfa)))) + (cond + (index + (irregex-match-end-chunk-set! matches index #f) + (irregex-match-end-index-set! matches index #f))) + (let lp1 ((src src) + (start start) + (state (dfa-init-state dfa)) + (res-src (and start-is-final? src)) + (res-index (and start-is-final? start))) + (let ((str (get-str src)) + (end (if (eq? src end-src) end (get-end src)))) + (let lp2 ((i start) + (state state) + (res-src res-src) + (res-index res-index)) + (cond + ((>= i end) + (cond + ((and index res-src) + (irregex-match-end-chunk-set! matches index res-src) + (irregex-match-end-index-set! matches index res-index))) + (let ((next (and (not (eq? src end-src)) (get-next src)))) + (if next + (lp1 next (get-start next) state res-src res-index) + (and index + (%irregex-match-end-chunk matches index) + #t)))) + (else + (let* ((ch (string-ref str i)) + (cell (find (lambda (x) + (if (eqv? ch (car x)) + #t + (and (pair? (car x)) + (char<=? (caar x) ch) + (char<=? ch (cdar x))))) + (cdr state)))) + (cond + (cell + (let ((next (dfa-next-state dfa cell))) + (if (dfa-final-state? dfa next) + (lp2 (+ i 1) next src (+ i 1)) + (lp2 (+ i 1) next res-src res-index)))) + (res-src + (cond + (index + (irregex-match-end-chunk-set! matches index res-src) + (irregex-match-end-index-set! matches index res-index))) + #t) + ((and index (%irregex-match-end-chunk matches index)) + #t) + (else + #f)))))))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Named Definitions + +(define sre-named-definitions + `((any . ,*all-chars*) + (nonl . (- ,*all-chars* (,(string #\newline)))) + (alphabetic . (/ #\a #\z #\A #\Z)) + (alpha . alphabetic) + (alphanumeric . (/ #\a #\z #\A #\Z #\0 #\9)) + (alphanum . alphanumeric) + (alnum . alphanumeric) + (lower-case . (/ #\a #\z)) + (lower . lower-case) + (upper-case . (/ #\A #\Z)) + (upper . upper-case) + (numeric . (/ #\0 #\9)) + (num . numeric) + (digit . numeric) + (punctuation . (or #\! #\" #\# #\% #\& #\' #\( #\) #\* #\, #\- #\. + #\/ #\: #\; #\? #\@ #\[ #\\ #\] #\_ #\{ #\})) + (punct . punctuation) + (graphic + . (or alphanumeric punctuation #\$ #\+ #\< #\= #\> #\^ #\` #\| #\~)) + (graph . graphic) + (blank . (or #\space ,(integer->char (- (char->integer #\space) 23)))) + (whitespace . (or blank #\newline)) + (space . whitespace) + (white . whitespace) + (printing or graphic whitespace) + (print . printing) + + ;; XXXX we assume a (possibly shifted) ASCII-based ordering + (control . (/ ,(integer->char (- (char->integer #\space) 32)) + ,(integer->char (- (char->integer #\space) 1)))) + (cntrl . control) + (hex-digit . (or numeric (/ #\a #\f #\A #\F))) + (xdigit . hex-digit) + (ascii . (/ ,(integer->char (- (char->integer #\space) 32)) + ,(integer->char (+ (char->integer #\space) 95)))) + (ascii-nonl . (/ ,(integer->char (- (char->integer #\space) 32)) + ,(integer->char (- (char->integer #\newline) 1)) + ,(integer->char (+ (char->integer #\newline) 1)) + ,(integer->char (+ (char->integer #\space) 95)))) + (newline . (or (seq ,(integer->char (+ (char->integer #\newline) 3)) + #\newline) + (/ #\newline + ,(integer->char (+ (char->integer #\newline) 3))))) + + ;; ... it's really annoying to support old Scheme48 + (word . (seq bow (+ (or alphanumeric #\_)) eow)) + (utf8-tail-char . (/ ,(integer->char (+ (char->integer #\space) #x60)) + ,(integer->char (+ (char->integer #\space) #xA1)))) + (utf8-2-char . (seq (/ ,(integer->char (+ (char->integer #\space) #xA2)) + ,(integer->char (+ (char->integer #\space) #xBF))) + utf8-tail-char)) + (utf8-3-char . (seq (/ ,(integer->char (+ (char->integer #\space) #xC0)) + ,(integer->char (+ (char->integer #\space) #xCF))) + utf8-tail-char + utf8-tail-char)) + (utf8-4-char . (seq (/ ,(integer->char (+ (char->integer #\space) #xD0)) + ,(integer->char (+ (char->integer #\space) #xD7))) + utf8-tail-char + utf8-tail-char + utf8-tail-char)) + (utf8-any . (or ascii utf8-2-char utf8-3-char utf8-4-char)) + (utf8-nonl . (or ascii-nonl utf8-2-char utf8-3-char utf8-4-char)) + + ;; extended library patterns + (integer . (seq (? (or #\+ #\-)) (+ numeric))) + (real . (seq (+ numeric) (? #\. (+ numeric)) (? (or #\e #\E) integer))) + ;; slightly more lax than R5RS, allow ->foo, etc. + (symbol-initial . (or alpha ("!$%&*/:<=>?^_~"))) + (symbol-subsequent . (or symbol-initial digit ("+-.@"))) + (symbol . (or (seq symbol-initial (* symbol-subsequent)) + (seq ("+-") (? symbol-initial (* symbol-subsequent))) + (seq ".." (* ".")))) + (sexp-space . (seq (* (* space) ";" (* nonl) newline) (+ space))) + (string . (seq #\" (escape #\\ #\") #\")) + (escape . ,(lambda (esc . o) `(* (or (~ ,esc ,@o) (seq ,esc any))))) + + (ipv4-digit . (seq (? (/ "12")) (? numeric) numeric)) + (ipv4-address . (seq ipv4-digit (= 3 #\. ipv4-digit))) + ;; XXXX lax, allows multiple double-colons or < 8 terms w/o a :: + (ipv6-address . (seq (** 0 4 hex-digit) + (** 1 7 #\: (? #\:) (** 0 4 hex-digit)))) + (ip-address . (or ipv4-address ipv6-address)) + (domain-atom . (+ (or alphanumeric #\_ #\-))) + (domain . (seq domain-atom (+ #\. domain-atom))) + ;; XXXX now anything can be a top-level domain, but this is still handy + (top-level-domain . (w/nocase (or "arpa" "com" "gov" "mil" "net" "org" + "aero" "biz" "coop" "info" "museum" + "name" "pro" (= 2 alpha)))) + (domain/common . (seq (+ domain-atom #\.) top-level-domain)) + ;;(email-local-part . (seq (+ (or (~ #\") string)))) + (email-local-part . (+ (or alphanumeric #\_ #\- #\. #\+))) + (email . (seq email-local-part #\@ domain)) + (url-char . (or alnum #\_ #\- #\+ #\\ #\= #\~ #\. #\, #\& #\; + (seq "%" hex-digit hex-digit))) + (url-final-char . (or alnum #\_ #\- #\+ #\\ #\= #\~ #\& + (seq "%" hex-digit hex-digit))) + (http-url . (w/nocase + "http" (? "s") "://" + (or domain/common ipv4-address) ;; (seq "[" ipv6-address "]") + (? ":" (+ numeric)) ;; port + ;; path + (? "/" (* url-char) + (? "?" (* url-char)) ;; query + (? "#" (? (* url-char) url-final-char)) ;; fragment + ))) + + )) + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; SRE->NFA compilation +;; +;; An NFA state is a numbered node with a list of patter->number +;; transitions, where pattern is either a character, (lo . hi) +;; character range, or epsilon (indicating an empty transition). +;; There may be duplicate characters and overlapping ranges - since +;; it's an NFA we process it by considering all possible transitions. + +(define *nfa-presize* 128) ;; constant +(define *nfa-num-fields* 4) ;; constant + +(define (nfa-num-states nfa) (quotient (vector-length nfa) *nfa-num-fields*)) +(define (nfa-start-state nfa) (- (nfa-num-states nfa) 1)) + +(define (nfa-get-state-trans nfa i) + (vector-ref nfa (* i *nfa-num-fields*))) +(define (nfa-set-state-trans! nfa i x) + (vector-set! nfa (* i *nfa-num-fields*) x)) +(define (nfa-push-state-trans! nfa i x) + (nfa-set-state-trans! nfa i (cons x (nfa-get-state-trans nfa i)))) + +(define (nfa-get-epsilons nfa i) + (vector-ref nfa (+ (* i *nfa-num-fields*) 1))) +(define (nfa-set-epsilons! nfa i x) + (vector-set! nfa (+ (* i *nfa-num-fields*) 1) x)) +(define (nfa-add-epsilon! nfa i x) + (let ((eps (nfa-get-epsilons nfa i))) + (if (not (memq x eps)) + (nfa-set-epsilons! nfa i (cons x eps))))) + +(define (nfa-get-state-closure nfa i) + (vector-ref nfa (+ (* i *nfa-num-fields*) 2))) +(define (nfa-set-state-closure! nfa i x) + (vector-set! nfa (+ (* i *nfa-num-fields*) 2) x)) + +(define (nfa-get-closure nfa mst) + (cond ((assoc mst + (vector-ref nfa (+ (* (nfa-multi-state-hash nfa mst) + *nfa-num-fields*) + (- *nfa-num-fields* 1)))) + => cdr) + (else #f))) +(define (nfa-add-closure! nfa mst x) + (let ((i (+ (* (nfa-multi-state-hash nfa mst) *nfa-num-fields*) + (- *nfa-num-fields* 1)))) + (vector-set! nfa i (cons (cons mst x) (vector-ref nfa i))))) + +;; Compile and return the vector of NFA states (in groups of +;; *nfa-num-fields* packed elements). The start state will be the +;; last element(s) of the vector, and all remaining states will be in +;; descending numeric order, with state 0 being the unique accepting +;; state. +(define (sre->nfa sre init-flags) + (let ((buf (make-vector (* *nfa-presize* *nfa-num-fields*) '()))) + ;; we loop over an implicit sequence list + (define (lp ls n flags next) + (define (new-state-number state) + (max n (+ 1 state))) + (define (add-state! n2 trans-ls) + (if (>= (* n2 *nfa-num-fields*) (vector-length buf)) + (let ((tmp (make-vector (* 2 (vector-length buf)) '()))) + (do ((i (- (vector-length buf) 1) (- i 1))) + ((< i 0)) + (vector-set! tmp i (vector-ref buf i))) + (set! buf tmp))) + (nfa-set-state-trans! buf n2 trans-ls) + n2) + (define (extend-state! next . trans) + (and next + (add-state! (new-state-number next) + (map (lambda (x) (cons x next)) trans)))) + (define (add-char-state! next ch) + (let ((alt (char-altcase ch))) + (if (and (flag-set? flags ~case-insensitive?) (not (eqv? ch alt))) + (extend-state! next ch alt) + (extend-state! next ch)))) + (if (null? ls) + next + (cond + ((or (eq? 'epsilon (car ls)) (equal? "" (car ls))) + ;; chars and epsilons go directly into the transition table + (let ((next (lp (cdr ls) n flags next))) + (and next + (let ((new (add-state! (new-state-number next) '()))) + (nfa-add-epsilon! buf new next) + new)))) + ((string? (car ls)) + ;; process literal strings a char at a time + (let ((next (lp (cdr ls) n flags next))) + (and next + (let lp2 ((i (- (string-length (car ls)) 1)) + (next next)) + (if (< i 0) + next + (lp2 (- i 1) + (add-char-state! next (string-ref (car ls) i)))) + )))) + ((char? (car ls)) + (add-char-state! (lp (cdr ls) n flags next) (car ls))) + ((symbol? (car ls)) + (let ((cell (assq (car ls) sre-named-definitions))) + (and cell + (lp (cons (if (procedure? (cdr cell)) + ((cdr cell)) + (cdr cell)) + (cdr ls)) + n + flags + next)))) + ((pair? (car ls)) + (cond + ((string? (caar ls)) + ;; enumerated character set + (lp (cons (sre-alternate (string->list (caar ls))) (cdr ls)) + n + flags + next)) + (else + (case (caar ls) + ((seq :) + ;; for an explicit sequence, just append to the list + (lp (append (cdar ls) (cdr ls)) n flags next)) + ((w/case w/nocase w/utf8 w/noutf8) + (let* ((next (lp (cdr ls) n flags next)) + (flags ((if (memq (caar ls) '(w/case w/utf8)) + flag-clear + flag-join) + flags + (if (memq (caar ls) '(w/case w/nocase)) + ~case-insensitive? + ~utf8?)))) + (and next + (lp (cdar ls) (new-state-number next) flags next)))) + ((/ - & ~) + (let ((ranges + (sre->cset (car ls) + (flag-set? flags ~case-insensitive?)))) + (case (length ranges) + ((1) + (extend-state! (lp (cdr ls) n flags next) (car ranges))) + (else + (let ((next (lp (cdr ls) n flags next))) + (and + next + (lp (list (sre-alternate + (map (lambda (x) (if (pair? x) + (list '/ (car x) (cdr x)) + x)) + ranges))) + (new-state-number next) + (flag-clear flags ~case-insensitive?) + next))))))) + ((or) + (let ((next (lp (cdr ls) n flags next))) + (and + next + (if (null? (cdar ls)) + ;; empty (or) always fails + (add-state! (new-state-number next) '()) + ;; compile both branches and insert epsilon + ;; transitions to either + (let* ((b (lp (list (sre-alternate (cddar ls))) + (new-state-number next) + flags + next)) + (a (and b + (lp (list (cadar ls)) + (new-state-number (max b next)) + flags + next)))) + (and a + (let ((c (add-state! (new-state-number a) '()))) + (nfa-add-epsilon! buf c a) + (nfa-add-epsilon! buf c b) + c))))))) + ((?) + (let ((next (lp (cdr ls) n flags next))) + ;; insert an epsilon transition directly to next + (and + next + (let ((a (lp (cdar ls) (new-state-number next) flags next))) + (if a + (nfa-add-epsilon! buf a next)) + a)))) + ((+ *) + (let ((next (lp (cdr ls) n flags next))) + (and + next + (let* ((new (lp '(epsilon) + (new-state-number next) + flags + next)) + (a (lp (cdar ls) (new-state-number new) flags new))) + (cond + (a + ;; for *, insert an epsilon transition as in ? above + (if (eq? '* (caar ls)) + (nfa-add-epsilon! buf a new)) + ;; for both, insert a loop back to self + (nfa-add-epsilon! buf new a))) + a)))) + ;; need to add these to the match extractor first, + ;; but they tend to generate large DFAs + ;;((=) + ;; (lp (append (vector->list + ;; (make-vector (cadar ls) + ;; (sre-sequence (cddar ls)))) + ;; (cdr ls)) + ;; n flags next)) + ;;((>=) + ;; (lp (append (vector->list + ;; (make-vector (- (cadar ls) 1) + ;; (sre-sequence (cddar ls)))) + ;; (cons `(+ ,@(cddar ls)) (cdr ls))) + ;; n flags next)) + ;;((**) + ;; (lp (append (vector->list + ;; (make-vector (cadar ls) + ;; (sre-sequence (cdddar ls)))) + ;; (map + ;; (lambda (x) `(? ,x)) + ;; (vector->list + ;; (make-vector (- (caddar ls) (cadar ls)) + ;; (sre-sequence (cdddar ls))))) + ;; (cdr ls)) + ;; n flags next)) + ;; ignore submatches altogether + (($ submatch) + (lp (cons (sre-sequence (cdar ls)) (cdr ls)) n flags next)) + ((=> submatch-named) + (lp (cons (sre-sequence (cddar ls)) (cdr ls)) n flags next)) + (else + (cond + ((assq (caar ls) sre-named-definitions) + => (lambda (cell) + (if (procedure? (cdr cell)) + (lp (cons (apply (cdr cell) (cdar ls)) (cdr ls)) + n flags next) + (%irregex-error "non-procedure in op position" (caar ls))))) + (else #f))))))) + (else + #f)))) + (let ((len (lp (list sre) 1 init-flags 0))) + (and len + (let ((nfa (make-vector (* *nfa-num-fields* (+ len 1))))) + (do ((i (- (vector-length nfa) 1) (- i 1))) + ((< i 0)) + (vector-set! nfa i (vector-ref buf i))) + nfa))))) + +;; We don't really want to use this, we use the closure compilation +;; below instead, but this is included for reference and testing the +;; sre->nfa conversion. + +;; (define (nfa-match nfa str) +;; (let lp ((ls (string->list str)) (state (car nfa)) (epsilons '())) +;; (if (null? ls) +;; (zero? (car state)) +;; (any (lambda (m) +;; (if (eq? 'epsilon (car m)) +;; (and (not (memv (cdr m) epsilons)) +;; (lp ls (assv (cdr m) nfa) (cons (cdr m) epsilons))) +;; (and (or (eqv? (car m) (car ls)) +;; (and (pair? (car m)) +;; (char<=? (caar m) (car ls)) +;; (char<=? (car ls) (cdar m)))) +;; (lp (cdr ls) (assv (cdr m) nfa) '())))) +;; (cdr state))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; NFA multi-state representation + +;; Cache closures in a simple hash-table keyed on the smallest state +;; (define (nfa-multi-state-hash nfa mst) +;; (car mst)) + +;; Original sorted list-based representation + +;; (define (make-nfa-multi-state nfa) +;; '()) + +;; (define (nfa-state->multi-state nfa state) +;; (list state)) + +;; (define (nfa-multi-state-copy mst) +;; (map (lambda (x) x) mst)) + +;; (define (list->nfa-multi-state nfa ls) +;; (nfa-multi-state-copy ls)) + +;; (define (nfa-multi-state-contains? mst i) +;; (memq i mst)) + +;; (define (nfa-multi-state-fold mst kons knil) +;; (fold kons knil mst)) + +;; (define (nfa-multi-state-add! mst i) +;; (insert-sorted i mst)) + +;; (define (nfa-multi-state-add mst i) +;; (insert-sorted i mst)) + +;; (define (nfa-multi-state-union a b) +;; (merge-sorted a b)) + +;; Sorted List Utilities + +;; (define (insert-sorted n ls) +;; (cond +;; ((null? ls) +;; (cons n '())) +;; ((<= n (car ls)) +;; (if (= n (car ls)) +;; ls +;; (cons n ls))) +;; (else +;; (cons (car ls) (insert-sorted n (cdr ls)))))) + +;; (define (insert-sorted! n ls) +;; (cond +;; ((null? ls) +;; (cons n '())) +;; ((<= n (car ls)) +;; (if (= n (car ls)) +;; ls +;; (cons n ls))) +;; (else +;; (let lp ((head ls) (tail (cdr ls))) +;; (cond ((or (null? tail) (< n (car tail))) +;; (set-cdr! head (cons n tail))) +;; ((> n (car tail)) +;; (lp tail (cdr tail))))) +;; ls))) + +;; (define (merge-sorted a b) +;; (cond ((null? a) b) +;; ((null? b) a) +;; ((< (car a) (car b)) +;; (cons (car a) (merge-sorted (cdr a) b))) +;; ((> (car a) (car b)) +;; (cons (car b) (merge-sorted a (cdr b)))) +;; (else (merge-sorted (cdr a) b)))) + +;; ========================================================= ;; + +;; Presized bit-vector based + +(define (nfa-multi-state-hash nfa mst) + (modulo (vector-ref mst 0) (nfa-num-states nfa))) + +(define (make-nfa-multi-state nfa) + (make-vector (quotient (+ (nfa-num-states nfa) 24 -1) 24) 0)) + +(define (nfa-state->multi-state nfa state) + (nfa-multi-state-add! (make-nfa-multi-state nfa) state)) + +(define (nfa-multi-state-copy mst) + (let ((res (make-vector (vector-length mst)))) + (do ((i (- (vector-length mst) 1) (- i 1))) + ((< i 0) res) + (vector-set! res i (vector-ref mst i))))) + +(define (nfa-multi-state-contains? mst i) + (let ((cell (quotient i 24)) + (bit (remainder i 24))) + (not (zero? (bit-and (vector-ref mst cell) (bit-shl 1 bit)))))) + +(define (nfa-multi-state-add! mst i) + (let ((cell (quotient i 24)) + (bit (remainder i 24))) + (vector-set! mst cell (bit-ior (vector-ref mst cell) (bit-shl 1 bit))) + mst)) + +(define (nfa-multi-state-add mst i) + (nfa-multi-state-add! (nfa-multi-state-copy mst) i)) + +(define (nfa-multi-state-union! a b) + (do ((i (- (vector-length a) 1) (- i 1))) + ((< i 0) a) + (vector-set! a i (bit-ior (vector-ref a i) (vector-ref b i))))) + +(define (nfa-multi-state-union a b) + (nfa-multi-state-union! (nfa-multi-state-copy a) b)) + +(define (nfa-multi-state-fold mst kons knil) + (let ((limit (vector-length mst))) + (let lp1 ((i 0) + (acc knil)) + (if (>= i limit) + acc + (let lp2 ((n (vector-ref mst i)) + (acc acc)) + (if (zero? n) + (lp1 (+ i 1) acc) + (let* ((n2 (bit-and n (- n 1))) + (n-tail (- n n2)) + (bit (+ (* i 24) (integer-log n-tail)))) + (lp2 n2 (kons bit acc))))))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; NFA->DFA compilation +;; +;; During processing, the DFA is a list of the form: +;; +;; ((NFA-states ...) accepting-state? transitions ...) +;; +;; where the transitions are as in the NFA, except there are no +;; epsilons, duplicate characters or overlapping char-set ranges, and +;; the states moved to are closures (sets of NFA states). Multiple +;; DFA states may be accepting states. + +(define (nfa->dfa nfa . o) + (let ((max-states (and (pair? o) (car o)))) + (let lp ((ls (list (nfa-cache-state-closure! nfa (nfa-start-state nfa)))) + (i 0) + (res '())) + (cond + ((null? ls) + (dfa-renumber nfa (reverse res))) + ((assoc (car ls) res) ;; already seen this combination of states + (lp (cdr ls) i res)) + ((and max-states (> i max-states)) ;; too many DFA states + #f) + (else + (let* ((states (car ls)) + (trans (nfa-state-transitions nfa states)) + (accept? (and (nfa-multi-state-contains? states 0) #t))) + (lp (append (map cdr trans) (cdr ls)) + (+ i 1) + `((,states ,accept? ,@trans) ,@res)))))))) + +;; When the conversion is complete we renumber the DFA sets-of-states +;; in order and convert the result to a vector for fast lookup. +(define (dfa-renumber nfa dfa) + (let* ((len (length dfa)) + (states (make-vector (nfa-num-states nfa) '())) + (res (make-vector len))) + (define (renumber mst) + (cdr (assoc mst (vector-ref states (nfa-multi-state-hash nfa mst))))) + (let lp ((ls dfa) (i 0)) + (cond ((pair? ls) + (let ((j (nfa-multi-state-hash nfa (caar ls)))) + (vector-set! states j (cons (cons (caar ls) i) + (vector-ref states j)))) + (lp (cdr ls) (+ i 1))))) + (let lp ((ls dfa) (i 0)) + (cond ((pair? ls) + (for-each + (lambda (x) (set-cdr! x (renumber (cdr x)))) + (cddar ls)) + (vector-set! res i (cdar ls)) + (lp (cdr ls) (+ i 1))))) + res)) + +;; Extract all distinct characters or ranges and the potential states +;; they can transition to from a given set of states. Any ranges that +;; would overlap with distinct characters are split accordingly. +(define (nfa-state-transitions nfa states) + (let ((res (nfa-multi-state-fold + states + (lambda (st res) + (fold (lambda (trans res) + (nfa-join-transitions! nfa res (car trans) (cdr trans))) + res + (nfa-get-state-trans nfa st))) + '()))) + (for-each (lambda (x) (set-cdr! x (nfa-closure nfa (cdr x)))) res) + res)) + +(define (nfa-join-transitions! nfa existing elt state) + (define (join! ls elt state) + (if (not elt) + ls + (nfa-join-transitions! nfa ls elt state))) + (cond + ((char? elt) + (let lp ((ls existing) (res '())) + (cond + ((null? ls) + ;; done, just cons this on to the original list + (cons (cons elt (nfa-state->multi-state nfa state)) existing)) + ((eq? elt (caar ls)) + ;; add a new state to an existing char + (set-cdr! (car ls) (nfa-multi-state-add! (cdar ls) state)) + existing) + ((and (pair? (caar ls)) + (char<=? (caaar ls) elt) + (char<=? elt (cdaar ls))) + ;; split a range + (apply + (lambda (left right) + (let ((left-copy (nfa-multi-state-copy (cdar ls))) + (right-copy (nfa-multi-state-copy (cdar ls)))) + (cons (cons elt (nfa-multi-state-add! (cdar ls) state)) + (append (if left (list (cons left left-copy)) '()) + (if right (list (cons right right-copy)) '()) + res + (cdr ls))))) + (split-char-range (caar ls) elt))) + (else + ;; keep looking + (lp (cdr ls) (cons (car ls) res)))))) + (else + (let ((lo (car elt)) + (hi (cdr elt))) + (let lp ((ls existing) (res '())) + (cond + ((null? ls) + ;; done, just cons this on to the original list + (cons (cons elt (nfa-state->multi-state nfa state)) existing)) + ((and (char? (caar ls)) (char<=? lo (caar ls)) (char<=? (caar ls) hi)) + ;; range enclosing a character + (apply + (lambda (left right) + (set-cdr! (car ls) (nfa-multi-state-add! (cdar ls) state)) + (join! (join! existing left state) right state)) + (split-char-range elt (caar ls)))) + ((and (pair? (caar ls)) + (or (and (char<=? (caaar ls) hi) (char<=? lo (cdaar ls))) + (and (char<=? hi (caaar ls)) (char<=? (cdaar ls) lo)))) + ;; overlapping ranges + (apply + (lambda (left1 left2 same right1 right2) ;; 5 regions + (let ((right1-copy (nfa-multi-state-copy (cdar ls))) + (right2-copy (nfa-multi-state-copy (cdar ls)))) + (set-car! (car ls) same) + (set-cdr! (car ls) (nfa-multi-state-add! (cdar ls) state)) + (let* ((res (if right1 + (cons (cons right1 right1-copy) existing) + existing)) + (res (if right2 + (cons (cons right2 right2-copy) res) + res))) + (join! (join! res left1 state) left2 state)))) + (intersect-char-ranges elt (caar ls)))) + (else + (lp (cdr ls) (cons (car ls) res))))))))) + +(define (char-range c1 c2) + (if (eqv? c1 c2) c1 (cons c1 c2))) + +;; assumes ch is included in the range +(define (split-char-range range ch) + (list + (and (not (eqv? ch (car range))) + (char-range (car range) (integer->char (- (char->integer ch) 1)))) + (and (not (eqv? ch (cdr range))) + (char-range (integer->char (+ (char->integer ch) 1)) (cdr range))))) + +;; returns 5 (possibly #f) char ranges: +;; a-only-1 a-only-2 a-and-b b-only-1 b-only-2 +(define (intersect-char-ranges a b) + (if (char>? (car a) (car b)) + (reverse (intersect-char-ranges b a)) + (let ((a-lo (car a)) + (a-hi (cdr a)) + (b-lo (car b)) + (b-hi (cdr b))) + (list + (and (char<? a-lo b-lo) + (char-range a-lo (integer->char (- (char->integer b-lo) 1)))) + (and (char>? a-hi b-hi) + (char-range (integer->char (+ (char->integer b-hi) 1)) a-hi)) + (char-range b-lo (if (char<? b-hi a-hi) b-hi a-hi)) + #f + (and (char>? b-hi a-hi) + (char-range (integer->char (+ (char->integer a-hi) 1)) b-hi)))))) + +(define (nfa-cache-state-closure! nfa state) + (let ((cached (nfa-get-state-closure nfa state))) + (cond + ((not (null? cached)) + cached) + (else + (let ((res (nfa-state-closure-internal nfa state))) + (nfa-set-state-closure! nfa state res) + res))))) + +;; The `closure' of a list of NFA states - all states that can be +;; reached from any of them using any number of epsilon transitions. +(define (nfa-state-closure-internal nfa state) + (let lp ((ls (list state)) + (res (make-nfa-multi-state nfa))) + (cond + ((null? ls) + res) + ((nfa-multi-state-contains? res (car ls)) + (lp (cdr ls) res)) + (else + (lp (append (nfa-get-epsilons nfa (car ls)) (cdr ls)) + (nfa-multi-state-add! res (car ls))))))) + +(define (nfa-closure-internal nfa states) + (nfa-multi-state-fold + states + (lambda (st res) + (nfa-multi-state-union! res (nfa-cache-state-closure! nfa st))) + (make-nfa-multi-state nfa))) + +(define (nfa-closure nfa states) + (or (nfa-get-closure nfa states) + (let ((res (nfa-closure-internal nfa states))) + (nfa-add-closure! nfa states res) + res))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Match Extraction +;; +;; DFAs don't give us match information, so once we match and +;; determine the start and end, we need to recursively break the +;; problem into smaller DFAs to get each submatch. +;; +;; See http://compilers.iecc.com/comparch/article/07-10-026 + +(define (match-vector-ref v i) (vector-ref v (+ 3 i))) + +(define (match-vector-set! v i x) (vector-set! v (+ 3 i) x)) + +(define (sre-match-extractor sre num-submatches) + (let* ((tmp (+ num-submatches 1)) + (tmp-end-src-offset (+ 2 (* tmp 4))) + (tmp-end-index-offset (+ 3 (* tmp 4)))) + (let lp ((sre sre) (n 1) (submatch-deps? #f)) + (cond + ((not (sre-has-submatches? sre)) + (if (not submatch-deps?) + (lambda (cnk start i end j matches) #t) + (let ((dfa (nfa->dfa (sre->nfa sre ~none)))) + (lambda (cnk start i end j matches) + (dfa-match/longest dfa cnk start i end j matches tmp))))) + ((pair? sre) + (case (car sre) + ((: seq) + (let* ((right (sre-sequence (cddr sre))) + (match-left (lp (cadr sre) n #t)) + (match-right + (lp right (+ n (sre-count-submatches (cadr sre))) #t))) + (lambda (cnk start i end j matches) + (let lp1 ((end2 end) (j2 j) (best-src #f) (best-index #f)) + (let ((limit (if (eq? start end2) + i + ((chunker-get-start cnk) end2)))) + (let lp2 ((k j2) (best-src best-src) (best-index best-index)) + (if (< k limit) + (cond + ((not (eq? start end2)) + (let ((prev (chunker-prev-chunk cnk start end2))) + (lp1 prev + ((chunker-get-end cnk) prev) + best-src + best-index))) + (best-src + (match-vector-set! matches tmp-end-src-offset best-src) + (match-vector-set! matches tmp-end-index-offset best-index) + #t) + (else + #f)) + (if (and (match-left cnk start i end2 k matches) + (eq? end2 (match-vector-ref matches + tmp-end-src-offset)) + (eqv? k (match-vector-ref matches + tmp-end-index-offset)) + (match-right cnk end2 k end j matches)) + (let ((right-src + (match-vector-ref matches tmp-end-src-offset)) + (right + (match-vector-ref matches tmp-end-index-offset))) + (cond + ((and (eq? end right-src) (eqv? j right)) + (match-vector-set! matches tmp-end-src-offset end) + (match-vector-set! matches tmp-end-index-offset j) + #t) + ((or (not best-src) + (if (eq? best-src right-src) + (> right best-index) + (chunk-before? cnk + best-src + right-src))) + (lp2 (- k 1) right-src right)) + (else + (lp2 (- k 1) best-src best-index)))) + (lp2 (- k 1) best-src best-index))))))))) + ((or) + (if (null? (cdr sre)) + (lambda (cnk start i end j matches) #f) + (let* ((rest (sre-alternate (cddr sre))) + (match-first + (lp (cadr sre) n #t)) + (match-rest + (lp rest + (+ n (sre-count-submatches (cadr sre))) + submatch-deps?))) + (lambda (cnk start i end j matches) + (or (and (match-first cnk start i end j matches) + (eq? end (match-vector-ref matches tmp-end-src-offset)) + (eqv? j (match-vector-ref matches tmp-end-index-offset))) + (match-rest cnk start i end j matches)))))) + ((* +) + (letrec ((match-once + (lp (sre-sequence (cdr sre)) n #t)) + (match-all + (lambda (cnk start i end j matches) + (if (match-once cnk start i end j matches) + (let ((src (match-vector-ref matches tmp-end-src-offset)) + (k (match-vector-ref matches tmp-end-index-offset))) + (if (and src (or (not (eq? start src)) (< i k))) + (match-all cnk src k end j matches) + #t)) + (begin + (match-vector-set! matches tmp-end-src-offset start) + (match-vector-set! matches tmp-end-index-offset i) + #t))))) + (if (eq? '* (car sre)) + match-all + (lambda (cnk start i end j matches) + (and (match-once cnk start i end j matches) + (let ((src (match-vector-ref matches tmp-end-src-offset)) + (k (match-vector-ref matches tmp-end-index-offset))) + (match-all cnk src k end j matches))))))) + ((?) + (let ((match-once (lp (sre-sequence (cdr sre)) n #t))) + (lambda (cnk start i end j matches) + (cond + ((match-once cnk start i end j matches) + #t) + (else + (match-vector-set! matches tmp-end-src-offset start) + (match-vector-set! matches tmp-end-index-offset i) + #t))))) + (($ submatch => submatch-named) + (let ((match-one + (lp (sre-sequence (if (memq (car sre) '($ submatch)) + (cdr sre) + (cddr sre))) + (+ n 1) + #t)) + (start-src-offset (* n 4)) + (start-index-offset (+ 1 (* n 4))) + (end-src-offset (+ 2 (* n 4))) + (end-index-offset (+ 3 (* n 4)))) + (lambda (cnk start i end j matches) + (cond + ((match-one cnk start i end j matches) + (match-vector-set! matches start-src-offset start) + (match-vector-set! matches start-index-offset i) + (match-vector-set! matches end-src-offset + (match-vector-ref matches tmp-end-src-offset)) + (match-vector-set! matches end-index-offset + (match-vector-ref matches tmp-end-index-offset)) + #t) + (else + #f))))) + (else + (%irregex-error "unknown regexp operator" (car sre))))) + (else + (%irregex-error "unknown regexp" sre)))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Closure Compilation +;; +;; We use this for non-regular expressions instead of an interpreted +;; NFA matcher. We use backtracking anyway, but this gives us more +;; freedom of implementation, allowing us to support patterns that +;; can't be represented in the above NFA representation. + +(define (sre->procedure sre . o) + (define names + (if (and (pair? o) (pair? (cdr o))) (cadr o) (sre-names sre 1 '()))) + (let lp ((sre sre) + (n 1) + (flags (if (pair? o) (car o) ~none)) + (next (lambda (cnk init src str i end matches fail) + (irregex-match-start-chunk-set! matches 0 (car init)) + (irregex-match-start-index-set! matches 0 (cdr init)) + (irregex-match-end-chunk-set! matches 0 src) + (irregex-match-end-index-set! matches 0 i) + (%irregex-match-fail-set! matches fail) + matches))) + ;; XXXX this should be inlined + (define (rec sre) (lp sre n flags next)) + (cond + ((pair? sre) + (if (string? (car sre)) + (sre-cset->procedure + (sre->cset (car sre) (flag-set? flags ~case-insensitive?)) + next) + (case (car sre) + ((~ - & /) + (sre-cset->procedure + (sre->cset sre (flag-set? flags ~case-insensitive?)) + next)) + ((or) + (case (length (cdr sre)) + ((0) (lambda (cnk init src str i end matches fail) (fail))) + ((1) (rec (cadr sre))) + (else + (let* ((first (rec (cadr sre))) + (rest (lp (sre-alternate (cddr sre)) + (+ n (sre-count-submatches (cadr sre))) + flags + next))) + (lambda (cnk init src str i end matches fail) + (first cnk init src str i end matches + (lambda () + (rest cnk init src str i end matches fail)))))))) + ((w/case) + (lp (sre-sequence (cdr sre)) + n + (flag-clear flags ~case-insensitive?) + next)) + ((w/nocase) + (lp (sre-sequence (cdr sre)) + n + (flag-join flags ~case-insensitive?) + next)) + ((w/utf8) + (lp (sre-sequence (cdr sre)) n (flag-join flags ~utf8?) next)) + ((w/noutf8) + (lp (sre-sequence (cdr sre)) n (flag-clear flags ~utf8?) next)) + ((seq :) + (case (length (cdr sre)) + ((0) next) + ((1) (rec (cadr sre))) + (else + (let ((rest (lp (sre-sequence (cddr sre)) + (+ n (sre-count-submatches (cadr sre))) + flags + next))) + (lp (cadr sre) n flags rest))))) + ((?) + (let ((body (rec (sre-sequence (cdr sre))))) + (lambda (cnk init src str i end matches fail) + (body cnk init src str i end matches + (lambda () (next cnk init src str i end matches fail)))))) + ((??) + (let ((body (rec (sre-sequence (cdr sre))))) + (lambda (cnk init src str i end matches fail) + (next cnk init src str i end matches + (lambda () (body cnk init src str i end matches fail)))))) + ((*) + (cond + ((sre-empty? (sre-sequence (cdr sre))) + (%irregex-error "invalid sre: empty *" sre)) + (else + (letrec + ((body + (lp (sre-sequence (cdr sre)) + n + flags + (lambda (cnk init src str i end matches fail) + (body cnk init src str i end matches + (lambda () + (next cnk init src str i end matches fail) + )))))) + (lambda (cnk init src str i end matches fail) + (body cnk init src str i end matches + (lambda () + (next cnk init src str i end matches fail)))))))) + ((*?) + (cond + ((sre-empty? (sre-sequence (cdr sre))) + (%irregex-error "invalid sre: empty *?" sre)) + (else + (letrec + ((body + (lp (sre-sequence (cdr sre)) + n + flags + (lambda (cnk init src str i end matches fail) + (next cnk init src str i end matches + (lambda () + (body cnk init src str i end matches fail) + )))))) + (lambda (cnk init src str i end matches fail) + (next cnk init src str i end matches + (lambda () + (body cnk init src str i end matches fail)))))))) + ((+) + (lp (sre-sequence (cdr sre)) + n + flags + (rec (list '* (sre-sequence (cdr sre)))))) + ((=) + (rec `(** ,(cadr sre) ,(cadr sre) ,@(cddr sre)))) + ((>=) + (rec `(** ,(cadr sre) #f ,@(cddr sre)))) + ((** **?) + (cond + ((or (and (number? (cadr sre)) + (number? (caddr sre)) + (> (cadr sre) (caddr sre))) + (and (not (cadr sre)) (caddr sre))) + (lambda (cnk init src str i end matches fail) (fail))) + (else + (let* ((from (cadr sre)) + (to (caddr sre)) + (? (if (eq? '** (car sre)) '? '??)) + (* (if (eq? '** (car sre)) '* '*?)) + (sre (sre-sequence (cdddr sre))) + (x-sre (sre-strip-submatches sre)) + (next (if to + (if (= from to) + next + (fold (lambda (x next) + (lp `(,? ,sre) n flags next)) + next + (zero-to (- to from)))) + (rec `(,* ,sre))))) + (if (zero? from) + next + (lp `(seq ,@(map (lambda (x) x-sre) (zero-to (- from 1))) + ,sre) + n + flags + next)))))) + ((word) + (rec `(seq bow ,@(cdr sre) eow))) + ((word+) + (rec `(seq bow (+ (& (or alphanumeric "_") + (or ,@(cdr sre)))) eow))) + ((posix-string) + (rec (string->sre (cadr sre)))) + ((look-ahead) + (let ((check + (lp (sre-sequence (cdr sre)) + n + flags + (lambda (cnk init src str i end matches fail) i)))) + (lambda (cnk init src str i end matches fail) + (if (check cnk init src str i end matches (lambda () #f)) + (next cnk init src str i end matches fail) + (fail))))) + ((neg-look-ahead) + (let ((check + (lp (sre-sequence (cdr sre)) + n + flags + (lambda (cnk init src str i end matches fail) i)))) + (lambda (cnk init src str i end matches fail) + (if (check cnk init src str i end matches (lambda () #f)) + (fail) + (next cnk init src str i end matches fail))))) + ((look-behind neg-look-behind) + (let ((check + (lp (sre-sequence + (cons '(* any) (append (cdr sre) '(eos)))) + n + flags + (lambda (cnk init src str i end matches fail) i)))) + (lambda (cnk init src str i end matches fail) + (let* ((prev ((chunker-get-substring cnk) + (car init) + (cdr init) + src + i)) + (len (string-length prev)) + (src2 (list prev 0 len))) + (if ((if (eq? (car sre) 'look-behind) (lambda (x) x) not) + (check irregex-basic-string-chunker + (cons src2 0) src2 prev 0 len matches (lambda () #f))) + (next cnk init src str i end matches fail) + (fail)))))) + ((atomic) + (let ((once + (lp (sre-sequence (cdr sre)) + n + flags + (lambda (cnk init src str i end matches fail) i)))) + (lambda (cnk init src str i end matches fail) + (let ((j (once cnk init src str i end matches (lambda () #f)))) + (if j + (next cnk init src str j end matches fail) + (fail)))))) + ((if) + (let* ((test-submatches (sre-count-submatches (cadr sre))) + (pass (lp (caddr sre) flags (+ n test-submatches) next)) + (fail (if (pair? (cdddr sre)) + (lp (cadddr sre) + (+ n test-submatches + (sre-count-submatches (caddr sre))) + flags + next) + (lambda (cnk init src str i end matches fail) + (fail))))) + (cond + ((or (number? (cadr sre)) (symbol? (cadr sre))) + (let ((index + (if (symbol? (cadr sre)) + (cond + ((assq (cadr sre) names) => cdr) + (else + (%irregex-error "unknown named backref in SRE IF" sre))) + (cadr sre)))) + (lambda (cnk init src str i end matches fail2) + (if (%irregex-match-end-chunk matches index) + (pass cnk init src str i end matches fail2) + (fail cnk init src str i end matches fail2))))) + (else + (let ((test (lp (cadr sre) n flags pass))) + (lambda (cnk init src str i end matches fail2) + (test cnk init src str i end matches + (lambda () (fail cnk init src str i end matches fail2))) + )))))) + ((backref backref-ci) + (let ((n (cond ((number? (cadr sre)) (cadr sre)) + ((assq (cadr sre) names) => cdr) + (else (%irregex-error "unknown backreference" (cadr sre))))) + (compare (if (or (eq? (car sre) 'backref-ci) + (flag-set? flags ~case-insensitive?)) + string-ci=? + string=?))) + (lambda (cnk init src str i end matches fail) + (let ((s (irregex-match-substring matches n))) + (if (not s) + (fail) + ;; XXXX create an abstract subchunk-compare + (let lp ((src src) + (str str) + (i i) + (end end) + (j 0) + (len (string-length s))) + (cond + ((<= len (- end i)) + (cond + ((compare (substring s j (string-length s)) + (substring str i (+ i len))) + (next cnk init src str (+ i len) end matches fail)) + (else + (fail)))) + (else + (cond + ((compare (substring s j (+ j (- end i))) + (substring str i end)) + (let ((src2 ((chunker-get-next cnk) src))) + (if src2 + (lp src2 + ((chunker-get-str cnk) src2) + ((chunker-get-start cnk) src2) + ((chunker-get-end cnk) src2) + (+ j (- end i)) + (- len (- end i))) + (fail)))) + (else + (fail))))))))))) + ((dsm) + (lp (sre-sequence (cdddr sre)) (+ n (cadr sre)) flags next)) + (($ submatch) + (let ((body + (lp (sre-sequence (cdr sre)) + (+ n 1) + flags + (lambda (cnk init src str i end matches fail) + (let ((old-source + (%irregex-match-end-chunk matches n)) + (old-index + (%irregex-match-end-index matches n))) + (irregex-match-end-chunk-set! matches n src) + (irregex-match-end-index-set! matches n i) + (next cnk init src str i end matches + (lambda () + (irregex-match-end-chunk-set! + matches n old-source) + (irregex-match-end-index-set! + matches n old-index) + (fail)))))))) + (lambda (cnk init src str i end matches fail) + (let ((old-source (%irregex-match-start-chunk matches n)) + (old-index (%irregex-match-start-index matches n))) + (irregex-match-start-chunk-set! matches n src) + (irregex-match-start-index-set! matches n i) + (body cnk init src str i end matches + (lambda () + (irregex-match-start-chunk-set! + matches n old-source) + (irregex-match-start-index-set! + matches n old-index) + (fail))))))) + ((=> submatch-named) + (rec `(submatch ,@(cddr sre)))) + (else + (%irregex-error "unknown regexp operator" sre))))) + ((symbol? sre) + (case sre + ((any) + (lambda (cnk init src str i end matches fail) + (if (< i end) + (next cnk init src str (+ i 1) end matches fail) + (let ((src2 ((chunker-get-next cnk) src))) + (if src2 + (let ((str2 ((chunker-get-str cnk) src2)) + (i2 ((chunker-get-start cnk) src2)) + (end2 ((chunker-get-end cnk) src2))) + (next cnk init src2 str2 (+ i2 1) end2 matches fail)) + (fail)))))) + ((nonl) + (lambda (cnk init src str i end matches fail) + (if (< i end) + (if (not (eqv? #\newline (string-ref str i))) + (next cnk init src str (+ i 1) end matches fail) + (fail)) + (let ((src2 ((chunker-get-next cnk) src))) + (if src2 + (let ((str2 ((chunker-get-str cnk) src2)) + (i2 ((chunker-get-start cnk) src2)) + (end2 ((chunker-get-end cnk) src2))) + (if (not (eqv? #\newline (string-ref str2 i2))) + (next cnk init src2 str2 (+ i2 1) end2 matches fail) + (fail))) + (fail)))))) + ((bos) + (lambda (cnk init src str i end matches fail) + (if (and (eq? src (car init)) (eqv? i (cdr init))) + (next cnk init src str i end matches fail) + (fail)))) + ((bol) + (lambda (cnk init src str i end matches fail) + (if (or (and (eq? src (car init)) (eqv? i (cdr init))) + (and (> i ((chunker-get-start cnk) src)) + (eqv? #\newline (string-ref str (- i 1))))) + (next cnk init src str i end matches fail) + (fail)))) + ((bow) + (lambda (cnk init src str i end matches fail) + (if (and (or (if (> i ((chunker-get-start cnk) src)) + (not (char-alphanumeric? (string-ref str (- i 1)))) + (let ((ch (chunker-prev-char cnk src end))) + (and ch (not (char-alphanumeric? ch))))) + (and (eq? src (car init)) (eqv? i (cdr init)))) + (if (< i end) + (char-alphanumeric? (string-ref str i)) + (let ((next ((chunker-get-next cnk) src))) + (and next + (char-alphanumeric? + (string-ref ((chunker-get-str cnk) next) + ((chunker-get-start cnk) next))))))) + (next cnk init src str i end matches fail) + (fail)))) + ((eos) + (lambda (cnk init src str i end matches fail) + (if (and (>= i end) (not ((chunker-get-next cnk) src))) + (next cnk init src str i end matches fail) + (fail)))) + ((eol) + (lambda (cnk init src str i end matches fail) + (if (if (< i end) + (eqv? #\newline (string-ref str i)) + (let ((src2 ((chunker-get-next cnk) src))) + (if (not src2) + #t + (eqv? #\newline + (string-ref ((chunker-get-str cnk) src2) + ((chunker-get-start cnk) src2)))))) + (next cnk init src str i end matches fail) + (fail)))) + ((eow) + (lambda (cnk init src str i end matches fail) + (if (and (if (< i end) + (not (char-alphanumeric? (string-ref str i))) + (let ((ch (chunker-next-char cnk src))) + (or (not ch) (not (char-alphanumeric? ch))))) + (if (> i ((chunker-get-start cnk) src)) + (char-alphanumeric? (string-ref str (- i 1))) + (let ((prev (chunker-prev-char cnk init src))) + (or (not prev) (char-alphanumeric? prev))))) + (next cnk init src str i end matches fail) + (fail)))) + ((nwb) ;; non-word-boundary + (lambda (cnk init src str i end matches fail) + (let ((c1 (if (< i end) + (string-ref str i) + (chunker-next-char cnk src))) + (c2 (if (> i ((chunker-get-start cnk) src)) + (string-ref str (- i 1)) + (chunker-prev-char cnk init src)))) + (if (and c1 c2 + (if (char-alphanumeric? c1) + (char-alphanumeric? c2) + (not (char-alphanumeric? c2)))) + (next cnk init src str i end matches fail) + (fail))))) + ((epsilon) + next) + (else + (let ((cell (assq sre sre-named-definitions))) + (if cell + (rec (cdr cell)) + (%irregex-error "unknown regexp" sre)))))) + ((char? sre) + (if (flag-set? flags ~case-insensitive?) + ;; case-insensitive + (lambda (cnk init src str i end matches fail) + (if (>= i end) + (let lp ((src2 ((chunker-get-next cnk) src))) + (if src2 + (let ((str2 ((chunker-get-str cnk) src2)) + (i2 ((chunker-get-start cnk) src2)) + (end2 ((chunker-get-end cnk) src2))) + (if (>= i2 end2) + (lp ((chunker-get-next cnk) src2)) + (if (char-ci=? sre (string-ref str2 i2)) + (next cnk init src2 str2 (+ i2 1) end2 + matches fail) + (fail)))) + (fail))) + (if (char-ci=? sre (string-ref str i)) + (next cnk init src str (+ i 1) end matches fail) + (fail)))) + ;; case-sensitive + (lambda (cnk init src str i end matches fail) + (if (>= i end) + (let lp ((src2 ((chunker-get-next cnk) src))) + (if src2 + (let ((str2 ((chunker-get-str cnk) src2)) + (i2 ((chunker-get-start cnk) src2)) + (end2 ((chunker-get-end cnk) src2))) + (if (>= i2 end2) + (lp ((chunker-get-next cnk) src2)) + (if (char=? sre (string-ref str2 i2)) + (next cnk init src2 str2 (+ i2 1) end2 + matches fail) + (fail)))) + (fail))) + (if (char=? sre (string-ref str i)) + (next cnk init src str (+ i 1) end matches fail) + (fail)))) + )) + ((string? sre) + (rec (sre-sequence (string->list sre))) +;; XXXX reintroduce faster string matching on chunks +;; (if (flag-set? flags ~case-insensitive?) +;; (rec (sre-sequence (string->list sre))) +;; (let ((len (string-length sre))) +;; (lambda (cnk init src str i end matches fail) +;; (if (and (<= (+ i len) end) +;; (%substring=? sre str 0 i len)) +;; (next str (+ i len) matches fail) +;; (fail))))) + ) + (else + (%irregex-error "unknown regexp" sre))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Character Sets +;; +;; Simple character sets as lists of ranges, as used in the NFA/DFA +;; compilation. This is not especially efficient, but is portable and +;; scalable for any range of character sets. + +(define (sre-cset->procedure cset next) + (lambda (cnk init src str i end matches fail) + (if (< i end) + (if (cset-contains? cset (string-ref str i)) + (next cnk init src str (+ i 1) end matches fail) + (fail)) + (let ((src2 ((chunker-get-next cnk) src))) + (if src2 + (let ((str2 ((chunker-get-str cnk) src2)) + (i2 ((chunker-get-start cnk) src2)) + (end2 ((chunker-get-end cnk) src2))) + (if (cset-contains? cset (string-ref str2 i2)) + (next cnk init src2 str2 (+ i2 1) end2 matches fail) + (fail))) + (fail)))))) + +(define (plist->alist ls) + (let lp ((ls ls) (res '())) + (if (null? ls) + (reverse res) + (lp (cddr ls) (cons (cons (car ls) (cadr ls)) res))))) + +(define (alist->plist ls) + (let lp ((ls ls) (res '())) + (if (null? ls) + (reverse res) + (lp (cdr ls) (cons (cdar ls) (cons (caar ls) res)))))) + +(define (sre->cset sre . o) + (let lp ((sre sre) (ci? (and (pair? o) (car o)))) + (define (rec sre) (lp sre ci?)) + (cond + ((pair? sre) + (if (string? (car sre)) + (if ci? + (cset-case-insensitive (string->list (car sre))) + (string->list (car sre))) + (case (car sre) + ((~) + (cset-complement + (fold cset-union (rec (cadr sre)) (map rec (cddr sre))))) + ((&) + (fold cset-intersection (rec (cadr sre)) (map rec (cddr sre)))) + ((-) + (fold (lambda (x res) (cset-difference res x)) + (rec (cadr sre)) + (map rec (cddr sre)))) + ((/) + (let ((res (plist->alist (sre-flatten-ranges (cdr sre))))) + (if ci? + (cset-case-insensitive res) + res))) + ((or) + (fold cset-union (rec (cadr sre)) (map rec (cddr sre)))) + ((w/case) + (lp (sre-alternate (cdr sre)) #f)) + ((w/nocase) + (lp (sre-alternate (cdr sre)) #t)) + (else + (%irregex-error "not a valid sre char-set operator" sre))))) + ((char? sre) (rec (list (string sre)))) + ((string? sre) (rec (list sre))) + (else + (let ((cell (assq sre sre-named-definitions))) + (if cell + (rec (cdr cell)) + (%irregex-error "not a valid sre char-set" sre))))))) + +;; another debugging utility +;; (define (cset->sre cset) +;; (let lp ((ls cset) (chars '()) (ranges '())) +;; (cond +;; ((null? ls) +;; (sre-alternate +;; (append +;; (if (pair? chars) (list (list (list->string chars))) '()) +;; (if (pair? ranges) (list (cons '/ (alist->plist ranges))) '())))) +;; ((char? (car ls)) (lp (cdr ls) (cons (car ls) chars) ranges)) +;; (else (lp (cdr ls) chars (cons (car ls) ranges)))))) + +(define (cset-contains? cset ch) + (find (lambda (x) + (or (eqv? x ch) + (and (pair? x) (char<=? (car x) ch) (char<=? ch (cdr x))))) + cset)) + +(define (cset-range x) + (if (char? x) (cons x x) x)) + +(define (char-ranges-overlap? a b) + (if (pair? a) + (if (pair? b) + (or (and (char<=? (car a) (cdr b)) (char<=? (car b) (cdr a))) + (and (char<=? (cdr b) (car a)) (char<=? (cdr a) (car b)))) + (and (char<=? (car a) b) (char<=? b (cdr a)))) + (if (pair? b) + (char-ranges-overlap? b a) + (eqv? a b)))) + +(define (char-ranges-union a b) + (cons (if (char<=? (car a) (car b)) (car a) (car b)) + (if (char>=? (cdr a) (cdr b)) (cdr a) (cdr b)))) + +(define (cset-union a b) + (cond ((null? b) a) + ((find-tail (lambda (x) (char-ranges-overlap? x (car b))) a) + => (lambda (ls) + (cset-union + (cset-union (append (take-up-to a ls) (cdr ls)) + (list (char-ranges-union (cset-range (car ls)) + (cset-range (car b))))) + (cdr b)))) + (else (cset-union (cons (car b) a) (cdr b))))) + +(define (cset-difference a b) + (cond ((null? b) a) + ((not (car b)) (cset-difference a (cdr b))) + ((find-tail (lambda (x) (char-ranges-overlap? x (car b))) a) + => (lambda (ls) + (apply + (lambda (left1 left2 same right1 right2) + (let* ((a (append (take-up-to a ls) (cdr ls))) + (a (if left1 (cons left1 a) a)) + (a (if left2 (cons left2 a) a)) + (b (if right1 (cset-union b (list right1)) b)) + (b (if right2 (cset-union b (list right2)) b))) + (cset-difference a b))) + (intersect-char-ranges (cset-range (car ls)) + (cset-range (car b)))))) + (else (cset-difference a (cdr b))))) + +(define (cset-intersection a b) + (let intersect ((a a) (b b) (res '())) + (cond ((null? b) res) + ((find-tail (lambda (x) (char-ranges-overlap? x (car b))) a) + => (lambda (ls) + (apply + (lambda (left1 left2 same right1 right2) + (let* ((a (append (take-up-to a ls) (cdr ls))) + (a (if left1 (cons left1 a) a)) + (a (if left2 (cons left2 a) a)) + (b (if right1 (cset-union b (list right1)) b)) + (b (if right2 (cset-union b (list right2)) b))) + (intersect a b (cset-union res (list same))))) + (intersect-char-ranges (cset-range (car ls)) + (cset-range (car b)))))) + (else (intersect a (cdr b) res))))) + +(define (cset-complement a) + (cset-difference (sre->cset *all-chars*) a)) + +(define (cset-case-insensitive a) + (let lp ((ls a) (res '())) + (cond ((null? ls) (reverse res)) + ((and (char? (car ls)) (char-alphabetic? (car ls))) + (let ((c2 (char-altcase (car ls))) + (res (cons (car ls) res))) + (lp (cdr ls) (if (cset-contains? res c2) res (cons c2 res))))) + ((and (pair? (car ls)) + (char-alphabetic? (caar ls)) + (char-alphabetic? (cdar ls))) + (lp (cdr ls) + (cset-union (cset-union res (list (car ls))) + (list (cons (char-altcase (caar ls)) + (char-altcase (cdar ls))))))) + (else (lp (cdr ls) (cset-union res (list (car ls)))))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Match and Replace Utilities + +(define (irregex-fold/fast irx kons knil str . o) + (if (not (string? str)) (%irregex-error 'irregex-fold "not a string" str)) + (let* ((irx (irregex irx)) + (matches (irregex-new-matches irx)) + (finish (or (and (pair? o) (car o)) (lambda (i acc) acc))) + (start (if (and (pair? o) (pair? (cdr o))) (cadr o) 0)) + (end (if (and (pair? o) (pair? (cdr o)) (pair? (cddr o))) + (caddr o) + (string-length str)))) + (if (not (integer? start)) (%irregex-error 'irregex-fold "not an integer" start)) + (if (not (integer? end)) (%irregex-error 'irregex-fold "not an integer" end)) + (irregex-match-chunker-set! matches irregex-basic-string-chunker) + (let lp ((i start) (acc knil)) + (if (>= i end) + (finish i acc) + (let ((m (irregex-search/matches + irx + irregex-basic-string-chunker + (list str i end) + i + matches))) + (if (not m) + (finish i acc) + (let* ((end (%irregex-match-end-index m 0)) + (acc (kons i m acc))) + (irregex-reset-matches! matches) + (lp end acc)))))))) + +(define (irregex-fold irx kons . args) + (if (not (procedure? kons)) (%irregex-error 'irregex-fold "not a procedure" kons)) + (let ((kons2 (lambda (i m acc) (kons i (irregex-copy-matches m) acc)))) + (apply irregex-fold/fast irx kons2 args))) + +(define (irregex-fold/chunked/fast irx kons knil cnk start . o) + (let* ((irx (irregex irx)) + (matches (irregex-new-matches irx)) + (finish (or (and (pair? o) (car o)) (lambda (src i acc) acc))) + (i (if (and (pair? o) (pair? (cdr o))) + (cadr o) + ((chunker-get-start cnk) start)))) + (if (not (integer? i)) (%irregex-error 'irregex-fold/chunked "not an integer" i)) + (irregex-match-chunker-set! matches cnk) + (let lp ((start start) (i i) (acc knil)) + (if (not start) + (finish start i acc) + (let ((m (irregex-search/matches irx cnk start i matches))) + (if (not m) + (finish start i acc) + (let* ((acc (kons start i m acc)) + (end-src (%irregex-match-end-chunk m 0)) + (end-index (%irregex-match-end-index m 0))) + (irregex-reset-matches! matches) + (lp end-src end-index acc)))))))) + +(define (irregex-fold/chunked irx kons . args) + (if (not (procedure? kons)) (%irregex-error 'irregex-fold/chunked "not a procedure" kons)) + (let ((kons2 (lambda (s i m acc) (kons s i (irregex-copy-matches m) acc)))) + (apply irregex-fold/chunked/fast irx kons2 args))) + +(define (irregex-replace irx str . o) + (if (not (string? str)) (%irregex-error 'irregex-replace "not a string" str)) + (let ((m (irregex-search irx str))) + (and + m + (string-cat-reverse + (cons (substring str (%irregex-match-end-index m 0) (string-length str)) + (append (irregex-apply-match m o) + (list (substring str 0 (%irregex-match-start-index m 0))) + )))))) + +(define (irregex-replace/all irx str . o) + (if (not (string? str)) (%irregex-error 'irregex-replace/all "not a string" str)) + (irregex-fold/fast + irx + (lambda (i m acc) + (let ((m-start (%irregex-match-start-index m 0))) + (append (irregex-apply-match m o) + (if (>= i m-start) + acc + (cons (substring str i m-start) acc))))) + '() + str + (lambda (i acc) + (let ((end (string-length str))) + (string-cat-reverse (if (>= i end) + acc + (cons (substring str i end) acc))))))) + +(define (irregex-apply-match m ls) + (let lp ((ls ls) (res '())) + (if (null? ls) + res + (cond + ((integer? (car ls)) + (lp (cdr ls) + (cons (or (irregex-match-substring m (car ls)) "") res))) + ((procedure? (car ls)) + (lp (cdr ls) (cons ((car ls) m) res))) + ((symbol? (car ls)) + (case (car ls) + ((pre) + (lp (cdr ls) + (cons (substring (car (%irregex-match-start-chunk m 0)) + 0 + (%irregex-match-start-index m 0)) + res))) + ((post) + (let ((str (car (%irregex-match-start-chunk m 0)))) + (lp (cdr ls) + (cons (substring str + (%irregex-match-end-index m 0) + (string-length str)) + res)))) + (else + (cond + ((assq (car ls) (irregex-match-names m)) + => (lambda (x) (lp (cons (cdr x) (cdr ls)) res))) + (else + (%irregex-error "unknown match replacement" (car ls))))))) + (else + (lp (cdr ls) (cons (car ls) res))))))) + +(define (irregex-extract irx str . o) + (if (not (string? str)) (%irregex-error 'irregex-extract "not a string" str)) + (apply irregex-fold/fast + irx + (lambda (i m a) (cons (irregex-match-substring m) a)) + '() + str + (lambda (i a) (reverse a)) + o)) + +(define (irregex-split irx str . o) + (if (not (string? str)) (%irregex-error 'irregex-split "not a string" str)) + (let ((start (if (pair? o) (car o) 0)) + (end (if (and (pair? o) (pair? (cdr o))) (cadr o) (string-length str)))) + (irregex-fold/fast + irx + (lambda (i m a) + (if (= i (%irregex-match-start-index m 0)) + a + (cons (substring str i (%irregex-match-start-index m 0)) a))) + '() + str + (lambda (i a) + (reverse (if (= i end) a (cons (substring str i end) a)))) + start + end))) diff --git a/irregex-utils.scm b/irregex-utils.scm new file mode 100644 index 00000000..8332791d --- /dev/null +++ b/irregex-utils.scm @@ -0,0 +1,154 @@ +;;;; irregex-utils.scm +;; +;; Copyright (c) 2010 Alex Shinn. All rights reserved. +;; BSD-style license: http://synthcode.com/license.txt + +(define rx-special-chars + "\\|[](){}.*+?^$#") + +(define (string-scan-char str c . o) + (let ((end (string-length str))) + (let scan ((i (if (pair? o) (car o) 0))) + (cond ((= i end) #f) + ((eqv? c (string-ref str i)) i) + (else (scan (+ i 1))))))) + +(define (irregex-quote str) + (list->string + (let loop ((ls (string->list str)) (res '())) + (if (null? ls) + (reverse res) + (let ((c (car ls))) + (if (string-scan-char rx-special-chars c) + (loop (cdr ls) (cons c (cons #\\ res))) + (loop (cdr ls) (cons c res)))))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define (irregex-opt ls) + (define (make-alt ls) + (cond ((null? (cdr ls)) (car ls)) + ((every char? ls) (list (list->string ls))) + (else (cons 'or ls)))) + (define (make-seq ls) + (cond ((null? (cdr ls)) (car ls)) + ((every (lambda (x) (or (string? x) (char? x))) ls) + (apply string-append (map (lambda (x) (if (char? x) (string x) x)) ls))) + (else (cons 'seq ls)))) + (cond + ((null? ls) "") + ((null? (cdr ls)) (car ls)) + (else + (let ((chars (make-vector 256 '()))) + (let lp1 ((ls ls) (empty? #f)) + (if (null? ls) + (let lp2 ((i 0) (res '())) + (if (= i 256) + (let ((res (make-alt (reverse res)))) + (if empty? `(? ,res) res)) + (let ((c (integer->char i)) + (opts (vector-ref chars i))) + (lp2 (+ i 1) + (cond + ((null? opts) res) + ((equal? opts '("")) `(,c ,@res)) + (else `(,(make-seq (list c (irregex-opt opts))) + ,@res))))))) + (let* ((str (car ls)) + (len (string-length str))) + (if (zero? len) + (lp1 (cdr ls) #t) + (let ((i (char->integer (string-ref str 0)))) + (vector-set! + chars + i + (cons (substring str 1 len) (vector-ref chars i))) + (lp1 (cdr ls) empty?)))))))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define (cset->string ls) + (let ((out (open-output-string))) + (let lp ((ls ls)) + (cond + ((pair? ls) + (cond + ((pair? (car ls)) + (display (irregex-quote (string (caar ls))) out) + (write-char #\- out) + (display (irregex-quote (string (cdar ls))) out)) + (else (display (irregex-quote (string (car ls))) out))) + (lp (cdr ls))))) + (get-output-string out))) + +(define (sre->string obj) + (let ((out (open-output-string))) + (let lp ((x obj)) + (cond + ((pair? x) + (case (car x) + ((: seq) + (cond + ((and (pair? (cddr x)) (pair? (cddr x)) (not (eq? x obj))) + (display "(?:" out) (for-each lp (cdr x)) (display ")" out)) + (else (for-each lp (cdr x))))) + ((submatch) + (display "(" out) (for-each lp (cdr x)) (display ")" out)) + ((submatch-named) + (display "(?<" out) (display (cadr x) out) (display ">" out) + (for-each lp (cddr x)) (display ")" out)) + ((or) + (display "(?:" out) + (lp (cadr x)) + (for-each (lambda (x) (display "|" out) (lp x)) (cddr x)) + (display ")" out)) + ((* + ? *? ??) + (cond + ((pair? (cddr x)) + (display "(?:" out) (for-each lp (cdr x)) (display ")" out)) + (else (lp (cadr x)))) + (display (car x) out)) + ((not) + (cond + ((and (pair? (cadr x)) (eq? 'cset (caadr x))) + (display "[^" out) + (display (cset->string (cdadr x)) out) + (display "]" out)) + (else (error "can't represent general 'not' in strings" x)))) + ((cset) + (display "[" out) + (display (cset->string (cdr x)) out) + (display "]" out)) + ((- & / ~) + (cond + ((or (eq? #\~ (car x)) + (and (eq? '- (car x)) (pair? (cdr x)) (eq? 'any (cadr x)))) + (display "[^" out) + (display (cset->string (if (eq? #\~ (car x)) (cdr x) (cddr x))) out) + (display "]" out)) + (else + (lp `(cset ,@(sre->cset x)))))) + ((w/case w/nocase) + (display "(?" out) + (if (eq? (car x) 'w/case) (display "-" out)) + (display ":" out) + (for-each lp (cdr x)) + (display ")" out)) + (else + (if (string? (car x)) + (lp `(cset ,@(string->list (car x)))) + (error "unknown sre operator" x))))) + ((symbol? x) + (case x + ((bos bol) (display "^" out)) + ((eos eol) (display "$" out)) + ((any nonl) (display "." out)) + (else (error "unknown sre symbol" x)))) + ((string? x) + (display (irregex-quote x) out)) + ((char? x) + (display (irregex-quote (string x)) out)) + (else + (error "unknown sre pattern" x)))) + (get-output-string out))) + diff --git a/irregex.import.scm b/irregex.import.scm index 7fc3bde7..4f2a81a6 100644 --- a/irregex.import.scm +++ b/irregex.import.scm @@ -26,11 +26,50 @@ (##sys#register-primitive-module 'irregex - '(irregex string->irregex sre->irregex string->sre - irregex? irregex-match-data? - irregex-new-matches irregex-reset-matches! - irregex-match-start irregex-match-end irregex-match-substring - irregex-search irregex-search/matches irregex-match irregex-match-string - irregex-fold irregex-replace irregex-replace/all irregex-apply-match - irregex-dfa irregex-dfa/search irregex-dfa/extract - irregex-nfa irregex-flags irregex-submatches irregex-lengths irregex-names)) + '(irregex + irregex-apply-match + irregex-dfa + irregex-dfa/extract + irregex-dfa/search + irregex-extract + irregex-flags + irregex-fold + irregex-fold/chunked + irregex-lengths + irregex-match + irregex-match? + irregex-match-data? + irregex-match-end + irregex-match-end-chunk + irregex-match-end-index + irregex-match-names + irregex-match-num-submatches + irregex-match-start + irregex-match-start-chunk + irregex-match-start-index + irregex-match-string + irregex-match-subchunk + irregex-match-substring + irregex-match-valid-index? + irregex-match/chunked + irregex-names + irregex-new-matches + irregex-nfa + irregex-num-submatches + irregex-opt + irregex-quote + irregex-replace + irregex-replace/all + irregex-reset-matches! + irregex-search + irregex-search/chunked + irregex-search/matches + irregex-split + irregex? + make-irregex-chunker + maybe-string->sre + sre->irregex + sre->string + string->irregex + string->sre + )) diff --git a/irregex.scm b/irregex.scm index 5d0f77e7..fb4cf21e 100644 --- a/irregex.scm +++ b/irregex.scm @@ -1,2718 +1,254 @@ -;;;; irregex.scm -- IrRegular Expressions -;; -;; Copyright (c) 2005-2008 Alex Shinn. All rights reserved. -;; BSD-style license: http://synthcode.com/license.txt - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; At this moment there was a loud ring at the bell, and I could -;; hear Mrs. Hudson, our landlady, raising her voice in a wail of -;; expostulation and dismay. -;; -;; "By heaven, Holmes," I said, half rising, "I believe that -;; they are really after us." -;; -;; "No, it's not quite so bad as that. It is the unofficial -;; force, -- the Baker Street irregulars." - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; History -;; -;; 0.6.2: 2008/07/26 - minor bugfixes, allow global disabling of utf8 mode, -;; friendlier error messages in parsing, \Q..\E support -;; 0.6.1: 2008/07/21 - added utf8 mode, more utils, bugfixes -;; 0.6: 2008/05/01 - most of PCRE supported -;; 0.5: 2008/04/24 - fully portable R4RS, many PCRE features implemented -;; 0.4: 2008/04/17 - rewriting NFA to use efficient closure compilation, -;; normal strings only, but all of the spencer tests pass -;; 0.3: 2008/03/10 - adding DFA converter (normal strings only) -;; 0.2: 2005/09/27 - adding irregex-opt (like elisp's regexp-opt) utility -;; 0.1: 2005/08/18 - simple NFA interpreter over abstract chunked strings - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -(define irregex-tag '*irregex-tag*) - -(define (make-irregex dfa dfa/search dfa/extract nfa flags - submatches lengths names) - (vector irregex-tag dfa dfa/search dfa/extract nfa flags - submatches lengths names)) - -(define (irregex? obj) - (and (vector? obj) - (= 9 (vector-length obj)) - (eq? irregex-tag (vector-ref obj 0)))) - -(define (irregex-dfa x) (vector-ref x 1)) -(define (irregex-dfa/search x) (vector-ref x 2)) -(define (irregex-dfa/extract x) (vector-ref x 3)) -(define (irregex-nfa x) (vector-ref x 4)) -(define (irregex-flags x) (vector-ref x 5)) -(define (irregex-submatches x) (vector-ref x 6)) -(define (irregex-lengths x) (vector-ref x 7)) -(define (irregex-names x) (vector-ref x 8)) - -(define (irregex-new-matches irx) - (make-irregex-match #f (irregex-submatches irx) (irregex-names irx))) -(define (irregex-reset-matches! m) - (do ((i (- (vector-length m) 1) (- i 1))) - ((<= i 3) m) - (vector-set! m i #f))) - -(define irregex-match-tag '*irregex-match-tag*) - -(define (irregex-match-data? obj) - (and (vector? obj) - (>= (vector-length obj) 5) - (eq? irregex-match-tag (vector-ref obj 0)))) - -(define (make-irregex-match str count names) - (let ((res (make-vector (+ (* 2 (+ 1 count)) 3) #f))) - (vector-set! res 0 irregex-match-tag) - (vector-set! res 1 str) - (vector-set! res 2 names) - res)) - -(define (irregex-match-num-submatches m) - (- (quotient (- (vector-length m) 3) 2) 1)) - -(define (irregex-match-string m) - (vector-ref m 1)) -(define (irregex-match-names m) - (vector-ref m 2)) -(define (irregex-match-string-set! m str) - (vector-set! m 1 str)) - -(define (irregex-match-start-index m n) - (vector-ref m (+ 3 (* n 2)))) -(define (irregex-match-end-index m n) - (vector-ref m (+ 4 (* n 2)))) - -(define (irregex-match-start-index-set! m n start) - (vector-set! m (+ 3 (* n 2)) start)) -(define (irregex-match-end-index-set! m n end) - (vector-set! m (+ 4 (* n 2)) end)) - -(define (irregex-match-index m opt) - (if (pair? opt) - (cond ((number? (car opt)) (car opt)) - ((assq (car opt) (irregex-match-names m)) => cdr) - (else (error "unknown match name" (car opt)))) - 0)) - -(define (irregex-match-valid-index? m n) - (and (< (+ 3 (* n 2)) (vector-length m)) - (vector-ref m (+ 4 (* n 2))))) - -(define (irregex-match-substring m . opt) - (let ((n (irregex-match-index m opt))) - (and (irregex-match-valid-index? m n) - (substring (irregex-match-string m) - (vector-ref m (+ 3 (* n 2))) - (vector-ref m (+ 4 (* n 2))))))) - -(define (irregex-match-start m . opt) - (let ((n (irregex-match-index m opt))) - (and (irregex-match-valid-index? m n) - (vector-ref m (+ 3 (* n 2)))))) - -(define (irregex-match-end m . opt) - (irregex-match-valid-index? m (irregex-match-index m opt))) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; string utilities - -;;;; Unicode version (skip surrogates) -(define *all-chars* - `(/ ,(integer->char 0) ,(integer->char #xD7FF) - ,(integer->char #xE000) ,(integer->char #x10FFFF))) - -;;;; ASCII version, offset to not assume 0-255 -;; (define *all-chars* `(/ ,(integer->char (- (char->integer #\space) 32)) ,(integer->char (+ (char->integer #\space) 223)))) - -;; set to #f to ignore even an explicit request for utf8 handling -(define *allow-utf8-mode?* #t) - -;; (define *named-char-properties* '()) - -(define (string-scan-char str c . o) - (let ((end (string-length str))) - (let scan ((i (if (pair? o) (car o) 0))) - (cond ((= i end) #f) - ((eqv? c (string-ref str i)) i) - (else (scan (+ i 1))))))) - -(define (string-scan-char-escape str c . o) - (let ((end (string-length str))) - (let scan ((i (if (pair? o) (car o) 0))) - (cond ((= i end) #f) - ((eqv? c (string-ref str i)) i) - ((eqv? c #\\) (scan (+ i 2))) - (else (scan (+ i 1))))))) - -(define (string-scan-pred str pred . o) - (let ((end (string-length str))) - (let scan ((i (if (pair? o) (car o) 0))) - (cond ((= i end) #f) - ((pred (string-ref str i)) i) - (else (scan (+ i 1))))))) - -(define (string-split-char str c) - (let ((end (string-length str))) - (let lp ((i 0) (from 0) (res '())) - (define (collect) (cons (substring str from i) res)) - (cond ((>= i end) (reverse (collect))) - ((eqv? c (string-ref str i)) (lp (+ i 1) (+ i 1) (collect))) - (else (lp (+ i 1) from res)))))) - -(define (char-alphanumeric? c) - (or (char-alphabetic? c) (char-numeric? c))) - -;; SRFI-13 extracts - -(define (%%string-copy! to tstart from fstart fend) - (do ((i fstart (+ i 1)) - (j tstart (+ j 1))) - ((>= i fend)) - (string-set! to j (string-ref from i)))) - -(define (string-cat-reverse string-list) - (string-cat-reverse/aux - (fold (lambda (s a) (+ (string-length s) a)) 0 string-list) - string-list)) - -(define (string-cat-reverse/aux len string-list) - (let ((res (make-string len))) - (let lp ((i len) (ls string-list)) - (if (pair? ls) - (let* ((s (car ls)) - (slen (string-length s)) - (i (- i slen))) - (%%string-copy! res i s 0 slen) - (lp i (cdr ls))))) - res)) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; list utilities - -;; like the one-arg IOTA case -(define (zero-to n) - (if (<= n 0) - '() - (let lp ((i (- n 1)) (res '())) - (if (zero? i) (cons 0 res) (lp (- i 1) (cons i res)))))) - -;; take the head of list FROM up to but not including TO, which must -;; be a tail of the list -(define (take-up-to from to) - (let lp ((ls from) (res '())) - (if (and (pair? ls) (not (eq? ls to))) - (lp (cdr ls) (cons (car ls) res)) - (reverse res)))) - -;; SRFI-1 extracts (simplified 1-ary versions) - -(define (find pred ls) - (cond ((find-tail pred ls) => car) - (else #f))) - -(define (find-tail pred ls) - (let lp ((ls ls)) - (cond ((null? ls) #f) - ((pred (car ls)) ls) - (else (lp (cdr ls)))))) - -(define (last ls) - (if (not (pair? ls)) - (error "can't take last of empty list" ls) - (let lp ((ls ls)) - (if (pair? (cdr ls)) - (lp (cdr ls)) - (car ls))))) - -(define (any pred ls) - (and (pair? ls) - (let lp ((head (car ls)) (tail (cdr ls))) - (if (null? tail) - (pred head) - (or (pred head) (lp (car tail) (cdr tail))))))) - -(define (every pred ls) - (or (null? ls) - (let lp ((head (car ls)) (tail (cdr ls))) - (if (null? tail) - (pred head) - (and (pred head) (lp (car tail) (cdr tail))))))) - -(define (fold kons knil ls) - (let lp ((ls ls) (res knil)) - (if (null? ls) - res - (lp (cdr ls) (kons (car ls) res))))) - -(define (filter pred ls) - (let lp ((ls ls) (res '())) - (if (null? ls) - (reverse res) - (lp (cdr ls) (if (pred (car ls)) (cons (car ls) res) res))))) - -(define (remove pred ls) - (let lp ((ls ls) (res '())) - (if (null? ls) - (reverse res) - (lp (cdr ls) (if (pred (car ls)) res (cons (car ls) res)))))) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; flags - -(define (bit-shr n i) - (quotient n (expt 2 i))) - -(define (bit-shl n i) - (* n (expt 2 i))) - -(define (bit-not n) (- #xFFFF n)) - -(define (bit-ior a b) - (cond - ((zero? a) b) - ((zero? b) a) - (else - (+ (if (or (odd? a) (odd? b)) 1 0) - (* 2 (bit-ior (quotient a 2) (quotient b 2))))))) - -(define (bit-and a b) - (cond - ((zero? a) 0) - ((zero? b) 0) - (else - (+ (if (and (odd? a) (odd? b)) 1 0) - (* 2 (bit-and (quotient a 2) (quotient b 2))))))) - -(define (flag-set? flags i) - (= i (bit-and flags i))) -(define (flag-join a b) - (if b (bit-ior a b) a)) -(define (flag-clear a b) - (bit-and a (bit-not b))) - -(define ~none 0) -(define ~searcher? 1) -(define ~consumer? 2) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; parsing - -(define ~save? 1) -(define ~case-insensitive? 2) -(define ~multi-line? 4) -(define ~single-line? 8) -(define ~ignore-space? 16) -(define ~utf8? 32) - -(define (symbol-list->flags ls) - (let lp ((ls ls) (res ~none)) - (if (not (pair? ls)) - res - (lp (cdr ls) - (flag-join - res - (case (car ls) - ((i ci case-insensitive) ~case-insensitive?) - ((m multi-line) ~multi-line?) - ((s single-line) ~single-line?) - ((x ignore-space) ~ignore-space?) - ((u utf8) ~utf8?) - (else #f))))))) - -(define (string->sre str . o) - (let ((end (string-length str)) - (flags (symbol-list->flags o))) - - (let lp ((i 0) (from 0) (flags flags) (res '()) (st '())) - - ;; handle case sensitivity at the literal char/string level - (define (cased-char ch) - (if (and (flag-set? flags ~case-insensitive?) - (char-alphabetic? ch)) - `(or ,ch ,(char-altcase ch)) - ch)) - (define (cased-string str) - (if (flag-set? flags ~case-insensitive?) - (sre-sequence (map cased-char (string->list str))) - str)) - ;; accumulate the substring from..i as literal text - (define (collect) - (if (= i from) res (cons (cased-string (substring str from i)) res))) - ;; like collect but breaks off the last single character when - ;; collecting literal data, as the argument to ?/*/+ etc. - (define (collect/single) - (let* ((utf8? (flag-set? flags ~utf8?)) - (j (if (and utf8? (> i 1)) - (utf8-backup-to-initial-char str (- i 1)) - (- i 1)))) - (cond - ((< j from) - res) - (else - (let ((c (cased-char (if utf8? - (utf8-string-ref str j (- i j) ) - (string-ref str j))))) - (cond - ((= j from) - (cons c res)) - (else - (cons c - (cons (cased-string (substring str from j)) - res))))))))) - ;; collects for use as a result, reversing and grouping OR - ;; terms, and some ugly tweaking of `function-like' groups and - ;; conditionals - (define (collect/terms) - (let* ((ls (collect)) - (func - (and (pair? ls) - (memq (last ls) - '(atomic if look-ahead neg-look-ahead - look-behind neg-look-behind submatch-named - w/utf8 w/noutf8)))) - (prefix (if (and func (eq? 'submatch-named (car func))) - (list 'submatch-named (cadr (reverse ls))) - (and func (list (car func))))) - (ls (if func - (if (eq? 'submatch-named (car func)) - (reverse (cddr (reverse ls))) - (reverse (cdr (reverse ls)))) - ls))) - (let lp ((ls ls) (term '()) (res '())) - (define (shift) - (cons (sre-sequence term) res)) - (cond - ((null? ls) - (let* ((res (sre-alternate (shift))) - (res (if (flag-set? flags ~save?) - (list 'submatch res) - res))) - (if prefix - (if (eq? 'if (car prefix)) - (cond - ((not (pair? res)) - 'epsilon) - ((memq (car res) - '(look-ahead neg-look-ahead - look-behind neg-look-behind)) - res) - ((eq? 'seq (car res)) - `(if ,(cadr res) - ,(if (pair? (cdr res)) - (sre-sequence (cddr res)) - 'epsilon))) - (else - `(if ,(cadadr res) - ,(if (pair? (cdr res)) - (sre-sequence (cddadr res)) - 'epsilon) - ,(sre-alternate - (if (pair? (cdr res)) (cddr res) '()))))) - `(,@prefix ,res)) - res))) - ((eq? 'or (car ls)) (lp (cdr ls) '() (shift))) - (else (lp (cdr ls) (cons (car ls) term) res)))))) - (define (save) - (cons (cons flags (collect)) st)) - - ;; main parsing - (if (>= i end) - (if (pair? st) - (error "unterminated parenthesis in regexp" str) - (collect/terms)) - (let ((c (string-ref str i))) - (case c - ((#\.) - (lp (+ i 1) (+ i 1) flags - (cons (if (flag-set? flags ~single-line?) 'any 'nonl) - (collect)) - st)) - ((#\?) - (let ((res (collect/single))) - (if (null? res) - (error "? can't follow empty sre" str res) - (let ((x (car res))) - (lp (+ i 1) - (+ i 1) - flags - (cons - (if (pair? x) - (case (car x) - ((*) `(*? ,@(cdr x))) - ((+) `(**? 1 #f ,@(cdr x))) - ((?) `(?? ,@(cdr x))) - ((**) `(**? ,@(cdr x))) - ((=) `(**? ,(cadr x) ,@(cdr x))) - ((>=) `(**? ,(cadr x) #f ,@(cddr x))) - (else `(? ,x))) - `(? ,x)) - (cdr res)) - st))))) - ((#\+ #\*) - (let* ((res (collect/single)) - (x (car res)) - (op (string->symbol (string c)))) - (cond - ((sre-repeater? x) - (error "duplicate repetition (e.g. **) in sre" str res)) - ((sre-empty? x) - (error "can't repeat empty sre (e.g. ()*)" str res)) - (else - (lp (+ i 1) (+ i 1) flags - (cons (list op x) (cdr res)) - st))))) - ((#\() - (cond - ((>= (+ i 1) end) - (error "unterminated parenthesis in regexp" str)) - ((not (eqv? #\? (string-ref str (+ i 1)))) - (lp (+ i 1) (+ i 1) (flag-join flags ~save?) '() (save))) - ((>= (+ i 2) end) - (error "unterminated parenthesis in regexp" str)) - (else - (case (string-ref str (+ i 2)) - ((#\#) - (let ((j (string-scan-char str #\) (+ i 3)))) - (lp (+ j i) (+ j 1) flags (collect) st))) - ((#\:) - (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) '() (save))) - ((#\=) - (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) - '(look-ahead) (save))) - ((#\!) - (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) - '(neg-look-ahead) (save))) - ((#\<) - (cond - ((>= (+ i 3) end) - (error "unterminated parenthesis in regexp" str)) - (else - (case (string-ref str (+ i 3)) - ((#\=) - (lp (+ i 4) (+ i 4) (flag-clear flags ~save?) - '(look-behind) (save))) - ((#\!) - (lp (+ i 4) (+ i 4) (flag-clear flags ~save?) - '(neg-look-behind) (save))) - (else - (let ((j (and (char-alphabetic? - (string-ref str (+ i 3))) - (string-scan-char str #\> (+ i 4))))) - (if j - (lp (+ j 1) (+ j 1) (flag-clear flags ~save?) - `(,(string->symbol (substring str (+ i 3) j)) - submatch-named) - (save)) - (error "invalid (?< sequence" str)))))))) - ((#\>) - (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) - '(atomic) (save))) - ;;((#\' #\P) ; named subpatterns - ;; ) - ;;((#\R) ; recursion - ;; ) - ((#\() - (cond - ((>= (+ i 3) end) - (error "unterminated parenthesis in regexp" str)) - ((char-numeric? (string-ref str (+ i 3))) - (let* ((j (string-scan-char str #\) (+ i 3))) - (n (string->number (substring str (+ i 3) j)))) - (if (not n) - (error "invalid conditional reference" str) - (lp (+ j 1) (+ j 1) (flag-clear flags ~save?) - `(,n if) (save))))) - ((char-alphabetic? (string-ref str (+ i 3))) - (let* ((j (string-scan-char str #\) (+ i 3))) - (s (string->symbol (substring str (+ i 3) j)))) - (lp (+ j 1) (+ j 1) (flag-clear flags ~save?) - `(,s if) (save)))) - (else - (lp (+ i 2) (+ i 2) (flag-clear flags ~save?) - '(if) (save))))) - ((#\{) - (error "unsupported Perl-style cluster" str)) - (else - (let ((old-flags flags)) - (let lp2 ((j (+ i 2)) (flags flags) (invert? #f)) - (define (join x) - ((if invert? flag-clear flag-join) flags x)) - (define (new-res res) - (let ((before (flag-set? old-flags ~utf8?)) - (after (flag-set? flags ~utf8?))) - (if (eq? before after) - res - (cons (if after 'w/utf8 'w/noutf8) res)))) - (cond - ((>= j end) - (error "incomplete cluster" str i)) - (else - (case (string-ref str j) - ((#\i) - (lp2 (+ j 1) (join ~case-insensitive?) invert?)) - ((#\m) - (lp2 (+ j 1) (join ~multi-line?) invert?)) - ((#\x) - (lp2 (+ j 1) (join ~ignore-space?) invert?)) - ((#\u) - (if *allow-utf8-mode?* - (lp2 (+ j 1) (join ~utf8?) invert?) - (lp2 (+ j 1) flags invert?))) - ((#\-) - (lp2 (+ j 1) flags (not invert?))) - ((#\)) - (lp (+ j 1) (+ j 1) flags (new-res (collect)) - st)) - ((#\:) - (lp (+ j 1) (+ j 1) flags (new-res '()) - (cons (cons old-flags (collect)) st))) - (else - (error "unknown regex cluster modifier" str) - ))))))))))) - ((#\)) - (if (null? st) - (error "too many )'s in regexp" str) - (lp (+ i 1) - (+ i 1) - (caar st) - (cons (collect/terms) (cdar st)) - (cdr st)))) - ((#\[) - (apply - (lambda (sre j) - (lp (+ j 1) (+ j 1) flags (cons sre (collect)) st)) - (string-parse-cset str (+ i 1) flags))) - ((#\{) - (if (or (>= (+ i 1) end) - (not (or (char-numeric? (string-ref str (+ i 1))) - (eqv? #\, (string-ref str (+ i 1)))))) - (lp (+ i 1) from flags res st) - (let* ((res (collect/single)) - (x (car res)) - (tail (cdr res)) - (j (string-scan-char str #\} (+ i 1))) - (s2 (string-split-char (substring str (+ i 1) j) #\,)) - (n (or (string->number (car s2)) 0)) - (m (and (pair? (cdr s2)) (string->number (cadr s2))))) - (cond - ((null? (cdr s2)) - (lp (+ j 1) (+ j 1) flags `((= ,n ,x) ,@tail) st)) - (m - (lp (+ j 1) (+ j 1) flags `((** ,n ,m ,x) ,@tail) st)) - (else - (lp (+ j 1) (+ j 1) flags `((>= ,n ,x) ,@tail) st) - ))))) - ((#\\) - (cond - ((>= (+ i 1) end) - (error "incomplete escape sequence" str)) - (else - (let ((c (string-ref str (+ i 1)))) - (case c - ((#\d) - (lp (+ i 2) (+ i 2) flags `(numeric ,@(collect)) st)) - ((#\D) - (lp (+ i 2) (+ i 2) flags `((~ numeric) ,@(collect)) st)) - ((#\s) - (lp (+ i 2) (+ i 2) flags `(space ,@(collect)) st)) - ((#\S) - (lp (+ i 2) (+ i 2) flags `((~ space) ,@(collect)) st)) - ((#\w) - (lp (+ i 2) (+ i 2) flags - `((or alphanumeric ("_")) ,@(collect)) st)) - ((#\W) - (lp (+ i 2) (+ i 2) flags - `((~ (or alphanumeric ("_"))) ,@(collect)) st)) - ((#\b) - (lp (+ i 2) (+ i 2) flags - `((or bow eow) ,@(collect)) st)) - ((#\B) - (lp (+ i 2) (+ i 2) flags `(nwb ,@(collect)) st)) - ((#\A) - (lp (+ i 2) (+ i 2) flags `(bos ,@(collect)) st)) - ((#\Z) - (lp (+ i 2) (+ i 2) flags - `((? #\newline) eos ,@(collect)) st)) - ((#\z) - (lp (+ i 2) (+ i 2) flags `(eos ,@(collect)) st)) - ((#\R) - (lp (+ i 2) (+ i 2) flags `(newline ,@(collect)) st)) - ((#\K) - (lp (+ i 2) (+ i 2) flags `(reset ,@(collect)) st)) - ;; these two are from Emacs and TRE, but not PCRE - ((#\<) - (lp (+ i 2) (+ i 2) flags `(bow ,@(collect)) st)) - ((#\>) - (lp (+ i 2) (+ i 2) flags `(eow ,@(collect)) st)) - ((#\x) - (apply - (lambda (ch j) - (lp (+ j 1) (+ j 1) flags `(,ch ,@(collect)) st)) - (string-parse-hex-escape str (+ i 2) end))) - ((#\k) - (let ((c (string-ref str (+ i 2)))) - (if (not (memv c '(#\< #\{ #\'))) - (error "bad \\k usage, expected \\k<...>" str) - (let* ((terminal (char-mirror c)) - (j (string-scan-char str terminal (+ i 2))) - (s (and j (substring str (+ i 3) j))) - (backref - (if (flag-set? flags ~case-insensitive?) - 'backref-ci - 'backref))) - (if (not j) - (error "interminated named backref" str) - (lp (+ j 1) (+ j 1) flags - `((,backref ,(string->symbol s)) - ,@(collect)) - st)))))) - ((#\Q) ;; \Q..\E escapes - (let ((res (collect))) - (let lp2 ((j (+ i 2))) - (cond - ((>= j end) - (lp j (+ i 2) flags res st)) - ((eqv? #\\ (string-ref str j)) - (cond - ((>= (+ j 1) end) - (lp (+ j 1) (+ i 2) flags res st)) - ((eqv? #\E (string-ref str (+ j 1))) - (lp (+ j 2) (+ j 2) flags - (cons (substring str (+ i 2) j) res) st)) - (else - (lp2 (+ j 2))))) - (else - (lp2 (+ j 1))))))) - ;;((#\p) ; XXXX unicode properties - ;; ) - ;;((#\P) - ;; ) - (else - (cond - ((char-numeric? c) - (let* ((j (or (string-scan-pred - str - (lambda (c) (not (char-numeric? c))) - (+ i 2)) - end)) - (backref - (if (flag-set? flags ~case-insensitive?) - 'backref-ci - 'backref)) - (res `((,backref ,(string->number - (substring str (+ i 1) j))) - ,@(collect)))) - (lp j j flags res st))) - ((char-alphabetic? c) - (let ((cell (assv c posix-escape-sequences))) - (if cell - (lp (+ i 2) (+ i 2) flags - (cons (cdr cell) (collect)) st) - (error "unknown escape sequence" str c)))) - (else - (lp (+ i 2) (+ i 1) flags (collect) st))))))))) - ((#\|) - (lp (+ i 1) (+ i 1) flags (cons 'or (collect)) st)) - ((#\^) - (let ((sym (if (flag-set? flags ~multi-line?) 'bol 'bos))) - (lp (+ i 1) (+ i 1) flags (cons sym (collect)) st))) - ((#\$) - (let ((sym (if (flag-set? flags ~multi-line?) 'eol 'eos))) - (lp (+ i 1) (+ i 1) flags (cons sym (collect)) st))) - ((#\space) - (if (flag-set? flags ~ignore-space?) - (lp (+ i 1) (+ i 1) flags (collect) st) - (lp (+ i 1) from flags res st))) - ((#\#) - (if (flag-set? flags ~ignore-space?) - (let ((j (or (string-scan-char str #\newline (+ i 1)) - (- end 1)))) - (lp (+ j 1) (+ j 1) flags (collect) st)) - (lp (+ i 1) from flags res st))) - (else - (lp (+ i 1) from flags res st)))))))) - -(define posix-escape-sequences - `((#\n . #\newline) - (#\r . ,(integer->char (+ (char->integer #\newline) 3))) - (#\t . ,(integer->char (- (char->integer #\newline) 1))) - (#\a . ,(integer->char (- (char->integer #\newline) 3))) - (#\e . ,(integer->char (+ (char->integer #\newline) #x11))) - (#\f . ,(integer->char (+ (char->integer #\newline) 2))) - )) - -(define (char-altcase c) - (if (char-upper-case? c) (char-downcase c) (char-upcase c))) - -(define (char-mirror c) - (case c ((#\<) #\>) ((#\{) #\}) ((#\() #\)) ((#\[) #\]) (else c))) - -(define (string-parse-hex-escape str i end) - (cond - ((>= i end) - (error "incomplete hex escape" str i)) - ((eqv? #\{ (string-ref str i)) - (let ((j (string-scan-char-escape str #\} (+ i 1)))) - (if (not j) - (error "incomplete hex brace escape" str i) - (let* ((s (substring str (+ i 1) j)) - (n (string->number s 16))) - (if n - (list (integer->char n) j) - (error "bad hex brace escape" s)))))) - ((>= (+ i 1) end) - (error "incomplete hex escape" str i)) - (else - (let* ((s (substring str i (+ i 2))) - (n (string->number s 16))) - (if n - (list (integer->char n) (+ i 2)) - (error "bad hex escape" s)))))) - -(define (string-parse-cset str start flags) - (let ((end (string-length str)) - (invert? (eqv? #\^ (string-ref str start))) - (utf8? (flag-set? flags ~utf8?))) - (define (go i chars ranges) - (if (>= i end) - (error "incomplete char set") - (let ((c (string-ref str i))) - (case c - ((#\]) - (if (and (null? chars) (null? ranges)) - (go (+ i 1) (cons #\] chars) ranges) - (let ((ci? (flag-set? flags ~case-insensitive?)) - (hi-chars (if utf8? (filter high-char? chars) '())) - (chars (if utf8? (remove high-char? chars) chars))) - (list - ((lambda (res) - (if invert? (cons '~ res) (sre-alternate res))) - (append - hi-chars - (if (pair? chars) - (list - (list (list->string - ((if ci? - cset-case-insensitive - (lambda (x) x)) - (reverse chars))))) - '()) - (if (pair? ranges) - (let ((res (if ci? - (cset-case-insensitive - (reverse ranges)) - (reverse ranges)))) - (list (cons '/ (alist->plist res)))) - '()))) - i)))) - ((#\-) - (cond - ((or (= i start) - (and (= i (+ start 1)) (eqv? #\^ (string-ref str start))) - (eqv? #\] (string-ref str (+ i 1)))) - (go (+ i 1) (cons c chars) ranges)) - ((null? chars) - (error "bad char-set")) - (else - (let* ((c1 (car chars)) - (c2 (string-ref str (+ i 1)))) - (apply - (lambda (c2 j) - (if (char<? c2 c1) - (error "inverted range in char-set" c1 c2) - (go j (cdr chars) (cons (cons c1 c2) ranges)))) - (cond - ((and (eqv? #\\ c2) (assv c2 posix-escape-sequences)) - => (lambda (x) (list (cdr x) (+ i 3)))) - ((and (eqv? #\\ c2) - (eqv? (string-ref str (+ i 2)) #\x)) - (string-parse-hex-escape str (+ i 3) end)) - ((and utf8? (<= #x80 (char->integer c2) #xFF)) - (let ((len (utf8-start-char->length c2))) - (list (utf8-string-ref str (+ i 1) len) (+ i 1 len)))) - (else - (list c2 (+ i 2))))))))) - ((#\[) - (let* ((inv? (eqv? #\^ (string-ref str (+ i 1)))) - (i2 (if inv? (+ i 2) (+ i 1)))) - (case (string-ref str i2) - ((#\:) - (let ((j (string-scan-char str #\: (+ i2 1)))) - (if (or (not j) (not (eqv? #\] (string-ref str (+ j 1))))) - (error "incomplete character class" str) - (let* ((cset (sre->cset - (string->symbol - (substring str (+ i2 1) j)))) - (cset (if inv? (cset-complement cset) cset))) - (go (+ j 2) - (append (filter char? cset) chars) - (append (filter pair? cset) ranges)))))) - ((#\= #\.) - (error "collating sequences not supported" str)) - (else - (go (+ i 1) (cons #\[ chars) ranges))))) - ((#\\) - (let ((c (string-ref str (+ i 1)))) - (case c - ((#\d #\D #\s #\S #\w #\W) - (let ((cset (sre->cset (string->sre (string #\\ c))))) - (go (+ i 2) - (append (filter char? cset) chars) - (append (filter pair? cset) ranges)))) - ((#\x) - (apply - (lambda (ch j) - (go j (cons ch chars) ranges)) - (string-parse-hex-escape str (+ i 2) end))) - (else - (let ((c (cond ((assv c posix-escape-sequences) => cdr) - (else c)))) - (go (+ i 2) - (cons (string-ref str (+ i 1)) (cons c chars)) - ranges)))))) - (else - (if (and utf8? (<= #x80 (char->integer c) #xFF)) - (let ((len (utf8-start-char->length c))) - (go (+ i len) - (cons (utf8-string-ref str i len) chars) - ranges)) - (go (+ i 1) (cons c chars) ranges))))))) - (if invert? - (go (+ start 1) - (if (flag-set? flags ~multi-line?) '(#\newline) '()) - '()) - (go start '() '())))) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; utf8 utilities - -;; Here are some hairy optimizations that need to be documented -;; better. Thanks to these, we never do any utf8 processing once the -;; regexp is compiled. - -;; two chars: ab..ef -;; a[b..xFF]|[b-d][x80..xFF]|e[x80..xFF] - -;; three chars: abc..ghi -;; ab[c..xFF]|a[d..xFF][x80..xFF]| -;; [b..f][x80..xFF][x80..xFF]| -;; g[x80..g][x80..xFF]|gh[x80..i] - -;; four chars: abcd..ghij -;; abc[d..xFF]|ab[d..xFF][x80..xFF]|a[c..xFF][x80..xFF][x80..xFF]| -;; [b..f][x80..xFF][x80..xFF][x80..xFF]| -;; g[x80..g][x80..xFF][x80..xFF]|gh[x80..h][x80..xFF]|ghi[x80..j] - -(define (high-char? c) (<= #x80 (char->integer c))) - -;; number of total bytes in a utf8 char given the 1st byte - -(define utf8-start-char->length - (let ((table '#( -1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 0x -1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 1x -1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 2x -1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 3x -1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 4x -1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 5x -1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 6x -1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 7x -1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 8x -1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 9x -1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; ax -1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; bx -2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ; cx -2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ; dx -3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 ; ex -4 4 4 4 4 4 4 4 5 5 5 5 6 6 0 0 ; fx -))) - (lambda (c) (vector-ref table (char->integer c))))) - -(define (utf8-string-ref str i len) - (define (byte n) (char->integer (string-ref str n))) - (case len - ((1) ; shouldn't happen in this module - (string-ref str i)) - ((2) - (integer->char - (+ (bit-shl (bit-and (byte i) #b00011111) 6) - (bit-and (byte (+ i 1)) #b00111111)))) - ((3) - (integer->char - (+ (bit-shl (bit-and (byte i) #b00001111) 12) - (bit-shl (bit-and (byte (+ i 1)) #b00111111) 6) - (bit-and (byte (+ i 2)) #b00111111)))) - ((4) - (integer->char - (+ (bit-shl (bit-and (byte i) #b00000111) 18) - (bit-shl (bit-and (byte (+ i 1)) #b00111111) 12) - (bit-shl (bit-and (byte (+ i 2)) #b00111111) 6) - (bit-and (byte (+ i 3)) #b00111111)))) - (else - (error "invalid utf8 length" str len i)))) - -(define (utf8-backup-to-initial-char str i) - (let lp ((i i)) - (if (= i 0) - 0 - (let ((c (char->integer (string-ref str i)))) - (if (or (< c #x80) (>= c #xC0)) - i - (lp (- i 1))))))) - -(define (utf8-lowest-digit-of-length len) - (case len - ((1) 0) ((2) #xC0) ((3) #xE0) ((4) #xF0) - (else (error "invalid utf8 length" len)))) - -(define (utf8-highest-digit-of-length len) - (case len - ((1) #x7F) ((2) #xDF) ((3) #xEF) ((4) #xF7) - (else (error "invalid utf8 length" len)))) - -(define (char->utf8-list c) - (let ((i (char->integer c))) - (cond - ((<= i #x7F) (list i)) - ((<= i #x7FF) - (list (bit-ior #b11000000 (bit-shr i 6)) - (bit-ior #b10000000 (bit-and i #b111111)))) - ((<= i #xFFFF) - (list (bit-ior #b11100000 (bit-shr i 12)) - (bit-ior #b10000000 (bit-and (bit-shr i 6) #b111111)) - (bit-ior #b10000000 (bit-and i #b111111)))) - ((<= i #x1FFFFF) - (list (bit-ior #b11110000 (bit-shr i 18)) - (bit-ior #b10000000 (bit-and (bit-shr i 12) #b111111)) - (bit-ior #b10000000 (bit-and (bit-shr i 6) #b111111)) - (bit-ior #b10000000 (bit-and i #b111111)))) - (else (error "unicode codepoint out of range:" i))))) - -(define (unicode-range->utf8-pattern lo hi) - (let ((lo-ls (char->utf8-list lo)) - (hi-ls (char->utf8-list hi))) - (if (not (= (length lo-ls) (length hi-ls))) - (sre-alternate (list (unicode-range-climb-digits lo-ls hi-ls) - (unicode-range-up-to hi-ls))) - (let lp ((lo-ls lo-ls) (hi-ls hi-ls)) - (cond - ((null? lo-ls) - '()) - ((= (car lo-ls) (car hi-ls)) - (sre-sequence - (list (integer->char (car lo-ls)) - (lp (cdr lo-ls) (cdr hi-ls))))) - ((= (+ (car lo-ls) 1) (car hi-ls)) - (sre-alternate (list (unicode-range-up-from lo-ls) - (unicode-range-up-to hi-ls)))) - (else - (sre-alternate (list (unicode-range-up-from lo-ls) - (unicode-range-middle lo-ls hi-ls) - (unicode-range-up-to hi-ls))))))))) - -(define (unicode-range-helper one ls prefix res) - (if (null? ls) - res - (unicode-range-helper - one - (cdr ls) - (cons (car ls) prefix) - (cons (sre-sequence - `(,@(map integer->char prefix) - ,(one (car ls)) - ,@(map (lambda (_) - `(/ ,(integer->char #x80) - ,(integer->char #xFF))) - (cdr ls)))) - res)))) - -(define (unicode-range-up-from lo-ls) - (sre-sequence - (list (integer->char (car lo-ls)) - (sre-alternate - (unicode-range-helper - (lambda (c) - `(/ ,(integer->char (+ (car lo-ls) 1)) ,(integer->char #xFF))) - (cdr (reverse (cdr lo-ls))) - '() - (list - (sre-sequence - (append - (map integer->char (reverse (cdr (reverse (cdr lo-ls))))) - `((/ ,(integer->char (last lo-ls)) - ,(integer->char #xFF))))))))))) - -(define (unicode-range-up-to hi-ls) - (sre-sequence - (list (integer->char (car hi-ls)) - (sre-alternate - (unicode-range-helper - (lambda (c) - `(/ ,(integer->char #x80) ,(integer->char (- (car hi-ls) 1)))) - (cdr (reverse (cdr hi-ls))) - '() - (list - (sre-sequence - (append - (map integer->char (reverse (cdr (reverse (cdr hi-ls))))) - `((/ ,(integer->char #x80) - ,(integer->char (last hi-ls)))))))))))) - -(define (unicode-range-climb-digits lo-ls hi-ls) - (let ((lo-len (length lo-ls))) - (sre-alternate - (append - (list - (sre-sequence - (cons `(/ ,(integer->char (car lo-ls)) - ,(integer->char (if (<= (car lo-ls) #x7F) #x7F #xFF))) - (map (lambda (_) - `(/ ,(integer->char #x80) ,(integer->char #xFF))) - (cdr lo-ls))))) - (map - (lambda (i) - (sre-sequence - (cons - `(/ ,(integer->char (utf8-lowest-digit-of-length (+ i lo-len 1))) - ,(integer->char (utf8-highest-digit-of-length (+ i lo-len 1)))) - (map (lambda (_) - `(/ ,(integer->char #x80) ,(integer->char #xFF))) - (zero-to (+ i lo-len)))))) - (zero-to (- (length hi-ls) lo-len 1))) - (list - (sre-sequence - (cons `(/ ,(integer->char - (utf8-lowest-digit-of-length - (utf8-start-char->length - (integer->char (- (car hi-ls) 1))))) - ,(integer->char (- (car hi-ls) 1))) - (map (lambda (_) - `(/ ,(integer->char #x80) ,(integer->char #xFF))) - (cdr hi-ls))))))))) - -(define (unicode-range-middle lo-ls hi-ls) - (let ((lo (integer->char (+ (car lo-ls) 1))) - (hi (integer->char (- (car hi-ls) 1)))) - (sre-sequence - (cons (if (char=? lo hi) lo `(/ ,lo ,hi)) - (map (lambda (_) `(/ ,(integer->char #x80) ,(integer->char #xFF))) - (cdr lo-ls)))))) - -(define (cset->utf8-pattern cset) - (let lp ((ls cset) (alts '()) (lo-cset '())) - (cond - ((null? ls) - (sre-alternate (append (reverse alts) - (if (null? lo-cset) - '() - (list (cons '/ (reverse lo-cset))))))) - ((char? (car ls)) - (if (high-char? (car ls)) - (lp (cdr ls) (cons (car ls) alts) lo-cset) - (lp (cdr ls) alts (cons (car ls) lo-cset)))) - (else - (if (or (high-char? (caar ls)) (high-char? (cdar ls))) - (lp (cdr ls) - (cons (unicode-range->utf8-pattern (caar ls) (cdar ls)) alts) - lo-cset) - (lp (cdr ls) alts (cons (cdar ls) (cons (caar ls) lo-cset)))))))) - -(define (sre-adjust-utf8 sre flags) - (let adjust ((sre sre) - (utf8? (flag-set? flags ~utf8?)) - (ci? (flag-set? flags ~case-insensitive?))) - (define (rec sre) (adjust sre utf8? ci?)) - (cond - ((pair? sre) - (case (car sre) - ((w/utf8) (adjust (sre-sequence (cdr sre)) #t ci?)) - ((w/noutf8) (adjust (sre-sequence (cdr sre)) #f ci?)) - ((w/case) - (cons (car sre) (map (lambda (s) (adjust s utf8? #f)) (cdr sre)))) - ((w/nocase) - (cons (car sre) (map (lambda (s) (adjust s utf8? #t)) (cdr sre)))) - ((/ ~ & -) - (if (not utf8?) - sre - (let ((cset (sre->cset sre ci?))) - (if (any (lambda (x) - (if (pair? x) - (or (high-char? (car x)) (high-char? (cdr x))) - (high-char? x))) - cset) - (if ci? - (list 'w/case (cset->utf8-pattern cset)) - (cset->utf8-pattern cset)) - sre)))) - ((*) - (case (sre-sequence (cdr sre)) - ;; special case optimization: .* w/utf8 == .* w/noutf8 - ((any) '(* any)) - ((nonl) '(* nonl)) - (else (cons '* (map rec (cdr sre)))))) - (else - (cons (car sre) (map rec (cdr sre)))))) - (else - (case sre - ((any) 'utf8-any) - ((nonl) 'utf8-nonl) - (else - (if (and utf8? (char? sre) (high-char? sre)) - (sre-sequence (map integer->char (char->utf8-list sre))) - sre))))))) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; compilation - -(define (irregex x . o) - (cond - ((irregex? x) x) - ((string? x) (apply string->irregex x o)) - (else (apply sre->irregex x o)))) - -(define (string->irregex str . o) - (apply sre->irregex (apply string->sre str o) o)) - -(define (sre->irregex sre . o) - (let* ((pat-flags (symbol-list->flags o)) - (sre (if *allow-utf8-mode?* - (sre-adjust-utf8 sre pat-flags) - sre)) - (searcher? (sre-searcher? sre)) - (sre-dfa (if searcher? (sre-remove-initial-bos sre) sre)) - (dfa-limit (cond ((memq 'small o) 1) ((memq 'fast o) 50) (else 10))) - (dfa/search - (if searcher? - #t - (cond ((sre->nfa `(seq (* any) ,sre-dfa) pat-flags) - => (lambda (nfa) (nfa->dfa nfa (* dfa-limit (length nfa))))) - (else #f)))) - (dfa (cond ((and dfa/search (sre->nfa sre-dfa pat-flags)) - => (lambda (nfa) (nfa->dfa nfa (* dfa-limit (length nfa))))) - (else #f))) - (extractor (and dfa dfa/search (sre-match-extractor sre-dfa))) - (submatches (sre-count-submatches sre-dfa)) - (names (sre-names sre-dfa 1 '())) - (lens (sre-length-ranges sre-dfa names)) - (flags (flag-join - (flag-join ~none (and searcher? ~searcher?)) - (and (sre-consumer? sre) ~consumer?)))) - (cond - (dfa - (make-irregex dfa dfa/search extractor #f flags submatches lens names)) - (else - (let ((f (sre->procedure sre pat-flags names))) - (make-irregex #f #f #f f flags submatches lens names)))))) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; sre analysis - -;; returns #t if the sre can ever be empty -(define (sre-empty? sre) - (if (pair? sre) - (case (car sre) - ((* ? look-ahead look-behind neg-look-ahead neg-look-behind) #t) - ((**) (or (not (number? (cadr sre))) (zero? (cadr sre)))) - ((or) (any sre-empty? (cdr sre))) - ((: seq submatch + atomic) (every sre-empty? (cdr sre))) - (else #f)) - (memq sre '(epsilon bos eos bol eol bow eow commit)))) - -(define (sre-any? sre) - (or (eq? sre 'any) - (and (pair? sre) - (case (car sre) - ((seq : submatch) - (and (pair? (cdr sre)) (null? (cddr sre)) (sre-any? (cadr sre)))) - ((or) (every sre-any? (cdr sre))) - (else #f))))) - -(define (sre-repeater? sre) - (and (pair? sre) - (or (memq (car sre) '(* +)) - (and (memq (car sre) '(submatch seq :)) - (pair? (cdr sre)) - (null? (cddr sre)) - (sre-repeater? (cadr sre)))))) - -(define (sre-searcher? sre) - (if (pair? sre) - (case (car sre) - ((* +) (sre-any? (sre-sequence (cdr sre)))) - ((seq : submatch) (and (pair? (cdr sre)) (sre-searcher? (cadr sre)))) - ((or) (every sre-searcher? (cdr sre))) - (else #f)) - (eq? 'bos sre))) - -(define (sre-consumer? sre) - (if (pair? sre) - (case (car sre) - ((* +) (sre-any? (sre-sequence (cdr sre)))) - ((seq : submatch) (and (pair? (cdr sre)) (sre-consumer? (last sre)))) - ((or) (every sre-consumer? (cdr sre))) - (else #f)) - (eq? 'eos sre))) - -(define (sre-has-submatchs? sre) - (and (pair? sre) - (or (eq? 'submatch (car sre)) - (any sre-has-submatchs? (cdr sre))))) - -(define (sre-count-submatches sre) - (let count ((sre sre) (sum 0)) - (if (pair? sre) - (fold count - (+ sum (case (car sre) - ((submatch submatch-named) 1) - ((dsm) (+ (cadr sre) (caddr sre))) - (else 0))) - (cdr sre)) - sum))) - -(define (sre-length-ranges sre . o) - (let ((names (if (pair? o) (car o) (sre-names sre 1 '()))) - (sublens (make-vector (+ 1 (sre-count-submatches sre)) #f))) - (vector-set! - sublens - 0 - (let lp ((sre sre) (n 1) (lo 0) (hi 0) (return cons)) - (define (grow i) (return (+ lo i) (and hi (+ hi i)))) - (cond - ((pair? sre) - (if (string? (car sre)) - (grow 1) - (case (car sre) - ((/ ~ & -) - (grow 1)) - ((posix-string) - (lp (string->sre (cadr sre)) n lo hi return)) - ((seq : w/case w/nocase atomic) - (let lp2 ((ls (cdr sre)) (n n) (lo2 0) (hi2 0)) - (if (null? ls) - (return (+ lo lo2) (and hi hi2 (+ hi hi2))) - (lp (car ls) n 0 0 - (lambda (lo3 hi3) - (lp2 (cdr ls) - (+ n (sre-count-submatches (car ls))) - (+ lo2 lo3) - (and hi2 hi3 (+ hi2 hi3)))))))) - ((or) - (let lp2 ((ls (cdr sre)) (n n) (lo2 #f) (hi2 0)) - (if (null? ls) - (return (+ lo lo2) (and hi hi2 (+ hi hi2))) - (lp (car ls) n 0 0 - (lambda (lo3 hi3) - (lp2 (cdr ls) - (+ n (sre-count-submatches (car ls))) - (if lo2 (min lo2 lo3) lo3) - (and hi2 hi3 (max hi2 hi3)))))))) - ((if) - (cond - ((or (null? (cdr sre)) (null? (cddr sre))) - (return lo hi)) - (else - (let ((n1 (sre-count-submatches (car sre))) - (n2 (sre-count-submatches (cadr sre)))) - (lp (if (or (number? (cadr sre)) (symbol? (cadr sre))) - 'epsilon - (cadr sre)) - n lo hi - (lambda (lo2 hi2) - (lp (caddr sre) (+ n n1) 0 0 - (lambda (lo3 hi3) - (lp (if (pair? (cdddr sre)) - (cadddr sre) - 'epsilon) - (+ n n1 n2) 0 0 - (lambda (lo4 hi4) - (return (+ lo2 (min lo3 lo4)) - (and hi2 hi3 hi4 - (+ hi2 (max hi3 hi4)) - )))))))))))) - ((dsm) - (lp (sre-sequence (cdddr sre)) (+ n (cadr sre)) lo hi return)) - ((submatch submatch-named) - (lp (sre-sequence - (if (eq? 'submatch (car sre)) (cdr sre) (cddr sre))) - (+ n 1) lo hi - (lambda (lo2 hi2) - (vector-set! sublens n (cons lo2 hi2)) - (return lo2 hi2)))) - ((backref backref-ci) - (let ((n (cond - ((number? (cadr sre)) (cadr sre)) - ((assq (cadr sre) names) => cdr) - (else (error "unknown backreference" (cadr sre)))))) - (cond - ((or (not (integer? n)) - (not (< 0 n (vector-length sublens)))) - (error "sre-length: invalid backreference" sre)) - ((not (vector-ref sublens n)) - (error "sre-length: invalid forward backreference" sre)) - (else - (let ((lo2 (car (vector-ref sublens n))) - (hi2 (cdr (vector-ref sublens n)))) - (return (+ lo lo2) (and hi hi2 (+ hi hi2)))))))) - ((* *?) - (lp (sre-sequence (cdr sre)) n lo hi (lambda (lo hi) #f)) - (return lo #f)) - ((** **?) - (cond - ((or (and (number? (cadr sre)) - (number? (caddr sre)) - (> (cadr sre) (caddr sre))) - (and (not (cadr sre)) (caddr sre))) - (return lo hi)) - (else - (if (caddr sre) - (lp (sre-sequence (cdddr sre)) n 0 0 - (lambda (lo2 hi2) - (return (+ lo (* (cadr sre) lo2)) - (and hi hi2 (+ hi (* (caddr sre) hi2)))))) - (lp (sre-sequence (cdddr sre)) n 0 0 - (lambda (lo2 hi2) - (return (+ lo (* (cadr sre) lo2)) #f))))))) - ((+) - (lp (sre-sequence (cdr sre)) n lo hi - (lambda (lo2 hi2) - (return (+ lo lo2) #f)))) - ((? ??) - (lp (sre-sequence (cdr sre)) n lo hi - (lambda (lo2 hi2) - (return lo (and hi hi2 (+ hi hi2)))))) - ((= =? >= >=?) - (lp `(** ,(cadr sre) - ,(if (memq (car sre) '(>= >=?)) #f (cadr sre)) - ,@(cddr sre)) - n lo hi return)) - ((look-ahead neg-look-ahead look-behind neg-look-behind) - (return lo hi)) - (else - (error "sre-length-ranges: unknown sre operator" sre))))) - ((char? sre) - (grow 1)) - ((string? sre) - (grow (string-length sre))) - ((memq sre '(any nonl)) - (grow 1)) - ((memq sre '(epsilon bos eos bol eol bow eow nwb commit)) - (return lo hi)) - (else - (let ((cell (assq sre sre-named-definitions))) - (if cell - (lp (cdr cell) n lo hi return) - (error "sre-length-ranges: unknown sre" sre))))))) - sublens)) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; sre manipulation - -;; build a (seq ls ...) sre from a list -(define (sre-sequence ls) - (cond - ((null? ls) 'epsilon) - ((null? (cdr ls)) (car ls)) - (else (cons 'seq ls)))) - -;; build a (or ls ...) sre from a list -(define (sre-alternate ls) - (cond - ((null? ls) 'epsilon) - ((null? (cdr ls)) (car ls)) - (else (cons 'or ls)))) - -;; returns an equivalent SRE without any match information -(define (sre-strip-submatches sre) - (if (not (pair? sre)) - sre - (case (car sre) - ((submatch) (sre-strip-submatches (sre-sequence (cdr sre)))) - ((dsm) (sre-strip-submatches (sre-sequence (cdddr sre)))) - (else (map sre-strip-submatches sre))))) - -;; given a char-set list of chars and strings, flattens them into -;; chars only -(define (sre-flatten-ranges ls) - (let lp ((ls ls) (res '())) - (cond - ((null? ls) - (reverse res)) - ((string? (car ls)) - (lp (append (string->list (car ls)) (cdr ls)) res)) - (else - (lp (cdr ls) (cons (car ls) res)))))) - -(define (sre-names sre n names) - (if (not (pair? sre)) - names - (case (car sre) - ((submatch) - (sre-names (sre-sequence (cdr sre)) (+ n 1) names)) - ((submatch-named) - (sre-names (sre-sequence (cddr sre)) - (+ n 1) - (cons (cons (cadr sre) n) names))) - ((dsm) - (sre-names (sre-sequence (cdddr sre)) (+ n (cadr sre)) names)) - ((seq : or * + ? *? ?? w/case w/nocase atomic - look-ahead look-behind neg-look-ahead neg-look-behind) - (sre-sequence-names (cdr sre) n names)) - ((= >=) - (sre-sequence-names (cddr sre) n names)) - ((** **?) - (sre-sequence-names (cdddr sre) n names)) - (else - names)))) - -(define (sre-sequence-names ls n names) - (if (null? ls) - names - (sre-sequence-names (cdr ls) - (+ n (sre-count-submatches (car ls))) - (sre-names (car ls) n names)))) - -(define (sre-remove-initial-bos sre) - (cond - ((pair? sre) - (case (car sre) - ((seq : submatch * +) - (cond - ((not (pair? (cdr sre))) - sre) - ((eq? 'bos (cadr sre)) - (cons (car sre) (cddr sre))) - (else - (cons (car sre) - (cons (sre-remove-initial-bos (cadr sre)) (cddr sre)))))) - ((or) - (sre-alternate (map sre-remove-initial-bos (cdr sre)))) - (else - sre))) - (else - sre))) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; matching - -(define (irregex-search x str . o) - (let ((irx (irregex x))) - (let ((start (if (pair? o) (car o) 0)) - (end (if (and (pair? o) (pair? (cdr o))) - (cadr o) (string-length str))) - (matches (irregex-new-matches irx))) - (irregex-match-string-set! matches str) - (irregex-search/matches irx str start end matches)))) - -;; internal routine, can be used in loops to avoid reallocating the -;; match vector -(define (irregex-search/matches irx str start end matches) - (cond - ((irregex-dfa irx) - (cond - ((flag-set? (irregex-flags irx) ~searcher?) - (let ((m-end (dfa-match/longest (irregex-dfa irx) str start end))) - (cond - (m-end - (irregex-match-start-index-set! matches 0 start) - (irregex-match-end-index-set! matches 0 m-end) - ((irregex-dfa/extract irx) str start m-end matches) - matches) - (else - #f)))) - (else - (let ((first-match - (dfa-match/shortest (irregex-dfa/search irx) str start end))) - (and - first-match - (let* ((lo+hi (vector-ref (irregex-lengths irx) 0)) - (m-start (if (cdr lo+hi) - (max start (- first-match (cdr lo+hi))) - start)) - (m-limit (- first-match (car lo+hi))) - (dfa (irregex-dfa irx))) - (let lp ((m-start m-start)) - (and (<= m-start m-limit) - (let ((m-end (dfa-match/longest dfa str m-start end))) - (cond - (m-end - (irregex-match-start-index-set! matches 0 m-start) - (irregex-match-end-index-set! matches 0 m-end) - ((irregex-dfa/extract irx) str m-start m-end matches) - matches) - (else - (lp (+ m-start 1))))))))))))) - (else - (let ((matcher (irregex-nfa irx))) - (let lp ((start start)) - (and (<= start end) - (let ((i (matcher str start matches (lambda () #f)))) - (cond - (i - (irregex-match-start-index-set! matches 0 start) - (irregex-match-end-index-set! matches 0 i) - matches) - (else - (lp (+ start 1))))))))))) - -(define (irregex-match irx str) - (let* ((irx (irregex irx)) - (matches (irregex-new-matches irx)) - (start 0) - (end (string-length str))) - (irregex-match-string-set! matches str) - (cond - ((irregex-dfa irx) - (let ((m-end (dfa-match/longest (irregex-dfa irx) str start end))) - (cond - ((equal? m-end end) - (irregex-match-start-index-set! matches 0 start) - (irregex-match-end-index-set! matches 0 m-end) - ((irregex-dfa/extract irx) str start m-end matches) - matches) - (else - #f)))) - (else - (let* ((matcher (irregex-nfa irx)) - (i (matcher str start matches (lambda () #f)))) - (cond - ((equal? i end) - (irregex-match-start-index-set! matches 0 start) - (irregex-match-end-index-set! matches 0 i) - matches) - (else - #f))))))) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; DFA matching - -;; inline these -(define (dfa-init-state dfa) - (vector-ref dfa 0)) -(define (dfa-next-state dfa node) - (vector-ref dfa (cdr node))) -(define (dfa-final-state? dfa state) - (car state)) - -;; this searches for the first end index for which a match is possible -(define (dfa-match/shortest dfa str start end) - (let lp ((i start) (state (dfa-init-state dfa))) - (if (dfa-final-state? dfa state) - i - (and (< i end) - (let* ((ch (string-ref str i)) - (next (find (lambda (x) - (or (eqv? ch (car x)) - (and (pair? (car x)) - (char<=? (caar x) ch) - (char<=? ch (cdar x))))) - (cdr state)))) - (and next (lp (+ i 1) (dfa-next-state dfa next)))))))) - -;; this finds the longest match starting at a given index -(define (dfa-match/longest dfa str start end) - (let lp ((i start) - (state (dfa-init-state dfa)) - (res (and (dfa-final-state? dfa (dfa-init-state dfa)) start))) - (if (>= i end) - res - (let* ((ch (string-ref str i)) - (cell (find (lambda (x) - (or (eqv? ch (car x)) - (and (pair? (car x)) - (char<=? (caar x) ch) - (char<=? ch (cdar x))))) - (cdr state)))) - (if cell - (let ((next (dfa-next-state dfa cell))) - (lp (+ i 1) - next - (if (dfa-final-state? dfa next) (+ i 1) res))) - res))))) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; SRE->NFA compilation -;; -;; An NFA state is a numbered node with a list of patter->number -;; transitions, where pattern is either a character, (lo . hi) -;; character range, or epsilon (indicating an empty transition). -;; There may be duplicate characters and overlapping ranges - since -;; it's an NFA we process it by considering all possible transitions. - -(define sre-named-definitions - `((any . ,*all-chars*) - (nonl . (- ,*all-chars* (,(string #\newline)))) - (alphabetic . (/ #\a #\z #\A #\Z)) - (alpha . alphabetic) - (alphanumeric . (/ #\a #\z #\A #\Z #\0 #\9)) - (alphanum . alphanumeric) - (alnum . alphanumeric) - (lower-case . (/ #\a #\z)) - (lower . lower-case) - (upper-case . (/ #\A #\Z)) - (upper . upper-case) - (numeric . (/ #\0 #\9)) - (num . numeric) - (digit . numeric) - (punctuation . (or #\! #\" #\# #\% #\& #\' #\( #\) #\* #\, #\- #\. - #\/ #\: #\; #\? #\@ #\[ #\\ #\] #\_ #\{ #\})) - (punct . punctuation) - (graphic - . (or alphanumeric punctuation #\$ #\+ #\< #\= #\> #\^ #\` #\| #\~)) - (graph . graphic) - (blank . (or #\space ,(integer->char (- (char->integer #\space) 23)))) - (whitespace . (or blank #\newline)) - (space . whitespace) - (white . whitespace) - (printing or graphic whitespace) - (print . printing) - ;; XXXX we assume a (possibly shifted) ASCII-based ordering - (control . (/ ,(integer->char (- (char->integer #\space) 32)) - ,(integer->char (- (char->integer #\space) 1)))) - (cntrl . control) - (hex-digit . (or numeric (/ #\a #\f #\A #\F))) - (xdigit . hex-digit) - (ascii . (/ ,(integer->char (- (char->integer #\space) 32)) - ,(integer->char (+ (char->integer #\space) 95)))) - (ascii-nonl . (/ ,(integer->char (- (char->integer #\space) 32)) - ,(integer->char (- (char->integer #\newline) 1)) - ,(integer->char (+ (char->integer #\newline) 1)) - ,(integer->char (+ (char->integer #\space) 95)))) - (newline . (or (seq ,(integer->char (+ (char->integer #\newline) 3)) - #\newline) - (/ #\newline - ,(integer->char (+ (char->integer #\newline) 3))))) - - ;; ... it's really annoying to support scheme48 - (word . (seq bow (+ (or alphanumeric #\_)) eow)) - (utf8-tail-char . (/ ,(integer->char (+ (char->integer #\space) #x60)) - ,(integer->char (+ (char->integer #\space) #xA1)))) - (utf8-2-char . (seq (/ ,(integer->char (+ (char->integer #\space) #xA2)) - ,(integer->char (+ (char->integer #\space) #xBF))) - utf8-tail-char)) - (utf8-3-char . (seq (/ ,(integer->char (+ (char->integer #\space) #xC0)) - ,(integer->char (+ (char->integer #\space) #xCF))) - utf8-tail-char - utf8-tail-char)) - (utf8-4-char . (seq (/ ,(integer->char (+ (char->integer #\space) #xD0)) - ,(integer->char (+ (char->integer #\space) #xD7))) - utf8-tail-char - utf8-tail-char - utf8-tail-char)) - (utf8-any . (or ascii utf8-2-char utf8-3-char utf8-4-char)) - (utf8-nonl . (or ascii-nonl utf8-2-char utf8-3-char utf8-4-char)) - )) - -;; Compile and return the list of NFA states. The start state will be -;; at the head of the list, and all remaining states will be in -;; descending numeric order, with state 0 being the unique accepting -;; state. -(define (sre->nfa sre . o) - ;; we loop over an implicit sequence list - (let lp ((ls (list sre)) - (n 1) - (flags (if (pair? o) (car o) ~none)) - (next (list (list 0)))) - (define (new-state-number state) - (max n (+ 1 (caar state)))) - (define (extend-state next . trans) - (and next - (cons (cons (new-state-number next) - (map (lambda (x) (cons x (caar next))) trans)) - next))) - (if (null? ls) - next - (cond - ((string? (car ls)) - ;; process literal strings a char at a time - (lp (append (string->list (car ls)) (cdr ls)) n flags next)) - ((eq? 'epsilon (car ls)) - ;; chars and epsilons go directly into the transition table - (extend-state (lp (cdr ls) n flags next) (car ls))) - ((char? (car ls)) - (let ((alt (char-altcase (car ls)))) - (if (and (flag-set? flags ~case-insensitive?) - (not (eqv? (car ls) alt))) - (extend-state (lp (cdr ls) n flags next) (car ls) alt) - (extend-state (lp (cdr ls) n flags next) (car ls))))) - ((symbol? (car ls)) - (let ((cell (assq (car ls) sre-named-definitions))) - (and cell (lp (cons (cdr cell) (cdr ls)) n flags next)))) - ((pair? (car ls)) - (cond - ((string? (caar ls)) - ;; enumerated character set - (lp (cons (sre-alternate (string->list (caar ls))) (cdr ls)) - n - flags - next)) - (else - (case (caar ls) - ((seq :) - ;; for an explicit sequence, just append to the list - (lp (append (cdar ls) (cdr ls)) n flags next)) - ((w/case w/nocase w/utf8 w/noutf8) - (let* ((next (lp (cdr ls) n flags next)) - (flags ((if (memq (caar ls) '(w/case w/utf8)) - flag-clear - flag-join) - flags - (if (memq (caar ls) '(w/case w/nocase)) - ~case-insensitive? - ~utf8?)))) - (and next (lp (cdar ls) (new-state-number next) flags next)))) - ((/ - & ~) - (let ((ranges (sre->cset (car ls) - (flag-set? flags ~case-insensitive?)))) - (case (length ranges) - ((1) - (extend-state (lp (cdr ls) n flags next) (car ranges))) - (else - (let ((next (lp (cdr ls) n flags next))) - (and - next - (lp (list (sre-alternate - (map (lambda (x) (if (pair? x) - (list '/ (car x) (cdr x)) - x)) - ranges))) - (new-state-number next) - (flag-clear flags ~case-insensitive?) - next))))))) - ((or) - (let* ((next (lp (cdr ls) n flags next)) - (b (and next - (lp (list (sre-alternate (cddar ls))) - (new-state-number next) - flags - next))) - (a (and b (lp (list (cadar ls)) - (new-state-number b) - flags - next)))) - ;; compile both branches and insert epsilon - ;; transitions to either - (and a - `((,(new-state-number a) - (epsilon . ,(caar a)) - (epsilon . ,(caar b))) - ,@(take-up-to a next) - ,@b)))) - ((?) - (let ((next (lp (cdr ls) n flags next))) - ;; insert an epsilon transition directly to next - (and - next - (let ((a (lp (cdar ls) (new-state-number next) flags next))) - (cond - (a - (set-cdr! (car a) `((epsilon . ,(caar next)) ,@(cdar a))) - a) - (else - #f)))))) - ((+ *) - (let ((next (lp (cdr ls) n flags next))) - (and - next - (let* ((new (lp '(epsilon) - (new-state-number next) - flags - next)) - (a (lp (cdar ls) (new-state-number new) flags new))) - (and - a - (begin - ;; for *, insert an epsilon transition as in ? above - (if (eq? '* (caar ls)) - (set-cdr! (car a) - `((epsilon . ,(caar new)) ,@(cdar a)))) - ;; for both, insert a loop back to self - (set-cdr! (car new) - `((epsilon . ,(caar a)) ,@(cdar new))) - a)))))) - ((submatch submatch-named) - ;; ignore submatches altogether - (lp (cons (sre-sequence (cdar ls)) (cdr ls)) n flags next)) - (else - #f))))) - (else - #f))))) - -;; We don't really want to use this, we use the closure compilation -;; below instead, but this is included for reference and testing the -;; sre->nfa conversion. - -;; (define (nfa-match nfa str) -;; (let lp ((ls (string->list str)) (state (car nfa)) (epsilons '())) -;; (if (null? ls) -;; (zero? (car state)) -;; (any (lambda (m) -;; (if (eq? 'epsilon (car m)) -;; (and (not (memv (cdr m) epsilons)) -;; (lp ls (assv (cdr m) nfa) (cons (cdr m) epsilons))) -;; (and (or (eqv? (car m) (car ls)) -;; (and (pair? (car m)) -;; (char<=? (caar m) (car ls)) -;; (char<=? (car ls) (cdar m)))) -;; (lp (cdr ls) (assv (cdr m) nfa) '())))) -;; (cdr state))))) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; NFA->DFA compilation -;; -;; During processing, the DFA is a list of the form: -;; -;; ((NFA-states ...) accepting-state? transitions ...) -;; -;; where the transitions are as in the NFA, except there are no -;; epsilons, duplicate characters or overlapping char-set ranges, and -;; the states moved to are closures (sets of NFA states). Multiple -;; DFA states may be accepting states. - -(define (nfa->dfa nfa . o) - (let ((max-states (and (pair? o) (car o)))) - (let lp ((ls (list (nfa-closure nfa (list (caar nfa))))) - (i 0) - (res '())) - (cond - ((null? ls) - (dfa-renumber (reverse res))) - ((assoc (car ls) res) - (lp (cdr ls) i res)) - (else - (let* ((states (car ls)) - (trans (nfa-state-transitions nfa states)) - (accept? (and (memv 0 states) #t))) - (and (or (not max-states) (< (+ i 1) max-states)) - (lp (append (map cdr trans) (cdr ls)) - (+ i 1) - `((,states ,accept? ,@trans) ,@res))))))))) - -;; When the conversion is complete we renumber the DFA sets-of-states -;; in order and convert the result to a vector for fast lookup. -(define (dfa-renumber dfa) - (let ((states (map cons (map car dfa) (zero-to (length dfa))))) - (define (renumber state) - (cdr (assoc state states))) - (list->vector - (map - (lambda (node) - (cons (cadr node) - (map (lambda (x) (cons (car x) (renumber (cdr x)))) - (cddr node)))) - dfa)))) - -;; Extract all distinct characters or ranges and the potential states -;; they can transition to from a given set of states. Any ranges that -;; would overlap with distinct characters are split accordingly. -(define (nfa-state-transitions nfa states) - (let lp ((trans '()) ;; list of (char . state) or ((char . char) . state) - (ls states) ;; list of integers (remaining state numbers) - (res '())) ;; (char state ...) or ((char . char) state ...) - (cond - ((null? trans) - (if (null? ls) - (map (lambda (x) (cons (car x) (nfa-closure nfa (cdr x)))) - res) - (let ((node (assv (car ls) nfa))) - (lp (if node (cdr node) '()) (cdr ls) res)))) - ((eq? 'epsilon (caar trans)) - (lp (cdr trans) ls res)) - (else - (lp (cdr trans) ls (nfa-join-transitions! res (car trans))))))) - -(define (nfa-join-transitions! existing new) - (define (join ls elt state) - (if (not elt) - ls - (nfa-join-transitions! ls (cons elt state)))) - (cond - ((char? (car new)) - (let ((ch (car new))) - (let lp ((ls existing) (res '())) - (cond - ((null? ls) - ;; done, just cons this on to the original list - (cons (list ch (cdr new)) existing)) - ((eqv? ch (caar ls)) - ;; add a new state to an existing char - (set-cdr! (car ls) (insert-sorted (cdr new) (cdar ls))) - existing) - ((and (pair? (caar ls)) - (char<=? (caaar ls) ch) - (char<=? ch (cdaar ls))) - ;; split a range - (apply - (lambda (left right) - (cons (cons ch (insert-sorted (cdr new) (cdar ls))) - (append (if left (list (cons left (cdar ls))) '()) - (if right (list (cons right (cdar ls))) '()) - res - (cdr ls)))) - (split-char-range (caar ls) (car new)))) - (else - ;; keep looking - (lp (cdr ls) (cons (car ls) res))))))) - (else - (let ((lo (caar new)) - (hi (cdar new))) - (let lp ((ls existing) (res '())) - (cond - ((null? ls) - (cons (list (car new) (cdr new)) existing)) - ((and (char? (caar ls)) (char<=? lo (caar ls)) (char<=? (caar ls) hi)) - ;; range enclosing a character - (apply - (lambda (left right) - (set-cdr! (car ls) (insert-sorted (cdr new) (cdar ls))) - (join (join existing left (cdr new)) right (cdr new))) - (split-char-range (car new) (caar ls)))) - ((and (pair? (caar ls)) - (or (and (char<=? (caaar ls) hi) (char<=? lo (cdaar ls))) - (and (char<=? hi (caaar ls)) (char<=? (cdaar ls) lo)))) - ;; overlapping ranges - (apply - (lambda (left1 left2 same right1 right2) - (let ((old-states (cdar ls))) - (set-car! (car ls) same) - (set-cdr! (car ls) (insert-sorted (cdr new) old-states)) - (let* ((res (if right1 - (cons (cons right1 old-states) existing) - existing)) - (res (if right2 (cons (cons right2 old-states) res) res))) - (join (join res left1 (cdr new)) left2 (cdr new))))) - (intersect-char-ranges (car new) (caar ls)))) - (else - (lp (cdr ls) (cons (car ls) res))))))))) - -(define (char-range c1 c2) - (if (eqv? c1 c2) c1 (cons c1 c2))) - -;; assumes ch is included in the range -(define (split-char-range range ch) - (list - (and (not (eqv? ch (car range))) - (char-range (car range) (integer->char (- (char->integer ch) 1)))) - (and (not (eqv? ch (cdr range))) - (char-range (integer->char (+ (char->integer ch) 1)) (cdr range))))) - -;; returns (possibly #f) char ranges: -;; a-only-1 a-only-2 a-and-b b-only-1 b-only-2 -(define (intersect-char-ranges a b) - (if (char>? (car a) (car b)) - (reverse (intersect-char-ranges b a)) - (let ((a-lo (car a)) - (a-hi (cdr a)) - (b-lo (car b)) - (b-hi (cdr b))) - (list - (and (char<? a-lo b-lo) - (char-range a-lo (integer->char (- (char->integer b-lo) 1)))) - (and (char>? a-hi b-hi) - (char-range (integer->char (+ (char->integer b-hi) 1)) a-hi)) - (char-range b-lo (if (char<? b-hi a-hi) b-hi a-hi)) - #f - (and (char>? b-hi a-hi) - (char-range (integer->char (+ (char->integer a-hi) 1)) b-hi)))))) - -;; The `closure' of a list of NFA states - all states that can be -;; reached from any of them using any number of epsilon transitions. -(define (nfa-closure nfa states) - (let lp ((ls states) - (res '())) - (cond - ((null? ls) - res) - ((memv (car ls) res) - (lp (cdr ls) res)) - (else - (lp (append (map cdr - (filter (lambda (trans) (eq? 'epsilon (car trans))) - (cdr (assv (car ls) nfa)))) - (cdr ls)) - (insert-sorted (car ls) res)))))) - -;; insert an integer uniquely into a sorted list -(define (insert-sorted n ls) - (cond - ((null? ls) - (cons n '())) - ((<= n (car ls)) - (if (= n (car ls)) - ls - (cons n ls))) - (else - (cons (car ls) (insert-sorted n (cdr ls)))))) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; DFAs don't give us match information, so once we match and -;; determine the start and end, we need to recursively break the -;; problem into smaller DFAs to get each submatch. -;; -;; See http://compilers.iecc.com/comparch/article/07-10-026 - -(define (sre-match-extractor sre) - (let lp ((sre sre) (n 1) (submatch-deps? #f)) - (cond - ((not (sre-has-submatchs? sre)) - (if (not submatch-deps?) - (lambda (str i j matches) j) - (let ((dfa (nfa->dfa (sre->nfa sre)))) - (lambda (str i j matches) - (dfa-match/longest dfa str i j))))) - ((pair? sre) - (case (car sre) - ((: seq) - (let* ((right (sre-sequence (cddr sre))) - (match-left (lp (cadr sre) n #t)) - (match-right - (lp right (+ n (sre-count-submatches (cadr sre))) #t))) - (lambda (str i j matches) - (let lp ((k j) (best #f)) - (if (< k i) - best - (let* ((middle (match-left str i k matches)) - (end (and middle - (eqv? middle k) - (match-right str middle j matches)))) - (if (eqv? end j) - end - (lp (- k 1) - (if (or (not best) (and end (> end best))) - end - best))))))))) - ((or) - (let* ((rest (sre-alternate (cddr sre))) - (match-first - (lp (cadr sre) n #t)) - (match-rest - (lp rest - (+ n (sre-count-submatches (cadr sre))) - submatch-deps?))) - (lambda (str i j matches) - (let ((k (match-first str i j matches))) - (if (eqv? k j) - k - (match-rest str i j matches)))))) - ((* +) - (letrec ((match-once - (lp (sre-sequence (cdr sre)) n #t)) - (match-all - (lambda (str i j matches) - (let ((k (match-once str i j matches))) - (if (and k (< i k)) - (match-all str k j matches) - i))))) - (if (eq? '* (car sre)) - match-all - (lambda (str i j matches) - (let ((k (match-once str i j matches))) - (and k - (match-all str k j matches))))))) - ((?) - (let ((match-once (lp (sre-sequence (cdr sre)) n #t))) - (lambda (str i j matches) - (let ((k (match-once str i j matches))) - (or k i))))) - ((submatch) - (let ((match-one - (lp (sre-sequence (cdr sre)) (+ n 1) #t))) - (lambda (str i j matches) - (let ((res (match-one str i j matches))) - (cond - ((number? res) - (irregex-match-start-index-set! matches n i) - (irregex-match-end-index-set! matches n res))) - res)))) - (else - (error "unknown regexp operator" (car sre))))) - (else - (error "unknown regexp" sre))))) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; closure compilation - we use this for non-regular expressions -;; instead of an interpreted NFA matcher - -(define (sre->procedure sre . o) - (define names - (if (and (pair? o) (pair? (cdr o))) (cadr o) (sre-names sre 1 '()))) - (let lp ((sre sre) - (n 1) - (flags (if (pair? o) (car o) ~none)) - (next (lambda (str i matches fail) i))) - (define (rec sre) (lp sre n flags next)) - (cond - ((pair? sre) - (if (string? (car sre)) - (sre-cset->procedure - (sre->cset (car sre) (flag-set? flags ~case-insensitive?)) - next) - (case (car sre) - ((~ - & /) - (sre-cset->procedure - (sre->cset sre (flag-set? flags ~case-insensitive?)) - next)) - ((or) - (case (length (cdr sre)) - ((0) (lambda (str i matches fail) (fail))) - ((1) (rec (cadr sre))) - (else - (let* ((first (rec (cadr sre))) - (rest (lp (sre-alternate (cddr sre)) - (+ n (sre-count-submatches (cadr sre))) - flags - next))) - (lambda (str i matches fail) - (first str i matches (lambda () (rest str i matches fail)))))))) - ((w/case) - (lp (sre-sequence (cdr sre)) - n - (flag-clear flags ~case-insensitive?) - next)) - ((w/nocase) - (lp (sre-sequence (cdr sre)) - n - (flag-join flags ~case-insensitive?) - next)) - ((w/utf8) - (lp (sre-sequence (cdr sre)) n (flag-join flags ~utf8?) next)) - ((w/noutf8) - (lp (sre-sequence (cdr sre)) n (flag-clear flags ~utf8?) next)) - ((seq :) - (case (length (cdr sre)) - ((0) next) - ((1) (rec (cadr sre))) - (else - (let ((rest (lp (sre-sequence (cddr sre)) - (+ n (sre-count-submatches (cadr sre))) - flags - next))) - (lp (cadr sre) n flags rest))))) - ((?) - (let ((body (rec (sre-sequence (cdr sre))))) - (lambda (str i matches fail) - (body str i matches (lambda () (next str i matches fail)))))) - ((??) - (let ((body (rec (sre-sequence (cdr sre))))) - (lambda (str i matches fail) - (next str i matches (lambda () (body str i matches fail)))))) - ((*) - (cond - ((sre-empty? (sre-sequence (cdr sre))) - (error "invalid sre: empty *" sre)) - (else - (letrec ((body - (lp (sre-sequence (cdr sre)) - n - flags - (lambda (str i matches fail) - (body str - i - matches - (lambda () (next str i matches fail))))))) - (lambda (str i matches fail) - (body str i matches (lambda () (next str i matches fail)))))))) - ((*?) - (cond - ((sre-empty? (sre-sequence (cdr sre))) - (error "invalid sre: empty *?" sre)) - (else - (letrec ((body - (lp (sre-sequence (cdr sre)) - n - flags - (lambda (str i matches fail) - (next str - i - matches - (lambda () (body str i matches fail))))))) - (lambda (str i matches fail) - (next str i matches (lambda () (body str i matches fail)))))))) - ((+) - (lp (sre-sequence (cdr sre)) - n - flags - (rec (list '* (sre-sequence (cdr sre)))))) - ((=) - (rec `(** ,(cadr sre) ,(cadr sre) ,@(cddr sre)))) - ((>=) - (rec `(** ,(cadr sre) #f ,@(cddr sre)))) - ((** **?) - (cond - ((or (and (number? (cadr sre)) - (number? (caddr sre)) - (> (cadr sre) (caddr sre))) - (and (not (cadr sre)) (caddr sre))) - (lambda (str i matches fail) (fail))) - (else - (let* ((from (cadr sre)) - (to (caddr sre)) - (? (if (eq? '** (car sre)) '? '??)) - (* (if (eq? '** (car sre)) '* '*?)) - (sre (sre-sequence (cdddr sre))) - (x-sre (sre-strip-submatches sre)) - (next (if to - (if (= from to) - next - (fold (lambda (x next) - (lp `(,? ,sre) n flags next)) - next - (zero-to (- to from)))) - (rec `(,* ,sre))))) - (if (zero? from) - next - (lp `(seq ,@(map (lambda (x) x-sre) (zero-to (- from 1))) - ,sre) - n - flags - next)))))) - ((word) - (rec `(seq bow ,@(cdr sre) eow))) - ((word+) - (rec `(seq bow (+ (& (or alphanumeric "_") - (or ,@(cdr sre)))) eow))) - ((posix-string) - (rec (string->sre (cadr sre)))) - ((look-ahead) - (let ((check - (lp (sre-sequence (cdr sre)) - n - flags - (lambda (str i matches fail) i)))) - (lambda (str i matches fail) - (if (check str i matches (lambda () #f)) - (next str i matches fail) - (fail))))) - ((neg-look-ahead) - (let ((check - (lp (sre-sequence (cdr sre)) - n - flags - (lambda (str i matches fail) i)))) - (lambda (str i matches fail) - (if (check str i matches (lambda () #f)) - (fail) - (next str i matches fail))))) - ((look-behind) - (let ((check - (lp (sre-sequence (cons '(* any) (cdr sre))) - n - flags - (lambda (str i matches fail) i)))) - (lambda (str i matches fail) - (if (eqv? i (check (substring str 0 i) 0 matches (lambda () #f))) - (next str i matches fail) - (fail))))) - ((neg-look-behind) - (let ((check - (lp (sre-sequence (cons '(* any) (cdr sre))) - n - flags - (lambda (str i matches fail) i)))) - (lambda (str i matches fail) - (if (eqv? i (check (substring str 0 i) 0 matches (lambda () #f))) - (fail) - (next str i matches fail))))) - ((atomic) - (let ((once - (lp (sre-sequence (cdr sre)) - n - flags - (lambda (str i matches fail) i)))) - (lambda (str i matches fail) - (let ((j (once str i matches (lambda () #f)))) - (if j - (next str j matches fail) - (fail)))))) - ((if) - (let* ((test-submatches (sre-count-submatches (cadr sre))) - (pass (lp (caddr sre) flags (+ n test-submatches) next)) - (fail (if (pair? (cdddr sre)) - (lp (cadddr sre) - (+ n test-submatches - (sre-count-submatches (caddr sre))) - flags - next) - (lambda (str i matches fail) (fail))))) - (cond - ((or (number? (cadr sre)) (symbol? (cadr sre))) - (let ((index - (if (symbol? (cadr sre)) - (cond - ((assq (cadr sre) names) => cdr) - (else - (error "unknown named backref in SRE IF" sre))) - (cadr sre)))) - (lambda (str i matches fail2) - (if (irregex-match-end-index matches index) - (pass str i matches fail2) - (fail str i matches fail2))))) - (else - (let ((test (lp (cadr sre) n flags pass))) - (lambda (str i matches fail2) - (test str i matches (lambda () (fail str i matches fail2))) - )))))) - ((backref backref-ci) - (let ((n (cond ((number? (cadr sre)) (cadr sre)) - ((assq (cadr sre) names) => cdr) - (else (error "unknown backreference" (cadr sre))))) - (compare (if (or (eq? (car sre) 'backref-ci) - (flag-set? flags ~case-insensitive?)) - string-ci=? - string=?))) - (lambda (str i matches fail) - (let ((s (irregex-match-substring matches n))) - (if (not s) - (fail) - (let ((j (+ i (string-length s)))) - (if (and (<= j (string-length str)) - (compare s (substring str i j))) - (next str j matches fail) - (fail)))))))) - ((dsm) - (lp (sre-sequence (cdddr sre)) (+ n (cadr sre)) flags next)) - ((submatch) - (let ((body - (lp (sre-sequence (cdr sre)) - (+ n 1) - flags - (lambda (str i matches fail) - (let ((old (irregex-match-end-index matches n))) - (irregex-match-end-index-set! matches n i) - (next str i matches - (lambda () - (irregex-match-end-index-set! matches n old) - (fail)))))))) - (lambda (str i matches fail) - (let ((old (irregex-match-start-index matches n))) - (irregex-match-start-index-set! matches n i) - (body str i matches - (lambda () - (irregex-match-start-index-set! matches n old) - (fail))))))) - ((submatch-named) - (rec `(submatch ,@(cddr sre)))) - (else - (error "unknown regexp operator" sre))))) - ((symbol? sre) - (case sre - ((any) - (lambda (str i matches fail) - (if (< i (string-length str)) - (next str (+ i 1) matches fail) - (fail)))) - ((nonl) - (lambda (str i matches fail) - (if (and (< i (string-length str)) - (not (eqv? #\newline (string-ref str i)))) - (next str (+ i 1) matches fail) - (fail)))) - ((bos) - (lambda (str i matches fail) - (if (zero? i) (next str i matches fail) (fail)))) - ((bol) - (lambda (str i matches fail) - (if (or (zero? i) (eqv? #\newline (string-ref str (- i 1)))) - (next str i matches fail) - (fail)))) - ((bow) - (lambda (str i matches fail) - (if (and (or (zero? i) - (not (char-alphanumeric? (string-ref str (- i 1))))) - (< i (string-length str)) - (char-alphanumeric? (string-ref str i))) - (next str i matches fail) - (fail)))) - ((eos) - (lambda (str i matches fail) - (if (>= i (string-length str)) (next str i matches fail) (fail)))) - ((eol) - (lambda (str i matches fail) - (if (or (>= i (string-length str)) - (eqv? #\newline (string-ref str i))) - (next str i matches fail) - (fail)))) - ((eow) - (lambda (str i matches fail) - (if (and (or (>= i (string-length str)) - (not (char-alphanumeric? (string-ref str i)))) - (> i 0) - (char-alphanumeric? (string-ref str (- i 1)))) - (next str i matches fail) - (fail)))) - ((nwb) ;; non-word-boundary - (lambda (str i matches fail) - (if (and (not (zero? i)) - (< i (string-length str)) - (if (char-alphanumeric? (string-ref str (- i 1))) - (char-alphanumeric? (string-ref str i)) - (not (char-alphanumeric? (string-ref str i))))) - (next str i matches fail) - (fail)))) - ((epsilon) - next) - (else - (let ((cell (assq sre sre-named-definitions))) - (if cell - (rec (cdr cell)) - (error "unknown regexp" sre)))))) - ((char? sre) - (if (flag-set? flags ~case-insensitive?) - (lambda (str i matches fail) - (if (and (< i (string-length str)) - (char-ci=? sre (string-ref str i))) - (next str (+ i 1) matches fail) - (fail))) - (lambda (str i matches fail) - (if (and (< i (string-length str)) - (eqv? sre (string-ref str i))) - (next str (+ i 1) matches fail) - (fail))))) - ((string? sre) - (rec (sre-sequence (string->list sre)))) - (else - (error "unknown regexp" sre))))) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; Simple character sets as lists of ranges, as used in the NFA/DFA -;; compilation. This is not especially efficient, but is portable and -;; scalable for any range of character sets. - -(define (sre-cset->procedure cset next) - (lambda (str i matches fail) - (if (and (< i (string-length str)) - (cset-contains? cset (string-ref str i))) - (next str (+ i 1) matches fail) - (fail)))) - -(define (plist->alist ls) - (let lp ((ls ls) (res '())) - (if (null? ls) - (reverse res) - (lp (cddr ls) (cons (cons (car ls) (cadr ls)) res))))) - -(define (alist->plist ls) - (let lp ((ls ls) (res '())) - (if (null? ls) - (reverse res) - (lp (cdr ls) (cons (cdar ls) (cons (caar ls) res)))))) - -(define (sre->cset sre . o) - (let lp ((sre sre) (ci? (and (pair? o) (car o)))) - (define (rec sre) (lp sre ci?)) - (cond - ((pair? sre) - (if (string? (car sre)) - (if ci? - (cset-case-insensitive (string->list (car sre))) - (string->list (car sre))) - (case (car sre) - ((~) - (cset-complement - (fold cset-union (rec (cadr sre)) (map rec (cddr sre))))) - ((&) - (fold cset-intersection (rec (cadr sre)) (map rec (cddr sre)))) - ((-) - (fold (lambda (x res) (cset-difference res x)) - (rec (cadr sre)) - (map rec (cddr sre)))) - ((/) - (let ((res (plist->alist (sre-flatten-ranges (cdr sre))))) - (if ci? - (cset-case-insensitive res) - res))) - ((or) - (fold cset-union (rec (cadr sre)) (map rec (cddr sre)))) - ((w/case) - (lp (sre-alternate (cdr sre)) #f)) - ((w/nocase) - (lp (sre-alternate (cdr sre)) #t)) - (else - (error "not a valid sre char-set operator" sre))))) - ((char? sre) (rec (list (string sre)))) - ((string? sre) (rec (list sre))) - (else - (let ((cell (assq sre sre-named-definitions))) - (if cell - (rec (cdr cell)) - (error "not a valid sre char-set" sre))))))) - -;;;; another debugging utility -;; (define (cset->sre cset) -;; (let lp ((ls cset) (chars '()) (ranges '())) -;; (cond -;; ((null? ls) -;; (sre-alternate -;; (append -;; (if (pair? chars) (list (list (list->string chars))) '()) -;; (if (pair? ranges) (list (cons '/ (alist->plist ranges))) '())))) -;; ((char? (car ls)) (lp (cdr ls) (cons (car ls) chars) ranges)) -;; (else (lp (cdr ls) chars (cons (car ls) ranges)))))) - -(define (cset-contains? cset ch) - (find (lambda (x) - (or (eqv? x ch) - (and (pair? x) (char<=? (car x) ch) (char<=? ch (cdr x))))) - cset)) - -(define (cset-range x) - (if (char? x) (cons x x) x)) - -(define (char-ranges-overlap? a b) - (if (pair? a) - (if (pair? b) - (or (and (char<=? (car a) (cdr b)) (char<=? (car b) (cdr a))) - (and (char<=? (cdr b) (car a)) (char<=? (cdr a) (car b)))) - (and (char<=? (car a) b) (char<=? b (cdr a)))) - (if (pair? b) - (char-ranges-overlap? b a) - (eqv? a b)))) - -(define (char-ranges-union a b) - (cons (if (char<=? (car a) (car b)) (car a) (car b)) - (if (char>=? (cdr a) (cdr b)) (cdr a) (cdr b)))) - -(define (cset-union a b) - (cond ((null? b) a) - ((find-tail (lambda (x) (char-ranges-overlap? x (car b))) a) - => (lambda (ls) - (cset-union - (cset-union (append (take-up-to a ls) (cdr ls)) - (list (char-ranges-union (cset-range (car ls)) - (cset-range (car b))))) - (cdr b)))) - (else (cset-union (cons (car b) a) (cdr b))))) - -(define (cset-difference a b) - (cond ((null? b) a) - ((not (car b)) (cset-difference a (cdr b))) - ((find-tail (lambda (x) (char-ranges-overlap? x (car b))) a) - => (lambda (ls) - (apply - (lambda (left1 left2 same right1 right2) - (let* ((a (append (take-up-to a ls) (cdr ls))) - (a (if left1 (cons left1 a) a)) - (a (if left2 (cons left2 a) a)) - (b (if right1 (cset-union b (list right1)) b)) - (b (if right2 (cset-union b (list right2)) b))) - (cset-difference a b))) - (intersect-char-ranges (cset-range (car ls)) - (cset-range (car b)))))) - (else (cset-difference a (cdr b))))) - -(define (cset-intersection a b) - (let intersect ((a a) (b b) (res '())) - (cond ((null? b) res) - ((find-tail (lambda (x) (char-ranges-overlap? x (car b))) a) - => (lambda (ls) - (apply - (lambda (left1 left2 same right1 right2) - (let* ((a (append (take-up-to a ls) (cdr ls))) - (a (if left1 (cons left1 a) a)) - (a (if left2 (cons left2 a) a)) - (b (if right1 (cset-union b (list right1)) b)) - (b (if right2 (cset-union b (list right2)) b))) - (intersect a b (cset-union res (list same))))) - (intersect-char-ranges (cset-range (car ls)) - (cset-range (car b)))))) - (else (intersect a (cdr b) res))))) - -(define (cset-complement a) - (cset-difference (sre->cset *all-chars*) a)) - -(define (cset-case-insensitive a) - (let lp ((ls a) (res '())) - (cond ((null? ls) (reverse res)) - ((and (char? (car ls)) (char-alphabetic? (car ls))) - (let ((c2 (char-altcase (car ls))) - (res (cons (car ls) res))) - (lp (cdr ls) (if (cset-contains? res c2) res (cons c2 res))))) - ((and (pair? (car ls)) - (char-alphabetic? (caar ls)) - (char-alphabetic? (cdar ls))) - (lp (cdr ls) - (cset-union (cset-union res (list (car ls))) - (list (cons (char-altcase (caar ls)) - (char-altcase (cdar ls))))))) - (else (lp (cdr ls) (cset-union res (list (car ls)))))))) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; match and replace utilities - -(define (irregex-fold irx kons knil str . o) - (let* ((irx (irregex irx)) - (matches (irregex-new-matches irx)) - (finish (if (pair? o) (car o) (lambda (i acc) acc))) - (start (if (and (pair? o) (pair? (cdr o))) (cadr o) 0)) - (end (if (and (pair? o) (pair? (cdr o)) (pair? (cddr o))) - (caddr o) - (string-length str)))) - (irregex-match-string-set! matches str) - (let lp ((i start) (acc knil)) - (if (>= i end) - (finish i acc) - (let ((m (irregex-search/matches irx str i end matches))) - (if (not m) - (finish i acc) - (let* ((end (irregex-match-end m 0)) - (acc (kons i m acc))) - (irregex-reset-matches! matches) - (lp end acc)))))))) - -(define (irregex-replace irx str . o) - (let ((m (irregex-search (irregex irx) str))) - (and - m - (string-cat-reverse - (cons (substring str (irregex-match-end m 0) (string-length str)) - (append (irregex-apply-match m o) - (list (substring str 0 (irregex-match-start m 0))))))))) - -(define (irregex-replace/all irx str . o) - (irregex-fold - irx - (lambda (i m acc) - (let ((m-start (irregex-match-start m 0))) - (append (irregex-apply-match m o) - (if (= i m-start) - acc - (cons (substring str i m-start) acc))))) - '() - str - (lambda (i acc) - (let ((end (string-length str))) - (string-cat-reverse (if (= i end) - acc - (cons (substring str i end) acc))))))) - -(define (irregex-apply-match m ls) - (let lp ((ls ls) (res '())) - (if (null? ls) - res - (cond - ((integer? (car ls)) - (lp (cdr ls) - (cons (or (irregex-match-substring m (car ls)) "") res))) - ((procedure? (car ls)) - (lp (cdr ls) (cons ((car ls) m) res))) - ((symbol? (car ls)) - (case (car ls) - ((pre) - (lp (cdr ls) - (cons (substring (irregex-match-string m) - 0 - (irregex-match-start m 0)) - res))) - ((post) - (lp (cdr ls) - (cons (substring (irregex-match-string m) - (irregex-match-end m 0) - (string-length (irregex-match-string m))) - res))) - (else (error "unknown match replacement" (car ls))))) - (else - (lp (cdr ls) (cons (car ls) res))))))) +;;;; irregex.scm - container for irregex-core.scm +; +; Copyright (c) 2010, The Chicken Team +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following +; conditions are met: +; +; Redistributions of source code must retain the above copyright notice, this list of conditions and the following +; disclaimer. +; Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following +; disclaimer in the documentation and/or other materials provided with the distribution. +; Neither the name of the author nor the names of its contributors may be used to endorse or promote +; products derived from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS +; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY +; AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR +; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +; OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +; POSSIBILITY OF SUCH DAMAGE. + + + +(declare (unit irregex)) + +(declare + (no-procedure-checks) + (fixnum) + (export + ##sys#glob->regexp + irregex + irregex-apply-match + irregex-dfa + irregex-dfa/extract + irregex-dfa/search + irregex-extract + irregex-flags + irregex-fold + irregex-fold/chunked + irregex-lengths + irregex-match + irregex-match? + irregex-match-data? + irregex-match-end + irregex-match-end-chunk + irregex-match-end-index + irregex-match-names + irregex-match-num-submatches + irregex-match-start + irregex-match-start-chunk + irregex-match-start-index + irregex-match-string + irregex-match-subchunk + irregex-match-substring + irregex-match-valid-index? + irregex-match/chunked + irregex-names + irregex-new-matches + irregex-nfa + irregex-num-submatches + irregex-opt + irregex-quote + irregex-replace + irregex-replace/all + irregex-reset-matches! + irregex-search + irregex-search/chunked + irregex-search/matches + irregex-split + irregex-submatches + irregex? + make-irregex-chunker + maybe-string->sre + irregex-search/chunked + sre->irregex + sre->string + string->irregex + string->sre + )) + +(include "common-declarations.scm") + +(register-feature! 'irregex) + +(define-syntax (build-cache x r c) + ;; (build-cache N ARG FAIL) + (let* ((n (cadr x)) + (n2 (* n 2)) + (arg (caddr x)) + (fail (cadddr x)) + (%cache (r 'cache)) + (%index (r 'index)) + (%arg (r 'arg)) + (%let (r 'let)) + (%let* (r 'let*)) + (%if (r 'if)) + (%fx+ (r 'fx+)) + (%fxmod (r 'fxmod)) + (%equal? (r 'equal?)) + (%quote (r 'quote)) + (%tmp (r 'tmp)) + (%begin (r 'begin)) + (cache (make-vector (add1 n2) #f))) + (##sys#setslot cache n2 0) ; last slot: current index + `(,%let* ((,%cache (,%quote ,cache)) ; we mutate a literal vector + (,%arg ,arg)) + ,(let fold ((i 0)) + (if (fx>= i n) + ;; this should be thread-safe: a context-switch can only + ;; happen before this code and in the call to FAIL. + `(,%let ((,%tmp ,fail) + (,%index (##sys#slot ,%cache ,n2))) + (##sys#setslot ,%cache ,%index ,%arg) + (##sys#setslot ,%cache (,%fx+ ,%index 1) ,%tmp) + (##sys#setislot + ,%cache ,n2 + (##core#inline "C_u_fixnum_modulo" (,%fx+ ,%index 2) ,n2)) + ,%tmp) + `(,%if (,%equal? (##sys#slot ,%cache ,(* i 2)) ,%arg) + (##sys#slot ,%cache ,(add1 (* i 2))) + ,(fold (add1 i)))))))) + +(define-compiler-syntax %%string-copy! + (syntax-rules () + ((_ to tstart from fstart fend) + (let ((x to) + (y tstart) + (z from) + (u fstart) + (v fend)) + (##core#inline "C_substring_copy" z x u v y))))) + +(define-compiler-syntax %substring=? + (syntax-rules () + ((_ a b start1 start2 len) + (##core#inline "C_substring_compare" a b start1 start2 len)))) + +(define-compiler-syntax make-irregex + (syntax-rules () + ((_ dfa dfa/search dfa/extract nfa flags submatches lengths names) + (##sys#make-structure + 'regexp dfa dfa/search dfa/extract nfa flags submatches lengths names)))) + +(define-compiler-syntax make-irregex-match + (syntax-rules () + ((_ count names) + (##sys#make-structure + 'regexp-match + (make-vector (+ (* 4 (+ 2 count)) 3) #f) ; #1: submatches + names ; #2: (guess) + #f ; #3: chunka + #f)))) ; #4: fail + +(define-compiler-syntax bit-shl + (syntax-rules () + ((_ n i) (fxshl n i)))) + +(define-compiler-syntax bit-shr + (syntax-rules () + ((_ n i) (fxshr n i)))) + +(define-compiler-syntax bit-not + (syntax-rules () + ((_ n) (fxnot n)))) + +(define-compiler-syntax bit-ior + (syntax-rules () + ((_ a b) (fxior a b)))) + +(define-compiler-syntax bit-and + (syntax-rules () + ((_ a b) (fxand a b)))) + +(define-compiler-syntax match-vector-ref + (syntax-rules () + ((_ m i) (##sys#slot (##sys#slot m 1) i)))) + +(define-compiler-syntax match-vector-set! + (syntax-rules () + ((_ m i x) (##sys#setslot (##sys#slot m 1) i x)))) + +(define-compiler-syntax irregex-match-start-chunk-set! + (syntax-rules () + ((_ m n start) + (vector-set! (##sys#slot m 1) (* n 4) start)))) + +(define-compiler-syntax irregex-match-start-index-set! + (syntax-rules () + ((_ m n start) + (vector-set! (##sys#slot m 1) (+ 1 (* n 4)) start)))) + +(define-compiler-syntax irregex-match-end-chunk-set! + (syntax-rules () + ((_ m n end) + (vector-set! (##sys#slot m 1) (+ 2 (* n 4)) end)))) + +(define-compiler-syntax irregex-match-end-index-set! + (syntax-rules () + ((_ m n end) + (vector-set! (##sys#slot m 1) (+ 3 (* n 4)) end)))) + +(define-compiler-syntax %irregex-error + (syntax-rules () + ((_ args ...) + (error args ...)))) + +(include "irregex-core.scm") +(include "irregex-utils.scm") + +(define ##sys#glob->regexp + (let ((list->string list->string) + (string->list string->list)) + (lambda (s #!optional sre?) + (##sys#check-string s 'glob->regexp) + (let ((sre + (cons + ': + (let loop ((cs (string->list s)) (dir #t)) + (if (null? cs) + '() + (let ((c (car cs)) + (rest (cdr cs)) ) + (cond ((char=? c #\*) + (if dir + `((or (: (~ ("./\\")) + (* (~ ("/\\")))) + (* (~ ("./\\")))) + ,@(loop rest #f)) + `((* (~ ("/\\"))) ,@(loop rest #f)))) + ((char=? c #\?) (cons 'any (loop rest #f))) + ((char=? c #\[) + (let loop2 ((rest rest) (s '())) + (cond ((not (pair? rest)) + (error 'glob->regexp + "unexpected end of character class" s)) + ((char=? #\] (car rest)) + `((or ,@s) ,@(loop (cdr rest) #f))) + ((and (pair? (cdr rest)) + (pair? (cddr rest)) + (char=? #\- (cadr rest)) ) + (loop2 (cdddr rest) + (cons `(/ ,(car rest) ,(caddr rest)) s))) + ((and (pair? (cdr rest)) + (char=? #\- (car rest))) + (loop2 (cddr rest) + (cons `(~ ,(cadr rest)) s))) + (else + (loop2 (cdr rest) (cons (car rest) s)))))) + (else (cons c (loop rest (memq c '(#\\ #\/)))))))))))) + (if sre? sre (irregex sre)))))) diff --git a/library.scm b/library.scm index c4b99b0d..d601a02f 100644 --- a/library.scm +++ b/library.scm @@ -76,7 +76,8 @@ #define C_a_get_current_seconds(ptr, c, dummy) C_flonum(ptr, time(NULL)) #define C_peek_c_string_at(ptr, i) ((C_char *)(((C_char **)ptr)[ i ])) -static C_word fast_read_line_from_file(C_word str, C_word port, C_word size) { +static C_word +fast_read_line_from_file(C_word str, C_word port, C_word size) { int n = C_unfix(size); int i; int c; @@ -101,7 +102,7 @@ static C_word fast_read_line_from_file(C_word str, C_word port, C_word size) { } static C_word -fast_read_string_from_file (C_word dest, C_word port, C_word len, C_word pos) +fast_read_string_from_file(C_word dest, C_word port, C_word len, C_word pos) { int n = C_unfix (len); char * buf = ((char *)C_data_pointer (dest) + C_unfix (pos)); @@ -1739,7 +1740,8 @@ EOF (define (##sys#check-port x . loc) (unless (%port? x) - (##sys#signal-hook #:type-error (and (pair? loc) (car loc)) "argument is not a port" x) ) ) + (##sys#signal-hook + #:type-error (and (pair? loc) (car loc)) "argument is not a port" x) ) ) (define (##sys#check-port-mode port mode . loc) (unless (eq? mode (##sys#slot port 1)) diff --git a/manual/Supported language b/manual/Supported language index 3fe014cd..d7be9c82 100644 --- a/manual/Supported language +++ b/manual/Supported language @@ -19,7 +19,7 @@ * [[Unit ports]] I/O ports * [[Unit files]] File and pathname operations * [[Unit extras]] Useful utility definitions -* [[Unit regex]] Regular expressions +* [[Unit irregex]] Regular expressions * [[Unit srfi-1]] List Library * [[Unit srfi-4]] Homogeneous numeric vectors * [[Unit srfi-13]] String library diff --git a/manual/Unit extras b/manual/Unit extras index f7f28cca..3d0163a8 100644 --- a/manual/Unit extras +++ b/manual/Unit extras @@ -196,4 +196,4 @@ false. Returns a string with the accumulated characters. --- Previous: [[Unit files]] -Next: [[Unit regex]] +Next: [[Unit irregex]] diff --git a/manual/Unit irregex b/manual/Unit irregex new file mode 100644 index 00000000..387b8422 --- /dev/null +++ b/manual/Unit irregex @@ -0,0 +1,826 @@ +[[tags: manual]] +[[toc:]] + +== Unit irregex + +This library unit provides support for regular expressions, using the +powerful ''irregex'' regular expression engine by Alex Shinn. It +supports both POSIX syntax with various (irregular) PCRE extensions, +as well as SCSH's SRE syntax, with various aliases for commonly used +patterns. DFA matching is used when possible, otherwise a +closure-compiled NFA approach is used. Matching may be performed over +standard Scheme strings, or over arbitrarily chunked streams of +strings. + +On systems that support dynamic loading, the {{irregex}} unit can +be made available in the Chicken interpreter ({{csi}}) by entering + +<enscript highlight=scheme> +(require-extension irregex) +</enscript> + +[[toc:]] + +=== Specification + +==== Procedures + +===== irregex +===== string->irregex +===== sre->irregex + +<procedure>(irregex <posix-string-or-sre> [<options> ...])</procedure><br> +<procedure>(string->irregex <posix-string> [<options> ...])</procedure><br> +<procedure>(sre->irregex <sre> [<options> ...])</procedure><br> + +Compiles a regular expression from either a POSIX-style regular +expression string (with most PCRE extensions) or an SCSH-style SRE. +There is no {{(rx ...)}} syntax - just use normal Scheme lists, with +{{quasiquote}} if you like. + +Technically a string by itself could be considered a valid (though +rather silly) SRE, so if you want to just match a literal string you +should use something like {{(irregex `(: ,str))}}, or use the explicit +{{(sre->irregex str)}}. + +The options are a list of any of the following symbols: + +; {{'i}}, {{'case-insensitive}} : match case-insensitively +; {{'m}}, {{'multi-line}} : treat string as multiple lines (effects {{^}} and {{$}}) +; {{'s}}, {{'single-line}} : treat string as a single line ({{.}} can match newline) +; {{'utf8}} : utf8-mode (assumes strings are byte-strings) +; {{'fast}} : try to optimize the regular expression +; {{'small}} : try to compile a smaller regular expression +; {{'backtrack}} : enforce a backtracking implementation + +The {{'fast}} and {{'small}} options are heuristic guidelines and will +not necessarily make the compiled expression faster or smaller. + +===== string->sre +===== maybe-string->sre + +<procedure>(string->sre <str>)</procedure><br> +<procedure>(maybe-string->sre <obj>)</procedure><br> + +For backwards compatibility, procedures to convert a POSIX string into +an SRE. + +{{maybe-string->sre}} does the same thing, but only if the argument is +a string, otherwise it assumes {{<obj>}} is an SRE and returns it +as-is. This is useful when you want to provide an API that allows +either a POSIX string or SRE (like {{irregex}} or {{irregex-search}} +below) - it ensures the result is an SRE. + +===== irregex? + +<procedure>(irregex? <obj>)</procedure><br> + +Returns {{#t}} iff the object is a regular expression. + +===== irregex-search + +<procedure>(irregex-search <irx> <str> [<start> <end>])</procedure> + +Searches for any instances of the pattern {{<irx>}} (a POSIX string, SRE +sexp, or pre-compiled regular expression) in {{<str>}}, optionally between +the given range. If a match is found, returns a match object, +otherwise returns {{#f}}. + +Match objects can be used to query the original range of the string or +its submatches using the {{irregex-match-*}} procedures below. + +Examples: + +<enscript highlight=scheme> +(irregex-search "foobar" "abcFOOBARdef") => #f + +(irregex-search "foobar" "abcFOOBARdef" 'i) => #<match> + +(irregex-search '(w/nocase "foobar") "abcFOOBARdef") => #<match> +</enscript> + +Note, the actual match result is represented by a vector in the +default implementation. Throughout this manual, we'll just write +{{#<match>}} to show that a successful match was returned when the +details are not important. + +Matching follows the POSIX leftmost, longest semantics, when +searching. That is, of all possible matches in the string, +{{irregex-search}} will return the match at the first position +(leftmost). If multiple matches are possible from that same first +position, the longest match is returned. + +===== irregex-match + +<procedure>(irregex-match <irx> <str>)</procedure> + +Like {{irregex-search}}, but performs an anchored match against the +beginning and end of the string, without searching. + +Examples: + +<enscript highlight=scheme> +(irregex-match '(w/nocase "foobar") "abcFOOBARdef") => #f + +(irregex-match '(w/nocase "foobar") "FOOBAR") => #<match> +</enscript> + +===== irregex-match-data? + +<procedure>(irregex-match-data? <obj>)</procedure> + +Returns {{#t}} iff the object is a successful match result from +{{irregex-search}} or {{irregex-match}}. + +===== irregex-num-submatches +===== irregex-match-num-submatches + +<procedure>(irregex-num-submatches <irx>)</procedure><br> +<procedure>(irregex-match-num-submatches <match>)</procedure> + +Returns the number of numbered submatches that are defined in the +irregex or match object. + +===== irregex-names +===== irregex-match-names + +<procedure>(irregex-names <irx>)</procedure><br> +<procedure>(irregex-match-names <match>)</procedure> + +Returns an association list of named submatches that are defined in +the irregex or match object. The {{car}} of each item in this list is +the name of a submatch, the {{cdr}} of each item is the numerical +submatch corresponding to this name. If a named submatch occurs +multiple times in the irregex, it will also occur multiple times in +this list. + +===== irregex-match-valid-index? + +<procedure>(irregex-match-valid-index? <match> <index-or-name>)</procedure><br> + +Returns {{#t}} iff the {{index-or-name}} named submatch or index is +defined in the {{match}} object. + +===== irregex-match-substring +===== irregex-match-start-index +===== irregex-match-end-index + +<procedure>(irregex-match-substring <match> [<index-or-name>])</procedure><br> +<procedure>(irregex-match-start-index <match> [<index-or-name>])</procedure><br> +<procedure>(irregex-match-end-index <match> [<index-or-name>])</procedure> + +Fetches the matched substring (or its start or end offset) at the +given submatch index, or named submatch. The entire match is index 0, +the first 1, etc. The default is index 0. + +===== irregex-match-subchunk + +<procedure>(irregex-match-subchunk <match> [<index-or-name>])</procedure> + +Generates a chunked data-type for the given match item, of the same +type as the underlying chunk type (see Chunked String Matching below). +This is only available if the chunk type specifies the get-subchunk +API, otherwise an error is raised. + +===== irregex-replace +===== irregex-replace/all + +<procedure>(irregex-replace <irx> <str> [<replacements> ...])</procedure><br> +<procedure>(irregex-replace/all <irx> <str> [<replacements> ...])</procedure> + +Matches a pattern in a string, and replaces it with a (possibly empty) +list of substitutions. Each {{<replacement>}} can be either a string +literal, a numeric index, a symbol (as a named submatch), or a +procedure which takes one argument (the match object) and returns a +string. + +Examples: + +<enscript highlight=scheme> +(irregex-replace "[aeiou]" "hello world" "*") => "h*llo world" + +(irregex-replace/all "[aeiou]" "hello world" "*") => "h*ll* w*rld" +</enscript> + +===== irregex-split +===== irregex-extract + +<procedure>(irregex-split <irx> <str> [<start> <end>])</procedure><br> +<procedure>(irregex-extract <irx> <str> [<start> <end>])</procedure> + +{{irregex-split}} splits the string {{<str>}} into substrings divided +by the pattern in {{<irx>}}. {{irregex-extract}} does the opposite, +returning a list of each instance of the pattern matched disregarding +the substrings in between. + +===== irregex-fold + +<procedure>(irregex-fold <irx> <kons> <knil> <str> [<finish> <start> <end>])</procedure> + +This performs a fold operation over every non-overlapping place +{{<irx>}} occurs in the string {{str}}. + +The {{<kons>}} procedure takes the following signature: + +<enscript highlight=scheme> +(<kons> <from-index> <match> <seed>) +</enscript> + +where {{<from-index>}} is the index from where we started searching +(initially {{<start>}} and thereafter the end index of the last +match), {{<match>}} is the resulting match-data object, and {{<seed>}} +is the accumulated fold result starting with {{<knil>}}. + +The rationale for providing the {{<from-index>}} (which is not +provided in the SCSH {{regexp-fold}} utility), is because this +information is useful (e.g. for extracting the unmatched portion of +the string before the current match, as needed in +{{irregex-replace}}), and not otherwise directly accessible. + +The optional {{<finish>}} takes two arguments: + +<enscript highlight=scheme> +(<finish> <from-index> <seed>) +</enscript> + +which simiarly allows you to pick up the unmatched tail of the string, +and defaults to just returning the {{<seed>}}. + +{{<start>}} and {{<end>}} are numeric indices letting you specify the +boundaries of the string on which you want to fold. + +To extract all instances of a match out of a string, you can use + +<enscript highlight=scheme> +(map irregex-match-substring + (irregex-fold <irx> + (lambda (i m s) (cons m s)) + '() + <str> + (lambda (i s) (reverse s)))) +</enscript> + +==== Extended SRE Syntax + +Irregex provides the first native implementation of SREs (Scheme +Regular Expressions), and includes many extensions necessary both for +minimal POSIX compatibility, as well as for modern extensions found in +libraries such as PCRE. + +The following table summarizes the SRE syntax, with detailed +explanations following. + + ;; basic patterns + <string> ; literal string + (seq <sre> ...) ; sequence + (: <sre> ...) + (or <sre> ...) ; alternation + + ;; optional/multiple patterns + (? <sre> ...) ; 0 or 1 matches + (* <sre> ...) ; 0 or more matches + (+ <sre> ...) ; 1 or more matches + (= <n> <sre> ...) ; exactly <n> matches + (>= <n> <sre> ...) ; <n> or more matches + (** <from> <to> <sre> ...) ; <n> to <m> matches + (?? <sre> ...) ; non-greedy (non-greedy) pattern: (0 or 1) + (*? <sre> ...) ; non-greedy kleene star + (**? <from> <to> <sre> ...) ; non-greedy range + + ;; submatch patterns + (submatch <sre> ...) ; numbered submatch + ($ <sre> ...) + (submatch-named <name> <sre> ...) ; named submatch + (=> <name> <sre> ...) + (backref <n-or-name>) ; match a previous submatch + + ;; toggling case-sensitivity + (w/case <sre> ...) ; enclosed <sre>s are case-sensitive + (w/nocase <sre> ...) ; enclosed <sre>s are case-insensitive + + ;; character sets + <char> ; singleton char set + (<string>) ; set of chars + (or <cset-sre> ...) ; set union + (~ <cset-sre> ...) ; set complement (i.e. [^...]) + (- <cset-sre> ...) ; set difference + (& <cset-sre> ...) ; set intersection + (/ <range-spec> ...) ; pairs of chars as ranges + + ;; named character sets + any + nonl + ascii + lower-case lower + upper-case upper + alphabetic alpha + numeric num + alphanumeric alphanum alnum + punctuation punct + graphic graph + whitespace white space + printing print + control cntrl + hex-digit xdigit + + ;; assertions and conditionals + bos eos ; beginning/end of string + bol eol ; beginning/end of line + bow eow ; beginning/end of word + nwb ; non-word-boundary + (look-ahead <sre> ...) ; zero-width look-ahead assertion + (look-behind <sre> ...) ; zero-width look-behind assertion + (neg-look-ahead <sre> ...) ; zero-width negative look-ahead assertion + (neg-look-behind <sre> ...) ; zero-width negative look-behind assertion + (atomic <sre> ...) ; for (?>...) independent patterns + (if <test> <pass> [<fail>]) ; conditional patterns + commit ; don't backtrack beyond this (i.e. cut) + + ;; backwards compatibility + (posix-string <string>) ; embed a POSIX string literal + +===== Basic SRE Patterns + +The simplest SRE is a literal string, which matches that string +exactly. + +<enscript highlight=scheme> +(irregex-search "needle" "hayneedlehay") => #<match> +</enscipt> + +By default the match is case-sensitive, though you can control this +either with the compiler flags or local overrides: + +<enscript highlight=scheme> +(irregex-search "needle" "haynEEdlehay") => #f + +(irregex-search (irregex "needle" 'i) "haynEEdlehay") => #<match> + +(irregex-search '(w/nocase "needle") "haynEEdlehay") => #<match> +</enscript> + +You can use {{w/case}} to switch back to case-sensitivity inside a +{{w/nocase}} or when the SRE was compiled with {{'i}}: + +<enscript highlight=scheme> +(irregex-search '(w/nocase "SMALL" (w/case "BIG")) "smallBIGsmall") => #<match> + +(irregex-search '(w/nocase "small" (w/case "big")) "smallBIGsmall") => #f +</enscript> + +Of course, literal strings by themselves aren't very interesting +regular expressions, so we want to be able to compose them. The most +basic way to do this is with the {{seq}} operator (or its abbreviation +{{:}}), which matches one or more patterns consecutively: + +<enscript highlight=scheme> +(irregex-search '(: "one" space "two" space "three") "one two three") => #<match> +</enscript> + +As you may have noticed above, the {{w/case}} and {{w/nocase}} +operators allowed multiple SREs in a sequence - other operators that +take any number of arguments (e.g. the repetition operators below) +allow such implicit sequences. + +To match any one of a set of patterns use the {{or}} alternation +operator: + +<enscript highlight=scheme> +(irregex-search '(or "eeney" "meeney" "miney") "meeney") => #<match> + +(irregex-search '(or "eeney" "meeney" "miney") "moe") => #f +</enscript> + +===== SRE Repetition Patterns + +There are also several ways to control the number of times a pattern +is matched. The simplest of these is {{?}} which just optionally +matches the pattern: + +<enscript highlight=scheme> +(irregex-search '(: "match" (? "es") "!") "matches!") => #<match> + +(irregex-search '(: "match" (? "es") "!") "match!") => #<match> + +(irregex-search '(: "match" (? "es") "!") "matche!") => #<match> +</enscript> + +To optionally match any number of times, use {{*}}, the Kleene star: + +<enscript highlight=scheme> +(irregex-search '(: "<" (* (~ #\>)) ">") "<html>") => #<match> + +(irregex-search '(: "<" (* (~ #\>)) ">") "<>") => #<match> + +(irregex-search '(: "<" (* (~ #\>)) ">") "<html") => #f +</enscript> + +Often you want to match any number of times, but at least one time is +required, and for that you use {{+}}: + +<enscript highlight=scheme> +(irregex-search '(: "<" (+ (~ #\>)) ">") "<html>") => #<match> + +(irregex-search '(: "<" (+ (~ #\>)) ">") "<a>") => #<match> + +(irregex-search '(: "<" (+ (~ #\>)) ">") "<>") => #f +</enscript> + +More generally, to match at least a given number of times, use {{>=}}: + +<enscript highlight=scheme> +(irregex-search '(: "<" (>= 3 (~ #\>)) ">") "<table>") => #<match> + +(irregex-search '(: "<" (>= 3 (~ #\>)) ">") "<pre>") => #<match> + +(irregex-search '(: "<" (>= 3 (~ #\>)) ">") "<tr>") => #f +</enscript> + +To match a specific number of times exactly, use {{=}}: + +<enscript highlight=scheme> +(irregex-search '(: "<" (= 4 (~ #\>)) ">") "<html>") => #<match> + +(irregex-search '(: "<" (= 4 (~ #\>)) ">") "<table>") => #f +</enscript> + +And finally, the most general form is {{**}} which specifies a range +of times to match. All of the earlier forms are special cases of this. + +<enscript highlight=scheme> +(irregex-search '(: (= 3 (** 1 3 numeric) ".") (** 1 3 numeric)) "192.168.1.10") => #<match> + +(irregex-search '(: (= 3 (** 1 3 numeric) ".") (** 1 3 numeric)) "192.0168.1.10") => #f +</enscript> + +There are also so-called "non-greedy" variants of these repetition +operators, by convention suffixed with an additional {{?}}. Since the +normal repetition patterns can match any of the allotted repetition +range, these operators will match a string if and only if the normal +versions matched. However, when the endpoints of which submatch +matched where are taken into account (specifically, all matches when +using irregex-search since the endpoints of the match itself matter), +the use of a non-greedy repetition can change the result. + +So, whereas {{?}} can be thought to mean "match or don't match," +{{??}} means "don't match or match." {{*}} typically consumes as much +as possible, but {{*?}} tries first to match zero times, and only +consumes one at a time if that fails. If you have a greedy operator +followed by a non-greedy operator in the same pattern, they can +produce surprisins results as they compete to make the match longer or +shorter. If this seems confusing, that's because it is. Non-greedy +repetitions are defined only in terms of the specific backtracking +algorithm used to implement them, which for compatibility purposes +always means the Perl algorithm. Thus, when using these patterns you +force IrRegex to use a backtracking engine, and can't rely on +efficient execution. + +===== SRE Character Sets + +Perhaps more common than matching specific strings is matching any of +a set of characters. You can use the {{or}} alternation pattern on a +list of single-character strings to simulate a character set, but this +is too clumsy for everyday use so SRE syntax allows a number of +shortcuts. + +A single character matches that character literally, a trivial +character class. More conveniently, a list holding a single element +which is a string refers to the character set composed of every +character in the string. + +<enscript highlight=scheme> +(irregex-match '(* #\-) "---") => #<match> + +(irregex-match '(* #\-) "-_-") => #f + +(irregex-match '(* ("aeiou")) "oui") => #<match> + +(irregex-match '(* ("aeiou")) "ouais") => #f +</enscript> + +Ranges are introduced with the \q{/} operator. Any strings or +characters in the \q{/} are flattened and then taken in pairs to +represent the start and end points, inclusive, of character ranges. + +<enscript highlight=scheme> +(irregex-match '(* (/ "AZ09")) "R2D2") => #<match> + +(irregex-match '(* (/ "AZ09")) "C-3PO") => #f +</enscript> + +In addition, a number of set algebra operations are provided. \q{or}, +of course, has the same meaning, but when all the options are +character sets it can be thought of as the set union operator. This +is further extended by the \q{&} set intersection, \q{-} set +difference, and \q{~} set complement operators. + +<enscript highlight=scheme> +(irregex-match '(* (& (/ "az") (~ ("aeiou")))) "xyzzy") => #<match> + +(irregex-match '(* (& (/ "az") (~ ("aeiou")))) "vowels") => #f + +(irregex-match '(* (- (/ "az") ("aeiou"))) "xyzzy") => #<match> + +(irregex-match '(* (- (/ "az") ("aeiou"))) "vowels") => #f +</enscript> + +===== SRE Assertion Patterns + +There are a number of times it can be useful to assert something about +the area around a pattern without explicitly making it part of the +pattern. The most common cases are specifically anchoring some +pattern to the beginning or end of a word or line or even the whole +string. For example, to match on the end of a word: + +<enscript highlight=scheme> +(irregex-match '(: "foo" eow) "foo") => #<match> + +(irregex-match '(: "foo" eow) "foo!") => #<match> + +(irregex-match '(: "foo" eow) "foof") => #f +</enscript> + +The {{bow}}, {{bol}}, {{eol}}, {{bos}} and {{eos}} work similarly. +{{nwb}} asserts that you are not in a word-boundary - if replaced for +{{eow}} in the above examples it would reverse all the results. + +There is no {{wb}}, since you tend to know from context whether it +would be the beginning or end of a word, but if you need it you can +always use {{(or bow eow)}}. + +Somewhat more generally, Perl introduced positive and negative +look-ahead and look-behind patterns. Perl look-behind patterns are +limited to a fixed length, however the IrRegex versions have no such +limit. + +<enscript highlight=scheme> +(irregex-match '(: "regular" (look-ahead " expression")) + "regular expression") + => #<match> +</enscript> + +The most general case, of course, would be an \q{and} pattern to +complement the \q{or} pattern - all the patterns must match or the +whole pattern fails. This may be provided in a future release, +although it (and look-ahead and look-behind assertions) are unlikely +to be compiled efficiently. + +===== SRE Utility Patterns + +The following utility regular expressions are also provided for common +patterns that people are eternally reinventing. They are not +necessarily the official patterns matching the RFC definitions of the +given data, because of the way that such patterns tend to be used. +There are three general usages for regexps: + +; searching : search for a pattern matching a desired object in a larger text + +; validation : determine whether an entire string matches a pattern + +; extraction : given a string already known to be valid, extract certain fields from it as submatches + +In some cases, but not always, these will overlap. When they are +different, {{irregex-search}} will naturally always want the searching +version, so IrRegex provides that version. + +As an example where these might be different, consider a URL. If you +want to match all the URLs in some arbitrary text, you probably want +to exclude a period or comma at the tail end of a URL, since it's more +likely being used as punctuation rather than part of the URL, despite +the fact that it would be valid URL syntax. + +Another problem with the RFC definitions is the standard itself may +have become irrelevant. For example, the pattern IrRegex provides for +email addresses doesn't match quoted local parts (e.g. +{{"first last"@domain.com}}) because these are increasingly rare, and +unsupported by enough software that it's better to discourage their use. +Conversely, technically consecutive periods +(e.g. {{first..last@domain.com}}) are not allowed in email addresses, but +most email software does allow this, and in fact such addresses are +quite common in Japan. + +The current patterns provided are: + + newline ; general newline pattern (crlf, cr, lf) + integer ; an integer + real ; a real number (including scientific) + string ; a "quoted" string + symbol ; an R5RS Scheme symbol + ipv4-address ; a numeric decimal ipv4 address + ipv6-address ; a numeric hexadecimal ipv6 address + domain ; a domain name + email ; an email address + http-url ; a URL beginning with https?:// + +Because of these issues the exact definitions of these patterns are +subject to be changed, but will be documented clearly when they are +finalized. More common patterns are also planned, but as what you +want increases in complexity it's probably better to use a real +parser. + +==== Supported PCRE Syntax + +Since the PCRE syntax is so overwhelming complex, it's easier to just +list what we *don't* support for now. Refer to the +[[http://pcre.org/pcre.txt|PCRE documentation]] for details. You +should be using the SRE syntax anyway! + +Unicode character classes ({{\P}}) are not supported, but will be +in an upcoming release. {{\C}} named characters are not supported. + +Callbacks, subroutine patterns and recursive patterns are not +supported. ({{*FOO}}) patterns are not supported and may never be. + +{{\G}} and {{\K}} are not supported. + +Octal character escapes are not supported because they are ambiguous +with back-references - just use hex character escapes. + +Other than that everything should work, including named submatches, +zero-width assertions, conditional patterns, etc. + +In addition, {{\<}} and {{\>}} act as beginning-of-word and end-of-word +marks, respectively, as in Emacs regular expressions. + +Also, two escapes are provided to embed SRE patterns inside PCRE +strings, {{"\'<sre>"}} and {{"(*'<sre>)"}}. For example, to match a +comma-delimited list of integers you could use + +<enscript highlight=scheme> +"\\'integer(,\\'integer)*" +</enscript> + +and to match a URL in angle brackets you could use + +<enscript highlight=scheme> +"<('*http-url)>" +</enscript> + +Note in the second example the enclosing {{"('*...)"}} syntax is needed +because the Scheme reader would consider the closing {{">"}} as part of +the SRE symbol. + +The following chart gives a quick reference from PCRE form to the SRE +equivalent: + + ;; basic syntax + "^" ;; bos (or eos inside (?m: ...)) + "$" ;; eos (or eos inside (?m: ...)) + "." ;; nonl + "a?" ;; (? a) + "a*" ;; (* a) + "a+" ;; (+ a) + "a??" ;; (?? a) + "a*?" ;; (*? a) + "a+?" ;; (+? a) + "a{n,m}" ;; (** n m a) + + ;; grouping + "(...)" ;; (submatch ...) + "(?:...)" ;; (: ...) + "(?i:...)" ;; (w/nocase ...) + "(?-i:...)" ;; (w/case ...) + "(?<name>...)" ;; (=> <name>...) + + ;; character classes + "[aeiou]" ;; ("aeiou") + "[^aeiou]" ;; (~ "aeiou") + "[a-z]" ;; (/ "az") or (/ "a" "z") + "[[:alpha:]]" ;; alpha + + ;; assertions + "(?=...)" ;; (look-ahead ...) + "(?!...)" ;; (neg-look-ahead ...) + "(?<=...)" ;; (look-behind ...) + "(?<!...)" ;; (neg-look-behind ...) + "(?(test)pass|fail)" ;; (if test pass fail) + "(*COMMIT)" ;; commit + +==== Chunked String Matching + +It's often desirable to perform regular expression matching over +sequences of characters not represented as a single string. The most +obvious example is a text-buffer data structure, but you may also want +to match over lists or trees of strings (i.e. ropes), over only +certain ranges within a string, over an input port, etc. With +existing regular expression libraries, the only way to accomplish this +is by converting the abstract sequence into a freshly allocated +string. This can be expensive, or even impossible if the object is a +text-buffer opened onto a 500MB file. + +IrRegex provides a chunked string API specifically for this purpose. +You define a chunking API with {{make-irregex-chunker}}: + +===== make-irregex-chunker + +<procedure>(make-irregex-chunker <get-next> <get-string> [<get-start> <get-end> <get-substring> <get-subchunk>])</procedure> + +where + +{{(<get-next> chunk) => }} returns the next chunk, or {{#f}} if there are no more chunks + +{{(<get-string> chunk) => }} a string source for the chunk + +{{(<get-start> chunk) => }} the start index of the result of {{<get-string>}} (defaults to always 0) + +{{(<get-end> chunk) => }} the end (exclusive) of the string (defaults to {{string-length}} of the source string) + +{{(<get-substring> cnk1 i cnk2 j) => }} a substring for the range between the chunk {{cnk1}} starting at index {{i}} and ending at {{cnk2}} at index {{j}} + +{{(<get-subchunk> cnk1 i cnk2 j) => }} as above but returns a new chunked data type instead of a string (optional) + +There are two important constraints on the {{<get-next>}} procedure. +It must return an {{eq?}} identical object when called multiple times +on the same chunk, and it must not return a chunk with an empty string +(start == end). This second constraint is for performance reasons - +we push the work of possibly filtering empty chunks to the chunker +since there are many chunk types for which empty strings aren't +possible, and this work is thus not needed. Note that the initial +chunk passed to match on is allowed to be empty. + +{{<get-substring>}} is provided for possible performance improvements +- without it a default is used. {{<get-subchunk>}} is optional - +without it you may not use {{irregex-match-subchunk}} described above. + +You can then match chunks of these types with the following +procedures: + +===== irregex-search/chunked +===== irregex-match/chunked + +<procedure>(irregex-search/chunked <irx> <chunker> <chunk> [<start>])</procedure><br> +<procedure>(irregex-match/chunked <irx> <chunker> <chunk> [<start>])</procedure> + +These return normal match-data objects. + +Example: + +To match against a simple, flat list of strings use: + +<enscript highlight=scheme> + (define (rope->string rope1 start rope2 end) + (if (eq? rope1 rope2) + (substring (car rope1) start end) + (let loop ((rope (cdr rope1)) + (res (list (substring (car rope1) start)))) + (if (eq? rope rope2) + (string-concatenate-reverse ; from SRFI-13 + (cons (substring (car rope) 0 end) res)) + (loop (cdr rope) (cons (car rope) res)))))) + + (define rope-chunker + (make-irregex-chunker (lambda (x) (and (pair? (cdr x)) (cdr x))) + car + (lambda (x) 0) + (lambda (x) (string-length (car x))) + rope->string)) + + (irregex-search/chunked <pat> rope-chunker <list-of-strings>) +</enscript> + +Here we are just using the default start, end and substring behaviors, +so the above chunker could simply be defined as: + +<enscript highlight=scheme> + (define rope-chunker + (make-irregex-chunker (lambda (x) (and (pair? (cdr x)) (cdr x))) car)) +</enscript> + +===== irregex-fold/chunked + +<procedure>(irregex-fold/chunked <irx> <kons> <knil> <chunker> <chunk> [<finish> [<start-index>]])</procedure> + +Chunked version of {{irregex-fold}}. + +==== Utilities + +The following procedures are also available. + +===== irregex-quote + +<procedure>(irregex-quote <str>)</procedure> + +Returns a new string with any special regular expression characters +escaped, to match the original string literally in POSIX regular +expressions. + +===== irregex-opt + +<procedure>(irregex-opt <list-of-strings>)</procedure> + +Returns an optimized SRE matching any of the literal strings +in the list, like Emacs' \q{regexp-opt}. Note this optimization +doesn't help when irregex is able to build a DFA. + +===== sre->string + +<procedure>(sre->string <sre>)</procedure> + +Convert an SRE to a POSIX-style regular expression string, if +possible. + + +--- +Previous: [[Unit extras]] + +Next: [[Unit srfi-1]] diff --git a/manual/Unit posix b/manual/Unit posix index f236e3d6..5627f61b 100644 --- a/manual/Unit posix +++ b/manual/Unit posix @@ -1104,7 +1104,7 @@ Recursively traverses the contents of {{DIRECTORY}} (which should be a string) and invokes the procedure {{action}} for all files in which the procedure {{test}} is true. {{test}} may be a procedure of one argument or a regular-expression string that will be matched with a -full pathname using {{string-match}}. {{action}} should be a +full pathname using {{irregex-match}}. {{action}} should be a procedure of two arguments: the currently encountered file and the result of the previous invocation of {{action}}, or, if this is the first invocation, the value of {{seed}}. {{test}} defaults to diff --git a/manual/Unit regex b/manual/Unit regex deleted file mode 100644 index 2d0c249e..00000000 --- a/manual/Unit regex +++ /dev/null @@ -1,461 +0,0 @@ -[[tags: manual]] -[[toc:]] - -== Unit regex - -This library unit provides support for regular expressions. The regular -expression package used is {{irregex}} -written by Alex Shinn. Irregex supports most Perl-extensions and is -written completely in Scheme. - -This library unit exposes two APIs: the standard Chicken API described below, and the -original irregex API. You may use either API or both: - - (require-library regex) ; required for either API, or both - (import regex) ; import the Chicken regex API - (import irregex) ; import the original irregex API - -Regular expressions may be either POSIX-style strings (with most PCRE -extensions) or an SCSH-style SRE. There is no {{(rx ...)}} syntax - -just use normal Scheme lists, with quasiquote if you like. - -=== grep - -<procedure>(grep REGEX LIST [ACCESSOR])</procedure> - -Returns all items of {{LIST}} that match the regular expression -{{REGEX}}. This procedure could be defined as follows: - -<enscript highlight=scheme> -(define (grep regex lst) - (filter (lambda (x) (string-search regex x)) lst) ) -</enscript> - -{{ACCESSOR}} is an optional accessor-procedure applied to each -element before doing the match. It should take a single argument -and return a string that will then be used in the regular expression -matching. {{ACCESSOR}} defaults to the identity function. - - -=== glob->regexp - -<procedure>(glob->regexp PATTERN [SRE?])</procedure> - -Converts the file-pattern {{PATTERN}} into a regular expression. - -<enscript highlight=scheme> -(glob->regexp "foo.*") -=> "foo\..*" -</enscript> - -{{PATTERN}} should follow "glob" syntax. Allowed wildcards are - - * - [C...] - [C1-C2] - [-C...] - ? - -{{glob->regexp}} returns a regular expression object if the optional -argument {{SRE?}} is false or not given, otherwise the SRE of the -computed regular expression is returned. - - -=== regexp - -<procedure>(regexp STRING [IGNORECASE [IGNORESPACE [UTF8]]])</procedure> - -Returns a precompiled regular expression object for {{string}}. -The optional arguments {{IGNORECASE}}, {{IGNORESPACE}} and {{UTF8}} -specify whether the regular expression should be matched with case- or whitespace-differences -ignored, or whether the string should be treated as containing UTF-8 encoded -characters, respectively. - -Note that code that uses regular expressions heavily should always -use them in precompiled form, which is likely to be much faster than -passing strings to any of the regular-expression routines described -below. - - -=== regexp? - -<procedure>(regexp? X)</procedure> - -Returns {{#t}} if {{X}} is a precompiled regular expression, -or {{#f}} otherwise. - - -=== string-match -=== string-match-positions - -<procedure>(string-match REGEXP STRING)</procedure><br> -<procedure>(string-match-positions REGEXP STRING)</procedure> - -Matches the regular expression in {{REGEXP}} (a string or a precompiled -regular expression) with -{{STRING}} and returns either {{#f}} if the match failed, -or a list of matching groups, where the first element is the complete -match. For each matching group the -result-list contains either: {{#f}} for a non-matching but optional -group; a list of start- and end-position of the match in {{STRING}} -(in the case of {{string-match-positions}}); or the matching -substring (in the case of {{string-match}}). Note that the exact string -is matched. For searching a pattern inside a string, see below. -Note also that {{string-match}} is implemented by calling -{{string-search}} with the regular expression wrapped in {{^ ... $}}. -If invoked with a precompiled regular expression argument (by using -{{regexp}}), {{string-match}} is identical to {{string-search}}. - - -=== string-search -=== string-search-positions - -<procedure>(string-search REGEXP STRING [START [RANGE]])</procedure><br> -<procedure>(string-search-positions REGEXP STRING [START [RANGE]])</procedure> - -Searches for the first match of the regular expression in -{{REGEXP}} with {{STRING}}. The search can be limited to -{{RANGE}} characters. - - -=== string-split-fields - -<procedure>(string-split-fields REGEXP STRING [MODE [START]])</procedure> - -Splits {{STRING}} into a list of fields according to {{MODE}}, -where {{MODE}} can be the keyword {{#:infix}} ({{REGEXP}} -matches field separator), the keyword {{#:suffix}} ({{REGEXP}} -matches field terminator) or {{#t}} ({{REGEXP}} matches field), -which is the default. - -<enscript highlight=scheme> -(define s "this is a string 1, 2, 3,") - -(string-split-fields "[^ ]+" s) - - => ("this" "is" "a" "string" "1," "2," "3,") - -(string-split-fields " " s #:infix) - - => ("this" "is" "a" "string" "1," "2," "3,") - -(string-split-fields "," s #:suffix) - - => ("this is a string 1" " 2" " 3") -</enscript> - - -=== string-substitute - -<procedure>(string-substitute REGEXP SUBST STRING [MODE])</procedure> - -Searches substrings in {{STRING}} that match {{REGEXP}} -and substitutes them with the string {{SUBST}}. The substitution -can contain references to subexpressions in -{{REGEXP}} with the {{\NUM}} notation, where {{NUM}} -refers to the NUMth parenthesized expression. The optional argument -{{MODE}} defaults to 1 and specifies the number of the match to -be substituted. Any non-numeric index specifies that all matches are to -be substituted. - -<enscript highlight=scheme> -(string-substitute "([0-9]+) (eggs|chicks)" "\\2 (\\1)" "99 eggs or 99 chicks" 2) -=> "99 eggs or chicks (99)" -</enscript> - -Note that a regular expression that matches an empty string will -signal an error. - - -=== string-substitute* - -<procedure>(string-substitute* STRING SMAP [MODE])</procedure> - -Substitutes elements of {{STRING}} with {{string-substitute}} according to {{SMAP}}. -{{SMAP}} should be an association-list where each element of the list -is a pair of the form {{(MATCH . REPLACEMENT)}}. Every occurrence of -the regular expression {{MATCH}} in {{STRING}} will be replaced by the string -{{REPLACEMENT}} - -<enscript highlight=scheme> -(string-substitute* "<h1>Hello, world!</h1>" '(("<[/A-Za-z0-9]+>" . ""))) - -=> "Hello, world!" -</enscript> - - -=== regexp-escape - -<procedure>(regexp-escape STRING)</procedure> - -Escapes all special characters in {{STRING}} with {{\}}, so that the string can be embedded -into a regular expression. - -<enscript highlight=scheme> -(regexp-escape "^[0-9]+:.*$") -=> "\\^\\[0-9\\]\\+:.\n.\\*\\$" -</enscript> - -=== Extended SRE Syntax - -The following table summarizes the SRE syntax, with detailed explanations following. - - ;; basic patterns - <string> ; literal string - (seq <sre> ...) ; sequence - (: <sre> ...) - (or <sre> ...) ; alternation - - ;; optional/multiple patterns - (? <sre> ...) ; 0 or 1 matches - (* <sre> ...) ; 0 or more matches - (+ <sre> ...) ; 1 or more matches - (= <n> <sre> ...) ; exactly <n> matches - (>= <n> <sre> ...) ; <n> or more matches - (** <from> <to> <sre> ...) ; <n> to <m> matches - (?? <sre> ...) ; non-greedy (non-greedy) pattern: (0 or 1) - (*? <sre> ...) ; non-greedy kleene star - (**? <from> <to> <sre> ...) ; non-greedy range - - ;; submatch patterns - (submatch <sre> ...) ; numbered submatch - (submatch-named <name> <sre> ...) ; named submatch - (=> <name> <sre> ...) - (backref <n-or-name>) ; match a previous submatch - - ;; toggling case-sensitivity - (w/case <sre> ...) ; enclosed <sre>s are case-sensitive - (w/nocase <sre> ...) ; enclosed <sre>s are case-insensitive - - ;; character sets - <char> ; singleton char set - (<string>) ; set of chars - (or <cset-sre> ...) ; set union - (~ <cset-sre> ...) ; set complement (i.e. [^...]) - (- <cset-sre> ...) ; set difference - (& <cset-sre> ...) ; set intersection - (/ <range-spec> ...) ; pairs of chars as ranges - - ;; named character sets - any - nonl - ascii - lower-case lower - upper-case upper - alphabetic alpha - numeric num - alphanumeric alphanum alnum - punctuation punct - graphic graph - whitespace white space - printing print - control cntrl - hex-digit xdigit - - ;; assertions and conditionals - bos eos ; beginning/end of string - bol eol ; beginning/end of line - bow eow ; beginning/end of word - nwb ; non-word-boundary - (look-ahead <sre> ...) ; zero-width look-ahead assertion - (look-behind <sre> ...) ; zero-width look-behind assertion - (neg-look-ahead <sre> ...) ; zero-width negative look-ahead assertion - (neg-look-behind <sre> ...) ; zero-width negative look-behind assertion - (atomic <sre> ...) ; for (?>...) independent patterns - (if <test> <pass> [<fail>]) ; conditional patterns - commit ; don't backtrack beyond this (i.e. cut) - - ;; backwards compatibility - (posix-string <string>) ; embed a POSIX string literal - -==== Basic SRE Patterns - -The simplest SRE is a literal string, which matches that string exactly. - - (string-search "needle" "hayneedlehay") => <match> - -By default the match is case-sensitive, though you can control this either with the compiler flags or local overrides: - - (string-search "needle" "haynEEdlehay") => #f - - (string-search (irregex "needle" 'i) "haynEEdlehay") => <match> - - (string-search '(w/nocase "needle") "haynEEdlehay") => <match> - -You can use {{w/case}} to switch back to case-sensitivity inside a {{w/nocase}}: - - (string-search '(w/nocase "SMALL" (w/case "BIG")) "smallBIGsmall") => <match> - - (string-search '(w/nocase "small" (w/case "big")) "smallBIGsmall") => #f - -Of course, literal strings by themselves aren't very interesting -regular expressions, so we want to be able to compose them. The most -basic way to do this is with the {{seq}} operator (or its abbreviation {{:}}), -which matches one or more patterns consecutively: - - (string-search '(: "one" space "two" space "three") "one two three") => <match> - -As you may have noticed above, the {{w/case}} and {{w/nocase}} operators -allowed multiple SREs in a sequence - other operators that take any -number of arguments (e.g. the repetition operators below) allow such -implicit sequences. - -To match any one of a set of patterns use the or alternation operator: - - (string-search '(or "eeney" "meeney" "miney") "meeney") => <match> - - (string-search '(or "eeney" "meeney" "miney") "moe") => #f - -==== SRE Repetition Patterns - -There are also several ways to control the number of times a pattern -is matched. The simplest of these is {{?}} which just optionally matches -the pattern: - - (string-search '(: "match" (? "es") "!") "matches!") => <match> - - (string-search '(: "match" (? "es") "!") "match!") => <match> - - (string-search '(: "match" (? "es") "!") "matche!") => #f - -To optionally match any number of times, use {{*}}, the Kleene star: - - (string-search '(: "<" (* (~ #\>)) ">") "<html>") => <match> - - (string-search '(: "<" (* (~ #\>)) ">") "<>") => <match> - - (string-search '(: "<" (* (~ #\>)) ">") "<html") => #f - -Often you want to match any number of times, but at least one time is required, and for that you use {{+}}: - - (string-search '(: "<" (+ (~ #\>)) ">") "<html>") => <match> - - (string-search '(: "<" (+ (~ #\>)) ">") "<a>") => <match> - - (string-search '(: "<" (+ (~ #\>)) ">") "<>") => #f - -More generally, to match at least a given number of times, use {{>=}}: - - (string-search '(: "<" (>= 3 (~ #\>)) ">") "<table>") => <match> - - (string-search '(: "<" (>= 3 (~ #\>)) ">") "<pre>") => <match> - - (string-search '(: "<" (>= 3 (~ #\>)) ">") "<tr>") => #f - -To match a specific number of times exactly, use {=}: - - (string-search '(: "<" (= 4 (~ #\>)) ">") "<html>") => <match> - - (string-search '(: "<" (= 4 (~ #\>)) ">") "<table>") => #f - -And finally, the most general form is {{**}} which specifies a range -of times to match. All of the earlier forms are special cases of this. - - (string-search '(: (= 3 (** 1 3 numeric) ".") (** 1 3 numeric)) "192.168.1.10") => <match> - - (string-search '(: (= 3 (** 1 3 numeric) ".") (** 1 3 numeric)) "192.0168.1.10") => #f - -There are also so-called "non-greedy" variants of these repetition -operators, by convention suffixed with an additional {{?}}. Since the -normal repetition patterns can match any of the allotted repetition -range, these operators will match a string if and only if the normal -versions matched. However, when the endpoints of which submatch -matched where are taken into account (specifically, all matches when -using string-search since the endpoints of the match itself matter), -the use of a non-greedy repetition can change the result. - -So, whereas {{?}} can be thought to mean "match or don't match," {{??}} means -"don't match or match." {{*}} typically consumes as much as possible, but -{{*?}} tries first to match zero times, and only consumes one at a time if -that fails. If you have a greedy operator followed by a non-greedy -operator in the same pattern, they can produce surprisins results as -they compete to make the match longer or shorter. If this seems -confusing, that's because it is. Non-greedy repetitions are defined -only in terms of the specific backtracking algorithm used to implement -them, which for compatibility purposes always means the Perl -algorithm. Thus, when using these patterns you force IrRegex to use a -backtracking engine, and can't rely on efficient execution. - -==== SRE Character Sets - -Perhaps more common than matching specific strings is matching any of -a set of characters. You can use the or alternation pattern on a list -of single-character strings to simulate a character set, but this is -too clumsy for everyday use so SRE syntax allows a number of -shortcuts. - -A single character matches that character literally, a trivial -character class. More conveniently, a list holding a single element -which is a string refers to the character set composed of every -character in the string. - - (string-match '(* #\-) "---") => <match> - - (string-match '(* #\-) "-_-") => #f - - (string-match '(* ("aeiou")) "oui") => <match> - - (string-match '(* ("aeiou")) "ouais") => #f - -Ranges are introduced with the {{/}} operator. Any strings or characters -in the {{/}} are flattened and then taken in pairs to represent the start -and end points, inclusive, of character ranges. - - (string-match '(* (/ "AZ09")) "R2D2") => <match> - - (string-match '(* (/ "AZ09")) "C-3PO") => #f - -In addition, a number of set algebra operations are provided. or, of -course, has the same meaning, but when all the options are character -sets it can be thought of as the set union operator. This is further -extended by the {{&}} set intersection, {{-}} set difference, and {{~}} set -complement operators. - - (string-match '(* (& (/ "az") (~ ("aeiou")))) "xyzzy") => <match> - - (string-match '(* (& (/ "az") (~ ("aeiou")))) "vowels") => #f - - (string-match '(* (- (/ "az") ("aeiou"))) "xyzzy") => <match> - - (string-match '(* (- (/ "az") ("aeiou"))) "vowels") => #f - -==== SRE Assertion Patterns - -There are a number of times it can be useful to assert something about -the area around a pattern without explicitly making it part of the -pattern. The most common cases are specifically anchoring some pattern -to the beginning or end of a word or line or even the whole -string. For example, to match on the end of a word: - - (string-match '(: "foo" eow) "foo") => <match> - - (string-match '(: "foo" eow) "foo!") => <match> - - (string-match '(: "foo" eow) "foof") => #f - -The {{bow}}, {{bol}}, {{eol}}, {{bos}} and {{eos}} work similarly. {{nwb}} asserts that you -are not in a word-boundary - if replaced for {{eow}} in the above examples -it would reverse all the results. - -There is no {{wb}}, since you tend to know from context whether it -would be the beginning or end of a word, but if you need it you can -always use (or bow eow). - -Somewhat more generally, Perl introduced positive and negative -look-ahead and look-behind patterns. Perl look-behind patterns are -limited to a fixed length, however the IrRegex versions have no such -limit. - - (string-match '(: "regular" (look-ahead " expression")) "regular expression") => <match> - -The most general case, of course, would be an and pattern to -complement the or pattern - all the patterns must match or the whole -pattern fails. This may be provided in a future release, although it -(and look-ahead and look-behind assertions) are unlikely to be -compiled efficiently. - - ---- -Previous: [[Unit extras]] - -Next: [[Unit srfi-1]] diff --git a/manual/Unit srfi-1 b/manual/Unit srfi-1 index 72ac271f..59a47160 100644 --- a/manual/Unit srfi-1 +++ b/manual/Unit srfi-1 @@ -1,4 +1,5 @@ [[tags: manual]] +[[toc:]] == Unit srfi-1 @@ -1515,6 +1516,6 @@ arguments. ---- -Previous: [[Unit regex]] +Previous: [[Unit irregex]] Next: [[Unit srfi-4]] diff --git a/manual/Unit utils b/manual/Unit utils index e1af0895..8c174b1d 100644 --- a/manual/Unit utils +++ b/manual/Unit utils @@ -88,11 +88,12 @@ The initial default options are {{-scrutinize -O2 -d2}}. <procedure>(scan-input-lines REGEXP [PORT])</procedure> Reads lines from {{PORT}} (defaults to the result of {{(current-input-port)}}) -using {{read-line}} and returns the result of {{(string-search REGEXP LINE)}}, +using {{read-line}} and returns the result of {{(irregex-search REGEXP LINE)}}, if the match succeeds. If no match could be found, {{#f}} is returned. {{REGEXP}} may also be a procedure of one argument which is called for each -input line and should return a non-false value on success. +input line and should return a non-false value on success, which will then +be the result of the call to {{scan-input-lines}}. === Asking the user for confirmation diff --git a/manual/faq b/manual/faq index 7abf0381..24f054af 100644 --- a/manual/faq +++ b/manual/faq @@ -533,7 +533,7 @@ Compile the program that uses the module: The regular expression engine has recently be replaced by [[/users/alex shinn|alex shinn]]'s excellent {{irregex}} library, which is fully implemented in Scheme. Precompiling regular expressions to internal form is somewhat slower than with the old PCRE-based -regex engine. It is advisable to use {{regexp}} to precompile regular expressions +regex engine. It is advisable to use {{irregex}} to precompile regular expressions outside of time-critical loops and use them where performance matters. diff --git a/posix-common.scm b/posix-common.scm index 55f9f488..e77b05fb 100644 --- a/posix-common.scm +++ b/posix-common.scm @@ -233,64 +233,54 @@ EOF ;;; Filename globbing: (define glob - (let ((regexp regexp) - (string-match string-match) - (glob->regexp glob->regexp) - (directory directory) - (make-pathname make-pathname) - (decompose-pathname decompose-pathname) ) - (lambda paths - (let conc-loop ((paths paths)) - (if (null? paths) - '() - (let ((path (car paths))) - (let-values (((dir fil ext) (decompose-pathname path))) - (let* ((patt (glob->regexp (make-pathname #f (or fil "*") ext))) - (rx (regexp patt))) - (let loop ((fns (directory (or dir ".") #t))) - (cond ((null? fns) (conc-loop (cdr paths))) - ((string-match rx (car fns)) - => (lambda (m) (cons (make-pathname dir (car m)) (loop (cdr fns)))) ) - (else (loop (cdr fns))) ) ) ) ) ) ) ) ) ) ) + (lambda paths + (let conc-loop ((paths paths)) + (if (null? paths) + '() + (let ((path (car paths))) + (let-values (((dir fil ext) (decompose-pathname path))) + (let ((rx (##sys#glob->regexp (make-pathname #f (or fil "*") ext)))) + (let loop ((fns (directory (or dir ".") #t))) + (cond ((null? fns) (conc-loop (cdr paths))) + ((irregex-match rx (car fns)) + => (lambda (m) + (cons + (make-pathname dir (irregex-match-substring m)) + (loop (cdr fns)))) ) + (else (loop (cdr fns))) ) ) ) ) ) ) ) ) ) ;;; Find matching files: (define ##sys#find-files - (let ((glob glob) - (string-match string-match) - (make-pathname make-pathname) - (pathname-file pathname-file) - (symbolic-link? symbolic-link?) - (directory? directory?) ) - (lambda (dir pred action id limit follow dot loc) - (##sys#check-string dir loc) - (let* ((depth 0) - (lproc - (cond ((not limit) (lambda _ #t)) - ((fixnum? limit) (lambda _ (fx< depth limit))) - (else limit) ) ) - (pproc - (if (or (string? pred) (regexp? pred)) - (let ((pred (regexp pred))) ; force compilation - (lambda (x) (string-match pred x))) - pred) ) ) - (let loop ((fs (glob (make-pathname dir (if dot "?*" "*")))) - (r id) ) - (if (null? fs) - r - (let ((f (##sys#slot fs 0)) - (rest (##sys#slot fs 1)) ) - (cond ((directory? f) - (cond ((member (pathname-file f) '("." "..")) (loop rest r)) - ((lproc f) - (loop rest - (fluid-let ((depth (fx+ depth 1))) - (loop (glob (make-pathname f "*")) - (if (pproc f) (action f r) r)) ) ) ) - (else (loop rest (if (pproc f) (action f r) r))) ) ) - ((pproc f) (loop rest (action f r))) - (else (loop rest r)) ) ) ) ) ) ) ) ) + (lambda (dir pred action id limit follow dot loc) + (##sys#check-string dir loc) + (let* ((depth 0) + (lproc + (cond ((not limit) (lambda _ #t)) + ((fixnum? limit) (lambda _ (fx< depth limit))) + (else limit) ) ) + (pproc + (if (or (string? pred) (irregex? pred)) + (let ((pred (irregex pred))) ; force compilation + (lambda (x) (irregex-match pred x))) + pred) ) ) + (let loop ((fs (glob (make-pathname dir (if dot "?*" "*")))) + (r id) ) + (if (null? fs) + r + (let ((f (##sys#slot fs 0)) + (rest (##sys#slot fs 1)) ) + (cond ((directory? f) + (cond ((member (pathname-file f) '("." "..")) (loop rest r)) + ((lproc f) + (loop rest + (fluid-let ((depth (fx+ depth 1))) + (loop (glob (make-pathname f "*")) + (if (pproc f) (action f r) r)) ) ) ) + (else (loop rest (if (pproc f) (action f r) r))) ) ) + ((pproc f) (loop rest (action f r))) + (else (loop rest r)) ) ) ) ) ) ) ) (define (find-files dir . args) (cond ((or (null? args) (not (keyword? (car args)))) diff --git a/posixunix.scm b/posixunix.scm index e17f6b3e..ed83fe1b 100644 --- a/posixunix.scm +++ b/posixunix.scm @@ -27,7 +27,7 @@ (declare (unit posix) - (uses scheduler regex extras utils files ports) + (uses scheduler irregex extras utils files ports) (disable-interrupts) (hide group-member _get-groups _ensure-groups posix-error ##sys#terminal-check) (not inline ##sys#interrupt-hook ##sys#user-interrupt-hook)) diff --git a/posixwin.scm b/posixwin.scm index 9dee8ede..97a0a232 100644 --- a/posixwin.scm +++ b/posixwin.scm @@ -63,7 +63,7 @@ (declare (unit posix) - (uses scheduler regex extras utils files ports) + (uses scheduler irregex extras utils files ports) (disable-interrupts) (hide $quote-args-list $exec-setup $exec-teardown) (not inline ##sys#interrupt-hook ##sys#user-interrupt-hook) diff --git a/regex.import.scm b/regex.import.scm deleted file mode 100644 index 70310a22..00000000 --- a/regex.import.scm +++ /dev/null @@ -1,41 +0,0 @@ -;;;; regex.import.scm - import library for "regex" module -; -; Copyright (c) 2008-2010, The Chicken Team -; All rights reserved. -; -; Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following -; conditions are met: -; -; Redistributions of source code must retain the above copyright notice, this list of conditions and the following -; disclaimer. -; Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following -; disclaimer in the documentation and/or other materials provided with the distribution. -; Neither the name of the author nor the names of its contributors may be used to endorse or promote -; products derived from this software without specific prior written permission. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS -; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY -; AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR -; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR -; OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -; POSSIBILITY OF SUCH DAMAGE. - - -(##sys#register-primitive-module - 'regex - '(glob->regexp - glob? - grep - regexp - regexp-escape - regexp? - string-match - string-match-positions - string-search - string-search-positions - string-split-fields - string-substitute - string-substitute*)) diff --git a/regex.scm b/regex.scm deleted file mode 100644 index 526e65ad..00000000 --- a/regex.scm +++ /dev/null @@ -1,360 +0,0 @@ -;;;; regex.scm -; -; Copyright (c) 2008-2010, The Chicken Team -; Copyright (c) 2000-2007, Felix L. Winkelmann -; All rights reserved. -; -; Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following -; conditions are met: -; -; Redistributions of source code must retain the above copyright notice, this list of conditions and the following -; disclaimer. -; Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following -; disclaimer in the documentation and/or other materials provided with the distribution. -; Neither the name of the author nor the names of its contributors may be used to endorse or promote -; products derived from this software without specific prior written permission. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS -; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY -; AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR -; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR -; OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -; POSSIBILITY OF SUCH DAMAGE. - - -(declare (unit regex)) - -(declare - (disable-interrupts) - (fixnum) - (export - regexp? regexp - string-match string-match-positions string-search string-search-positions - string-split-fields string-substitute string-substitute* - glob->regexp - grep - regexp-escape - - irregex string->irregex sre->irregex string->sre - irregex? irregex-match-data? - irregex-new-matches irregex-reset-matches! - irregex-match-start irregex-match-end irregex-match-substring - irregex-match-num-submatches - irregex-search irregex-search/matches irregex-match irregex-match-string - irregex-fold irregex-replace irregex-replace/all irregex-apply-match - irregex-dfa irregex-dfa/search irregex-dfa/extract - irregex-nfa irregex-flags irregex-submatches irregex-lengths irregex-names - )) - -(include "common-declarations.scm") - -(register-feature! 'regex 'irregex) - -(include "irregex.scm") - - -;;; Record `regexp' - -(define-record regexp x) - -(define-syntax (build-cache x r c) - ;; (build-cache N ARG FAIL) - (let* ((n (cadr x)) - (n2 (* n 2)) - (arg (caddr x)) - (fail (cadddr x)) - (%cache (r 'cache)) - (%index (r 'index)) - (%arg (r 'arg)) - (%let (r 'let)) - (%let* (r 'let*)) - (%if (r 'if)) - (%fx+ (r 'fx+)) - (%fxmod (r 'fxmod)) - (%equal? (r 'equal?)) - (%quote (r 'quote)) - (%tmp (r 'tmp)) - (%begin (r 'begin)) - (cache (make-vector (add1 n2) #f))) - (vector-set! cache n2 0) ; last slot: current index - `(,%let* ((,%cache (,%quote ,cache)) - (,%arg ,arg)) - ,(let fold ((i 0)) - (if (>= i n) - ;; this should be thread-safe: a context-switch can only - ;; happen before this code and in the call to FAIL. - `(,%let ((,%tmp ,fail) - (,%index (##sys#slot ,%cache ,n2))) - (##sys#setslot ,%cache ,%index ,%arg) - (##sys#setslot ,%cache (,%fx+ ,%index 1) ,%tmp) - (##sys#setislot - ,%cache ,n2 - (##core#inline "C_u_fixnum_modulo" (,%fx+ ,%index 2) ,n2)) - ,%tmp) - `(,%if (,%equal? (##sys#slot ,%cache ,(* i 2)) ,%arg) - (##sys#slot ,%cache ,(add1 (* i 2))) - ,(fold (add1 i)))))))) - -(define (regexp pat #!optional caseless extended utf8) - (if (regexp? pat) - pat - (make-regexp - (apply - irregex - pat - (let ((opts '())) - (when caseless (set! opts (cons 'i opts))) - (when extended (set! opts (cons 'x opts))) - (when utf8 (set! opts (cons 'utf8 opts))) - opts))) ) ) - -(define (unregexp x) - (cond ((regexp? x) (regexp-x x)) - ((irregex? x) x) - (else - (build-cache - 5 x - (irregex x))))) - - -;;; Basic `regexp' operations - -(define (string-match rx str) - (let ((rx (unregexp rx))) - (and-let* ((m (irregex-match rx str))) - (let loop ((i (irregex-match-num-submatches m)) - (res '())) - (if (fx<= i 0) - (cons str res) - (loop (fx- i 1) (cons (irregex-match-substring m i) res))))))) - -(define (string-match-positions rx str) - (let ((rx (unregexp rx))) - (and-let* ((m (irregex-match rx str))) - (let loop ((i (irregex-match-num-submatches m)) - (res '())) - (if (fx<= i 0) - (cons (list 0 (string-length str)) res) - (loop (fx- i 1) (cons (list (irregex-match-start-index m i) - (irregex-match-end-index m i)) - res))))))) - -(define (string-search rx str #!optional (start 0) (range (string-length str))) - (let ((rx (unregexp rx))) - (let ((n (string-length str))) - (and-let* ((m (irregex-search rx str start (min n (fx+ start range))))) - (let loop ((i (irregex-match-num-submatches m)) - (res '())) - (if (fx< i 0) - res - (loop (fx- i 1) (cons (irregex-match-substring m i) res)))))))) - -(define (string-search-positions rx str #!optional (start 0) (range (string-length str))) - (let ((rx (unregexp rx))) - (let ((n (string-length str))) - (and-let* ((m (irregex-search rx str start (min n (fx+ start range))))) - (let loop ((i (irregex-match-num-submatches m)) - (res '())) - (if (fx< i 0) - res - (loop (fx- i 1) (cons (list (irregex-match-start-index m i) - (irregex-match-end-index m i)) - res)))))))) - - -;;; Split string into fields: - -(define string-split-fields - (let ([reverse reverse] - [substring substring] - [string-search-positions string-search-positions] ) - (lambda (rx str . mode-and-start) - (##sys#check-string str 'string-split-fields) - (let* ([argc (length mode-and-start)] - [len (##sys#size str)] - [mode (if (fx> argc 0) (car mode-and-start) #t)] - [start (if (fx> argc 1) (cadr mode-and-start) 0)] - [fini (case mode - [(#:suffix) - (lambda (ms start) - (if (fx< start len) - (##sys#error 'string-split-fields - "record does not end with suffix" str rx) - (reverse ms) ) ) ] - [(#:infix) - (lambda (ms start) - (if (fx>= start len) - (reverse (cons "" ms)) - (reverse (cons (substring str start len) ms)) ) ) ] - [else (lambda (ms start) (reverse ms)) ] ) ] - [fetch (case mode - [(#:infix #:suffix) (lambda (start from to) (substring str start from))] - [else (lambda (start from to) (substring str from to))] ) ] ) - (let loop ([ms '()] [start start]) - (let ([m (string-search-positions rx str start)]) - (if m - (let* ([mp (car m)] - [from (car mp)] - [to (cadr mp)] ) - (if (fx= from to) - (if (fx= to len) - (fini ms start) - (loop (cons (fetch start (fx+ from 1) (fx+ to 2)) ms) (fx+ to 1)) ) - (loop (cons (fetch start from to) ms) to) ) ) - (fini ms start) ) ) ) ) ) ) ) - - -;;; Substitute matching strings: - -(define string-substitute - (let ([substring substring] - [reverse reverse] - [make-string make-string] - [string-search-positions string-search-positions] ) - (lambda (rx subst string . flag) - (##sys#check-string subst 'string-substitute) - (##sys#check-string string 'string-substitute) - (let* ([which (if (pair? flag) (car flag) 1)] - [substlen (##sys#size subst)] - (strlen (##sys#size string)) - [substlen-1 (fx- substlen 1)] - [result '()] - [total 0] ) - (define (push x) - (set! result (cons x result)) - (set! total (fx+ total (##sys#size x))) ) - (define (substitute matches) - (let loop ([start 0] [index 0]) - (if (fx>= index substlen-1) - (push (if (fx= start 0) subst (substring subst start substlen))) - (let ([c (##core#inline "C_subchar" subst index)] - [index+1 (fx+ index 1)] ) - (if (char=? c #\\) - (let ([c2 (##core#inline "C_subchar" subst index+1)]) - (if (and (not (char=? #\\ c2)) (char-numeric? c2)) - (let ([mi (list-ref matches (fx- (char->integer c2) 48))]) - (push (substring subst start index)) - (push (substring string (car mi) (cadr mi))) - (loop (fx+ index 2) index+1) ) - (loop start (fx+ index+1 1)) ) ) - (loop start index+1) ) ) ) ) ) - (let loop ([index 0] [count 1]) - (let ((matches (and (fx< index strlen) - (string-search-positions rx string index)))) - (cond [matches - (let* ([range (car matches)] - [upto (cadr range)] ) - (cond ((fx= 0 (fx- (cadr range) (car range))) - (##sys#error - 'string-substitute "empty substitution match" - rx) ) - ((or (not (fixnum? which)) (fx= count which)) - (push (substring string index (car range))) - (substitute matches) - (loop upto #f) ) - (else - (push (substring string index upto)) - (loop upto (fx+ count 1)) ) ) ) ] - [else - (push (substring string index (##sys#size string))) - (##sys#fragments->string total (reverse result)) ] ) ) ) ) ) ) ) - -(define string-substitute* - (let ([string-substitute string-substitute]) - (lambda (str smap . mode) - (##sys#check-string str 'string-substitute*) - (##sys#check-list smap 'string-substitute*) - (let ((mode (and (pair? mode) (car mode)))) - (let loop ((str str) (smap smap)) - (if (null? smap) - str - (let ((sm (car smap))) - (loop (string-substitute (car sm) (cdr sm) str mode) - (cdr smap) ) ) ) ) ) ) ) ) - - -;;; Glob support: - -(define glob->regexp - (let ((list->string list->string) - (string->list string->list) - (regexp regexp)) - (lambda (s #!optional sre?) - (##sys#check-string s 'glob->regexp) - (let ((sre - (cons - ': - (let loop ((cs (string->list s)) (dir #t)) - (if (null? cs) - '() - (let ((c (car cs)) - (rest (cdr cs)) ) - (cond ((char=? c #\*) - (if dir - `((or (: (~ ("./\\")) - (* (~ ("/\\")))) - (* (~ ("./\\")))) - ,@(loop rest #f)) - `((* (~ ("/\\"))) ,@(loop rest #f)))) - ((char=? c #\?) (cons 'any (loop rest #f))) - ((char=? c #\[) - (let loop2 ((rest rest) (s '())) - (cond ((not (pair? rest)) - (error 'glob->regexp "unexpected end of character class" s)) - ((char=? #\] (car rest)) - `((or ,@s) ,@(loop (cdr rest) #f))) - ((and (pair? (cdr rest)) - (pair? (cddr rest)) - (char=? #\- (cadr rest)) ) - (loop2 (cdddr rest) (cons `(/ ,(car rest) ,(caddr rest)) s))) - ((and (pair? (cdr rest)) - (char=? #\- (car rest))) - (loop2 (cddr rest) - (cons `(~ ,(cadr rest)) s))) - (else - (loop2 (cdr rest) (cons (car rest) s)))))) - (else (cons c (loop rest (memq c '(#\\ #\/)))))))))))) - (if sre? sre (regexp sre)))))) - - -;;; Grep-like function on list: - -(define grep - (let ((string-search string-search) - (regexp regexp)) - (lambda (rx lst #!optional (acc (lambda (x) x))) - (##sys#check-list lst 'grep) - (##sys#check-closure acc 'grep) - (let ((rx (regexp rx))) - (let loop ((lst lst)) - (if (null? lst) - '() - (let ((x (##sys#slot lst 0)) - (r (##sys#slot lst 1)) ) - (if (string-search rx (acc x)) - (cons x (loop r)) - (loop r) ) ) ) ) ) ) ) ) - - -;;; Escape regular expression (suggested by Peter Bex): - -(define regexp-escape - (let ([open-output-string open-output-string] - [get-output-string get-output-string] ) - (lambda (str) - (##sys#check-string str 'regexp-escape) - (let ([out (open-output-string)] - [len (##sys#size str)] ) - (let loop ([i 0]) - (cond [(fx>= i len) (get-output-string out)] - [(memq (##core#inline "C_subchar" str i) - '(#\. #\\ #\? #\* #\+ #\^ #\$ #\( #\) #\[ #\] #\| #\{ #\})) - (##sys#write-char-0 #\\ out) - (##sys#write-char-0 (##core#inline "C_subchar" str i) out) - (loop (fx+ i 1)) ] - [else - (##sys#write-char-0 (##core#inline "C_subchar" str i) out) - (loop (fx+ i 1)) ] ) ) ) ) ) ) diff --git a/rules.make b/rules.make index 2e571510..a2888e16 100644 --- a/rules.make +++ b/rules.make @@ -28,16 +28,11 @@ VPATH=$(SRCDIR) # object files -LIBCHICKEN_OBJECTS_1 = \ - library eval data-structures ports files extras lolevel utils tcp srfi-1 srfi-4 srfi-13 \ - srfi-14 srfi-18 srfi-69 $(POSIXFILE) regex scheduler \ - profiler stub expand chicken-syntax chicken-ffi-syntax runtime +LIBCHICKEN_OBJECTS_1 = library eval data-structures ports files extras lolevel utils tcp srfi-1 srfi-4 srfi-13 srfi-14 srfi-18 srfi-69 $(POSIXFILE) irregex scheduler profiler stub expand chicken-syntax chicken-ffi-syntax runtime LIBCHICKEN_SHARED_OBJECTS = $(LIBCHICKEN_OBJECTS_1:=$(O)) LIBCHICKEN_STATIC_OBJECTS = $(LIBCHICKEN_OBJECTS_1:=-static$(O)) -COMPILER_OBJECTS_1 = \ - chicken batch-driver compiler optimizer compiler-syntax scrutinizer unboxing support \ - c-platform c-backend +COMPILER_OBJECTS_1 = chicken batch-driver compiler optimizer compiler-syntax scrutinizer unboxing support c-platform c-backend COMPILER_OBJECTS = $(COMPILER_OBJECTS_1:=$(O)) COMPILER_STATIC_OBJECTS = $(COMPILER_OBJECTS_1:=-static$(O)) @@ -100,7 +95,7 @@ profiler$(O): profiler.c chicken.h $(CHICKEN_CONFIG_H) $(C_COMPILER) $(C_COMPILER_OPTIONS) $(INCLUDES) \ $(C_COMPILER_COMPILE_OPTION) $(C_COMPILER_OPTIMIZATION_OPTIONS) $(C_COMPILER_SHARED_OPTIONS) \ $(C_COMPILER_BUILD_RUNTIME_OPTIONS) $< $(C_COMPILER_OUTPUT) -regex$(O): regex.c chicken.h $(CHICKEN_CONFIG_H) +irregex$(O): irregex.c chicken.h $(CHICKEN_CONFIG_H) $(C_COMPILER) $(C_COMPILER_OPTIONS) \ $(C_COMPILER_COMPILE_OPTION) $(C_COMPILER_OPTIMIZATION_OPTIONS) $(C_COMPILER_SHARED_OPTIONS) \ $(C_COMPILER_BUILD_RUNTIME_OPTIONS) $< $(C_COMPILER_OUTPUT) @@ -215,7 +210,7 @@ profiler-static$(O): profiler.c chicken.h $(CHICKEN_CONFIG_H) $(C_COMPILER_COMPILE_OPTION) $(C_COMPILER_OPTIMIZATION_OPTIONS) \ $(C_COMPILER_STATIC_OPTIONS) \ $(C_COMPILER_BUILD_RUNTIME_OPTIONS) $< $(C_COMPILER_OUTPUT) -regex-static$(O): regex.c chicken.h $(CHICKEN_CONFIG_H) +irregex-static$(O): irregex.c chicken.h $(CHICKEN_CONFIG_H) $(C_COMPILER) $(C_COMPILER_OPTIONS) $(INCLUDES) \ $(C_COMPILER_COMPILE_OPTION) $(C_COMPILER_OPTIMIZATION_OPTIONS) \ $(C_COMPILER_STATIC_OPTIONS) \ @@ -326,10 +321,6 @@ extras.import$(O): extras.import.c chicken.h $(CHICKEN_CONFIG_H) $(HOST_C_COMPILER) $(HOST_C_COMPILER_OPTIONS) $(HOST_C_COMPILER_PTABLES_OPTIONS) $(INCLUDES) -DC_SHARED \ $(HOST_C_COMPILER_COMPILE_OPTION) $(HOST_C_COMPILER_OPTIMIZATION_OPTIONS) $(HOST_C_COMPILER_SHARED_OPTIONS) \ $(HOST_C_COMPILER_BUILD_RUNTIME_OPTIONS) $< $(HOST_C_COMPILER_OUTPUT) -regex.import$(O): regex.import.c chicken.h $(CHICKEN_CONFIG_H) - $(HOST_C_COMPILER) $(HOST_C_COMPILER_OPTIONS) $(HOST_C_COMPILER_PTABLES_OPTIONS) $(INCLUDES) -DC_SHARED \ - $(HOST_C_COMPILER_COMPILE_OPTION) $(HOST_C_COMPILER_OPTIMIZATION_OPTIONS) $(HOST_C_COMPILER_SHARED_OPTIONS) \ - $(HOST_C_COMPILER_BUILD_RUNTIME_OPTIONS) $< $(HOST_C_COMPILER_OUTPUT) irregex.import$(O): irregex.import.c chicken.h $(CHICKEN_CONFIG_H) $(HOST_C_COMPILER) $(HOST_C_COMPILER_OPTIONS) $(HOST_C_COMPILER_PTABLES_OPTIONS) $(INCLUDES) -DC_SHARED \ $(HOST_C_COMPILER_COMPILE_OPTION) $(HOST_C_COMPILER_OPTIMIZATION_OPTIONS) $(HOST_C_COMPILER_SHARED_OPTIONS) \ @@ -679,7 +670,6 @@ ifdef STATICBUILD $(INSTALL_PROGRAM) $(INSTALL_PROGRAM_FILE_OPTIONS) srfi-13.import.scm "$(DESTDIR)$(IEGGDIR)" $(INSTALL_PROGRAM) $(INSTALL_PROGRAM_FILE_OPTIONS) srfi-69.import.scm "$(DESTDIR)$(IEGGDIR)" $(INSTALL_PROGRAM) $(INSTALL_PROGRAM_FILE_OPTIONS) extras.import.scm "$(DESTDIR)$(IEGGDIR)" - $(INSTALL_PROGRAM) $(INSTALL_PROGRAM_FILE_OPTIONS) regex.import.scm "$(DESTDIR)$(IEGGDIR)" $(INSTALL_PROGRAM) $(INSTALL_PROGRAM_FILE_OPTIONS) srfi-14.import.scm "$(DESTDIR)$(IEGGDIR)" $(INSTALL_PROGRAM) $(INSTALL_PROGRAM_FILE_OPTIONS) tcp.import.scm "$(DESTDIR)$(IEGGDIR)" $(INSTALL_PROGRAM) $(INSTALL_PROGRAM_FILE_OPTIONS) foreign.import.scm "$(DESTDIR)$(IEGGDIR)" @@ -702,7 +692,6 @@ else $(INSTALL_PROGRAM) $(INSTALL_PROGRAM_EXECUTABLE_OPTIONS) srfi-13.import.so "$(DESTDIR)$(IEGGDIR)" $(INSTALL_PROGRAM) $(INSTALL_PROGRAM_EXECUTABLE_OPTIONS) srfi-69.import.so "$(DESTDIR)$(IEGGDIR)" $(INSTALL_PROGRAM) $(INSTALL_PROGRAM_EXECUTABLE_OPTIONS) extras.import.so "$(DESTDIR)$(IEGGDIR)" - $(INSTALL_PROGRAM) $(INSTALL_PROGRAM_EXECUTABLE_OPTIONS) regex.import.so "$(DESTDIR)$(IEGGDIR)" $(INSTALL_PROGRAM) $(INSTALL_PROGRAM_EXECUTABLE_OPTIONS) srfi-14.import.so "$(DESTDIR)$(IEGGDIR)" $(INSTALL_PROGRAM) $(INSTALL_PROGRAM_EXECUTABLE_OPTIONS) tcp.import.so "$(DESTDIR)$(IEGGDIR)" $(INSTALL_PROGRAM) $(INSTALL_PROGRAM_EXECUTABLE_OPTIONS) foreign.import.so "$(DESTDIR)$(IEGGDIR)" @@ -740,7 +729,6 @@ ifneq ($(POSTINSTALL_PROGRAM),true) $(POSTINSTALL_PROGRAM) $(POSTINSTALL_PROGRAM_FLAGS) "$(DESTDIR)$(IEGGDIR)$(SEP)srfi-13.import.so" $(POSTINSTALL_PROGRAM) $(POSTINSTALL_PROGRAM_FLAGS) "$(DESTDIR)$(IEGGDIR)$(SEP)srfi-69.import.so" $(POSTINSTALL_PROGRAM) $(POSTINSTALL_PROGRAM_FLAGS) "$(DESTDIR)$(IEGGDIR)$(SEP)extras.import.so" - $(POSTINSTALL_PROGRAM) $(POSTINSTALL_PROGRAM_FLAGS) "$(DESTDIR)$(IEGGDIR)$(SEP)regex.import.so" $(POSTINSTALL_PROGRAM) $(POSTINSTALL_PROGRAM_FLAGS) "$(DESTDIR)$(IEGGDIR)$(SEP)irregex.import.so" $(POSTINSTALL_PROGRAM) $(POSTINSTALL_PROGRAM_FLAGS) "$(DESTDIR)$(IEGGDIR)$(SEP)srfi-14.import.so" $(POSTINSTALL_PROGRAM) $(POSTINSTALL_PROGRAM_FLAGS) "$(DESTDIR)$(IEGGDIR)$(SEP)tcp.import.so" @@ -864,8 +852,8 @@ posixunix.c: $(SRCDIR)posixunix.scm $(SRCDIR)posix-common.scm $(SRCDIR)common-de $(CHICKEN) $< $(CHICKEN_LIBRARY_OPTIONS) -output-file $@ posixwin.c: $(SRCDIR)posixwin.scm $(SRCDIR)posix-common.scm $(SRCDIR)common-declarations.scm $(CHICKEN) $< $(CHICKEN_LIBRARY_OPTIONS) -output-file $@ -regex.c: $(SRCDIR)regex.scm $(SRCDIR)irregex.scm $(SRCDIR)common-declarations.scm - $(CHICKEN) $< $(CHICKEN_LIBRARY_OPTIONS) -output-file $@ +irregex.c: $(SRCDIR)irregex.scm $(SRCDIR)irregex-core.scm $(SRCDIR)irregex-utils.scm $(SRCDIR)common-declarations.scm + $(CHICKEN) $< $(CHICKEN_LIBRARY_OPTIONS) -output-file $@ scheduler.c: $(SRCDIR)scheduler.scm $(SRCDIR)common-declarations.scm $(CHICKEN) $< $(CHICKEN_LIBRARY_OPTIONS) -output-file $@ profiler.c: $(SRCDIR)profiler.scm $(SRCDIR)common-declarations.scm @@ -895,8 +883,6 @@ srfi-69.import.c: $(SRCDIR)srfi-69.import.scm $(CHICKEN) $< $(CHICKEN_IMPORT_LIBRARY_OPTIONS) -output-file $@ extras.import.c: $(SRCDIR)extras.import.scm $(CHICKEN) $< $(CHICKEN_IMPORT_LIBRARY_OPTIONS) -output-file $@ -regex.import.c: $(SRCDIR)regex.import.scm - $(CHICKEN) $< $(CHICKEN_IMPORT_LIBRARY_OPTIONS) -output-file $@ irregex.import.c: $(SRCDIR)irregex.import.scm $(CHICKEN) $< $(CHICKEN_IMPORT_LIBRARY_OPTIONS) -output-file $@ srfi-14.import.c: $(SRCDIR)srfi-14.import.scm @@ -980,7 +966,7 @@ setup-download.c: $(SRCDIR)setup-download.scm setup-api.c distfiles: library.c eval.c expand.c chicken-syntax.c chicken-ffi-syntax.c \ data-structures.c ports.c files.c extras.c lolevel.c utils.c \ tcp.c srfi-1.c srfi-4.c srfi-13.c srfi-14.c srfi-18.c srfi-69.c \ - posixunix.c posixwin.c regex.c scheduler.c profiler.c stub.c \ + posixunix.c posixwin.c irregex.c scheduler.c profiler.c stub.c \ chicken-profile.c chicken-install.c chicken-uninstall.c chicken-status.c \ csc.c csi.c chicken.c batch-driver.c compiler.c optimizer.c \ compiler-syntax.c scrutinizer.c unboxing.c support.c \ @@ -1023,7 +1009,7 @@ spotless: distclean testclean -$(REMOVE_COMMAND) $(REMOVE_COMMAND_OPTIONS) library.c eval.c data-structures.c \ ports.c files.c extras.c lolevel.c utils.c chicken-syntax.c chicken-ffi-syntax.c \ tcp.c srfi-1.c srfi-4.c srfi-13.c srfi-14.c srfi-18.c srfi-69.c expand.c \ - posixunix.c posixwin.c regex.c scheduler.c profiler.c stub.c \ + posixunix.c posixwin.c irregex.c scheduler.c profiler.c stub.c \ chicken-profile.c chicken-bug.c \ csc.c csi.c chicken-install.c chicken-uninstall.c chicken-status.c \ chicken.c batch-driver.c compiler.c optimizer.c compiler-syntax.c \ @@ -1071,11 +1057,11 @@ buildhead: boot-chicken: $(MAKE) -f Makefile.$(PLATFORM) PLATFORM=$(PLATFORM) PREFIX=/nowhere CONFIG= \ SRCDIR=$(SRCDIR) CHICKEN=$(CHICKEN) PROGRAM_SUFFIX=-boot-stage1 STATICBUILD=1 \ - C_COMPILER_OPTIMIZATION_OPTIONS= HACKED_APPLY= \ + C_COMPILER_OPTIMIZATION_OPTIONS= \ confclean chicken-boot-stage1$(EXE) $(MAKE) -f Makefile.$(PLATFORM) PLATFORM=$(PLATFORM) PREFIX=/nowhere CONFIG= \ - SRCDIR=$(SRCDIR) CHICKEN=$(PWD)/chicken-boot-stage1$(EXE) PROGRAM_SUFFIX=-boot \ - STATICBUILD=1 HACKED_APPLY= C_COMPILER_OPTIMIZATION_OPTIONS= \ + SRCDIR=$(SRCDIR) CHICKEN=$(PWD)/chicken-boot-stage1$(EXE) PROGRAM_SUFFIX=-boot STATICBUILD=1 \ + C_COMPILER_OPTIMIZATION_OPTIONS= \ touchfiles chicken-boot$(EXE) confclean .PHONY: touchfiles diff --git a/scripts/make-egg-index.scm b/scripts/make-egg-index.scm index 5afb9d1a..27df8d36 100644 --- a/scripts/make-egg-index.scm +++ b/scripts/make-egg-index.scm @@ -2,9 +2,8 @@ (load-relative "tools.scm") -(use setup-download matchable sxml-transforms data-structures regex) +(use setup-download matchable sxml-transforms data-structures irregex) -(import irregex) (define *help* #f) (define *major-version* (##sys#fudge 41)) diff --git a/scripts/makedist.scm b/scripts/makedist.scm index e44f234c..a9ca70d7 100644 --- a/scripts/makedist.scm +++ b/scripts/makedist.scm @@ -1,7 +1,7 @@ ;;;; makedist.scm - Make distribution tarballs -(use srfi-69) +(use srfi-69 irregex) (define *release* #f) @@ -15,7 +15,7 @@ (define *platform* (let ((sv (symbol->string (software-version)))) - (cond ((string-match ".*bsd" sv) "bsd") + (cond ((irregex-match ".*bsd" sv) "bsd") (else (case (build-platform) ((mingw32) diff --git a/scripts/setversion b/scripts/setversion index a46d7290..d751ee66 100644 --- a/scripts/setversion +++ b/scripts/setversion @@ -10,10 +10,6 @@ exec csi -s "$0" "$@" (define files '("README" "manual/The User's Manual")) -(define-syntax rx - (syntax-rules () - ((_ r) (force (delay (regexp r)))))) - (define (patch which rx subst) (cond ((and (list? which) (= 2 (length which))) (let ((from (car which)) @@ -26,17 +22,17 @@ exec csi -s "$0" "$@" (let loop () (let ((ln (read-line))) (unless (eof-object? ln) - (write-line (string-substitute rx subst ln #t)) + (write-line (irregex-replace/all rx ln subst)) (loop) ) ) ) ) - binary:) ) - binary:))) + #:binary) ) + #:binary))) (else (let ((tmp (create-temporary-file))) (patch (list which tmp) rx subst) (system* "mv ~S ~S" tmp which) ) ) ) ) (define (parse-version v) - (string-match (rx "(\\d+)\\.(\\d+)\\.(\\d+)(.*)") v) ) + (string-match (irregex "(\\d+)\\.(\\d+)\\.(\\d+)(.*)") v) ) (define (main args) (let ((major (##sys#fudge 41)) @@ -65,14 +61,14 @@ exec csi -s "$0" "$@" binary:) (system* "cat version.scm") (let ([vstr (sprintf "version ~A" buildversion)]) - (for-each (cut patch <> (rx "version [0-9][-.0-9a-zA-Z]+") vstr) files) ) + (for-each (cut patch <> (irregex "version [0-9][-.0-9a-zA-Z]+") vstr) files) ) (patch "chicken.h" - (rx "C_MAJOR_VERSION[ \\t]+[0-9]+") + (irregex "C_MAJOR_VERSION[ \\t]+[0-9]+") (sprintf "C_MAJOR_VERSION ~a" major)) (patch "chicken.h" - (rx "C_MINOR_VERSION[ \\t]+[0-9]+") + (irregex "C_MINOR_VERSION[ \\t]+[0-9]+") (sprintf "C_MINOR_VERSION ~a" minor)) 0)) diff --git a/scripts/tools.scm b/scripts/tools.scm index b4c3d435..fe53ae59 100644 --- a/scripts/tools.scm +++ b/scripts/tools.scm @@ -341,15 +341,19 @@ (set! debug #t) ) (else (usage 1)) ) (loop (cdr args)) ) - ((string-match "([-_A-Za-z0-9]+)=(.*)" x) => + ((irregex-match "([-_A-Za-z0-9]+)=(.*)" x) => (lambda (m) - (let* ((sym (string->symbol (cadr m)))) + (let* ((sym (string->symbol (irregex-match-substring m 1)))) (if (##sys#symbol-has-toplevel-binding? sym) (let ((val (##sys#slot sym 0))) - (if (or (boolean? val) (string? val) (symbol? val) (eq? (void) val)) - (##sys#setslot sym 0 (caddr m)) - (quit "variable `~a' already has a suspicious value" sym) ) ) - (##sys#setslot sym 0 (caddr m)) ) + (if (or (boolean? val) + (string? val) + (symbol? val) + (eq? (void) val)) + (##sys#setslot sym 0 (irregex-match-substring m 2)) + (quit "variable `~a' already has a suspicious value" + sym) ) ) + (##sys#setslot sym 0 (irregex-match-substring m 2)) ) (loop (cdr args)) ) ) ) (else (set! targets (cons x targets)) @@ -424,22 +428,24 @@ val))) (let loop ((args args) (vals '())) (cond ((null? args) (reverse vals)) - ((string-match "(-{1,2})([-_A-Za-z0-9]+)(=)?\\s*(.+)?" (car args)) + ((irregex-match "(-{1,2})([-_A-Za-z0-9]+)(=)?\\s*(.+)?" (car args)) => (lambda (m) (let*-values (((next) (cdr args)) ((var val) - (cond ((equal? "=" (fourth m)) - (let ((opt (third m)) - (val (fifth m))) + (cond ((equal? "=" (irregex-match-substring m 3)) + (let ((opt (irregex-match-substring m 2)) + (val (irregex-match-substring m 4))) (cond (val (values opt val)) (else (when (null? next) - (error "missing argument for option" (car args)) ) + (error "missing argument for option" + (car args)) ) (let ((x (car next))) (set! next (cdr next)) (values opt x))))) ) - ((string? (second m)) (values (third m) #t)) + ((string? (irregex-match-substring m 1)) + (values (irregex-match-substring m 2) #t)) (else (values #f #f)) ) ) ) (cond (var (assign var val) diff --git a/setup-api.scm b/setup-api.scm index 9c14871a..9ef3635c 100644 --- a/setup-api.scm +++ b/setup-api.scm @@ -24,7 +24,7 @@ ; POSSIBILITY OF SUCH DAMAGE. -(require-library srfi-1 regex utils posix srfi-13 extras ports data-structures files) +(require-library srfi-1 irregex utils posix srfi-13 extras ports data-structures files) ; This code is partially quite messy and the API is not overly consistent, ; mainly because it has grown "organically" while the old chicken-setup program @@ -67,7 +67,7 @@ setup-error-handling) (import scheme chicken foreign - regex utils posix ports extras data-structures + irregex utils posix ports extras data-structures srfi-1 srfi-13 files) ;;; Constants, variables and parameters @@ -197,7 +197,7 @@ (let loop () (let ((ln (read-line))) (unless (eof-object? ln) - (write-line (string-substitute rx subst ln #t)) + (write-line (irregex-replace/all rx ln subst)) (loop) ) ) ) ) ) ) ) (let ((tmp (create-temporary-file))) (patch (list tmp tmp) rx subst) @@ -728,7 +728,7 @@ (define (version>=? v1 v2) (define (version->list v) (map (lambda (x) (or (string->number x) x)) - (string-split-fields "[-\\._]" (->string v) #:infix))) + (irregex-split "[-\\._]" (->string v)))) (let loop ((p1 (version->list v1)) (p2 (version->list v2))) (cond ((null? p1) (null? p2)) diff --git a/setup-download.scm b/setup-download.scm index 2973a42d..9cd057c6 100644 --- a/setup-download.scm +++ b/setup-download.scm @@ -24,7 +24,7 @@ ; POSSIBILITY OF SUCH DAMAGE. -(require-library extras regex posix utils setup-api srfi-1 data-structures tcp srfi-13 +(require-library extras irregex posix utils setup-api srfi-1 data-structures tcp srfi-13 files) @@ -38,11 +38,13 @@ temporary-directory) (import scheme chicken) - (import extras regex posix utils srfi-1 data-structures tcp srfi-13 files setup-api) + (import extras irregex posix utils srfi-1 data-structures tcp srfi-13 files setup-api) (define-constant +default-tcp-connect-timeout+ 10000) ; 10 seconds (define-constant +default-tcp-read/write-timeout+ 20000) ; 20 seconds + (define-constant +url-regex+ "(http://)?([^/:]+)(:([^:/]+))?(/.+)") + (tcp-connect-timeout +default-tcp-connect-timeout+) (tcp-read-timeout +default-tcp-read/write-timeout+) (tcp-write-timeout +default-tcp-read/write-timeout+) @@ -157,7 +159,9 @@ [tagver (existing-version egg version (filter-map - (lambda (f) (and-let* ((m (string-search "^tags/([^/]+)/" f))) (cadr m))) + (lambda (f) + (and-let* ((m (irregex-search "^tags/([^/]+)/" f))) + (irregex-match-substring m 1))) files))]) (let-values ([(filedir ver) (if tagver @@ -189,14 +193,15 @@ (conc dir #\/ egg ".meta")) (define (deconstruct-url url) - (let ([m (string-match "(http://)?([^/:]+)(:([^:/]+))?(/.+)" url)]) + (let ([m (irregex-match +url-regex+ url)]) (values - (if m (caddr m) url) - (if (and m (cadddr m)) - (or (string->number (list-ref m 4)) - (error "not a valid port" (list-ref m 4))) + (if m (irregex-match-substring m 2) url) + (if (and m (irregex-match-substring m 3)) + (let ((port (irregex-match-substring m 4))) + (or (string->number port) + (error "not a valid port" port))) 80) - (if m (list-ref m 5) "/")) ) ) + (if m (irregex-match-substring m 5) "/")) ) ) (define (locate-egg/http egg url #!optional version destination tests proxy-host proxy-port) @@ -245,13 +250,13 @@ (define (match-http-response rsp) (and (string? rsp) - (string-match "HTTP/[0-9.]+\\s+([0-9]+)\\s+.*" rsp)) ) + (irregex-match "HTTP/[0-9.]+\\s+([0-9]+)\\s+.*" rsp)) ) (define (response-match-code? mrsp code) - (and mrsp (string=? (number->string code) (cadr mrsp))) ) + (and mrsp (string=? (number->string code) (irregex-match-substring mrsp 1))) ) (define (match-chunked-transfer-encoding ln) - (string-match "[Tt]ransfer-[Ee]ncoding:\\s*chunked.*" ln) ) + (irregex-match "[Tt]ransfer-[Ee]ncoding:\\s*chunked.*" ln) ) (define (http-fetch host port locn dest proxy-host proxy-port) (d "connecting to host ~s, port ~a ~a...~%" host port diff --git a/setup.defaults b/setup.defaults index 4bf7ab32..604980c1 100644 --- a/setup.defaults +++ b/setup.defaults @@ -24,7 +24,7 @@ (map (data-structures extras files foreign irregex lolevel ports tcp utils - posix regex setup-api setup-download + posix irregex setup-api setup-download srfi-1 srfi-4 srfi-13 srfi-14 srfi-18 srfi-69 ->) ) diff --git a/tests/re-tests.txt b/tests/re-tests.txt index a73604c6..3b7bf976 100644 --- a/tests/re-tests.txt +++ b/tests/re-tests.txt @@ -104,6 +104,9 @@ a[bcd]*dcdcde adcdcde y & adcdcde a[bcd]+dcdcde adcdcde n - - (ab|a)b*c abc y &-\1 abc-ab ((a)(b)c)(d) abcd y \1-\2-\3-\4 abc-a-b-d +((a)(b)?c)(d) abcd y \1-\2-\3-\4 abc-a-b-d +((a)(b)?c)(d) acd y \1-\2-\3-\4 ac-a--d +((aa)(bb)?cc)(dd) aaccdd y \1-\2-\3-\4 aacc-aa--dd [ -~]* abc y & abc [ -~ -~]* abc y & abc [ -~ -~ -~]* abc y & abc @@ -118,8 +121,13 @@ a[bcd]+dcdcde adcdcde n - - (bc+d$|ef*g.|h?i(j|k)) effg n - - (bc+d$|ef*g.|h?i(j|k)) bcdd n - - (bc+d$|ef*g.|h?i(j|k)) reffgz y &-\1-\2 effgz-effgz- +((((((((((a))))))))) - c - - +((((((((((a)))))))))) a y &-\10 a-a (((((((((a))))))))) a y & a multiple words of text uh-uh n - - multiple words multiple words, yeah y & multiple words (.*)c(.*) abcde y &-\1-\2 abcde-ab-de \((.*), (.*)\) (a, b) y (\2, \1) (b, a) +(we|wee|week)(knights|night) weeknights y &-\1-\2 weeknights-wee-knights +(a([^a])*)* abcaBC y &-\1-\2 abcaBC-aBC-C +([Aa]b).*\1 abxyzab y &-\1 abxyzab-ab diff --git a/tests/runtests.sh b/tests/runtests.sh index e2c2eab6..ae147777 100644 --- a/tests/runtests.sh +++ b/tests/runtests.sh @@ -20,7 +20,7 @@ for x in setup-api.so setup-api.import.so setup-download.so \ srfi-1.import.so srfi-4.import.so data-structures.import.so \ ports.import.so files.import.so posix.import.so \ srfi-13.import.so srfi-69.import.so extras.import.so \ - regex.import.so srfi-14.import.so tcp.import.so \ + irregex.import.so srfi-14.import.so tcp.import.so \ foreign.import.so scheme.import.so srfi-18.import.so \ utils.import.so csi.import.so irregex.import.so types.db; do cp ../$x test-repository diff --git a/tests/sgrep.scm b/tests/sgrep.scm index 7503256a..555829e4 100644 --- a/tests/sgrep.scm +++ b/tests/sgrep.scm @@ -1,7 +1,7 @@ ;;;; sgrep.scm - grepping benchmark -(use regex extras utils posix srfi-1) +(use irregex extras utils posix srfi-1) (define big-string @@ -21,7 +21,7 @@ (lambda (line) (set! c (fx+ c 1)) ;(when (zero? (fxmod c 500)) (print* ".")) - (when (string-search expr line) + (when (irregex-search expr line) (set! h (fx+ h 1))) #f)) ;(newline) diff --git a/tests/test-glob.scm b/tests/test-glob.scm index a5732384..91fc3d64 100644 --- a/tests/test-glob.scm +++ b/tests/test-glob.scm @@ -1,20 +1,20 @@ ;;;; test-glob.scm - test glob-pattern -> regex translation -(use regex) +(use irregex) -(assert (string-match (glob->regexp "foo.bar") "foo.bar")) -(assert (string-match (glob->regexp "foo*") "foo.bar")) -(assert (string-match (glob->regexp "foo/*") "foo/bar")) -(assert (not (string-match (glob->regexp "foo/*") "foo/bar/baz"))) -(assert (string-match (glob->regexp "foo/*/*") "foo/bar/baz")) -(assert (not (string-match (glob->regexp "foo/*") "foo/.bar"))) -(assert (string-match (glob->regexp "*foo") "xyzfoo")) -(assert (not (string-match (glob->regexp "*foo") ".foo"))) -(assert (not (string-match (glob->regexp "*foo*") "a.fooxxx/yyy"))) -(assert (string-match (glob->regexp "*foo*") "fooxxx")) -(assert (string-match (glob->regexp "main.[ch]") "main.c")) -(assert (string-match (glob->regexp "main.[ch]") "main.h")) -(assert (not (string-match (glob->regexp "main.[ch]") "main.cpp"))) -(assert (string-match (glob->regexp "main.[-c]") "main.h")) -(assert (not (string-match (glob->regexp "main.[-h]") "main.h"))) +(assert (irregex-match (##sys#glob->regexp "foo.bar") "foo.bar")) +(assert (irregex-match (##sys#glob->regexp "foo*") "foo.bar")) +(assert (irregex-match (##sys#glob->regexp "foo/*") "foo/bar")) +(assert (not (irregex-match (##sys#glob->regexp "foo/*") "foo/bar/baz"))) +(assert (irregex-match (##sys#glob->regexp "foo/*/*") "foo/bar/baz")) +(assert (not (irregex-match (##sys#glob->regexp "foo/*") "foo/.bar"))) +(assert (irregex-match (##sys#glob->regexp "*foo") "xyzfoo")) +(assert (not (irregex-match (##sys#glob->regexp "*foo") ".foo"))) +(assert (not (irregex-match (##sys#glob->regexp "*foo*") "a.fooxxx/yyy"))) +(assert (irregex-match (##sys#glob->regexp "*foo*") "fooxxx")) +(assert (irregex-match (##sys#glob->regexp "main.[ch]") "main.c")) +(assert (irregex-match (##sys#glob->regexp "main.[ch]") "main.h")) +(assert (not (irregex-match (##sys#glob->regexp "main.[ch]") "main.cpp"))) +(assert (irregex-match (##sys#glob->regexp "main.[-c]") "main.h")) +(assert (not (irregex-match (##sys#glob->regexp "main.[-h]") "main.h"))) diff --git a/tests/test-irregex.scm b/tests/test-irregex.scm index 12d49adf..917e6e6a 100644 --- a/tests/test-irregex.scm +++ b/tests/test-irregex.scm @@ -1,15 +1,13 @@ ;;;: test-irregex.scm -(use extras regex) +(use extras irregex) (include "test.scm") -(import irregex) - (define (subst-matches matches subst) (define (submatch n) - (if (vector? matches) + (if (irregex-match-data? matches) (irregex-match-substring matches n) (list-ref matches n))) (and @@ -28,35 +26,41 @@ ((#\\) (let ((c (read-char in))) (if (char-numeric? c) - (display - (or (submatch (string->number (string c))) "") - out) + (let lp ((res (list c))) + (if (and (char? (peek-char in)) + (char-numeric? (peek-char in))) + (lp (cons (read-char in) res)) + (display + (or (submatch (string->number + (list->string (reverse res)))) + "") + out))) (write-char c out)))) (else (write-char c out))) (lp))))))))))) (define (test-re matcher line) - (apply - (lambda (pattern input result subst output) - (let ((name (sprintf "~A ~A ~A" pattern input result))) - (cond - ((equal? "c" result) - (test-error name (matcher pattern input))) - ((equal? "n" result) - (test-assert name (not (matcher pattern input)))) - ((equal? "y" result) - (test-assert name (matcher pattern input))) - (else - (test-equal name - (subst-matches (matcher pattern input) subst) - result))))) - (string-split line "\t" #t))) - + (let ((splt (string-split line "\t" #t))) + (if (list? splt) + (apply + (lambda (pattern input result subst output) + (let ((name (sprintf "~A ~A ~A ~A" pattern input result subst))) + (cond + ((equal? "c" result) + (test-error name (matcher pattern input))) + ((equal? "n" result) + (test-assert name (not (matcher pattern input)))) + (else + (test-equal name output + (subst-matches (matcher pattern input) subst)))))) + splt) + (warning "invalid regex test line" line)))) (test-begin) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; basic irregex (for-each (lambda (opts) @@ -69,9 +73,139 @@ (irregex-search (apply irregex pat opts) str)) line)) read-line))))) - '((small) (fast))) + '((backtrack) + (fast) + )) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; chunked irregex + +(define (rope . args) + (map (lambda (x) (if (pair? x) x (list x 0 (string-length x)))) args)) + +(define rope-chunker + (make-irregex-chunker + (lambda (x) (and (pair? (cdr x)) (cdr x))) + caar + cadar + caddar + (lambda (src1 i src2 j) + (if (eq? src1 src2) + (substring (caar src1) i j) + (let lp ((src (cdr src1)) + (res (list (substring (caar src1) i (caddar src1))))) + (if (eq? src src2) + (string-intersperse + (reverse (cons (substring (caar src2) (cadar src2) j) res)) + "") + (lp (cdr src) + (cons (substring (caar src) (cadar src) (caddar src)) + res)))))))) + +(define (make-ropes str) + (let ((len (string-length str))) + (case len + ((0 1) + (list (rope str))) + ((2) + (list (rope str) + (rope (substring str 0 1) (substring str 1 2)))) + ((3) + (list (rope str) + (rope (substring str 0 1) (substring str 1 3)) + (rope (substring str 0 2) (substring str 2 3)) + (rope (substring str 0 1) + (substring str 1 2) + (substring str 2 3)))) + (else + (let ((mid (quotient (+ len 1) 2))) + (list (rope str) + (rope (substring str 0 1) (substring str 1 len)) + (rope (substring str 0 mid) (substring str mid len)) + (rope (substring str 0 (- len 1)) + (substring str (- len 1) len)) + (rope (substring str 0 1) + (substring str 1 mid) + (substring str mid len)) + )))))) + +(define (make-shared-ropes str) + (let ((len (string-length str))) + (case len + ((0 1) + '()) + ((2) + (list (list (list str 0 1) (list str 1 2)))) + ((3) + (list (list (list str 0 1) (list str 1 3)) + (list (list str 0 2) (list str 2 3)) + (list (list str 0 1) (list str 1 2) (list str 2 3)))) + (else + (let ((mid (quotient (+ len 1) 2))) + (list (list (list str 0 1) (list str 1 len)) + (list (list str 0 mid) (list str mid len)) + (list (list str 0 (- len 1)) + (list str (- len 1) len)) + (list (list str 0 1) (list str 1 mid) (list str mid len)) + )))))) + +(for-each + (lambda (opts) + (test-group + (sprintf "irregex/chunked - ~S" opts) + (with-input-from-file "re-tests.txt" + (lambda () + (port-for-each + (lambda (line) + (let ((splt (string-split line "\t" #t))) + (if (list? splt) + (apply + (lambda (pattern input result subst output) + (let ((name + (sprintf "~A ~A ~A ~A" pattern input result subst))) + (cond + ((equal? "c" result)) + ((equal? "n" result) + (for-each + (lambda (rope) + (test-assert name + (not (irregex-search/chunked pattern + rope-chunker + rope)))) + (append (make-ropes input) + (make-shared-ropes input)))) + (else + (for-each + (lambda (rope) + (test-equal + name output + (subst-matches (irregex-search/chunked pattern + rope-chunker + rope) + subst))) + (append (make-ropes input) + (make-shared-ropes input))))))) + splt) + (warning "invalid regex test line" line)))) + read-line))))) + '((backtrack) + (fast) + )) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; pregexp + +'(test-group "pregexp" + (with-input-from-file "re-tests.txt" + (lambda () + (port-for-each + (lambda (line) (test-re pregexp-match line)) + read-line)))) -(test-group "regex" +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; default regex (PCRE) + +'(test-group "regex" (with-input-from-file "re-tests.txt" (lambda () (port-for-each @@ -80,14 +214,238 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(test-group "unmatchable patterns" + (test-assert (not (irregex-search '(or) "abc"))) + (test-assert (not (irregex-search '(: "ab" (or)) "abc"))) + (test-assert (not (irregex-search '(submatch "ab" (or)) "abc"))) + (test-assert (not (irregex-search '(: "ab" (submatch (or))) "abc"))) + (test-assert (not (irregex-search '(/) "abc"))) + (test-assert (not (irregex-search '(: "ab" (/)) "abc"))) + (test-assert (not (irregex-search '(~ any) "abc"))) + (test-assert (not (irregex-search '(: "ab" (~ any)) "abc"))) + (test-assert (not (irregex-search '("") "abc"))) + (test-assert (not (irregex-search '(: "ab" ("")) "abc"))) + ) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(test-group "beginning/end of chunks" + (test-assert + (irregex-search/chunked '(: bos "foo") rope-chunker '((" foo" 0 4)) 1)) + (test-assert + (irregex-search/chunked '(: bos "foo") rope-chunker '((" foo" 1 5)) 2)) + (test-assert + (irregex-search/chunked '(: bos "foo" eos) rope-chunker '((" foo" 1 4)) 1)) + (test-assert + (irregex-search/chunked '(: bos "foo" eos) rope-chunker '((" foo" 2 5)) 2)) + (test-assert + (irregex-search/chunked '(: bos "foo" eos) rope-chunker '((" foo" 0 4)) 1)) + (test-assert + (irregex-search/chunked '(: bos "foo" eos) rope-chunker '((" foo" 1 5)) 2)) + ) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(test-group "API" + (test-assert (irregex? (irregex "a.*b"))) + (test-assert (irregex? (irregex '(: "a" (* any) "b")))) + (test-assert (not (irregex? (vector '*irregex-tag* #f #f #f #f #f #f #f)))) + (test-assert (not (irregex? (vector #f #f #f #f #f #f #f #f #f)))) + (test-assert (irregex-match-data? (irregex-search "a.*b" "axxxb"))) + (test-assert (irregex-match-data? (irregex-match "a.*b" "axxxb"))) + (test-assert (not (irregex-match-data? (vector '*irregex-match-tag* #f #f #f #f #f #f #f #f #f)))) + (test-assert (not (irregex-match-data? (vector #f #f #f #f #f #f #f #f #f #f #f)))) + (test-equal 0 (irregex-num-submatches (irregex "a.*b"))) + (test-equal 1 (irregex-num-submatches (irregex "a(.*)b"))) + (test-equal 2 (irregex-num-submatches (irregex "(a(.*))b"))) + (test-equal 2 (irregex-num-submatches (irregex "a(.*)(b)"))) + (test-equal 10 (irregex-num-submatches (irregex "((((((((((a))))))))))"))) + (test-equal 0 (irregex-match-num-submatches (irregex-search "a.*b" "axxxb"))) + (test-equal 1 (irregex-match-num-submatches (irregex-search "a(.*)b" "axxxb"))) + (test-equal 2 (irregex-match-num-submatches (irregex-search "(a(.*))b" "axxxb"))) + (test-equal 2 (irregex-match-num-submatches (irregex-search "a(.*)(b)" "axxxb"))) + (test-equal 10 (irregex-match-num-submatches (irregex-search "((((((((((a))))))))))" "a"))) + (test-assert + (irregex-match-valid-index? (irregex-search "a.*b" "axxxb") 0)) + (test-assert + (not (irregex-match-valid-index? (irregex-search "a.*b" "axxxb") 1))) + (test-assert + (not (irregex-match-valid-index? (irregex-search "a.*b" "axxxb") -1))) + (test-assert + (irregex-match-valid-index? (irregex-search "a(.*)(b)" "axxxb") 0)) + (test-assert + (irregex-match-valid-index? (irregex-search "a(.*)(b)" "axxxb") 1)) + (test-assert + (irregex-match-valid-index? (irregex-search "a(.*)(b)" "axxxb") 2)) + (test-assert + (not (irregex-match-valid-index? (irregex-search "a(.*)(b)" "axxxb") 3))) + (test-equal 1 (irregex-match-start-index (irregex-search "a(.*)(b)" "axxxb") 1)) + (test-equal 4 (irregex-match-end-index (irregex-search "a(.*)(b)" "axxxb") 1)) + ) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + (test-group "utils" - (test-equal "replace" - (irregex-replace "[aeiou]" "hello world" "*") - "h*llo world") - (test-equal "replace/all" - (irregex-replace/all "[aeiou]" "hello world" "*") - "h*ll* w*rld")) + (test-equal "h*llo world" + (irregex-replace "[aeiou]" "hello world" "*")) + (test-equal "h*ll* w*rld" + (irregex-replace/all "[aeiou]" "hello world" "*")) + (test-equal '("bob@test.com" "fred@example.com") + (irregex-fold 'email + (lambda (i m s) (cons (irregex-match-substring m) s)) + '() + "bob@test.com and fred@example.com" + (lambda (i s) (reverse s)))) + (test-equal '("bob@test.com" "fred@example.com") + (irregex-fold/chunked + 'email + (lambda (src i m s) (cons (irregex-match-substring m) s)) + '() + rope-chunker + (rope "bob@test.com and fred@example.com") + (lambda (src i s) (reverse s)))) + ) + + +(define (extract name irx str) + (irregex-match-substring (irregex-match irx str) name)) +(define (valid? name irx str) + (irregex-match-valid-index? (irregex-match irx str) name)) +(define (start-idx name irx str) + (irregex-match-start-index (irregex-match irx str) name)) +(define (end-idx name irx str) + (irregex-match-end-index (irregex-match irx str) name)) + +(test-group "named submatches" + (test-equal "matching submatch is seen and extracted" + "first" (extract 'first `(or (submatch-named first "first") + (submatch-named second "second")) + "first")) + (test-assert "matching submatch index is valid" + (valid? 'first `(or (submatch-named first "first") + (submatch-named second "second")) + "first")) + (test-equal "nonmatching submatch is known but returns false" + #f + (extract 'second `(or (submatch-named first "first") + (submatch-named second "second")) + "first")) + (test-assert "nonmatching submatch index is valid" + (valid? 'second `(or (submatch-named first "first") + (submatch-named second "second")) + "first")) + (test-error "nonexisting submatch is unknown and raises an error" + (extract 'third `(or (submatch-named first "first") + (submatch-named second "second")) + "first")) + (test-assert "nonexisting submatch index is invalid" + (not (valid? 'third `(or (submatch-named first "first") + (submatch-named second "second")) + "first"))) + (test-equal "matching alternative is used" + "first" (extract 'sub `(or (submatch-named sub "first") + (submatch-named sub "second")) + "first")) + (test-equal "matching alternative is used (second match)" + "second" (extract 'sub `(or (submatch-named sub "first") + (submatch-named sub "second")) + "second")) + (test-equal "last match is used with multiple matches for a name" + "second" (extract 'sub `(seq (submatch-named sub "first") + space + (submatch-named sub "second")) + "first second")) + (test-equal "submatch start" + 1 + (start-idx 'xs `(seq "a" (submatch-named xs (+ "x")) "b") "axxxb")) + (test-error "unknown submatch start" + (start-idx 'xs `(seq "a" (submatch-named ys (+ "x")) "b") "axxxb")) + (test-equal "submatch end" + 4 (end-idx 'xs `(seq "a" (submatch-named xs (+ "x")) "b") "axxxb")) + (test-error "unknown submatch start" + (end-idx 'xs `(seq "a" (submatch-named ys (+ "x")) "b") "axxxb"))) + +;; This is here to help optimized implementations catch segfaults and +;; other such problems. These calls will always return errors in plain +;; Scheme, but only because it will try to use the invalid object in a +;; way that's not supported by the operator. Once Scheme grows a +;; standardized way of signaling and catching exceptions, these tests +;; should be changed and expanded to check for specific condition types, +;; and probably moved to the group where the procedure is being tested. +(test-group "error handling" + (test-error (irregex 'invalid-sre)) + (test-error (string->irregex 'not-a-string)) + (test-error (sre->irregex 'invalid-sre)) + + (test-error (irregex-search 'not-an-irx-or-sre "foo")) + (test-error (irregex-search "foo" 'not-a-string)) + (test-error (irregex-search "foo" "foo" 'not-a-number)) + (test-error (irregex-search "foo" "foo" 0 'not-a-number)) + + ;; TODO: irregex-new-matches, irregex-reset-matches! + ;; irregex-search/matches, make-irregex-chunker? + + (test-error (irregex-match-valid-index? 'not-a-match-object 0)) + (test-error (irregex-match-start-index 'not-a-match-object 0)) + (test-error (irregex-match-start-index (irregex-search "foo" "foo") -1)) + (test-error (irregex-match-end-index 'not-a-match-object 0)) + (test-error (irregex-match-end-index (irregex-search "foo" "foo") -1)) + + (test-error (irregex-match-start-chunk 'not-a-match-object 0)) + (test-error (irregex-match-end-chunk 'not-a-match-object 0)) + (test-error (irregex-match-substring 'not-a-match-object 0)) + (test-error (irregex-match-subchunk 'not-a-match-object 0)) + (test-error (irregex-match-num-submatches 'not-a-match-object)) + (test-error (irregex-match-names 'not-a-match-object)) + (test-error (irregex-num-submatches 'not-an-irx)) + (test-error (irregex-names 'not-an-irx)) + + (test-error (irregex-fold 'not-an-irx (lambda x x) 0 "foo" (lambda x x) 0 3)) + (test-error (irregex-fold "foo" 'not-a-proc 0 "foo" (lambda x x) 0 3)) + (test-error (irregex-fold "foo" (lambda (a b) b) 0 'not-a-string + (lambda x x) 0 3)) + (test-error (irregex-fold "foo" (lambda (a b) b) 0 "foo" 'not-a-proc 0 3)) + (test-error (irregex-fold "foo" (lambda (a b) b) 0 "foo" (lambda x x) + 'not-a-number 3)) + (test-error (irregex-fold "foo" (lambda (a b) b) 0 "foo" (lambda x x) 0 + 'not-a-number)) + + (test-error (irregex-replace 'not-an-irx "str")) + (test-error (irregex-replace "foo" "foo" (lambda (x) 'not-a-string))) + (test-error (irregex-replace/all 'not-an-irx "str")) + (test-error (irregex-replace/all "foo" "foo" (lambda (x) 'not-a-string))) + + ;; Are these supposed to be exported? + ;; irregex-nfa, irregex-dfa, irregex-dfa/search, irregex-dfa/extract + ;; irregex-flags, irregex-lengths + ) + +(test-end) + + +;;; UTF-8 tests + +(test-begin) + +(test-assert (irregex-search "(?u:<..>)" "<漢字>")) +(test-assert (irregex-search "(?u:<.*>)" "<漢字>")) +(test-assert (irregex-search "(?u:<.+>)" "<漢字>")) +(test-assert (not (irregex-search "(?u:<.>)" "<漢字>"))) +(test-assert (not (irregex-search "(?u:<...>)" "<漢>"))) + +(test-assert (irregex-search "(?u:<[^a-z]*>)" "<漢字>")) +(test-assert (not (irregex-search "(?u:<[^a-z]*>)" "<漢m字>"))) +(test-assert (irregex-search "(?u:<[^a-z][^a-z]>)" "<漢字>")) +(test-assert (irregex-search "(?u:<あ*>)" "<あ>")) +(test-assert (irregex-search "(?u:<あ*>)" "<ああ>")) +(test-assert (not (irregex-search "(?u:<あ*>)" "<あxあ>"))) + +(test-assert (irregex-search "(?u:<[あ-ん]*>)" "<あん>")) +(test-assert (irregex-search "(?u:<[あ-ん]*>)" "<ひらがな>")) +(test-assert (not (irregex-search "(?u:<[あ-ん]*>)" "<ひらgがな>"))) (test-end) + + (test-exit) diff --git a/tests/test.scm b/tests/test.scm index e9b43c14..c16de6a5 100644 --- a/tests/test.scm +++ b/tests/test.scm @@ -77,7 +77,8 @@ (define-syntax test-equal (syntax-rules () ((_ name expr value eq) (run-equal name (lambda () expr) value eq)) - ((_ name expr value) (run-equal name (lambda () expr) value equal?)))) + ((_ name expr value) (run-equal name (lambda () expr) value equal?)) + ((_ expr value) (run-equal (->string value) (lambda () expr) value equal?)))) (define-syntax test-error (syntax-rules () @@ -89,7 +90,8 @@ (define-syntax test-assert (syntax-rules () - ((_ name expr) (run-equal name (lambda () (if expr #t #f)) #t eq?)))) + ((_ name expr) (run-equal name (lambda () (if expr #t #f)) #t eq?)) + ((_ expr) (run-equal (->string expr) (lambda () (if expr #t #f)) #t eq?)))) (define-syntax test-group (syntax-rules () diff --git a/types.db b/types.db index f28c72a3..c269ce1a 100644 --- a/types.db +++ b/types.db @@ -538,30 +538,59 @@ ;; irregex (irregex (procedure irregex (#!rest) *)) -(string->irregex (procedure string->irregex (string #!rest) *)) -(sre->irregex (procedure sre->irregex (#!rest) *)) -(string->sre (procedure string->sre (string #!rest) *)) -(irregex? (procedure irregex? (*) boolean)) +;irregex-apply-match +(irregex-dfa (procedure irregex-dfa (*) *)) +(irregex-dfa/extract (procedure irregex-dfa/extract (*) *)) +(irregex-dfa/search (procedure irregex-dfa/search (*) *)) +(irregex-extract (procedure irregex-extract (* string #!optional fixnum fixnum) list)) +(irregex-flags (procedure irregex-flags (*) *)) +(irregex-fold (procedure irregex-fold (* (procedure (fixnum (struct regexp-match)) *) * string #!optional (procedure (fixnum *) *) fixnum fixnum) *)) +(irregex-fold/chunked (procedure irregex-fold/chunked (* (procedure (fixnum (struct regexp-match)) *) * procedure * #!optional (procedure (fixnum *) *) fixnum fixnum) *)) +(irregex-lengths (procedure irregex-lengths (*) *)) +(irregex-match (procedure irregex-match (* string) *)) +;irregex-match? (irregex-match-data? (procedure irregex-match-data? (*) boolean)) -(irregex-new-matches (procedure irregex-new-matches (*) *)) -(irregex-reset-matches! (procedure irregex-reset-matches! (*) *)) -(irregex-match-start (procedure irregex-match-start (* #!optional *) *)) (irregex-match-end (procedure irregex-match-end (* #!optional *) *)) -(irregex-match-substring (procedure irregex-match-substring (* #!optional *) *)) -(irregex-search (procedure irregex-search (* string #!optional fixnum fixnum) *)) -(irregex-search/matches (procedure irregex-search/matches (* string fixnum fixnum *) *)) -(irregex-match (procedure irregex-match (* string) *)) +;irregex-match-end-chunk +(irregex-match-end-index (procedure irregex-match-end-index ((struct regexp-match) #!optional *) fixnum)) +(irregex-match-names (procedure irregex-match-names ((struct regexp-match)) list)) +(irregex-match-num-submatches (procedure irregex-match-num-submatches ((struct regexp-match)) fixnum)) +(irregex-match-start (procedure irregex-match-start (* #!optional *) *)) +;irregex-match-start-chunk +(irregex-match-start-index (procedure irregex-match-start-index ((struct regexp-match) #!optional *) fixnum)) (irregex-match-string (procedure irregex-match-string (*) *)) +(irregex-match-subchunk (procedure irregex-match-subchunk ((struct regexp-match) #!optional *) *)) +(irregex-match-substring (procedure irregex-match-substring (* #!optional *) *)) +(irregex-match/chunked (procedure irregex-match/chunked (* * * #!optional fixnum) *)) +(irregex-names (procedure irregex-names (*) *)) +(irregex-new-matches (procedure irregex-new-matches (*) *)) +(irregex-nfa (procedure irregex-nfa (*) *)) +(irregex-num-submatches (procedure irregex-num-submatches (*) fixnum)) +(irregex-opt (procedure irregex-opt (list) *)) +(irregex-quote (procedure irregex-quote (string) string)) (irregex-replace (procedure irregex-replace (* string #!rest) *)) (irregex-replace/all (procedure irregex-replace/all (* string #!rest) *)) -(irregex-dfa (procedure irregex-dfa (*) *)) -(irregex-dfa/search (procedure irregex-dfa/search (*) *)) -(irregex-dfa/extract (procedure irregex-dfa/extract (*) *)) -(irregex-nfa (procedure irregex-nfa (*) *)) -(irregex-flags (procedure irregex-flags (*) *)) +(irregex-reset-matches! (procedure irregex-reset-matches! (*) *)) +(irregex-search (procedure irregex-search (* string #!optional fixnum fixnum) *)) +(irregex-search/matches (procedure irregex-search/matches (* string fixnum fixnum *) *)) +(irregex-split (procedure irregex-split (* string #!optional fixnum fixnum) list)) (irregex-submatches (procedure irregex-submatches (*) *)) -(irregex-lengths (procedure irregex-lengths (*) *)) -(irregex-names (procedure irregex-names (*) *)) +(irregex-match-valid-index? irregex-match-valid-index? ((struct regexp-match) *) boolean) +(irregex? (procedure irregex? (*) boolean)) +(make-irregex-chunker + (procedure make-irregex-chunker + ((procedure (*) *) + (procedure (*) *) + #!optional + (procedure (*) *) + (procedure (*) *) + (procedure (* fixnum * fixnum) string) + (procedure (* fixnum * fixnum) *)) + *)) +(maybe-string->sre (procedure maybe-string->sre (*) *)) +(sre->irregex (procedure sre->irregex (#!rest) *)) +(string->irregex (procedure string->irregex (string #!rest) *)) +(string->sre (procedure string->sre (string #!rest) *)) ;; lolevel @@ -894,22 +923,6 @@ (with-input-from-pipe (procedure with-input-from-pipe (string (procedure () . *) #!optional symbol) . *)) (with-output-to-pipe (procedure with-output-to-pipe (string (procedure () . *) #!optional symbol) . *)) -;; regex - -(glob->regexp (procedure glob->regexp (string #!optional *) *)) -(glob? deprecated) -(grep (procedure grep (* list #!optional (procedure (*) *)) list)) -(regexp (procedure regexp (* #!optional * * *) (struct regexp))) -(regexp-escape (procedure regexp-escape (string) string)) -(regexp? (procedure regexp? (*) boolean)) -(string-match (procedure string-match (* string) *)) -(string-match-positions (procedure string-match-positions (* string) *)) -(string-search (procedure string-search (* string #!optional fixnum fixnum) *)) -(string-search-positions (procedure string-search-positions (* string #!optional fixnum fixnum) *)) -(string-split-fields (procedure string-split-fields (* string #!optional * fixnum) list)) -(string-substitute (procedure string-substitute (* string string #!optional *) string)) -(string-substitute* (procedure string-substitute* (string list #!optional *) string)) - ;; srfi-1 (alist-cons (procedure alist-cons (* * *) list)) diff --git a/utils.scm b/utils.scm index 03bd4ed6..b934a550 100644 --- a/utils.scm +++ b/utils.scm @@ -27,7 +27,7 @@ (declare (unit utils) - (uses extras srfi-13 posix files regex) + (uses extras srfi-13 posix files irregex) (fixnum) (hide chop-pds) (disable-interrupts) ) @@ -115,18 +115,15 @@ ;;; Scan lines until regex or predicate matches (define scan-input-lines - (let ((regexp regexp) - (read-line read-line) - (string-search string-search)) - (lambda (rx #!optional (port ##sys#standard-input)) - (let ((rx (if (procedure? rx) - rx - (cut string-search (regexp rx) <>)))) - (let loop () - (let ((ln (read-line port))) - (and (not (eof-object? ln)) - (or (rx ln) - (loop))))))))) + (lambda (rx #!optional (port ##sys#standard-input)) + (let ((rx (if (procedure? rx) + rx + (cute irregex-search (irregex rx) <>)))) + (let loop () + (let ((ln (read-line port))) + (and (not (eof-object? ln)) + (or (rx ln) + (loop)))))))) ;; Ask for confirmationTrap