~ chicken-core (master) /irregex.scm


  1;;;; irregex.scm - container for irregex-core.scm
  2;
  3; Copyright (c) 2010-2022, The CHICKEN Team
  4; All rights reserved.
  5;
  6; Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following
  7; conditions are met:
  8;
  9;   Redistributions of source code must retain the above copyright notice, this list of conditions and the following
 10;     disclaimer.
 11;   Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
 12;     disclaimer in the documentation and/or other materials provided with the distribution.
 13;   Neither the name of the author nor the names of its contributors may be used to endorse or promote
 14;     products derived from this software without specific prior written permission.
 15;
 16; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS
 17; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
 18; AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
 19; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 20; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 21; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 22; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 23; OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 24; POSSIBILITY OF SUCH DAMAGE.
 25
 26
 27
 28(declare
 29  (unit irregex)
 30  (no-procedure-checks)
 31  (fixnum))
 32
 33(module chicken.irregex
 34    (;; Constructors, accessors and predicates
 35     irregex irregex? string->sre maybe-string->sre sre->irregex
 36     irregex-names irregex-num-submatches string->irregex
 37
 38     ;; Chunking constructor
 39     make-irregex-chunker
 40
 41     ;; Main API
 42     irregex-extract irregex-fold irregex-match irregex-match?
 43     irregex-search irregex-split irregex-replace irregex-replace/all
 44
 45     ;; Chunked main API
 46     irregex-fold/chunked irregex-match/chunked irregex-search/chunked
 47
 48     ;; Match extraction API
 49     irregex-match-data? irregex-match-names
 50     irregex-match-start-index irregex-match-end-index
 51     irregex-match-num-submatches irregex-match-substring
 52     irregex-match-valid-index?
 53
 54     ;; Chunked match API
 55     irregex-match-start-chunk irregex-match-end-chunk
 56     irregex-match-subchunk
 57
 58     ;; Utilities
 59     glob->sre sre->string irregex-opt irregex-quote)
 60
 61
 62(import scheme chicken.base chicken.fixnum chicken.syntax chicken.type)
 63(import (only (scheme base) open-output-string get-output-string))
 64
 65(import-for-syntax chicken.fixnum)
 66
 67(include "common-declarations.scm")
 68
 69;; These should probably be taken out of irregex upstream
 70(declare (unused filter integer-log cset-size remove))
 71
 72;; Due to usual-integrations, find is the one from library.scm,
 73;; so find-tail is unused (it's only used in the "find" definition)
 74(declare (unused find-tail))
 75
 76;; This is to silence an "always true" warning that we can't fix
 77;; because we don't want to needlessly change irregex-core.
 78(declare (type (*allow-utf8-mode?* boolean)))
 79
 80(define-syntax build-cache
 81  (er-macro-transformer 
 82   (lambda (x r c)
 83     ;; (build-cache N ARG FAIL) 
 84     (let* ((n (cadr x))
 85	    (n2 (* n 2))
 86	    (arg (caddr x))
 87	    (fail (cadddr x))
 88	    (%cache (r 'cache))
 89	    (%index (r 'index))
 90	    (%arg (r 'arg))
 91	    (%let (r 'let))
 92	    (%let* (r 'let*))
 93	    (%if (r 'if))
 94	    (%fx+ (r 'fx+))
 95	    (%fxmod (r 'fxmod))
 96	    (%equal? (r 'equal?))
 97	    (%quote (r 'quote))
 98	    (%tmp (r 'tmp))
 99	    (%begin (r 'begin))
100	    (cache (make-vector (add1 n2) #f)))
101       (##sys#setslot cache n2 0)	 ; last slot: current index
102       `(,%let* ((,%cache (,%quote ,cache)) ; we mutate a literal vector
103		 (,%arg ,arg))
104		,(let fold ((i 0))
105		   (if (fx>= i n)
106		       ;; this should be thread-safe: a context-switch can only
107		       ;; happen before this code and in the call to FAIL.
108		       `(,%let ((,%tmp ,fail)
109				(,%index (##sys#slot ,%cache ,n2)))
110			       (##sys#setslot ,%cache ,%index ,%arg)
111			       (##sys#setslot ,%cache (,%fx+ ,%index 1) ,%tmp)
112			       (##sys#setislot 
113				,%cache ,n2 (,%fxmod (,%fx+ ,%index 2) ,n2))
114			       ,%tmp)
115		       `(,%if (,%equal? (##sys#slot ,%cache ,(* i 2)) ,%arg)
116			      (##sys#slot ,%cache ,(add1 (* i 2)))
117			      ,(fold (add1 i))))))))))
118
119(declare (unused %substring=?))
120(define-compiler-syntax %substring=?
121  (syntax-rules ()
122    ((_ a b start1 start2 len)
123     (##core#inline "C_u_i_substring_equal_p" a b start1 start2 len))))
124
125(define-compiler-syntax make-irregex 
126  (syntax-rules ()
127    ((_ dfa dfa/search nfa flags submatches lengths names)
128     (##sys#make-structure
129      'regexp dfa dfa/search nfa flags submatches lengths names))))
130
131(define-compiler-syntax make-irregex-match
132  (syntax-rules ()
133    ((_ count names)
134     (##sys#make-structure
135      'regexp-match
136      (make-vector (* 4 (+ 2 count)) #f) ; #1: submatches
137      names                              ; #2: (guess)
138      #f                                 ; #3: chunka
139      #f))))                             ; #4: fail
140
141(declare (unused reverse))
142(define-compiler-syntax reverse
143  (syntax-rules ()
144    ((_ lst) (##sys#fast-reverse lst))))
145
146(declare (unused bit-shl))
147(define-compiler-syntax bit-shl
148  (syntax-rules ()
149    ((_ n i) (fxshl n i))))
150
151(declare (unused bit-shr))
152(define-compiler-syntax bit-shr
153  (syntax-rules ()
154    ((_ n i) (fxshr n i))))
155
156(declare (unused bit-not))
157(define-compiler-syntax bit-not
158  (syntax-rules ()
159    ((_ n) (fxnot n))))
160
161(declare (unused bit-ior))
162(define-compiler-syntax bit-ior
163  (syntax-rules ()
164    ((_ a b) (fxior a b))))
165
166(declare (unused bit-and))
167(define-compiler-syntax bit-and
168  (syntax-rules ()
169    ((_ a b) (fxand a b))))
170
171(define-compiler-syntax match-vector-ref
172  (syntax-rules ()
173    ((_ m i) (##sys#slot (##sys#slot m 1) i))))
174
175(define-compiler-syntax match-vector-set!
176  (syntax-rules ()
177    ((_ m i x) (##sys#setslot (##sys#slot m 1) i x))))
178
179(declare (unused irregex-match-start-chunk-set!))
180(define-compiler-syntax irregex-match-start-chunk-set!
181  (syntax-rules ()
182    ((_ m n start)
183     (vector-set! (##sys#slot m 1) (* n 4) start))))
184
185(declare (unused irregex-match-start-index-set!))
186(define-compiler-syntax irregex-match-start-index-set!
187  (syntax-rules ()
188    ((_ m n start)
189     (vector-set! (##sys#slot m 1) (+ 1 (* n 4)) start))))
190
191(declare (unused irregex-match-end-chunk-set!))
192(define-compiler-syntax irregex-match-end-chunk-set!
193  (syntax-rules ()
194    ((_ m n end)
195     (vector-set! (##sys#slot m 1) (+ 2 (* n 4)) end))))
196
197(declare (unused irregex-match-end-index-set!))
198(define-compiler-syntax irregex-match-end-index-set!
199  (syntax-rules ()
200    ((_ m n end)
201     (vector-set! (##sys#slot m 1) (+ 3 (* n 4)) end))))
202
203(declare (unused irregex-match-chunk&index-from-tag-set!))
204(define-compiler-syntax irregex-match-chunk&index-from-tag-set!
205  (syntax-rules ()
206    ((_ m t chunk index)
207     (begin
208       (vector-set! (##sys#slot m 1) (+ 4 (* t 2)) chunk)
209       (vector-set! (##sys#slot m 1) (+ 5 (* t 2)) index)))))
210
211(include "irregex-core.scm")
212(include "irregex-utils.scm")
213
214(set! *allow-utf8-mode?* #f)
215
216(define glob->sre
217  (let ((list->string list->string)
218        (string->list string->list))
219    (lambda (s)
220      (##sys#check-string s 'glob->sre)
221      (cons
222       ':
223       (let loop ((cs (string->list s)) (dir #t))
224	 (if (null? cs)
225	     '()
226	     (let ((c (car cs))
227		   (rest (cdr cs)) )
228	       (cond ((char=? c #\*)
229		      (if dir
230			  `((or (: (~ ("./\\"))
231				   (* (~ ("/\\"))))
232				(* (~ ("./\\"))))
233			    ,@(loop rest #f))
234			  `((* (~ ("/\\"))) ,@(loop rest #f))))
235		     ((char=? c #\?)  (cons 'any (loop rest #f)))
236		     ((char=? c #\[)
237		      (let loop2 ((rest rest) (s '()))
238			(cond ((not (pair? rest))
239			       (error 'glob->sre
240				      "unexpected end of character class" s))
241			      ((char=? #\] (car rest))
242			       `(,(if (> (length s) 1)
243				      `(or ,@s)
244				      (car s))
245				 ,@(loop (cdr rest) #f)))
246			      ((and (pair? (cdr rest))
247				    (pair? (cddr rest))
248				    (char=? #\- (cadr rest)) )
249			       (loop2 (cdddr rest)
250				      (cons `(/ ,(car rest) ,(caddr rest)) s)))
251			      ((and (pair? (cdr rest))
252				    (char=? #\- (car rest)))
253			       (loop2 (cddr rest)
254				      (cons `(~ ,(cadr rest)) s)))
255			      (else
256			       (loop2 (cdr rest) (cons (car rest) s))))))
257		     (else (cons c (loop rest (memq c '(#\\ #\/)))))))))))))
258
259)
Trap