~ chicken-core (chicken-5) /irregex.scm


  1;;;; irregex.scm - container for irregex-core.scm
  2;
  3; Copyright (c) 2010-2022, The CHICKEN Team
  4; All rights reserved.
  5;
  6; Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following
  7; conditions are met:
  8;
  9;   Redistributions of source code must retain the above copyright notice, this list of conditions and the following
 10;     disclaimer.
 11;   Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
 12;     disclaimer in the documentation and/or other materials provided with the distribution.
 13;   Neither the name of the author nor the names of its contributors may be used to endorse or promote
 14;     products derived from this software without specific prior written permission.
 15;
 16; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS
 17; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
 18; AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
 19; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 20; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 21; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 22; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 23; OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 24; POSSIBILITY OF SUCH DAMAGE.
 25
 26
 27
 28(declare
 29  (unit irregex)
 30  (no-procedure-checks)
 31  (fixnum))
 32
 33(module chicken.irregex
 34    (;; Constructors, accessors and predicates
 35     irregex irregex? string->sre maybe-string->sre sre->irregex
 36     irregex-names irregex-num-submatches string->irregex
 37
 38     ;; Chunking constructor
 39     make-irregex-chunker
 40
 41     ;; Main API
 42     irregex-extract irregex-fold irregex-match irregex-match?
 43     irregex-search irregex-split irregex-replace irregex-replace/all
 44
 45     ;; Chunked main API
 46     irregex-fold/chunked irregex-match/chunked irregex-search/chunked
 47
 48     ;; Match extraction API
 49     irregex-match-data? irregex-match-names
 50     irregex-match-start-index irregex-match-end-index
 51     irregex-match-num-submatches irregex-match-substring
 52     irregex-match-valid-index?
 53
 54     ;; Chunked match API
 55     irregex-match-start-chunk irregex-match-end-chunk
 56     irregex-match-subchunk
 57
 58     ;; Utilities
 59     glob->sre sre->string irregex-opt irregex-quote)
 60
 61
 62(import scheme chicken.base chicken.fixnum chicken.syntax chicken.type)
 63
 64(import-for-syntax chicken.fixnum)
 65
 66(include "common-declarations.scm")
 67
 68;; These should probably be taken out of irregex upstream
 69(declare (unused filter integer-log cset-size remove))
 70
 71;; Due to usual-integrations, find is the one from library.scm,
 72;; so find-tail is unused (it's only used in the "find" definition)
 73(declare (unused find-tail))
 74
 75;; This is to silence an "always true" warning that we can't fix
 76;; because we don't want to needlessly change irregex-core.
 77(declare (type (*allow-utf8-mode?* boolean)))
 78
 79(define-syntax build-cache
 80  (er-macro-transformer 
 81   (lambda (x r c)
 82     ;; (build-cache N ARG FAIL) 
 83     (let* ((n (cadr x))
 84	    (n2 (* n 2))
 85	    (arg (caddr x))
 86	    (fail (cadddr x))
 87	    (%cache (r 'cache))
 88	    (%index (r 'index))
 89	    (%arg (r 'arg))
 90	    (%let (r 'let))
 91	    (%let* (r 'let*))
 92	    (%if (r 'if))
 93	    (%fx+ (r 'fx+))
 94	    (%fxmod (r 'fxmod))
 95	    (%equal? (r 'equal?))
 96	    (%quote (r 'quote))
 97	    (%tmp (r 'tmp))
 98	    (%begin (r 'begin))
 99	    (cache (make-vector (add1 n2) #f)))
100       (##sys#setslot cache n2 0)	 ; last slot: current index
101       `(,%let* ((,%cache (,%quote ,cache)) ; we mutate a literal vector
102		 (,%arg ,arg))
103		,(let fold ((i 0))
104		   (if (fx>= i n)
105		       ;; this should be thread-safe: a context-switch can only
106		       ;; happen before this code and in the call to FAIL.
107		       `(,%let ((,%tmp ,fail)
108				(,%index (##sys#slot ,%cache ,n2)))
109			       (##sys#setslot ,%cache ,%index ,%arg)
110			       (##sys#setslot ,%cache (,%fx+ ,%index 1) ,%tmp)
111			       (##sys#setislot 
112				,%cache ,n2 (,%fxmod (,%fx+ ,%index 2) ,n2))
113			       ,%tmp)
114		       `(,%if (,%equal? (##sys#slot ,%cache ,(* i 2)) ,%arg)
115			      (##sys#slot ,%cache ,(add1 (* i 2)))
116			      ,(fold (add1 i))))))))))
117
118(declare (unused %%string-copy!))
119(define-compiler-syntax %%string-copy!
120  (syntax-rules ()
121    ((_ to tstart from fstart fend)
122     (let ((x to)
123	   (y tstart)
124	   (z from)
125	   (u fstart)
126	   (v fend))
127       (##core#inline "C_substring_copy" z x u v y)))))
128
129(declare (unused %substring=?))
130(define-compiler-syntax %substring=?
131  (syntax-rules ()
132    ((_ a b start1 start2 len)
133     (##core#inline "C_substring_compare" a b start1 start2 len))))
134
135(define-compiler-syntax make-irregex 
136  (syntax-rules ()
137    ((_ dfa dfa/search nfa flags submatches lengths names)
138     (##sys#make-structure
139      'regexp dfa dfa/search nfa flags submatches lengths names))))
140
141(define-compiler-syntax make-irregex-match
142  (syntax-rules ()
143    ((_ count names)
144     (##sys#make-structure
145      'regexp-match
146      (make-vector (* 4 (+ 2 count)) #f) ; #1: submatches
147      names                              ; #2: (guess)
148      #f                                 ; #3: chunka
149      #f))))                             ; #4: fail
150
151(declare (unused reverse))
152(define-compiler-syntax reverse
153  (syntax-rules ()
154    ((_ lst) (##sys#fast-reverse lst))))
155
156(declare (unused bit-shl))
157(define-compiler-syntax bit-shl
158  (syntax-rules ()
159    ((_ n i) (fxshl n i))))
160
161(declare (unused bit-shr))
162(define-compiler-syntax bit-shr
163  (syntax-rules ()
164    ((_ n i) (fxshr n i))))
165
166(declare (unused bit-not))
167(define-compiler-syntax bit-not
168  (syntax-rules ()
169    ((_ n) (fxnot n))))
170
171(declare (unused bit-ior))
172(define-compiler-syntax bit-ior
173  (syntax-rules ()
174    ((_ a b) (fxior a b))))
175
176(declare (unused bit-and))
177(define-compiler-syntax bit-and
178  (syntax-rules ()
179    ((_ a b) (fxand a b))))
180
181(define-compiler-syntax match-vector-ref
182  (syntax-rules ()
183    ((_ m i) (##sys#slot (##sys#slot m 1) i))))
184
185(define-compiler-syntax match-vector-set!
186  (syntax-rules ()
187    ((_ m i x) (##sys#setslot (##sys#slot m 1) i x))))
188
189(declare (unused irregex-match-start-chunk-set!))
190(define-compiler-syntax irregex-match-start-chunk-set!
191  (syntax-rules ()
192    ((_ m n start)
193     (vector-set! (##sys#slot m 1) (* n 4) start))))
194
195(declare (unused irregex-match-start-index-set!))
196(define-compiler-syntax irregex-match-start-index-set!
197  (syntax-rules ()
198    ((_ m n start)
199     (vector-set! (##sys#slot m 1) (+ 1 (* n 4)) start))))
200
201(declare (unused irregex-match-end-chunk-set!))
202(define-compiler-syntax irregex-match-end-chunk-set!
203  (syntax-rules ()
204    ((_ m n end)
205     (vector-set! (##sys#slot m 1) (+ 2 (* n 4)) end))))
206
207(declare (unused irregex-match-end-index-set!))
208(define-compiler-syntax irregex-match-end-index-set!
209  (syntax-rules ()
210    ((_ m n end)
211     (vector-set! (##sys#slot m 1) (+ 3 (* n 4)) end))))
212
213(declare (unused irregex-match-chunk&index-from-tag-set!))
214(define-compiler-syntax irregex-match-chunk&index-from-tag-set!
215  (syntax-rules ()
216    ((_ m t chunk index)
217     (begin
218       (vector-set! (##sys#slot m 1) (+ 4 (* t 2)) chunk)
219       (vector-set! (##sys#slot m 1) (+ 5 (* t 2)) index)))))
220
221(include "irregex-core.scm")
222(include "irregex-utils.scm")
223
224(define glob->sre
225  (let ((list->string list->string)
226        (string->list string->list))
227    (lambda (s)
228      (##sys#check-string s 'glob->sre)
229      (cons
230       ':
231       (let loop ((cs (string->list s)) (dir #t))
232	 (if (null? cs)
233	     '()
234	     (let ((c (car cs))
235		   (rest (cdr cs)) )
236	       (cond ((char=? c #\*)
237		      (if dir
238			  `((or (: (~ ("./\\"))
239				   (* (~ ("/\\"))))
240				(* (~ ("./\\"))))
241			    ,@(loop rest #f))
242			  `((* (~ ("/\\"))) ,@(loop rest #f))))
243		     ((char=? c #\?)  (cons 'any (loop rest #f)))
244		     ((char=? c #\[)
245		      (let loop2 ((rest rest) (s '()))
246			(cond ((not (pair? rest))
247			       (error 'glob->sre
248				      "unexpected end of character class" s))
249			      ((char=? #\] (car rest))
250			       `(,(if (> (length s) 1)
251				      `(or ,@s)
252				      (car s))
253				 ,@(loop (cdr rest) #f)))
254			      ((and (pair? (cdr rest))
255				    (pair? (cddr rest))
256				    (char=? #\- (cadr rest)) )
257			       (loop2 (cdddr rest)
258				      (cons `(/ ,(car rest) ,(caddr rest)) s)))
259			      ((and (pair? (cdr rest))
260				    (char=? #\- (car rest)))
261			       (loop2 (cddr rest)
262				      (cons `(~ ,(cadr rest)) s)))
263			      (else
264			       (loop2 (cdr rest) (cons (car rest) s))))))
265		     (else (cons c (loop rest (memq c '(#\\ #\/)))))))))))))
266
267)
Trap