~ chicken-core (master) /irregex-core.scm
Trap1;;;; irregex.scm -- IrRegular Expressions
2;;
3;; Copyright (c) 2005-2024 Alex Shinn. All rights reserved.
4;; BSD-style license: http://synthcode.com/license.txt
5
6;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
7;; At this moment there was a loud ring at the bell, and I could
8;; hear Mrs. Hudson, our landlady, raising her voice in a wail of
9;; expostulation and dismay.
10;;
11;; "By heaven, Holmes," I said, half rising, "I believe that
12;; they are really after us."
13;;
14;; "No, it's not quite so bad as that. It is the unofficial
15;; force, -- the Baker Street irregulars."
16
17;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
18;;;; Notes
19;;
20;; This code should not require any porting - it should work out of
21;; the box in any R[457]RS Scheme implementation. Slight modifications
22;; are needed for R6RS (a separate R6RS-compatible version is included
23;; in the distribution as irregex-r6rs.scm).
24;;
25;; The goal of portability makes this code a little clumsy and
26;; inefficient. Future versions will include both cleanup and
27;; performance tuning, but you can only go so far while staying
28;; portable. AND-LET*, SRFI-9 records and custom macros would've been
29;; nice.
30
31;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
32;;;; History
33;; 0.9.11: 2024/02/23 - Guile test and packaging support from Tomas Volf.
34;; 0.9.10: 2021/07/06 - fixes for submatches under kleene star, empty seqs
35;; in alternations, and bol in folds for backtracking
36;; matcher (thanks John Clements and snan for reporting
37;; and Peter Bex for fixing)
38;; 0.9.9: 2021/05/14 - more comprehensive fix for repeated empty matches
39;; 0.9.8: 2020/07/13 - fix irregex-replace/all with look-behind patterns
40;; 0.9.7: 2019/12/31 - more intuitive handling of empty matches in -fold,
41;; -replace and -split
42;; 0.9.6: 2016/12/05 - fixed exponential memory use of + in compilation
43;; of backtracking matcher (CVE-2016-9954).
44;; 0.9.5: 2016/09/10 - fixed a bug in irregex-fold handling of bow
45;; 0.9.4: 2015/12/14 - performance improvement for {n,m} matches
46;; 0.9.3: 2014/07/01 - R7RS library
47;; 0.9.2: 2012/11/29 - fixed a bug in -fold on conditional bos patterns
48;; 0.9.1: 2012/11/27 - various accumulated bugfixes
49;; 0.9.0: 2012/06/03 - Using tags for match extraction from Peter Bex.
50;; 0.8.3: 2011/12/18 - various accumulated bugfixes
51;; 0.8.2: 2010/08/28 - (...)? submatch extraction fix and alternate
52;; named submatches from Peter Bex
53;; Added irregex-split, irregex-extract,
54;; irregex-match-names and irregex-match-valid-index?
55;; to Chicken and Guile module export lists and made
56;; the latter accept named submatches. The procedures
57;; irregex-match-{start,end}-{index,chunk} now also
58;; accept named submatches, with the index argument
59;; made optional. Improved argument type checks.
60;; Disallow negative submatch index.
61;; Improve performance of backtracking matcher.
62;; Refactor charset handling into a consistent API
63;; 0.8.1: 2010/03/09 - backtracking irregex-match fix and other small fixes
64;; 0.8.0: 2010/01/20 - optimizing DFA compilation, adding SRE escapes
65;; inside PCREs, adding utility SREs
66;; 0.7.5: 2009/08/31 - adding irregex-extract and irregex-split
67;; *-fold copies match data (use *-fold/fast for speed)
68;; irregex-opt now returns an SRE
69;; 0.7.4: 2009/05/14 - empty alternates (or) and empty csets always fail,
70;; bugfix in default finalizer for irregex-fold/chunked
71;; 0.7.3: 2009/04/14 - adding irregex-fold/chunked, minor doc fixes
72;; 0.7.2: 2009/02/11 - some bugfixes, much improved documentation
73;; 0.7.1: 2008/10/30 - several bugfixes (thanks to Derick Eddington)
74;; 0.7.0: 2008/10/20 - support abstract chunked strings
75;; 0.6.2: 2008/07/26 - minor bugfixes, allow global disabling of utf8 mode,
76;; friendlier error messages in parsing, \Q..\E support
77;; 0.6.1: 2008/07/21 - added utf8 mode, more utils, bugfixes
78;; 0.6: 2008/05/01 - most of PCRE supported
79;; 0.5: 2008/04/24 - fully portable R4RS, many PCRE features implemented
80;; 0.4: 2008/04/17 - rewriting NFA to use efficient closure compilation,
81;; normal strings only, but all of the spencer tests pass
82;; 0.3: 2008/03/10 - adding DFA converter (normal strings only)
83;; 0.2: 2005/09/27 - adding irregex-opt (like elisp's regexp-opt) utility
84;; 0.1: 2005/08/18 - simple NFA interpreter over abstract chunked strings
85
86;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
87;;;; Data Structures
88
89(define %vector-copy scheme#vector-copy)
90(define (%vector-copy! from to) (scheme#vector-copy! to 0 from))
91
92(define (%vector-copy v)
93 (let ((v2 (make-vector (vector-length v))))
94 (%vector-copy! v v2)
95 v2))
96
97(cond-expand
98 (chicken-bootstrap
99 (begin
100 ;; make-irregex defined elsewhere
101 (define (irregex? x)
102 (##sys#structure? x 'regexp))
103 (define (irregex-dfa x)
104 (##sys#check-structure x 'regexp 'irregex-dfa)
105 (##sys#slot x 1))
106 (define (irregex-dfa/search x)
107 (##sys#check-structure x 'regexp 'irregex-dfa/search)
108 (##sys#slot x 2))
109 (define (irregex-nfa x)
110 (##sys#check-structure x 'regexp 'irregex-nfa)
111 (##sys#slot x 3))
112 (define (irregex-flags x)
113 (##sys#check-structure x 'regexp 'irregex-flags)
114 (##sys#slot x 4))
115 (define (irregex-num-submatches x)
116 (##sys#check-structure x 'regexp 'irregex-num-submatches)
117 (##sys#slot x 5))
118 (define (irregex-lengths x)
119 (##sys#check-structure x 'regexp 'irregex-lengths)
120 (##sys#slot x 6))
121 (define (irregex-names x)
122 (##sys#check-structure x 'regexp 'irregex-names)
123 (##sys#slot x 7))
124 ;; make-irregex-match defined elsewhere
125 (define (irregex-new-matches irx)
126 (make-irregex-match (irregex-num-submatches irx) (irregex-names irx)))
127 (define (irregex-reset-matches! m)
128 (let ((v (##sys#slot m 1)))
129 (vector-fill! v #f)
130 m))
131 (define (irregex-copy-matches m)
132 (and (##sys#structure? m 'regexp-match)
133 (##sys#make-structure
134 'regexp-match
135 (%vector-copy (##sys#slot m 1))
136 (##sys#slot m 2)
137 (##sys#slot m 3)
138 (##sys#slot m 4))))
139 (define (irregex-match-data? obj)
140 (##sys#structure? obj 'regexp-match))
141 (define (irregex-match-num-submatches m)
142 (##sys#check-structure m 'regexp-match 'irregex-match-num-submatches)
143 (- (fx/ (##sys#size (##sys#slot m 1)) 4) 2))
144 (define (irregex-match-chunker m)
145 (##sys#slot m 3))
146 (define (irregex-match-names m)
147 (##sys#check-structure m 'regexp-match 'irregex-match-names)
148 (##sys#slot m 2))
149 (define (irregex-match-chunker-set! m str)
150 (##sys#setslot m 3 str))
151 (define-inline (%irregex-match-start-chunk m n)
152 (##sys#slot (##sys#slot m 1) (* n 4)))
153 (define-inline (%irregex-match-start-index m n)
154 (##sys#slot (##sys#slot m 1) (+ 1 (* n 4))))
155 (define-inline (%irregex-match-end-chunk m n)
156 (##sys#slot (##sys#slot m 1) (+ 2 (* n 4))))
157 (define (%irregex-match-end-index m n)
158 (##sys#slot (##sys#slot m 1) (+ 3 (* n 4))))
159 (define (%irregex-match-fail m) (##sys#slot m 4))
160 (define (%irregex-match-fail-set! m x) (##sys#setslot m 4 x))
161 (set-record-printer! 'regexp-match
162 (lambda (m out)
163 (let ((n (irregex-match-num-submatches m)))
164 (display "#<regexp-match (" out)
165 (display n out)
166 (display " submatch" out)
167 (when (or (eq? n 0) (fx> n 1)) (display "es" out))
168 (display ")>" out))))
169 (define-inline (irregex-match-valid-numeric-index? m n)
170 (let ((v (##sys#slot m 1)))
171 (and (>= n 0) (< (* n 4) (- (##sys#size v) 4)))))
172 (define-inline (irregex-match-matched-numeric-index? m n)
173 (let ((v (##sys#slot m 1)))
174 (and (##sys#slot v (+ 1 (* n 4)))
175 #t)))))
176 (else
177 (begin
178 (define irregex-tag '*irregex-tag*)
179 (define (make-irregex dfa dfa/search nfa flags submatches lengths names)
180 (vector irregex-tag dfa dfa/search nfa flags submatches lengths names))
181 (define (irregex? obj)
182 (and (vector? obj)
183 (= 8 (vector-length obj))
184 (eq? irregex-tag (vector-ref obj 0))))
185 (define (irregex-dfa x) (vector-ref x 1))
186 (define (irregex-dfa/search x) (vector-ref x 2))
187 (define (irregex-nfa x) (vector-ref x 3))
188 (define (irregex-flags x) (vector-ref x 4))
189 (define (irregex-num-submatches x) (vector-ref x 5))
190 (define (irregex-lengths x) (vector-ref x 6))
191 (define (irregex-names x) (vector-ref x 7))
192 (define (irregex-new-matches irx)
193 (make-irregex-match (irregex-num-submatches irx) (irregex-names irx)))
194 (define (irregex-reset-matches! m)
195 (do ((i (- (vector-length m) 1) (- i 1)))
196 ((<= i 3) m)
197 (vector-set! m i #f)))
198 (define (irregex-copy-matches m)
199 (and (vector? m) (%vector-copy m)))
200 (define irregex-match-tag '*irregex-match-tag*)
201 (define (irregex-match-data? obj)
202 (and (vector? obj)
203 (>= (vector-length obj) 11)
204 (eq? irregex-match-tag (vector-ref obj 0))))
205 (define (make-irregex-match count names)
206 (let ((res (make-vector (+ (* 4 (+ 2 count)) 3) #f)))
207 (vector-set! res 0 irregex-match-tag)
208 (vector-set! res 2 names)
209 res))
210 (define (irregex-match-num-submatches m)
211 (- (quotient (- (vector-length m) 3) 4) 2))
212 (define (irregex-match-chunker m)
213 (vector-ref m 1))
214 (define (irregex-match-names m)
215 (vector-ref m 2))
216 (define (irregex-match-chunker-set! m str)
217 (vector-set! m 1 str))
218 (define (%irregex-match-start-chunk m n) (vector-ref m (+ 3 (* n 4))))
219 (define (%irregex-match-start-index m n) (vector-ref m (+ 4 (* n 4))))
220 (define (%irregex-match-end-chunk m n) (vector-ref m (+ 5 (* n 4))))
221 (define (%irregex-match-end-index m n) (vector-ref m (+ 6 (* n 4))))
222 (define (%irregex-match-fail m) (vector-ref m (- (vector-length m) 1)))
223 (define (%irregex-match-fail-set! m x) (vector-set! m (- (vector-length m) 1) x))
224 (define (irregex-match-valid-numeric-index? m n)
225 (and (>= n 0) (< (+ 3 (* n 4)) (- (vector-length m) 4))))
226 (define (irregex-match-matched-numeric-index? m n)
227 (and (vector-ref m (+ 4 (* n 4)))
228 #t)))))
229
230(define (irregex-match-valid-named-index? m n)
231 (and (assq n (irregex-match-names m))
232 #t))
233
234;; public interface with error checking
235(define (irregex-match-start-chunk m . opt)
236 (let ((n (irregex-match-numeric-index 'irregex-match-start-chunk m opt)))
237 (and n (%irregex-match-start-chunk m n))))
238(define (irregex-match-start-index m . opt)
239 (let ((n (irregex-match-numeric-index 'irregex-match-start-index m opt)))
240 (and n (%irregex-match-start-index m n))))
241(define (irregex-match-end-chunk m . opt)
242 (let ((n (irregex-match-numeric-index 'irregex-match-end-chunk m opt)))
243 (and n (%irregex-match-end-chunk m n))))
244(define (irregex-match-end-index m . opt)
245 (let ((n (irregex-match-numeric-index 'irregex-match-end-index m opt)))
246 (and n (%irregex-match-end-index m n))))
247
248(define (irregex-match-start-chunk-set! m n start)
249 (vector-set! m (+ 3 (* n 4)) start))
250(define (irregex-match-start-index-set! m n start)
251 (vector-set! m (+ 4 (* n 4)) start))
252(define (irregex-match-end-chunk-set! m n end)
253 (vector-set! m (+ 5 (* n 4)) end))
254(define (irregex-match-end-index-set! m n end)
255 (vector-set! m (+ 6 (* n 4)) end))
256
257;; Tags use indices that are aligned to start/end positions just like the
258;; match vectors. ie, a tag 0 is a start tag, 1 is its corresponding end tag.
259;; They start at 0, which requires us to map them to submatch index 1.
260;; Sorry for the horrible name ;)
261(define (irregex-match-chunk&index-from-tag-set! m t chunk index)
262 (vector-set! m (+ 7 (* t 2)) chunk)
263 (vector-set! m (+ 8 (* t 2)) index))
264
265;; Helper procedure to convert any type of index from a rest args list
266;; to a numeric index. Named submatches are converted to their corresponding
267;; numeric index, and numeric submatches are checked for validity.
268;; An error is raised for invalid numeric or named indices, #f is returned
269;; for defined but nonmatching indices.
270(define (irregex-match-numeric-index location m opt)
271 (cond
272 ((not (irregex-match-data? m))
273 (error location "not match data" m))
274 ((not (pair? opt)) 0)
275 ((pair? (cdr opt))
276 (apply error location "too many arguments" m opt))
277 (else
278 (let ((n (car opt)))
279 (if (number? n)
280 (if (and (integer? n) (exact? n))
281 (if (irregex-match-valid-numeric-index? m n)
282 (and (irregex-match-matched-numeric-index? m n) n)
283 (error location "not a valid index" m n))
284 (error location "not an exact integer" n))
285 (let lp ((ls (irregex-match-names m))
286 (unknown? #t))
287 (cond
288 ((null? ls)
289 (and unknown?
290 (error location "unknown match name" n)))
291 ((eq? n (caar ls))
292 (if (%irregex-match-start-chunk m (cdar ls))
293 (cdar ls)
294 (lp (cdr ls) #f)))
295 (else (lp (cdr ls) unknown?)))))))))
296
297(define (irregex-match-valid-index? m n)
298 (if (not (irregex-match-data? m))
299 (error 'irregex-match-valid-index? "not match data" m))
300 (if (integer? n)
301 (if (not (exact? n))
302 (error 'irregex-match-valid-index? "not an exact integer" n)
303 (irregex-match-valid-numeric-index? m n))
304 (irregex-match-valid-named-index? m n)))
305
306(define (irregex-match-substring m . opt)
307 (let* ((n (irregex-match-numeric-index 'irregex-match-substring m opt))
308 (cnk (irregex-match-chunker m)))
309 (and n
310 ((chunker-get-substring cnk)
311 (%irregex-match-start-chunk m n)
312 (%irregex-match-start-index m n)
313 (%irregex-match-end-chunk m n)
314 (%irregex-match-end-index m n)))))
315
316(define (irregex-match-subchunk m . opt)
317 (let* ((n (irregex-match-numeric-index 'irregex-match-subchunk m opt))
318 (cnk (irregex-match-chunker m))
319 (get-subchunk (chunker-get-subchunk cnk)))
320 (if (not get-subchunk)
321 (error "this chunk type does not support match subchunks" m n)
322 (and n (get-subchunk
323 (%irregex-match-start-chunk m n)
324 (%irregex-match-start-index m n)
325 (%irregex-match-end-chunk m n)
326 (%irregex-match-end-index m n))))))
327
328;; chunkers tell us how to navigate through chained chunks of strings
329
330(define (make-irregex-chunker get-next get-str . o)
331 (let* ((get-start (or (and (pair? o) (car o)) (lambda (cnk) 0)))
332 (o (if (pair? o) (cdr o) o))
333 (get-end (or (and (pair? o) (car o))
334 (lambda (cnk) (string-length (get-str cnk)))))
335 (o (if (pair? o) (cdr o) o))
336 (get-substr
337 (or (and (pair? o) (car o))
338 (lambda (cnk1 start cnk2 end)
339 (if (eq? cnk1 cnk2)
340 (substring (get-str cnk1) start end)
341 (let loop ((cnk (get-next cnk1))
342 (res (list (substring (get-str cnk1)
343 start
344 (get-end cnk1)))))
345 (if (eq? cnk cnk2)
346 (string-cat-reverse
347 (cons (substring (get-str cnk)
348 (get-start cnk)
349 end)
350 res))
351 (loop (get-next cnk)
352 (cons (substring (get-str cnk)
353 (get-start cnk)
354 (get-end cnk))
355 res))))))))
356 (o (if (pair? o) (cdr o) o))
357 (get-subchunk (and (pair? o) (car o))))
358 (if (not (and (procedure? get-next) (procedure? get-str)
359 (procedure? get-start) (procedure? get-substr)))
360 (error 'make-irregex-chunker "expected a procdure"))
361 (vector get-next get-str get-start get-end get-substr get-subchunk)))
362
363(define (chunker-get-next cnk) (vector-ref cnk 0))
364(define (chunker-get-str cnk) (vector-ref cnk 1))
365(define (chunker-get-start cnk) (vector-ref cnk 2))
366(define (chunker-get-end cnk) (vector-ref cnk 3))
367(define (chunker-get-substring cnk) (vector-ref cnk 4))
368(define (chunker-get-subchunk cnk) (vector-ref cnk 5))
369
370(define (chunker-prev-chunk cnk start end)
371 (if (eq? start end)
372 #f
373 (let ((get-next (chunker-get-next cnk)))
374 (let lp ((start start))
375 (let ((next (get-next start)))
376 (if (eq? next end)
377 start
378 (and next (lp next))))))))
379
380(define (chunker-prev-char cnk start end)
381 (let ((prev (chunker-prev-chunk cnk start end)))
382 (and prev
383 (string-ref ((chunker-get-str cnk) prev)
384 (- ((chunker-get-end cnk) prev) 1)))))
385
386(define (chunker-next-char cnk src)
387 (let ((next ((chunker-get-next cnk) src)))
388 (and next
389 (string-ref ((chunker-get-str cnk) next)
390 ((chunker-get-start cnk) next)))))
391
392(define (chunk-before? cnk a b)
393 (and (not (eq? a b))
394 (let ((next ((chunker-get-next cnk) a)))
395 (and next
396 (if (eq? next b)
397 #t
398 (chunk-before? cnk next b))))))
399
400;; For look-behind searches, wrap an existing chunker such that it
401;; returns the same results but ends at a given point.
402(define (wrap-end-chunker cnk src i)
403 (make-irregex-chunker
404 (lambda (x) (and (not (eq? x src)) ((chunker-get-next cnk) x)))
405 (chunker-get-str cnk)
406 (chunker-get-start cnk)
407 (lambda (x)
408 ;; TODO: this is a hack workaround for the fact that we don't
409 ;; have either a notion of chunk equivalence or chunk truncation,
410 ;; until which time (neg-)look-behind in a fold won't work on
411 ;; non-basic chunks.
412 (if (or (eq? x src)
413 (and (not ((chunker-get-next cnk) x))
414 (not ((chunker-get-next cnk) src))))
415 i
416 ((chunker-get-end cnk) x)))
417 (chunker-get-substring cnk)
418 (chunker-get-subchunk cnk)))
419
420;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
421;;;; String Utilities
422
423;; Unicode version (skip surrogates)
424(define *all-chars*
425 `(/ ,(integer->char 0) ,(integer->char #xD7FF)
426 ,(integer->char #xE000) ,(integer->char #x10FFFF)))
427
428;; ASCII version, offset to not assume 0-255
429;; (define *all-chars* `(/ ,(integer->char (- (char->integer #\space) 32)) ,(integer->char (+ (char->integer #\space) 223))))
430
431;; set to #f to ignore even an explicit request for utf8 handling
432;; The utf8-mode is undesired on any implementation with native unicode support.
433;; It is a workaround for those that treat strings as a raw byte sequences, and
434;; does not work well otherwise. So disable it on implementations known to
435;; handle unicode natively.
436(define *allow-utf8-mode?* (cond-expand ((and chicken (not full-unicode)) #t)
437 (else #f)))
438
439;; (define *named-char-properties* '())
440
441(define (string-scan-char str c . o)
442 (let ((end (string-length str)))
443 (let scan ((i (if (pair? o) (car o) 0)))
444 (cond ((= i end) #f)
445 ((eqv? c (string-ref str i)) i)
446 (else (scan (+ i 1)))))))
447
448(define (string-scan-char-escape str c . o)
449 (let ((end (string-length str)))
450 (let scan ((i (if (pair? o) (car o) 0)))
451 (cond ((= i end) #f)
452 ((eqv? c (string-ref str i)) i)
453 ((eqv? c #\\) (scan (+ i 2)))
454 (else (scan (+ i 1)))))))
455
456(define (string-scan-pred str pred . o)
457 (let ((end (string-length str)))
458 (let scan ((i (if (pair? o) (car o) 0)))
459 (cond ((= i end) #f)
460 ((pred (string-ref str i)) i)
461 (else (scan (+ i 1)))))))
462
463(define (string-split-char str c)
464 (let ((end (string-length str)))
465 (let lp ((i 0) (from 0) (res '()))
466 (define (collect) (cons (substring str from i) res))
467 (cond ((>= i end) (reverse (collect)))
468 ((eqv? c (string-ref str i)) (lp (+ i 1) (+ i 1) (collect)))
469 (else (lp (+ i 1) from res))))))
470
471(define (char-alphanumeric? c)
472 (or (char-alphabetic? c) (char-numeric? c)))
473
474(define (%substring=? a b start1 start2 len)
475 (let lp ((i 0))
476 (cond ((>= i len)
477 #t)
478 ((char=? (string-ref a (+ start1 i)) (string-ref b (+ start2 i)))
479 (lp (+ i 1)))
480 (else
481 #f))))
482
483;; SRFI-13 extracts
484
485(define (%%string-copy! to tstart from fstart fend)
486 (do ((i fstart (+ i 1))
487 (j tstart (+ j 1)))
488 ((>= i fend))
489 (string-set! to j (string-ref from i))))
490
491(define (string-cat-reverse string-list)
492 (string-cat-reverse/aux
493 (fold (lambda (s a) (+ (string-length s) a)) 0 string-list)
494 string-list))
495
496(define (string-cat-reverse/aux len string-list)
497 (let ((res (make-string len)))
498 (let lp ((i len) (ls string-list))
499 (if (pair? ls)
500 (let* ((s (car ls))
501 (slen (string-length s))
502 (i (- i slen)))
503 (%%string-copy! res i s 0 slen)
504 (lp i (cdr ls)))))
505 res))
506
507;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
508;;;; List Utilities
509
510;; like the one-arg IOTA case
511(define (zero-to n)
512 (if (<= n 0)
513 '()
514 (let lp ((i (- n 1)) (res '()))
515 (if (zero? i) (cons 0 res) (lp (- i 1) (cons i res))))))
516
517;; SRFI-1 extracts (simplified 1-ary versions)
518
519(define (find pred ls)
520 (let lp ((ls ls))
521 (cond ((null? ls) #f)
522 ((pred (car ls)) (car ls))
523 (else (lp (cdr ls))))))
524
525(define (find-tail pred ls)
526 (let lp ((ls ls))
527 (cond ((null? ls) #f)
528 ((pred (car ls)) ls)
529 (else (lp (cdr ls))))))
530
531(define (last ls)
532 (if (not (pair? ls))
533 (error "can't take last of empty list")
534 (let lp ((ls ls))
535 (if (pair? (cdr ls))
536 (lp (cdr ls))
537 (car ls)))))
538
539(define (any pred ls)
540 (and (pair? ls)
541 (let lp ((head (car ls)) (tail (cdr ls)))
542 (if (null? tail)
543 (pred head)
544 (or (pred head) (lp (car tail) (cdr tail)))))))
545
546(define (every pred ls)
547 (or (null? ls)
548 (let lp ((head (car ls)) (tail (cdr ls)))
549 (if (null? tail)
550 (pred head)
551 (and (pred head) (lp (car tail) (cdr tail)))))))
552
553(define (fold kons knil ls)
554 (let lp ((ls ls) (res knil))
555 (if (null? ls)
556 res
557 (lp (cdr ls) (kons (car ls) res)))))
558
559(define (filter pred ls)
560 (let lp ((ls ls) (res '()))
561 (if (null? ls)
562 (reverse res)
563 (lp (cdr ls) (if (pred (car ls)) (cons (car ls) res) res)))))
564
565(define (remove pred ls)
566 (let lp ((ls ls) (res '()))
567 (if (null? ls)
568 (reverse res)
569 (lp (cdr ls) (if (pred (car ls)) res (cons (car ls) res))))))
570
571;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
572;;;; Flags
573
574(define (bit-shr n i)
575 (quotient n (expt 2 i)))
576
577(define (bit-shl n i)
578 (* n (expt 2 i)))
579
580(define (bit-not n) (- #xFFFF n))
581
582(define (bit-ior a b)
583 (cond
584 ((zero? a) b)
585 ((zero? b) a)
586 (else
587 (+ (if (or (odd? a) (odd? b)) 1 0)
588 (* 2 (bit-ior (quotient a 2) (quotient b 2)))))))
589
590(define (bit-and a b)
591 (cond
592 ((zero? a) 0)
593 ((zero? b) 0)
594 (else
595 (+ (if (and (odd? a) (odd? b)) 1 0)
596 (* 2 (bit-and (quotient a 2) (quotient b 2)))))))
597
598(define (integer-log n)
599 (define (b8 n r)
600 (if (>= n (bit-shl 1 8)) (b4 (bit-shr n 8) (+ r 8)) (b4 n r)))
601 (define (b4 n r)
602 (if (>= n (bit-shl 1 4)) (b2 (bit-shr n 4) (+ r 4)) (b2 n r)))
603 (define (b2 n r)
604 (if (>= n (bit-shl 1 2)) (b1 (bit-shr n 2) (+ r 2)) (b1 n r)))
605 (define (b1 n r) (if (>= n (bit-shl 1 1)) (+ r 1) r))
606 (if (>= n (bit-shl 1 16)) (b8 (bit-shr n 16) 16) (b8 n 0)))
607
608(define (flag-set? flags i)
609 (= i (bit-and flags i)))
610(define (flag-join a b)
611 (if b (bit-ior a b) a))
612(define (flag-clear a b)
613 (bit-and a (bit-not b)))
614
615(define ~none 0)
616(define ~searcher? 1)
617(define ~consumer? 2)
618
619;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
620;;;; Parsing Embedded SREs in PCRE Strings
621
622;; (define (with-read-from-string str i proc)
623;; (define (port-size in)
624;; (let lp ((i 0)) (if (eof-object? (read-char in)) i (lp (+ i 1)))))
625;; (let* ((len (string-length str))
626;; (tail-len (- len i))
627;; (in (open-input-string (substring str i len)))
628;; (sre (read in))
629;; (unused-len (port-size in)))
630;; (close-input-port in)
631;; (proc sre (- tail-len unused-len))))
632
633(define close-token (list 'close))
634(define dot-token (string->symbol "."))
635
636(define (with-read-from-string str i proc)
637 (define end (string-length str))
638 (define (read i k)
639 (cond
640 ((>= i end) (error "unterminated embedded SRE" str))
641 (else
642 (case (string-ref str i)
643 ((#\()
644 (let lp ((i (+ i 1)) (ls '()))
645 (read
646 i
647 (lambda (x j)
648 (cond
649 ((eq? x close-token)
650 (k (reverse ls) j))
651 ((eq? x dot-token)
652 (if (null? ls)
653 (error "bad dotted form" str)
654 (read j (lambda (y j2)
655 (read j2 (lambda (z j3)
656 (if (not (eq? z close-token))
657 (error "bad dotted form" str)
658 (k (append (reverse (cdr ls))
659 (cons (car ls) y))
660 j3))))))))
661 (else
662 (lp j (cons x ls))))))))
663 ((#\))
664 (k close-token (+ i 1)))
665 ((#\;)
666 (let skip ((i (+ i 1)))
667 (if (or (>= i end) (eqv? #\newline (string-ref str i)))
668 (read (+ i 1) k)
669 (skip (+ i 1)))))
670 ((#\' #\`)
671 (read (+ i 1)
672 (lambda (sexp j)
673 (let ((q (if (eqv? #\' (string-ref str i)) 'quote 'quasiquote)))
674 (k (list q sexp) j)))))
675 ((#\,)
676 (let* ((at? (and (< (+ i 1) end) (eqv? #\@ (string-ref str (+ i 1)))))
677 (u (if at? 'uquote-splicing 'unquote))
678 (j (if at? (+ i 2) (+ i 1))))
679 (read j (lambda (sexp j) (k (list u sexp) j)))))
680 ((#\")
681 (let scan ((from (+ i 1)) (i (+ i 1)) (res '()))
682 (define (collect)
683 (if (= from i) res (cons (substring str from i) res)))
684 (if (>= i end)
685 (error "unterminated string in embedded SRE" str)
686 (case (string-ref str i)
687 ((#\") (k (string-cat-reverse (collect)) (+ i 1)))
688 ((#\\) (scan (+ i 1) (+ i 2) (collect)))
689 (else (scan from (+ i 1) res))))))
690 ((#\#)
691 (case (string-ref str (+ i 1))
692 ((#\;)
693 (read (+ i 2) (lambda (sexp j) (read j k))))
694 ((#\\)
695 (read (+ i 2)
696 (lambda (sexp j)
697 (k (case sexp
698 ((space) #\space)
699 ((newline) #\newline)
700 (else (let ((s (if (number? sexp)
701 (number->string sexp)
702 (symbol->string sexp))))
703 (string-ref s 0))))
704 j))))
705 ((#\t #\f)
706 (k (eqv? #\t (string-ref str (+ i 1))) (+ i 2)))
707 (else
708 (error "bad # syntax in simplified SRE" i))))
709 (else
710 (cond
711 ((char-whitespace? (string-ref str i))
712 (read (+ i 1) k))
713 (else ;; symbol/number
714 (let scan ((j (+ i 1)))
715 (cond
716 ((or (>= j end)
717 (let ((c (string-ref str j)))
718 (or (char-whitespace? c)
719 (memv c '(#\; #\( #\) #\" #\# #\\)))))
720 (let ((str2 (substring str i j)))
721 (k (or (string->number str2) (string->symbol str2)) j)))
722 (else (scan (+ j 1))))))))))))
723 (read i (lambda (res j)
724 (if (eq? res 'close-token)
725 (error "unexpected ')' in SRE" str j)
726 (proc res j)))))
727
728;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
729;;;; Parsing PCRE Strings
730
731(define ~save? 1)
732(define ~case-insensitive? 2)
733(define ~multi-line? 4)
734(define ~single-line? 8)
735(define ~ignore-space? 16)
736(define ~utf8? 32)
737
738(define (symbol-list->flags ls)
739 (let lp ((ls ls) (res ~none))
740 (if (not (pair? ls))
741 res
742 (lp (cdr ls)
743 (flag-join
744 res
745 (case (car ls)
746 ((i ci case-insensitive) ~case-insensitive?)
747 ((m multi-line) ~multi-line?)
748 ((s single-line) ~single-line?)
749 ((x ignore-space) ~ignore-space?)
750 ((u utf8) (if *allow-utf8-mode?* ~utf8? ~none))
751 (else #f)))))))
752
753(define (maybe-string->sre obj)
754 (if (string? obj) (string->sre obj) obj))
755
756(define (string->sre str . o)
757 (if (not (string? str)) (error 'string->sre "expected a string" str))
758 (let ((end (string-length str))
759 (flags (symbol-list->flags o)))
760
761 (let lp ((i 0) (from 0) (flags flags) (res '()) (st '()))
762
763 ;; handle case sensitivity at the literal char/string level
764 (define (cased-char ch)
765 (if (and (flag-set? flags ~case-insensitive?)
766 (char-alphabetic? ch))
767 `(or ,ch ,(char-altcase ch))
768 ch))
769 (define (cased-string str)
770 (if (flag-set? flags ~case-insensitive?)
771 (sre-sequence (map cased-char (string->list str)))
772 str))
773 ;; accumulate the substring from..i as literal text
774 (define (collect)
775 (if (= i from) res (cons (cased-string (substring str from i)) res)))
776 ;; like collect but breaks off the last single character when
777 ;; collecting literal data, as the argument to ?/*/+ etc.
778 (define (collect/single)
779 (let* ((utf8? (flag-set? flags ~utf8?))
780 (j (if (and utf8? (> i 1))
781 (utf8-backup-to-initial-char str (- i 1))
782 (- i 1))))
783 (cond
784 ((< j from)
785 res)
786 (else
787 (let ((c (cased-char (if utf8?
788 (utf8-string-ref str j (- i j))
789 (string-ref str j)))))
790 (cond
791 ((= j from)
792 (cons c res))
793 (else
794 (cons c
795 (cons (cased-string (substring str from j))
796 res)))))))))
797 ;; collects for use as a result, reversing and grouping OR
798 ;; terms, and some ugly tweaking of `function-like' groups and
799 ;; conditionals
800 (define (collect/terms)
801 (let* ((ls (collect))
802 (func
803 (and (pair? ls)
804 (memq (last ls)
805 '(atomic if look-ahead neg-look-ahead
806 look-behind neg-look-behind
807 => submatch-named
808 w/utf8 w/noutf8))))
809 (prefix (if (and func (memq (car func) '(=> submatch-named)))
810 (list 'submatch-named (cadr (reverse ls)))
811 (and func (list (car func)))))
812 (ls (if func
813 (if (memq (car func) '(=> submatch-named))
814 (reverse (cddr (reverse ls)))
815 (reverse (cdr (reverse ls))))
816 ls)))
817 (let lp ((ls ls) (term '()) (res '()))
818 (define (shift)
819 (cons (sre-sequence term) res))
820 (cond
821 ((null? ls)
822 (let* ((res (sre-alternate (shift)))
823 (res (if (flag-set? flags ~save?)
824 (list 'submatch res)
825 res)))
826 (if prefix
827 (if (eq? 'if (car prefix))
828 (cond
829 ((not (pair? res))
830 'epsilon)
831 ((memq (car res)
832 '(look-ahead neg-look-ahead
833 look-behind neg-look-behind))
834 res)
835 ((eq? 'seq (car res))
836 `(if ,(cadr res)
837 ,(sre-sequence (cddr res))))
838 (else
839 `(if ,(cadadr res)
840 ,(sre-sequence (cddadr res))
841 ,(sre-alternate (cddr res)))))
842 `(,@prefix ,res))
843 res)))
844 ((eq? 'or (car ls)) (lp (cdr ls) '() (shift)))
845 (else (lp (cdr ls) (cons (car ls) term) res))))))
846 (define (save)
847 (cons (cons flags (collect)) st))
848
849 ;; main parsing
850 (if (>= i end)
851 (if (pair? st)
852 (error "unterminated parenthesis in regexp" str)
853 (collect/terms))
854 (let ((c (string-ref str i)))
855 (case c
856 ((#\.)
857 (lp (+ i 1) (+ i 1) flags
858 (cons (if (flag-set? flags ~single-line?) 'any 'nonl)
859 (collect))
860 st))
861 ((#\?)
862 (let ((res (collect/single)))
863 (if (null? res)
864 (error "? can't follow empty pattern" str res)
865 (let ((x (car res)))
866 (lp (+ i 1)
867 (+ i 1)
868 flags
869 (cons
870 (if (pair? x)
871 (case (car x)
872 ((*) `(*? ,@(cdr x)))
873 ((+) `(**? 1 #f ,@(cdr x)))
874 ((?) `(?? ,@(cdr x)))
875 ((**) `(**? ,@(cdr x)))
876 ((=) `(**? ,(cadr x) ,@(cdr x)))
877 ((>=) `(**? ,(cadr x) #f ,@(cddr x)))
878 (else `(? ,x)))
879 `(? ,x))
880 (cdr res))
881 st)))))
882 ((#\+ #\*)
883 (let* ((res (collect/single))
884 (x (if (pair? res) (car res) 'epsilon))
885 (op (string->symbol (string c))))
886 (cond
887 ((sre-repeater? x)
888 (error "duplicate repetition (e.g. **) in pattern" str res))
889 ((sre-empty? x)
890 (error "can't repeat empty pattern (e.g. ()*)" str res))
891 (else
892 (lp (+ i 1) (+ i 1) flags
893 (cons (list op x) (cdr res))
894 st)))))
895 ((#\()
896 (cond
897 ((>= (+ i 1) end)
898 (error "unterminated parenthesis in regexp" str))
899 ((not (memv (string-ref str (+ i 1)) '(#\? #\*))) ; normal case
900 (lp (+ i 1) (+ i 1) (flag-join flags ~save?) '() (save)))
901 ((>= (+ i 2) end)
902 (error "unterminated parenthesis in regexp" str))
903 ((eqv? (string-ref str (+ i 1)) #\*)
904 (if (eqv? #\' (string-ref str (+ i 2)))
905 (with-read-from-string str (+ i 3)
906 (lambda (sre j)
907 (if (or (>= j end) (not (eqv? #\) (string-ref str j))))
908 (error "unterminated (*'...) SRE escape" str)
909 (lp (+ j 1) (+ j 1) flags (cons sre (collect)) st))))
910 (error "bad regexp syntax: (*FOO) not supported" str)))
911 (else ;; (?...) case
912 (case (string-ref str (+ i 2))
913 ((#\#)
914 (let ((j (string-scan-char str #\) (+ i 3))))
915 (lp (+ j i) (+ j 1) flags (collect) st)))
916 ((#\:)
917 (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) '() (save)))
918 ((#\=)
919 (lp (+ i 3) (+ i 3) (flag-clear flags ~save?)
920 '(look-ahead) (save)))
921 ((#\!)
922 (lp (+ i 3) (+ i 3) (flag-clear flags ~save?)
923 '(neg-look-ahead) (save)))
924 ((#\<)
925 (cond
926 ((>= (+ i 3) end)
927 (error "unterminated parenthesis in regexp" str))
928 (else
929 (case (string-ref str (+ i 3))
930 ((#\=)
931 (lp (+ i 4) (+ i 4) (flag-clear flags ~save?)
932 '(look-behind) (save)))
933 ((#\!)
934 (lp (+ i 4) (+ i 4) (flag-clear flags ~save?)
935 '(neg-look-behind) (save)))
936 (else
937 (let ((j (and (char-alphabetic?
938 (string-ref str (+ i 3)))
939 (string-scan-char str #\> (+ i 4)))))
940 (if j
941 (lp (+ j 1) (+ j 1) (flag-clear flags ~save?)
942 `(,(string->symbol (substring str (+ i 3) j))
943 submatch-named)
944 (save))
945 (error "invalid (?< sequence" str))))))))
946 ((#\>)
947 (lp (+ i 3) (+ i 3) (flag-clear flags ~save?)
948 '(atomic) (save)))
949 ;;((#\' #\P) ; named subpatterns
950 ;; )
951 ;;((#\R) ; recursion
952 ;; )
953 ((#\()
954 (cond
955 ((>= (+ i 3) end)
956 (error "unterminated parenthesis in regexp" str))
957 ((char-numeric? (string-ref str (+ i 3)))
958 (let* ((j (string-scan-char str #\) (+ i 3)))
959 (n (string->number (substring str (+ i 3) j))))
960 (if (not n)
961 (error "invalid conditional reference" str)
962 (lp (+ j 1) (+ j 1) (flag-clear flags ~save?)
963 `(,n if) (save)))))
964 ((char-alphabetic? (string-ref str (+ i 3)))
965 (let* ((j (string-scan-char str #\) (+ i 3)))
966 (s (string->symbol (substring str (+ i 3) j))))
967 (lp (+ j 1) (+ j 1) (flag-clear flags ~save?)
968 `(,s if) (save))))
969 (else
970 (lp (+ i 2) (+ i 2) (flag-clear flags ~save?)
971 '(if) (save)))))
972 ((#\{)
973 (error "unsupported Perl-style cluster" str))
974 (else
975 (let ((old-flags flags))
976 (let lp2 ((j (+ i 2)) (flags flags) (invert? #f))
977 (define (join x)
978 ((if invert? flag-clear flag-join) flags x))
979 (define (new-res res)
980 (let ((before (flag-set? old-flags ~utf8?))
981 (after (flag-set? flags ~utf8?)))
982 (if (eq? before after)
983 res
984 (cons (if after 'w/utf8 'w/noutf8) res))))
985 (cond
986 ((>= j end)
987 (error "incomplete cluster" str i))
988 (else
989 (case (string-ref str j)
990 ((#\i)
991 (lp2 (+ j 1) (join ~case-insensitive?) invert?))
992 ((#\m)
993 (lp2 (+ j 1) (join ~multi-line?) invert?))
994 ((#\x)
995 (lp2 (+ j 1) (join ~ignore-space?) invert?))
996 ((#\u)
997 (if *allow-utf8-mode?*
998 (lp2 (+ j 1) (join ~utf8?) invert?)
999 (lp2 (+ j 1) flags invert?)))
1000 ((#\-)
1001 (lp2 (+ j 1) flags (not invert?)))
1002 ((#\))
1003 (lp (+ j 1) (+ j 1) flags (new-res (collect))
1004 st))
1005 ((#\:)
1006 (lp (+ j 1) (+ j 1) flags (new-res '())
1007 (cons (cons old-flags (collect)) st)))
1008 (else
1009 (error "unknown regex cluster modifier" str)
1010 )))))))))))
1011 ((#\))
1012 (if (null? st)
1013 (error "too many )'s in regexp" str)
1014 (lp (+ i 1)
1015 (+ i 1)
1016 (caar st)
1017 (cons (collect/terms) (cdar st))
1018 (cdr st))))
1019 ((#\[)
1020 (apply
1021 (lambda (sre j)
1022 (lp (+ j 1) (+ j 1) flags (cons sre (collect)) st))
1023 (string-parse-cset str (+ i 1) flags)))
1024 ((#\{)
1025 (cond
1026 ((or (>= (+ i 1) end)
1027 (not (or (char-numeric? (string-ref str (+ i 1)))
1028 (eqv? #\, (string-ref str (+ i 1))))))
1029 (lp (+ i 1) from flags res st))
1030 (else
1031 (let ((res (collect/single)))
1032 (cond
1033 ((null? res)
1034 (error "{ can't follow empty pattern"))
1035 (else
1036 (let* ((x (car res))
1037 (tail (cdr res))
1038 (j (string-scan-char str #\} (+ i 1)))
1039 (s2 (string-split-char (substring str (+ i 1) j)
1040 #\,))
1041 (n (string->number (car s2)))
1042 (m (and (pair? (cdr s2))
1043 (string->number (cadr s2)))))
1044 (cond
1045 ((or (not n)
1046 (and (pair? (cdr s2))
1047 (not (equal? "" (cadr s2)))
1048 (not m)))
1049 (error "invalid {n} repetition syntax" s2))
1050 ((null? (cdr s2))
1051 (lp (+ j 1) (+ j 1) flags `((= ,n ,x) ,@tail) st))
1052 (m
1053 (lp (+ j 1) (+ j 1) flags `((** ,n ,m ,x) ,@tail) st))
1054 (else
1055 (lp (+ j 1) (+ j 1) flags `((>= ,n ,x) ,@tail) st)
1056 )))))))))
1057 ((#\\)
1058 (cond
1059 ((>= (+ i 1) end)
1060 (error "incomplete escape sequence" str))
1061 (else
1062 (let ((c (string-ref str (+ i 1))))
1063 (case c
1064 ((#\d)
1065 (lp (+ i 2) (+ i 2) flags `(numeric ,@(collect)) st))
1066 ((#\D)
1067 (lp (+ i 2) (+ i 2) flags `((~ numeric) ,@(collect)) st))
1068 ((#\s)
1069 (lp (+ i 2) (+ i 2) flags `(space ,@(collect)) st))
1070 ((#\S)
1071 (lp (+ i 2) (+ i 2) flags `((~ space) ,@(collect)) st))
1072 ((#\w)
1073 (lp (+ i 2) (+ i 2) flags
1074 `((or alphanumeric ("_")) ,@(collect)) st))
1075 ((#\W)
1076 (lp (+ i 2) (+ i 2) flags
1077 `((~ (or alphanumeric ("_"))) ,@(collect)) st))
1078 ((#\b)
1079 (lp (+ i 2) (+ i 2) flags
1080 `((or bow eow) ,@(collect)) st))
1081 ((#\B)
1082 (lp (+ i 2) (+ i 2) flags `(nwb ,@(collect)) st))
1083 ((#\A)
1084 (lp (+ i 2) (+ i 2) flags `(bos ,@(collect)) st))
1085 ((#\Z)
1086 (lp (+ i 2) (+ i 2) flags
1087 `((? #\newline) eos ,@(collect)) st))
1088 ((#\z)
1089 (lp (+ i 2) (+ i 2) flags `(eos ,@(collect)) st))
1090 ((#\R)
1091 (lp (+ i 2) (+ i 2) flags `(newline ,@(collect)) st))
1092 ((#\K)
1093 (lp (+ i 2) (+ i 2) flags `(reset ,@(collect)) st))
1094 ;; these two are from Emacs and TRE, but not in PCRE
1095 ((#\<)
1096 (lp (+ i 2) (+ i 2) flags `(bow ,@(collect)) st))
1097 ((#\>)
1098 (lp (+ i 2) (+ i 2) flags `(eow ,@(collect)) st))
1099 ((#\x)
1100 (apply
1101 (lambda (ch j)
1102 (lp (+ j 1) (+ j 1) flags `(,ch ,@(collect)) st))
1103 (string-parse-hex-escape str (+ i 2) end)))
1104 ((#\k)
1105 (let ((c (string-ref str (+ i 2))))
1106 (if (not (memv c '(#\< #\{ #\')))
1107 (error "bad \\k usage, expected \\k<...>" str)
1108 (let* ((terminal (char-mirror c))
1109 (j (string-scan-char str terminal (+ i 2)))
1110 (s (and j (substring str (+ i 3) j)))
1111 (backref
1112 (if (flag-set? flags ~case-insensitive?)
1113 'backref-ci
1114 'backref)))
1115 (if (not j)
1116 (error "unterminated named backref" str)
1117 (lp (+ j 1) (+ j 1) flags
1118 `((,backref ,(string->symbol s))
1119 ,@(collect))
1120 st))))))
1121 ((#\Q) ;; \Q..\E escapes
1122 (let ((res (collect)))
1123 (let lp2 ((j (+ i 2)))
1124 (cond
1125 ((>= j end)
1126 (lp j (+ i 2) flags res st))
1127 ((eqv? #\\ (string-ref str j))
1128 (cond
1129 ((>= (+ j 1) end)
1130 (lp (+ j 1) (+ i 2) flags res st))
1131 ((eqv? #\E (string-ref str (+ j 1)))
1132 (lp (+ j 2) (+ j 2) flags
1133 (cons (substring str (+ i 2) j) res) st))
1134 (else
1135 (lp2 (+ j 2)))))
1136 (else
1137 (lp2 (+ j 1)))))))
1138 ((#\')
1139 (with-read-from-string str (+ i 2)
1140 (lambda (sre j)
1141 (lp j j flags (cons sre (collect)) st))))
1142 ;;((#\p) ; XXXX unicode properties
1143 ;; )
1144 ;;((#\P)
1145 ;; )
1146 (else
1147 (cond
1148 ((char-numeric? c)
1149 (let* ((j (or (string-scan-pred
1150 str
1151 (lambda (c) (not (char-numeric? c)))
1152 (+ i 2))
1153 end))
1154 (backref
1155 (if (flag-set? flags ~case-insensitive?)
1156 'backref-ci
1157 'backref))
1158 (res `((,backref ,(string->number
1159 (substring str (+ i 1) j)))
1160 ,@(collect))))
1161 (lp j j flags res st)))
1162 ((char-alphabetic? c)
1163 (let ((cell (assv c posix-escape-sequences)))
1164 (if cell
1165 (lp (+ i 2) (+ i 2) flags
1166 (cons (cdr cell) (collect)) st)
1167 (error "unknown escape sequence" str c))))
1168 (else
1169 (lp (+ i 2) (+ i 1) flags (collect) st)))))))))
1170 ((#\|)
1171 (lp (+ i 1) (+ i 1) flags (cons 'or (collect)) st))
1172 ((#\^)
1173 (let ((sym (if (flag-set? flags ~multi-line?) 'bol 'bos)))
1174 (lp (+ i 1) (+ i 1) flags (cons sym (collect)) st)))
1175 ((#\$)
1176 (let ((sym (if (flag-set? flags ~multi-line?) 'eol 'eos)))
1177 (lp (+ i 1) (+ i 1) flags (cons sym (collect)) st)))
1178 ((#\space)
1179 (if (flag-set? flags ~ignore-space?)
1180 (lp (+ i 1) (+ i 1) flags (collect) st)
1181 (lp (+ i 1) from flags res st)))
1182 ((#\#)
1183 (if (flag-set? flags ~ignore-space?)
1184 (let ((j (or (string-scan-char str #\newline (+ i 1))
1185 (- end 1))))
1186 (lp (+ j 1) (+ j 1) flags (collect) st))
1187 (lp (+ i 1) from flags res st)))
1188 (else
1189 (lp (+ i 1) from flags res st))))))))
1190
1191(define posix-escape-sequences
1192 `((#\n . #\newline)
1193 (#\r . ,(integer->char (+ (char->integer #\newline) 3)))
1194 (#\t . ,(integer->char (- (char->integer #\newline) 1)))
1195 (#\a . ,(integer->char (- (char->integer #\newline) 3)))
1196 (#\e . ,(integer->char (+ (char->integer #\newline) #x11)))
1197 (#\f . ,(integer->char (+ (char->integer #\newline) 2)))
1198 ))
1199
1200(define (char-altcase c)
1201 (if (char-upper-case? c) (char-downcase c) (char-upcase c)))
1202
1203(define (char-mirror c)
1204 (case c ((#\<) #\>) ((#\{) #\}) ((#\() #\)) ((#\[) #\]) (else c)))
1205
1206(define (string-parse-hex-escape str i end)
1207 (cond
1208 ((>= i end)
1209 (error "incomplete hex escape" str i))
1210 ((eqv? #\{ (string-ref str i))
1211 (let ((j (string-scan-char-escape str #\} (+ i 1))))
1212 (if (not j)
1213 (error "incomplete hex brace escape" str i)
1214 (let* ((s (substring str (+ i 1) j))
1215 (n (string->number s 16)))
1216 (if n
1217 (list (integer->char n) j)
1218 (error "bad hex brace escape" s))))))
1219 ((>= (+ i 1) end)
1220 (error "incomplete hex escape" str i))
1221 (else
1222 (let* ((s (substring str i (+ i 2)))
1223 (n (string->number s 16)))
1224 (if n
1225 (list (integer->char n) (+ i 2))
1226 (error "bad hex escape" s))))))
1227
1228(define (string-parse-cset str start flags)
1229 (let* ((end (string-length str))
1230 (invert? (and (< start end) (eqv? #\^ (string-ref str start))))
1231 (utf8? (flag-set? flags ~utf8?)))
1232 (define (go i prev-char cset)
1233 (if (>= i end)
1234 (error "incomplete char set" str i end)
1235 (let ((c (string-ref str i)))
1236 (case c
1237 ((#\])
1238 (if (cset-empty? cset)
1239 (go (+ i 1) #\] (cset-adjoin cset #\]))
1240 (let ((ci? (flag-set? flags ~case-insensitive?)))
1241 (list
1242 (let ((res (if ci? (cset-case-insensitive cset) cset)))
1243 (cset->sre (if invert? (cset-complement res) res)))
1244 i))))
1245 ((#\-)
1246 (cond
1247 ((or (= i start)
1248 (and (= i (+ start 1)) (eqv? #\^ (string-ref str start)))
1249 (eqv? #\] (string-ref str (+ i 1))))
1250 (go (+ i 1) c (cset-adjoin cset c)))
1251 ((not prev-char)
1252 (error "bad char-set"))
1253 (else
1254 (let ((char (string-ref str (+ i 1))))
1255 (apply
1256 (lambda (c j)
1257 (if (char<? c prev-char)
1258 (error "inverted range in char-set" prev-char c)
1259 (go j #f (cset-union cset (range->cset prev-char c)))))
1260 (cond
1261 ((and (eqv? #\\ char) (assv char posix-escape-sequences))
1262 => (lambda (x) (list (cdr x) (+ i 3))))
1263 ((and (eqv? #\\ char)
1264 (eqv? (string-ref str (+ i 2)) #\x))
1265 (string-parse-hex-escape str (+ i 3) end))
1266 ((and utf8? (<= #x80 (char->integer char) #xFF))
1267 (let ((len (utf8-start-char->length char)))
1268 (list (utf8-string-ref str (+ i 1) len) (+ i 1 len))))
1269 (else
1270 (list char (+ i 2)))))))))
1271 ((#\[)
1272 (let* ((inv? (eqv? #\^ (string-ref str (+ i 1))))
1273 (i2 (if inv? (+ i 2) (+ i 1))))
1274 (case (string-ref str i2)
1275 ((#\:)
1276 (let ((j (string-scan-char str #\: (+ i2 1))))
1277 (if (or (not j) (not (eqv? #\] (string-ref str (+ j 1)))))
1278 (error "incomplete character class" str)
1279 (let* ((class (sre->cset
1280 (string->symbol
1281 (substring str (+ i2 1) j))))
1282 (class (if inv? (cset-complement class) class)))
1283 (go (+ j 2) #f (cset-union cset class))))))
1284 ((#\= #\.)
1285 (error "collating sequences not supported" str))
1286 (else
1287 (go (+ i 1) #\[ (cset-adjoin cset #\[))))))
1288 ((#\\)
1289 (let ((c (string-ref str (+ i 1))))
1290 (case c
1291 ((#\d #\D #\s #\S #\w #\W)
1292 (go (+ i 2) #f
1293 (cset-union cset
1294 (sre->cset (string->sre (string #\\ c))))))
1295 ((#\x)
1296 (apply
1297 (lambda (ch j)
1298 (go j ch (cset-adjoin cset ch)))
1299 (string-parse-hex-escape str (+ i 2) end)))
1300 (else
1301 (let ((c (cond ((assv c posix-escape-sequences) => cdr)
1302 (else c))))
1303 (go (+ i 2) c (cset-adjoin cset c)))))))
1304 (else
1305 (if (and utf8? (<= #x80 (char->integer c) #xFF))
1306 (let ((len (utf8-start-char->length c)))
1307 (go (+ i len)
1308 (utf8-string-ref str i len)
1309 (cset-adjoin cset (utf8-string-ref str i len))))
1310 (go (+ i 1) c (cset-adjoin cset c))))))))
1311 (if invert?
1312 (go (+ start 1)
1313 #f
1314 (if (flag-set? flags ~multi-line?)
1315 (char->cset #\newline)
1316 (make-cset)))
1317 (go start #f (make-cset)))))
1318
1319;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1320;;;; UTF-8 Utilities
1321
1322;; Here are some hairy optimizations that need to be documented
1323;; better. Thanks to these, we never do any utf8 processing once the
1324;; regexp is compiled.
1325
1326;; two chars: ab..ef
1327;; a[b..xFF]|[b-d][x80..xFF]|e[x80..xFF]
1328
1329;; three chars: abc..ghi
1330;; ab[c..xFF]|a[d..xFF][x80..xFF]|
1331;; [b..f][x80..xFF][x80..xFF]|
1332;; g[x80..g][x80..xFF]|gh[x80..i]
1333
1334;; four chars: abcd..ghij
1335;; abc[d..xFF]|ab[d..xFF][x80..xFF]|a[c..xFF][x80..xFF][x80..xFF]|
1336;; [b..f][x80..xFF][x80..xFF][x80..xFF]|
1337;; g[x80..g][x80..xFF][x80..xFF]|gh[x80..h][x80..xFF]|ghi[x80..j]
1338
1339(define (high-char? c) (<= #x80 (char->integer c)))
1340
1341;; number of total bytes in a utf8 char given the 1st byte
1342
1343(define utf8-start-char->length
1344 (let ((table '#(
13451 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 0x
13461 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 1x
13471 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 2x
13481 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 3x
13491 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 4x
13501 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 5x
13511 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 6x
13521 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 7x
13531 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 8x
13541 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 9x
13551 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; ax
13561 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; bx
13572 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ; cx
13582 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ; dx
13593 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 ; ex
13604 4 4 4 4 4 4 4 5 5 5 5 6 6 0 0 ; fx
1361)))
1362 (lambda (c) (vector-ref table (char->integer c)))))
1363
1364(define (utf8-string-ref str i len)
1365 (define (byte n) (char->integer (string-ref str n)))
1366 (case len
1367 ((1) ; shouldn't happen in this module
1368 (string-ref str i))
1369 ((2)
1370 (integer->char
1371 (+ (bit-shl (bit-and (byte i) #b00011111) 6)
1372 (bit-and (byte (+ i 1)) #b00111111))))
1373 ((3)
1374 (integer->char
1375 (+ (bit-shl (bit-and (byte i) #b00001111) 12)
1376 (bit-shl (bit-and (byte (+ i 1)) #b00111111) 6)
1377 (bit-and (byte (+ i 2)) #b00111111))))
1378 ((4)
1379 (integer->char
1380 (+ (bit-shl (bit-and (byte i) #b00000111) 18)
1381 (bit-shl (bit-and (byte (+ i 1)) #b00111111) 12)
1382 (bit-shl (bit-and (byte (+ i 2)) #b00111111) 6)
1383 (bit-and (byte (+ i 3)) #b00111111))))
1384 (else
1385 (error "invalid utf8 length" str len i))))
1386
1387(define (utf8-backup-to-initial-char str i)
1388 (let lp ((i i))
1389 (if (= i 0)
1390 0
1391 (let ((c (char->integer (string-ref str i))))
1392 (if (or (< c #x80) (>= c #xC0))
1393 i
1394 (lp (- i 1)))))))
1395
1396(define (utf8-lowest-digit-of-length len)
1397 (case len
1398 ((1) 0) ((2) #xC0) ((3) #xE0) ((4) #xF0)
1399 (else (error "invalid utf8 length" len))))
1400
1401(define (utf8-highest-digit-of-length len)
1402 (case len
1403 ((1) #x7F) ((2) #xDF) ((3) #xEF) ((4) #xF7)
1404 (else (error "invalid utf8 length" len))))
1405
1406(define (char->utf8-list c)
1407 (let ((i (char->integer c)))
1408 (cond
1409 ((<= i #x7F) (list i))
1410 ((<= i #x7FF)
1411 (list (bit-ior #b11000000 (bit-shr i 6))
1412 (bit-ior #b10000000 (bit-and i #b111111))))
1413 ((<= i #xFFFF)
1414 (list (bit-ior #b11100000 (bit-shr i 12))
1415 (bit-ior #b10000000 (bit-and (bit-shr i 6) #b111111))
1416 (bit-ior #b10000000 (bit-and i #b111111))))
1417 ((<= i #x1FFFFF)
1418 (list (bit-ior #b11110000 (bit-shr i 18))
1419 (bit-ior #b10000000 (bit-and (bit-shr i 12) #b111111))
1420 (bit-ior #b10000000 (bit-and (bit-shr i 6) #b111111))
1421 (bit-ior #b10000000 (bit-and i #b111111))))
1422 (else (error "unicode codepoint out of range:" i)))))
1423
1424(define (unicode-range->utf8-pattern lo hi)
1425 (let ((lo-ls (char->utf8-list lo))
1426 (hi-ls (char->utf8-list hi)))
1427 (if (not (= (length lo-ls) (length hi-ls)))
1428 (sre-alternate (list (unicode-range-climb-digits lo-ls hi-ls)
1429 (unicode-range-up-to hi-ls)))
1430 (let lp ((lo-ls lo-ls) (hi-ls hi-ls))
1431 (cond
1432 ((= (car lo-ls) (car hi-ls))
1433 (sre-sequence
1434 (cons (integer->char (car lo-ls))
1435 (if (null? (cdr lo-ls)) '()
1436 (cons (lp (cdr lo-ls) (cdr hi-ls)) '())))))
1437 ((= (+ (car lo-ls) 1) (car hi-ls))
1438 (sre-alternate (list (unicode-range-up-from lo-ls)
1439 (unicode-range-up-to hi-ls))))
1440 (else
1441 (sre-alternate (list (unicode-range-up-from lo-ls)
1442 (unicode-range-middle lo-ls hi-ls)
1443 (unicode-range-up-to hi-ls)))))))))
1444
1445(define (unicode-range-helper one ls prefix res)
1446 (if (null? ls)
1447 res
1448 (unicode-range-helper
1449 one
1450 (cdr ls)
1451 (cons (car ls) prefix)
1452 (cons (sre-sequence
1453 `(,@(map integer->char prefix)
1454 ,(one (car ls))
1455 ,@(map (lambda (_)
1456 `(/ ,(integer->char #x80)
1457 ,(integer->char #xFF)))
1458 (cdr ls))))
1459 res))))
1460
1461(define (unicode-range-up-from lo-ls)
1462 (sre-sequence
1463 (list (integer->char (car lo-ls))
1464 (sre-alternate
1465 (unicode-range-helper
1466 (lambda (c)
1467 `(/ ,(integer->char (+ (car lo-ls) 1)) ,(integer->char #xFF)))
1468 (cdr (reverse (cdr lo-ls)))
1469 '()
1470 (list
1471 (sre-sequence
1472 (append
1473 (map integer->char (reverse (cdr (reverse (cdr lo-ls)))))
1474 `((/ ,(integer->char (last lo-ls))
1475 ,(integer->char #xFF)))))))))))
1476
1477(define (unicode-range-up-to hi-ls)
1478 (sre-sequence
1479 (list (integer->char (car hi-ls))
1480 (sre-alternate
1481 (unicode-range-helper
1482 (lambda (c)
1483 `(/ ,(integer->char #x80) ,(integer->char (- (car hi-ls) 1))))
1484 (cdr (reverse (cdr hi-ls)))
1485 '()
1486 (list
1487 (sre-sequence
1488 (append
1489 (map integer->char (reverse (cdr (reverse (cdr hi-ls)))))
1490 `((/ ,(integer->char #x80)
1491 ,(integer->char (last hi-ls))))))))))))
1492
1493(define (unicode-range-climb-digits lo-ls hi-ls)
1494 (let ((lo-len (length lo-ls)))
1495 (sre-alternate
1496 (append
1497 (list
1498 (sre-sequence
1499 (cons `(/ ,(integer->char (car lo-ls))
1500 ,(integer->char (if (<= (car lo-ls) #x7F) #x7F #xFF)))
1501 (map (lambda (_)
1502 `(/ ,(integer->char #x80) ,(integer->char #xFF)))
1503 (cdr lo-ls)))))
1504 (map
1505 (lambda (i)
1506 (sre-sequence
1507 (cons
1508 `(/ ,(integer->char (utf8-lowest-digit-of-length (+ i lo-len 1)))
1509 ,(integer->char (utf8-highest-digit-of-length (+ i lo-len 1))))
1510 (map (lambda (_)
1511 `(/ ,(integer->char #x80) ,(integer->char #xFF)))
1512 (zero-to (+ i lo-len))))))
1513 (zero-to (- (length hi-ls) (+ lo-len 1))))
1514 (list
1515 (sre-sequence
1516 (cons `(/ ,(integer->char
1517 (utf8-lowest-digit-of-length
1518 (utf8-start-char->length
1519 (integer->char (- (car hi-ls) 1)))))
1520 ,(integer->char (- (car hi-ls) 1)))
1521 (map (lambda (_)
1522 `(/ ,(integer->char #x80) ,(integer->char #xFF)))
1523 (cdr hi-ls)))))))))
1524
1525(define (unicode-range-middle lo-ls hi-ls)
1526 (let ((lo (integer->char (+ (car lo-ls) 1)))
1527 (hi (integer->char (- (car hi-ls) 1))))
1528 (sre-sequence
1529 (cons (if (char=? lo hi) lo `(/ ,lo ,hi))
1530 (map (lambda (_) `(/ ,(integer->char #x80) ,(integer->char #xFF)))
1531 (cdr lo-ls))))))
1532
1533;; Maybe this should just modify the input?
1534(define (cset->utf8-pattern cset)
1535 (let lp ((ls (cset->plist cset)) (alts '()) (lo-cset '()))
1536 (if (null? ls)
1537 (sre-alternate (append (reverse alts)
1538 (if (null? lo-cset)
1539 '()
1540 (list (cons '/ (reverse lo-cset))))))
1541 (if (or (high-char? (car ls)) (high-char? (cadr ls)))
1542 (lp (cddr ls)
1543 (cons (unicode-range->utf8-pattern (car ls) (cadr ls)) alts)
1544 lo-cset)
1545 (lp (cddr ls) alts (cons (cadr ls) (cons (car ls) lo-cset)))))))
1546
1547(define (sre-adjust-utf8 sre flags)
1548 (let adjust ((sre sre)
1549 (utf8? (flag-set? flags ~utf8?))
1550 (ci? (flag-set? flags ~case-insensitive?)))
1551 (define (rec sre) (adjust sre utf8? ci?))
1552 (cond
1553 ((pair? sre)
1554 (case (car sre)
1555 ((w/utf8) (adjust (sre-sequence (cdr sre)) #t ci?))
1556 ((w/noutf8) (adjust (sre-sequence (cdr sre)) #f ci?))
1557 ((w/case)
1558 (cons (car sre) (map (lambda (s) (adjust s utf8? #f)) (cdr sre))))
1559 ((w/nocase)
1560 (cons (car sre) (map (lambda (s) (adjust s utf8? #t)) (cdr sre))))
1561 ((/ ~ & -)
1562 (if (not utf8?)
1563 sre
1564 (let ((cset (sre->cset sre ci?)))
1565 (if (any high-char? (cset->plist cset))
1566 (if ci?
1567 (list 'w/case (cset->utf8-pattern cset))
1568 (cset->utf8-pattern cset))
1569 sre))))
1570 ((*)
1571 (case (sre-sequence (cdr sre))
1572 ;; special case optimization: .* w/utf8 == .* w/noutf8
1573 ((any) '(* any))
1574 ((nonl) '(* nonl))
1575 (else (cons '* (map rec (cdr sre))))))
1576 (else
1577 (cons (car sre) (map rec (cdr sre))))))
1578 (else
1579 (case sre
1580 ((any) (if utf8? 'utf8-any 'any))
1581 ((nonl) (if utf8? 'utf8-nonl 'nonl))
1582 (else
1583 (if (and utf8? (char? sre) (high-char? sre))
1584 (sre-sequence (map integer->char (char->utf8-list sre)))
1585 sre)))))))
1586
1587;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1588;;;; Compilation
1589
1590(cond-expand
1591 (chicken-bootstrap
1592 (define-syntax cached
1593 (syntax-rules ()
1594 ((_ arg fail) (build-cache 5 arg fail)))))
1595 (else
1596 (define-syntax cached
1597 (syntax-rules ()
1598 ((_ arg fail) fail)))))
1599
1600(define (irregex x . o)
1601 (cond ((irregex? x) x)
1602 ((null? o)
1603 (cached
1604 x
1605 (if (string? x)
1606 (string->irregex x)
1607 (sre->irregex x))))
1608 (else
1609 (if (string? x)
1610 (apply string->irregex x o)
1611 (apply sre->irregex x o)))))
1612
1613(define (string->irregex str . o)
1614 (apply sre->irregex (apply string->sre str o) o))
1615
1616(define (sre->irregex sre . o)
1617 (let* ((pat-flags (symbol-list->flags o))
1618 (sre (if *allow-utf8-mode?*
1619 (sre-adjust-utf8 sre pat-flags)
1620 sre))
1621 (searcher? (sre-searcher? sre))
1622 (sre-dfa (if searcher? (sre-remove-initial-bos sre) sre))
1623 (dfa-limit (cond ((memq 'small o) 1) ((memq 'fast o) 50) (else 10)))
1624 ;; TODO: Maybe make these two promises; if we only want to search,
1625 ;; it's wasteful to compile the matcher, and vice versa
1626 ;; Maybe provide a flag to compile eagerly, to help benchmarking etc.
1627 (dfa/search
1628 (cond ((memq 'backtrack o) #f)
1629 (searcher? #t)
1630 ((sre->nfa `(seq (* any) ,sre-dfa) pat-flags)
1631 => (lambda (nfa)
1632 (nfa->dfa nfa (* dfa-limit (nfa-num-states nfa)))))
1633 (else #f)))
1634 (dfa (cond ((and dfa/search (sre->nfa sre-dfa pat-flags))
1635 => (lambda (nfa)
1636 (nfa->dfa nfa (* dfa-limit (nfa-num-states nfa)))))
1637 (else #f)))
1638 (submatches (sre-count-submatches sre-dfa))
1639 (names (sre-names sre-dfa 1 '()))
1640 (lens (sre-length-ranges sre-dfa names))
1641 (flags (flag-join
1642 (flag-join ~none (and searcher? ~searcher?))
1643 (and (sre-consumer? sre) ~consumer?))))
1644 (cond
1645 (dfa
1646 (make-irregex dfa dfa/search #f flags submatches lens names))
1647 (else
1648 (let ((f (sre->procedure sre pat-flags names)))
1649 (make-irregex #f #f f flags submatches lens names))))))
1650
1651;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1652;;;; SRE Analysis
1653
1654;; returns #t if the sre can ever be empty
1655(define (sre-empty? sre)
1656 (if (pair? sre)
1657 (case (car sre)
1658 ((* ? look-ahead look-behind neg-look-ahead neg-look-behind) #t)
1659 ((**) (or (not (number? (cadr sre))) (zero? (cadr sre))))
1660 ((or) (any sre-empty? (cdr sre)))
1661 ((: seq $ submatch => submatch-named + atomic)
1662 (every sre-empty? (cdr sre)))
1663 (else #f))
1664 (memq sre '(epsilon bos eos bol eol bow eow commit))))
1665
1666(define (sre-any? sre)
1667 (or (eq? sre 'any)
1668 (and (pair? sre)
1669 (case (car sre)
1670 ((seq : $ submatch => submatch-named)
1671 (and (pair? (cdr sre)) (null? (cddr sre)) (sre-any? (cadr sre))))
1672 ((or) (every sre-any? (cdr sre)))
1673 (else #f)))))
1674
1675(define (sre-repeater? sre)
1676 (and (pair? sre)
1677 (or (memq (car sre) '(* +))
1678 (and (memq (car sre) '($ submatch => submatch-named seq :))
1679 (pair? (cdr sre))
1680 (null? (cddr sre))
1681 (sre-repeater? (cadr sre))))))
1682
1683(define (sre-bos? sre)
1684 (if (pair? sre)
1685 (case (car sre)
1686 ((seq : $ submatch => submatch-named)
1687 (and (pair? (cdr sre)) (sre-bos? (cadr sre))))
1688 ((or) (every sre-bos? (cdr sre)))
1689 (else #f))
1690 (eq? 'bos sre)))
1691
1692;; a searcher doesn't need explicit iteration to find the first match
1693(define (sre-searcher? sre)
1694 (or (sre-bos? sre)
1695 (and (pair? sre)
1696 (case (car sre)
1697 ((* +) (sre-any? (sre-sequence (cdr sre))))
1698 ((seq : $ submatch => submatch-named)
1699 (and (pair? (cdr sre)) (sre-searcher? (cadr sre))))
1700 ((or) (every sre-searcher? (cdr sre)))
1701 (else #f)))))
1702
1703;; a consumer doesn't need to match more than once
1704(define (sre-consumer? sre)
1705 (or (sre-bos? sre)
1706 (and (pair? sre)
1707 (case (car sre)
1708 ((* +) (sre-any? (sre-sequence (cdr sre))))
1709 ((seq : $ submatch => submatch-named)
1710 (and (pair? (cdr sre)) (sre-consumer? (last sre))))
1711 ((or) (every sre-consumer? (cdr sre)))
1712 (else #f)))))
1713
1714(define (sre-has-submatches? sre)
1715 (and (pair? sre)
1716 (or (memq (car sre) '($ submatch => submatch-named))
1717 (if (eq? 'posix-string (car sre))
1718 (sre-has-submatches? (string->sre (cadr sre)))
1719 (any sre-has-submatches? (cdr sre))))))
1720
1721(define (sre-count-submatches sre)
1722 (let count ((sre sre) (sum 0))
1723 (if (pair? sre)
1724 (fold count
1725 (+ sum (case (car sre)
1726 (($ submatch => submatch-named) 1)
1727 ((dsm) (+ (cadr sre) (caddr sre)))
1728 ((posix-string)
1729 (sre-count-submatches (string->sre (cadr sre))))
1730 (else 0)))
1731 (cdr sre))
1732 sum)))
1733
1734(define (sre-length-ranges sre . o)
1735 (let ((names (if (pair? o) (car o) (sre-names sre 1 '())))
1736 (sublens (make-vector (+ 1 (sre-count-submatches sre)) #f)))
1737 (vector-set!
1738 sublens
1739 0
1740 (let lp ((sre sre) (n 1) (lo 0) (hi 0) (return cons))
1741 (define (grow i) (return (+ lo i) (and hi (+ hi i))))
1742 (cond
1743 ((pair? sre)
1744 (if (string? (car sre))
1745 (grow 1)
1746 (case (car sre)
1747 ((/ ~ & -)
1748 (grow 1))
1749 ((posix-string)
1750 (lp (string->sre (cadr sre)) n lo hi return))
1751 ((seq : w/case w/nocase atomic)
1752 (let lp2 ((ls (cdr sre)) (n n) (lo2 0) (hi2 0))
1753 (if (null? ls)
1754 (return (+ lo lo2) (and hi hi2 (+ hi hi2)))
1755 (lp (car ls) n 0 0
1756 (lambda (lo3 hi3)
1757 (lp2 (cdr ls)
1758 (+ n (sre-count-submatches (car ls)))
1759 (+ lo2 lo3)
1760 (and hi2 hi3 (+ hi2 hi3))))))))
1761 ((or)
1762 (let lp2 ((ls (cdr sre)) (n n) (lo2 #f) (hi2 0))
1763 (if (null? ls)
1764 (return (+ lo (or lo2 1)) (and hi hi2 (+ hi hi2)))
1765 (lp (car ls) n 0 0
1766 (lambda (lo3 hi3)
1767 (lp2 (cdr ls)
1768 (+ n (sre-count-submatches (car ls)))
1769 (if lo2 (min lo2 lo3) lo3)
1770 (and hi2 hi3 (max hi2 hi3))))))))
1771 ((if)
1772 (cond
1773 ((or (null? (cdr sre)) (null? (cddr sre)))
1774 (return lo hi))
1775 (else
1776 (let ((n1 (sre-count-submatches (car sre)))
1777 (n2 (sre-count-submatches (cadr sre))))
1778 (lp (if (or (number? (cadr sre)) (symbol? (cadr sre)))
1779 'epsilon
1780 (cadr sre))
1781 n lo hi
1782 (lambda (lo2 hi2)
1783 (lp (caddr sre) (+ n n1) 0 0
1784 (lambda (lo3 hi3)
1785 (lp (if (pair? (cdddr sre))
1786 (cadddr sre)
1787 'epsilon)
1788 (+ n n1 n2) 0 0
1789 (lambda (lo4 hi4)
1790 (return (+ lo2 (min lo3 lo4))
1791 (and hi2 hi3 hi4
1792 (+ hi2 (max hi3 hi4))
1793 ))))))))))))
1794 ((dsm)
1795 (lp (sre-sequence (cdddr sre)) (+ n (cadr sre)) lo hi return))
1796 (($ submatch => submatch-named)
1797 (lp (sre-sequence
1798 (if (eq? 'submatch (car sre)) (cdr sre) (cddr sre)))
1799 (+ n 1) lo hi
1800 (lambda (lo2 hi2)
1801 (vector-set! sublens n (cons lo2 hi2))
1802 (return lo2 hi2))))
1803 ((backref backref-ci)
1804 (let ((n (cond
1805 ((number? (cadr sre)) (cadr sre))
1806 ((assq (cadr sre) names) => cdr)
1807 (else (error "unknown backreference" (cadr sre))))))
1808 (cond
1809 ((or (not (integer? n))
1810 (not (< 0 n (vector-length sublens))))
1811 (error 'sre-length "invalid backreference" sre))
1812 ((not (vector-ref sublens n))
1813 (error 'sre-length "invalid forward backreference" sre))
1814 (else
1815 (let ((lo2 (car (vector-ref sublens n)))
1816 (hi2 (cdr (vector-ref sublens n))))
1817 (return (+ lo lo2) (and hi hi2 (+ hi hi2))))))))
1818 ((* *?)
1819 (lp (sre-sequence (cdr sre)) n lo hi (lambda (lo hi) #f))
1820 (return lo #f))
1821 ((** **?)
1822 (cond
1823 ((or (and (number? (cadr sre))
1824 (number? (caddr sre))
1825 (> (cadr sre) (caddr sre)))
1826 (and (not (cadr sre)) (caddr sre)))
1827 (return lo hi))
1828 (else
1829 (if (caddr sre)
1830 (lp (sre-sequence (cdddr sre)) n 0 0
1831 (lambda (lo2 hi2)
1832 (return (+ lo (* (cadr sre) lo2))
1833 (and hi hi2 (+ hi (* (caddr sre) hi2))))))
1834 (lp (sre-sequence (cdddr sre)) n 0 0
1835 (lambda (lo2 hi2)
1836 (return (+ lo (* (cadr sre) lo2)) #f)))))))
1837 ((+)
1838 (lp (sre-sequence (cdr sre)) n lo hi
1839 (lambda (lo2 hi2)
1840 (return (+ lo lo2) #f))))
1841 ((? ??)
1842 (lp (sre-sequence (cdr sre)) n lo hi
1843 (lambda (lo2 hi2)
1844 (return lo (and hi hi2 (+ hi hi2))))))
1845 ((= =? >= >=?)
1846 (lp `(** ,(cadr sre)
1847 ,(if (memq (car sre) '(>= >=?)) #f (cadr sre))
1848 ,@(cddr sre))
1849 n lo hi return))
1850 ((look-ahead neg-look-ahead look-behind neg-look-behind)
1851 (return lo hi))
1852 (else
1853 (cond
1854 ((assq (car sre) sre-named-definitions)
1855 => (lambda (cell)
1856 (lp (apply (cdr cell) (cdr sre)) n lo hi return)))
1857 (else
1858 (error 'sre-length-ranges "unknown sre operator" sre)))))))
1859 ((char? sre)
1860 (grow 1))
1861 ((string? sre)
1862 (grow (string-length sre)))
1863 ((memq sre '(any nonl))
1864 (grow 1))
1865 ((memq sre '(epsilon bos eos bol eol bow eow nwb commit))
1866 (return lo hi))
1867 (else
1868 (let ((cell (assq sre sre-named-definitions)))
1869 (if cell
1870 (lp (if (procedure? (cdr cell)) ((cdr cell)) (cdr cell))
1871 n lo hi return)
1872 (error 'sre-length-ranges "unknown sre" sre)))))))
1873 sublens))
1874
1875;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1876;;;; SRE Manipulation
1877
1878;; build a (seq ls ...) sre from a list
1879(define (sre-sequence ls)
1880 (cond
1881 ((null? ls) 'epsilon)
1882 ((null? (cdr ls)) (car ls))
1883 (else (cons 'seq ls))))
1884
1885;; build a (or ls ...) sre from a list
1886(define (sre-alternate ls)
1887 (cond
1888 ((null? ls) '(or))
1889 ((null? (cdr ls)) (car ls))
1890 (else (cons 'or ls))))
1891
1892;; returns an equivalent SRE without any match information
1893(define (sre-strip-submatches sre)
1894 (if (not (pair? sre))
1895 sre
1896 (case (car sre)
1897 (($ submatch) (sre-strip-submatches (sre-sequence (cdr sre))))
1898 ((=> submatch-named) (sre-strip-submatches (sre-sequence (cddr sre))))
1899 ((dsm) (sre-strip-submatches (sre-sequence (cdddr sre))))
1900 (else (map sre-strip-submatches sre)))))
1901
1902;; given a char-set list of chars and strings, flattens them into
1903;; chars only
1904(define (sre-flatten-ranges ls)
1905 (let lp ((ls ls) (res '()))
1906 (cond
1907 ((null? ls)
1908 (reverse res))
1909 ((string? (car ls))
1910 (lp (append (string->list (car ls)) (cdr ls)) res))
1911 (else
1912 (lp (cdr ls) (cons (car ls) res))))))
1913
1914(define (sre-names sre n names)
1915 (if (not (pair? sre))
1916 names
1917 (case (car sre)
1918 (($ submatch)
1919 (sre-names (sre-sequence (cdr sre)) (+ n 1) names))
1920 ((=> submatch-named)
1921 (sre-names (sre-sequence (cddr sre))
1922 (+ n 1)
1923 (cons (cons (cadr sre) n) names)))
1924 ((dsm)
1925 (sre-names (sre-sequence (cdddr sre)) (+ n (cadr sre)) names))
1926 ((seq : or * + ? *? ?? w/case w/nocase atomic
1927 look-ahead look-behind neg-look-ahead neg-look-behind)
1928 (sre-sequence-names (cdr sre) n names))
1929 ((= >=)
1930 (sre-sequence-names (cddr sre) n names))
1931 ((** **?)
1932 (sre-sequence-names (cdddr sre) n names))
1933 (else
1934 names))))
1935
1936(define (sre-sequence-names ls n names)
1937 (if (null? ls)
1938 names
1939 (sre-sequence-names (cdr ls)
1940 (+ n (sre-count-submatches (car ls)))
1941 (sre-names (car ls) n names))))
1942
1943(define (sre-remove-initial-bos sre)
1944 (cond
1945 ((pair? sre)
1946 (case (car sre)
1947 ((seq : $ submatch => submatch-named * +)
1948 (cond
1949 ((not (pair? (cdr sre)))
1950 sre)
1951 ((eq? 'bos (cadr sre))
1952 (cons (car sre) (cddr sre)))
1953 (else
1954 (cons (car sre)
1955 (cons (sre-remove-initial-bos (cadr sre)) (cddr sre))))))
1956 ((or)
1957 (sre-alternate (map sre-remove-initial-bos (cdr sre))))
1958 (else
1959 sre)))
1960 (else
1961 sre)))
1962
1963;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1964;;;; Basic Matching
1965
1966(define irregex-basic-string-chunker
1967 (make-irregex-chunker (lambda (x) #f)
1968 car
1969 cadr
1970 caddr
1971 (lambda (src1 i src2 j)
1972 (substring (car src1) i j))))
1973
1974(define (irregex-search x str . o)
1975 (if (not (string? str)) (error 'irregex-search "not a string" str))
1976 (let ((start (or (and (pair? o) (car o)) 0))
1977 (end (or (and (pair? o) (pair? (cdr o)) (cadr o)) (string-length str))))
1978 (if (not (and (integer? start) (exact? start)))
1979 (error 'irregex-search "not an exact integer" start))
1980 (if (not (and (integer? end) (exact? end)))
1981 (error 'irregex-search "not an exact integer" end))
1982 (irregex-search/chunked x
1983 irregex-basic-string-chunker
1984 (list str start end)
1985 start)))
1986
1987(define (irregex-search/chunked x cnk src . o)
1988 (let* ((irx (irregex x))
1989 (matches (irregex-new-matches irx))
1990 (i (if (pair? o) (car o) ((chunker-get-start cnk) src))))
1991 (if (not (integer? i)) (error 'irregex-search "not an integer" i))
1992 (irregex-match-chunker-set! matches cnk)
1993 (irregex-search/matches irx cnk (cons src i) src i matches)))
1994
1995;; internal routine, can be used in loops to avoid reallocating the
1996;; match vector
1997(define (irregex-search/matches irx cnk init src i matches)
1998 (cond
1999 ((irregex-dfa irx)
2000 (cond
2001 ((flag-set? (irregex-flags irx) ~searcher?)
2002 (cond
2003 ((dfa-match/longest (irregex-dfa irx) cnk src i #f #f matches 0)
2004 (irregex-match-start-chunk-set! matches 0 src)
2005 (irregex-match-start-index-set! matches 0 i)
2006 matches)
2007 (else
2008 #f)))
2009 ((dfa-match/shortest
2010 (irregex-dfa/search irx) cnk src i matches 0)
2011 (let ((dfa (irregex-dfa irx))
2012 (get-start (chunker-get-start cnk))
2013 (get-end (chunker-get-end cnk))
2014 (get-next (chunker-get-next cnk)))
2015 (let lp1 ((src src) (i i))
2016 (let ((end (get-end src)))
2017 (let lp2 ((i i))
2018 (cond
2019 ((dfa-match/longest dfa cnk src i #f #f matches 0)
2020 (irregex-match-start-chunk-set! matches 0 src)
2021 (irregex-match-start-index-set! matches 0 i)
2022 matches)
2023 ((>= i end)
2024 (let ((next (get-next src)))
2025 (and next (lp1 next (get-start next)))))
2026 (else
2027 (lp2 (+ i 1)))))))))
2028 (else
2029 #f)))
2030 (else
2031 (let ((res (irregex-search/backtrack irx cnk init src i matches)))
2032 (if res (%irregex-match-fail-set! res #f))
2033 res))))
2034
2035(define (irregex-search/backtrack irx cnk init src i matches)
2036 (let ((matcher (irregex-nfa irx))
2037 (str ((chunker-get-str cnk) src))
2038 (end ((chunker-get-end cnk) src))
2039 (get-next (chunker-get-next cnk)))
2040 (if (flag-set? (irregex-flags irx) ~searcher?)
2041 (matcher cnk init src str i end matches (lambda () #f))
2042 (let lp ((src2 src)
2043 (str str)
2044 (i i)
2045 (end end))
2046 (cond
2047 ((matcher cnk init src2 str i end matches (lambda () #f))
2048 (irregex-match-start-chunk-set! matches 0 src2)
2049 (irregex-match-start-index-set! matches 0 i)
2050 matches)
2051 ((< i end)
2052 (lp src2 str (+ i 1) end))
2053 (else
2054 (let ((src2 (get-next src2)))
2055 (if src2
2056 (lp src2
2057 ((chunker-get-str cnk) src2)
2058 ((chunker-get-start cnk) src2)
2059 ((chunker-get-end cnk) src2))
2060 #f))))))))
2061
2062(define (irregex-match irx str . o)
2063 (if (not (string? str)) (error 'irregex-match "not a string" str))
2064 (let ((start (or (and (pair? o) (car o)) 0))
2065 (end (or (and (pair? o) (pair? (cdr o)) (cadr o)) (string-length str))))
2066 (if (not (and (integer? start) (exact? start)))
2067 (error 'irregex-match "not an exact integer" start))
2068 (if (not (and (integer? end) (exact? end)))
2069 (error 'irregex-match "not an exact integer" end))
2070 (irregex-match/chunked irx
2071 irregex-basic-string-chunker
2072 (list str start end))))
2073
2074(define (irregex-match/chunked irx cnk src)
2075 (let* ((irx (irregex irx))
2076 (matches (irregex-new-matches irx)))
2077 (irregex-match-chunker-set! matches cnk)
2078 (cond
2079 ((irregex-dfa irx)
2080 (and
2081 (dfa-match/longest
2082 (irregex-dfa irx) cnk src ((chunker-get-start cnk) src) #f #f matches 0)
2083 (= ((chunker-get-end cnk) (%irregex-match-end-chunk matches 0))
2084 (%irregex-match-end-index matches 0))
2085 (begin
2086 (irregex-match-start-chunk-set! matches 0 src)
2087 (irregex-match-start-index-set! matches
2088 0
2089 ((chunker-get-start cnk) src))
2090 matches)))
2091 (else
2092 (let* ((matcher (irregex-nfa irx))
2093 (str ((chunker-get-str cnk) src))
2094 (i ((chunker-get-start cnk) src))
2095 (end ((chunker-get-end cnk) src))
2096 (init (cons src i)))
2097 (let lp ((m (matcher cnk init src str i end matches (lambda () #f))))
2098 (and m
2099 (cond
2100 ((and (not ((chunker-get-next cnk)
2101 (%irregex-match-end-chunk m 0)))
2102 (= ((chunker-get-end cnk)
2103 (%irregex-match-end-chunk m 0))
2104 (%irregex-match-end-index m 0)))
2105 (%irregex-match-fail-set! m #f)
2106 m)
2107 ((%irregex-match-fail m)
2108 (lp ((%irregex-match-fail m))))
2109 (else
2110 #f)))))))))
2111
2112(define (irregex-match? . args)
2113 (and (apply irregex-match args) #t))
2114
2115;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2116;;;; DFA Matching
2117
2118;; inline these
2119(define (dfa-init-state dfa)
2120 (vector-ref dfa 0))
2121(define (dfa-next-state dfa node)
2122 (vector-ref dfa (cadr node)))
2123(define (dfa-cell-commands dfa node)
2124 (cddr node))
2125(define (dfa-finalizer dfa state)
2126 (car state))
2127
2128;; this searches for the first end index for which a match is possible
2129(define (dfa-match/shortest dfa cnk src start matches index)
2130 (let ((get-str (chunker-get-str cnk))
2131 (get-start (chunker-get-start cnk))
2132 (get-end (chunker-get-end cnk))
2133 (get-next (chunker-get-next cnk))
2134 ;; Skip the "set-up" state, we don't need to set tags.
2135 (start-state (dfa-next-state dfa (cadr (dfa-init-state dfa)))))
2136 (let lp1 ((src src) (start start) (state start-state))
2137 (and
2138 src
2139 (let ((str (get-str src))
2140 (end (get-end src)))
2141 (let lp2 ((i start) (state state))
2142 (cond
2143 ((dfa-finalizer dfa state)
2144 (cond
2145 (index
2146 (irregex-match-end-chunk-set! matches index src)
2147 (irregex-match-end-index-set! matches index i)))
2148 #t)
2149 ((< i end)
2150 (let* ((ch (string-ref str i))
2151 (next (find (lambda (x)
2152 (or (eqv? ch (car x))
2153 (and (not (char? (car x)))
2154 (cset-contains? (car x) ch))))
2155 (cdr state))))
2156 (and next (lp2 (+ i 1) (dfa-next-state dfa next)))))
2157 (else
2158 (let ((next (get-next src)))
2159 (and next (lp1 next (get-start next) state)))))))))))
2160
2161(define (finalize! finalizer memory matches)
2162 (for-each
2163 (lambda (tag&slot)
2164 (let* ((tag (car tag&slot))
2165 (slot (vector-ref memory (cdr tag&slot)))
2166 (chunk&pos (vector-ref slot tag)))
2167 (irregex-match-chunk&index-from-tag-set!
2168 matches tag
2169 (and chunk&pos (car chunk&pos))
2170 (and chunk&pos (cdr chunk&pos)))))
2171 finalizer))
2172(define (make-initial-memory slots matches)
2173 (let ((size (* (irregex-match-num-submatches matches) 2))
2174 (memory (make-vector slots)))
2175 (do ((i 0 (+ i 1)))
2176 ((= i slots) memory)
2177 (vector-set! memory i (make-vector size #f)))))
2178
2179;; this finds the longest match starting at a given index
2180(define (dfa-match/longest dfa cnk src start end-src end matches index)
2181 (let* ((get-str (chunker-get-str cnk))
2182 (get-start (chunker-get-start cnk))
2183 (get-end (chunker-get-end cnk))
2184 (get-next (chunker-get-next cnk))
2185 (initial-state (dfa-init-state dfa))
2186 (memory-size (car initial-state))
2187 (submatches? (not (zero? memory-size)))
2188 ;; A vector of vectors, each of size <number of start/end submatches>
2189 (memory (make-initial-memory memory-size matches))
2190 (init-cell (cadr initial-state))
2191 (start-state (dfa-next-state dfa init-cell))
2192 (start-finalizer (dfa-finalizer dfa start-state)))
2193 (cond
2194 (index
2195 (irregex-match-end-chunk-set! matches index #f)
2196 (irregex-match-end-index-set! matches index #f)))
2197 (cond (submatches?
2198 (for-each (lambda (s)
2199 (let ((slot (vector-ref memory (cdr s))))
2200 (vector-set! slot (car s) (cons src start))))
2201 (cdr (dfa-cell-commands dfa init-cell)))))
2202 (let lp1 ((src src)
2203 (start start)
2204 (state start-state)
2205 (res-src (and start-finalizer src))
2206 (res-index (and start-finalizer start))
2207 (finalizer start-finalizer))
2208 (let ((str (get-str src))
2209 (end (if (eq? src end-src) end (get-end src))))
2210 (let lp2 ((i start)
2211 (state state)
2212 (res-src res-src)
2213 (res-index res-index)
2214 (finalizer finalizer))
2215 (cond
2216 ((>= i end)
2217 (cond
2218 ((and index res-src)
2219 (irregex-match-end-chunk-set! matches index res-src)
2220 (irregex-match-end-index-set! matches index res-index)))
2221 (let ((next (and (not (eq? src end-src)) (get-next src))))
2222 (if next
2223 (lp1 next (get-start next) state res-src res-index finalizer)
2224 (and index
2225 (%irregex-match-end-chunk matches index)
2226 (or (not finalizer) (finalize! finalizer memory matches))
2227 #t))))
2228 (else
2229 (let* ((ch (string-ref str i))
2230 (cell (find (lambda (x)
2231 (or (eqv? ch (car x))
2232 (and (not (char? (car x)))
2233 (cset-contains? (car x) ch))))
2234 (cdr state))))
2235 (cond
2236 (cell
2237 (let* ((next (dfa-next-state dfa cell))
2238 (new-finalizer (dfa-finalizer dfa next)))
2239 (cond
2240 (submatches?
2241 (let ((cmds (dfa-cell-commands dfa cell)))
2242 ;; Save match when we're moving from accepting state to
2243 ;; rejecting state; this could be the last accepting one.
2244 (cond ((and finalizer (not new-finalizer))
2245 (finalize! finalizer memory matches)))
2246 (for-each (lambda (s)
2247 (let ((slot (vector-ref memory (cdr s)))
2248 (chunk&position (cons src (+ i 1))))
2249 (vector-set! slot (car s) chunk&position)))
2250 (cdr cmds))
2251 ;; Reassigning commands may be in an order which
2252 ;; causes memory cells to be clobbered before
2253 ;; they're read out. Make 2 passes to maintain
2254 ;; old values by copying them into a closure.
2255 (for-each (lambda (execute!) (execute!))
2256 (map (lambda (c)
2257 (let* ((tag (vector-ref c 0))
2258 (ss (vector-ref memory (vector-ref c 1)))
2259 (ds (vector-ref memory (vector-ref c 2)))
2260 (value-from (vector-ref ss tag)))
2261 (lambda () (vector-set! ds tag value-from))))
2262 (car cmds))))))
2263 (if new-finalizer
2264 (lp2 (+ i 1) next src (+ i 1) new-finalizer)
2265 (lp2 (+ i 1) next res-src res-index #f))))
2266 (res-src
2267 (cond
2268 (index
2269 (irregex-match-end-chunk-set! matches index res-src)
2270 (irregex-match-end-index-set! matches index res-index)))
2271 (cond (finalizer (finalize! finalizer memory matches)))
2272 #t)
2273 ((and index (%irregex-match-end-chunk matches index))
2274 (cond (finalizer (finalize! finalizer memory matches)))
2275 #t)
2276 (else
2277 #f))))))))))
2278
2279;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2280;;;; Named Definitions
2281
2282(define sre-named-definitions
2283 `((any . ,*all-chars*)
2284 (nonl . (- ,*all-chars* (,(string #\newline))))
2285 (alphabetic . (/ #\a #\z #\A #\Z))
2286 (alpha . alphabetic)
2287 (alphanumeric . (/ #\a #\z #\A #\Z #\0 #\9))
2288 (alphanum . alphanumeric)
2289 (alnum . alphanumeric)
2290 (lower-case . (/ #\a #\z))
2291 (lower . lower-case)
2292 (upper-case . (/ #\A #\Z))
2293 (upper . upper-case)
2294 (numeric . (/ #\0 #\9))
2295 (num . numeric)
2296 (digit . numeric)
2297 (punctuation . (or #\! #\" #\# #\% #\& #\' #\( #\) #\* #\, #\- #\.
2298 #\/ #\: #\; #\? #\@ #\[ #\\ #\] #\_ #\{ #\}))
2299 (punct . punctuation)
2300 (graphic
2301 . (or alphanumeric punctuation #\$ #\+ #\< #\= #\> #\^ #\` #\| #\~))
2302 (graph . graphic)
2303 (blank . (or #\space ,(integer->char (- (char->integer #\space) 23))))
2304 ;; 0B - vertical tab, 0C - form feed
2305 (whitespace . (or blank #\newline #\x0C #\return #\x0B))
2306 (space . whitespace)
2307 (white . whitespace)
2308 (printing . (or graphic whitespace))
2309 (print . printing)
2310
2311 ;; XXXX we assume a (possibly shifted) ASCII-based ordering
2312 (control . (/ ,(integer->char (- (char->integer #\space) 32))
2313 ,(integer->char (- (char->integer #\space) 1))))
2314 (cntrl . control)
2315 (hex-digit . (or numeric (/ #\a #\f #\A #\F)))
2316 (xdigit . hex-digit)
2317 (ascii . (/ ,(integer->char (- (char->integer #\space) 32))
2318 ,(integer->char (+ (char->integer #\space) 95))))
2319 (ascii-nonl . (/ ,(integer->char (- (char->integer #\space) 32))
2320 ,(integer->char (- (char->integer #\newline) 1))
2321 ,(integer->char (+ (char->integer #\newline) 1))
2322 ,(integer->char (+ (char->integer #\space) 95))))
2323 (newline . (or (seq ,(integer->char (+ (char->integer #\newline) 3))
2324 #\newline)
2325 (/ #\newline
2326 ,(integer->char (+ (char->integer #\newline) 3)))))
2327
2328 ;; ... it's really annoying to support old Scheme48
2329 (word . (seq bow (+ (or alphanumeric #\_)) eow))
2330 (utf8-tail-char . (/ ,(integer->char (+ (char->integer #\space) #x60))
2331 ,(integer->char (+ (char->integer #\space) #xA1))))
2332 (utf8-2-char . (seq (/ ,(integer->char (+ (char->integer #\space) #xA2))
2333 ,(integer->char (+ (char->integer #\space) #xBF)))
2334 utf8-tail-char))
2335 (utf8-3-char . (seq (/ ,(integer->char (+ (char->integer #\space) #xC0))
2336 ,(integer->char (+ (char->integer #\space) #xCF)))
2337 utf8-tail-char
2338 utf8-tail-char))
2339 (utf8-4-char . (seq (/ ,(integer->char (+ (char->integer #\space) #xD0))
2340 ,(integer->char (+ (char->integer #\space) #xD7)))
2341 utf8-tail-char
2342 utf8-tail-char
2343 utf8-tail-char))
2344 (utf8-any . (or ascii utf8-2-char utf8-3-char utf8-4-char))
2345 (utf8-nonl . (or ascii-nonl utf8-2-char utf8-3-char utf8-4-char))
2346
2347 ;; extended library patterns
2348 (integer . (seq (? (or #\+ #\-)) (+ numeric)))
2349 (real . (seq (? (or #\+ #\-))
2350 (+ numeric) (? #\. (+ numeric))
2351 (? (or #\e #\E) integer)))
2352 ;; slightly more lax than R5RS, allow ->foo, etc.
2353 (symbol-initial . (or alpha ("!$%&*/:<=>?^_~")))
2354 (symbol-subsequent . (or symbol-initial digit ("+-.@")))
2355 (symbol . (or (seq symbol-initial (* symbol-subsequent))
2356 (seq ("+-") (? symbol-initial (* symbol-subsequent)))
2357 (seq ".." (* "."))))
2358 (sexp-space . (seq (* (* space) ";" (* nonl) newline) (+ space)))
2359 (string . (seq #\" (escape #\\ #\") #\"))
2360 (escape . ,(lambda (esc . o) `(* (or (~ ,esc ,@o) (seq ,esc any)))))
2361
2362 (ipv4-digit . (seq (? (/ "12")) (? numeric) numeric))
2363 (ipv4-address . (seq ipv4-digit (= 3 #\. ipv4-digit)))
2364 ;; XXXX lax, allows multiple double-colons or < 8 terms w/o a ::
2365 (ipv6-address . (seq (** 0 4 hex-digit)
2366 (** 1 7 #\: (? #\:) (** 0 4 hex-digit))))
2367 (ip-address . (or ipv4-address ipv6-address))
2368 (domain-atom . (+ (or alphanumeric #\_ #\-)))
2369 (domain . (seq domain-atom (+ #\. domain-atom)))
2370 ;; XXXX now anything can be a top-level domain, but this is still handy
2371 (top-level-domain . (w/nocase (or "arpa" "com" "gov" "mil" "net" "org"
2372 "edu" "aero" "biz" "coop" "info"
2373 "museum" "name" "pro" (= 2 alpha))))
2374 (domain/common . (seq (+ domain-atom #\.) top-level-domain))
2375 ;;(email-local-part . (seq (+ (or (~ #\") string))))
2376 (email-local-part . (+ (or alphanumeric #\_ #\- #\. #\+)))
2377 (email . (seq email-local-part #\@ domain))
2378 (url-char . (or alnum #\_ #\- #\+ #\\ #\= #\~ #\. #\, #\& #\;
2379 (seq "%" hex-digit hex-digit)))
2380 (url-final-char . (or alnum #\_ #\- #\+ #\\ #\= #\~ #\&
2381 (seq "%" hex-digit hex-digit)))
2382 (http-url . (w/nocase
2383 "http" (? "s") "://"
2384 (or domain ipv4-address) ;; (seq "[" ipv6-address "]")
2385 (? ":" (+ numeric)) ;; port
2386 ;; path
2387 (? "/" (* (or url-char "/"))
2388 (? "?" (* url-char)) ;; query
2389 (? "#" (? (* url-char) url-final-char)) ;; fragment
2390 )))
2391
2392 ))
2393
2394
2395;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2396;;;; SRE->tNFA compilation
2397;;
2398;; A tagged NFA (tNFA) state is a numbered node with a list of
2399;; pattern->number transitions, where pattern is character set range,
2400;; or epsilon (indicating an empty transition).
2401;;
2402;; (Only) epsilon transitions may be *tagged*. Each tag represents
2403;; either the start or the end of a submatch.
2404;;
2405;; There may be overlapping ranges - since it's an NFA we process it
2406;; by considering all possible transitions.
2407
2408(define *nfa-presize* 128) ;; constant
2409(define *nfa-num-fields* 4) ;; constant
2410
2411(define (nfa-num-states nfa) (quotient (vector-length nfa) *nfa-num-fields*))
2412(define (nfa-start-state nfa) (- (nfa-num-states nfa) 1))
2413
2414(define (nfa-num-tags nfa)
2415 (vector-ref nfa 0))
2416(define (nfa-highest-map-index nfa)
2417 (vector-ref nfa 1))
2418(define (nfa-set-highest-map-index! nfa idx)
2419 (vector-set! nfa 1 idx))
2420
2421(define (nfa-get-state-trans nfa i)
2422 (if (= i 0) '() (vector-ref nfa (* i *nfa-num-fields*))))
2423(define (nfa-set-state-trans! nfa i x)
2424 (vector-set! nfa (* i *nfa-num-fields*) x))
2425
2426(define (nfa-get-epsilons nfa i)
2427 (if (= i 0) '() (vector-ref nfa (+ (* i *nfa-num-fields*) 1))))
2428(define (nfa-set-epsilons! nfa i x)
2429 (vector-set! nfa (+ (* i *nfa-num-fields*) 1) x))
2430(define (nfa-add-epsilon! nfa i x t)
2431 (let ((eps (nfa-get-epsilons nfa i)))
2432 (if (not (assv x eps))
2433 (nfa-set-epsilons! nfa i (cons (cons x t) eps)))))
2434
2435(define (nfa-get-reorder-commands nfa mst)
2436 (cond ((assoc mst (vector-ref nfa (+ (* (mst-hash mst) *nfa-num-fields*) 2)))
2437 => cdr)
2438 (else #f)))
2439(define (nfa-set-reorder-commands! nfa mst x)
2440 (let ((i (+ (* (mst-hash mst) *nfa-num-fields*) 2)))
2441 (vector-set! nfa i (cons (cons mst x) (vector-ref nfa i)))))
2442
2443(define (nfa-get-closure nfa mst)
2444 (cond ((assoc mst (vector-ref nfa (+ (* (mst-hash mst) *nfa-num-fields*) 3)))
2445 => cdr)
2446 (else #f)))
2447(define (nfa-add-closure! nfa mst x)
2448 (let ((i (+ (* (mst-hash mst) *nfa-num-fields*) 3)))
2449 (vector-set! nfa i (cons (cons mst x) (vector-ref nfa i)))))
2450
2451;; Compile and return the vector of NFA states (in groups of
2452;; *nfa-num-fields* packed elements). The start state will be the
2453;; last element(s) of the vector, and all remaining states will be in
2454;; descending numeric order, with state 0 being the unique accepting
2455;; state.
2456(define (sre->nfa sre init-flags)
2457 (let* ((buf (make-vector (* *nfa-presize* *nfa-num-fields*) '()))
2458 ;; Get cons cells and map them to numeric submatch indexes.
2459 ;; Doing it here is slightly easier than integrating into the loop below
2460 (match-index
2461 (let lp ((sre (list sre)) (max 0) (res '()))
2462 (cond
2463 ((not (pair? sre))
2464 ;; We abuse the transitions slot for state 0 (the final state,
2465 ;; which can have no transitions) to store the number of tags.
2466 (vector-set! buf 0 (* max 2))
2467 ;; We abuse the epsilons slot for state 0 to store the highest
2468 ;; encountered memory slot mapping index. Initialize to -1.
2469 (vector-set! buf 1 -1)
2470 res)
2471 ((pair? (car sre))
2472 ;; The appends here should be safe (are they?)
2473 (case (caar sre)
2474 (($ submatch => submatch-named)
2475 (lp (append (cdar sre) (cdr sre)) (+ max 1)
2476 (cons (cons (car sre) max) res)))
2477 (else (lp (append (car sre) (cdr sre)) max res))))
2478 (else (lp (cdr sre) max res))))))
2479 ;; we loop over an implicit sequence list
2480 (define (lp ls n flags next)
2481 (define (new-state-number state)
2482 (max n (+ 1 state)))
2483 (define (add-state! n2 trans-ls)
2484 (if (>= (* n2 *nfa-num-fields*) (vector-length buf))
2485 (let ((tmp (make-vector (* 2 (vector-length buf)) '())))
2486 (do ((i (- (vector-length buf) 1) (- i 1)))
2487 ((< i 0))
2488 (vector-set! tmp i (vector-ref buf i)))
2489 (set! buf tmp)))
2490 (nfa-set-state-trans! buf n2 trans-ls)
2491 n2)
2492 (define (extend-state! next trans-cs)
2493 (and next
2494 (add-state! (new-state-number next) (cons trans-cs next))))
2495 (define (add-char-state! next ch)
2496 (let ((alt (char-altcase ch)))
2497 (if (flag-set? flags ~case-insensitive?)
2498 (extend-state! next (cset-union (char->cset ch) (char->cset alt)))
2499 (extend-state! next (char->cset ch)))))
2500 (if (null? ls)
2501 next
2502 (cond
2503 ((or (eq? 'epsilon (car ls)) (equal? "" (car ls)))
2504 ;; chars and epsilons go directly into the transition table
2505 (let ((next (lp (cdr ls) n flags next)))
2506 (and next
2507 (let ((new (add-state! (new-state-number next) '())))
2508 (nfa-add-epsilon! buf new next #f)
2509 new))))
2510 ((string? (car ls))
2511 ;; process literal strings a char at a time
2512 (let ((next (lp (cdr ls) n flags next)))
2513 (and next
2514 (let lp2 ((i (- (string-length (car ls)) 1))
2515 (next next))
2516 (if (< i 0)
2517 next
2518 (lp2 (- i 1)
2519 (add-char-state! next (string-ref (car ls) i))))
2520 ))))
2521 ((char? (car ls))
2522 (add-char-state! (lp (cdr ls) n flags next) (car ls)))
2523 ((symbol? (car ls))
2524 (let ((cell (assq (car ls) sre-named-definitions)))
2525 (and cell
2526 (lp (cons (if (procedure? (cdr cell))
2527 ((cdr cell))
2528 (cdr cell))
2529 (cdr ls))
2530 n
2531 flags
2532 next))))
2533 ((pair? (car ls))
2534 (cond
2535 ((string? (caar ls)) ; Enumerated character set
2536 (let ((set (if (flag-set? flags ~case-insensitive?)
2537 (cset-case-insensitive (string->cset (caar ls)))
2538 (string->cset (caar ls)))))
2539 (extend-state! (lp (cdr ls) n flags next) set)))
2540 (else
2541 (case (caar ls)
2542 ((seq :)
2543 ;; for an explicit sequence, just append to the list
2544 (lp (append (cdar ls) (cdr ls)) n flags next))
2545 ((w/case w/nocase w/utf8 w/noutf8)
2546 (let* ((next (lp (cdr ls) n flags next))
2547 (flags ((if (memq (caar ls) '(w/case w/utf8))
2548 flag-clear
2549 flag-join)
2550 flags
2551 (if (memq (caar ls) '(w/case w/nocase))
2552 ~case-insensitive?
2553 ~utf8?))))
2554 (and next
2555 (lp (cdar ls) (new-state-number next) flags next))))
2556 ((/ - & ~)
2557 (let ((range (sre->cset (car ls)
2558 (flag-set? flags ~case-insensitive?))))
2559 (extend-state! (lp (cdr ls) n flags next)
2560 range)))
2561 ((or)
2562 (let ((next (lp (cdr ls) n flags next)))
2563 (and
2564 next
2565 (if (null? (cdar ls))
2566 ;; empty (or) always fails
2567 (add-state! (new-state-number next) '())
2568 ;; compile both branches and insert epsilon
2569 ;; transitions to either
2570 (let* ((b (lp (list (sre-alternate (cddar ls)))
2571 (new-state-number next)
2572 flags
2573 next))
2574 (a (and b
2575 (lp (list (cadar ls))
2576 (new-state-number (max b next))
2577 flags
2578 next))))
2579 (and a
2580 (let ((c (add-state! (new-state-number (max a b))
2581 '())))
2582 (nfa-add-epsilon! buf c a #f)
2583 (nfa-add-epsilon! buf c b #f)
2584 c)))))))
2585 ((?)
2586 (let ((next (lp (cdr ls) n flags next)))
2587 ;; insert an epsilon transition directly to next
2588 (and
2589 next
2590 (let ((a (lp (cdar ls) (new-state-number next) flags next)))
2591 (if a
2592 (nfa-add-epsilon! buf a next #f))
2593 a))))
2594 ((+ *)
2595 (let ((next (lp (cdr ls) n flags next)))
2596 (and
2597 next
2598 (let* ((new (lp '(epsilon)
2599 (new-state-number next)
2600 flags
2601 next))
2602 (a (lp (cdar ls) (new-state-number new) flags new)))
2603 (cond
2604 (a
2605 ;; for *, insert an epsilon transition as in ? above
2606 (if (eq? '* (caar ls))
2607 (nfa-add-epsilon! buf a new #f))
2608 ;; for both, insert a loop back to self
2609 (nfa-add-epsilon! buf new a #f)))
2610 a))))
2611 ;; need to add these to the match extractor first,
2612 ;; but they tend to generate large DFAs
2613 ;;((=)
2614 ;; (lp (append (vector->list
2615 ;; (make-vector (cadar ls)
2616 ;; (sre-sequence (cddar ls))))
2617 ;; (cdr ls))
2618 ;; n flags next))
2619 ;;((>=)
2620 ;; (lp (append (vector->list
2621 ;; (make-vector (- (cadar ls) 1)
2622 ;; (sre-sequence (cddar ls))))
2623 ;; (cons `(+ ,@(cddar ls)) (cdr ls)))
2624 ;; n flags next))
2625 ;;((**)
2626 ;; (lp (append (vector->list
2627 ;; (make-vector (cadar ls)
2628 ;; (sre-sequence (cdddar ls))))
2629 ;; (map
2630 ;; (lambda (x) `(? ,x))
2631 ;; (vector->list
2632 ;; (make-vector (- (caddar ls) (cadar ls))
2633 ;; (sre-sequence (cdddar ls)))))
2634 ;; (cdr ls))
2635 ;; n flags next))
2636 ;; ignore submatches altogether
2637 (($ submatch)
2638 (let* ((pre-tag (* (cdr (assq (car ls) match-index)) 2))
2639 (post-tag (+ pre-tag 1))
2640 (next (lp (cdr ls) n flags next)))
2641 (and next
2642 (let* ((after (add-state! (new-state-number next) '()))
2643 (sub (lp (list (sre-sequence (cdar ls)))
2644 (new-state-number after) flags after))
2645 (before (and sub (add-state! (new-state-number sub) '()))))
2646 (cond (before
2647 (nfa-add-epsilon! buf before sub pre-tag)
2648 (nfa-add-epsilon! buf after next post-tag)))
2649 before))))
2650 ((=> submatch-named)
2651 (let* ((pre-tag (* (cdr (assq (car ls) match-index)) 2))
2652 (post-tag (+ pre-tag 1))
2653 (next (lp (cdr ls) n flags next)))
2654 (and next
2655 (let* ((after (add-state! (new-state-number next) '()))
2656 (sub (lp (list (sre-sequence (cddar ls)))
2657 (new-state-number after) flags after))
2658 (before (and sub (add-state! (new-state-number sub) '()))))
2659 (cond (before
2660 (nfa-add-epsilon! buf before sub pre-tag)
2661 (nfa-add-epsilon! buf after next post-tag)))
2662 before))))
2663 (else
2664 (cond
2665 ((assq (caar ls) sre-named-definitions)
2666 => (lambda (cell)
2667 (if (procedure? (cdr cell))
2668 (lp (cons (apply (cdr cell) (cdar ls)) (cdr ls))
2669 n flags next)
2670 (error "non-procedure in op position" (caar ls)))))
2671 (else #f)))))))
2672 (else
2673 #f))))
2674 (let ((len (lp (list sre) 1 init-flags 0)))
2675 (and len
2676 (let ((nfa (make-vector (* *nfa-num-fields* (+ len 1)))))
2677 (do ((i (- (vector-length nfa) 1) (- i 1)))
2678 ((< i 0))
2679 (vector-set! nfa i (vector-ref buf i)))
2680 nfa)))))
2681
2682;; We don't really want to use this, we use the closure compilation
2683;; below instead, but this is included for reference and testing the
2684;; sre->nfa conversion.
2685
2686;; (define (nfa-match nfa str)
2687;; (let ((matches (make-vector (nfa-num-tags nfa) #f)))
2688;; (let lp ((pos 0) (ls (string->list str)) (state (nfa-start-state nfa)) (epsilons '()))
2689;; (and (or (and (null? ls) (zero? state))
2690;; (let ((t (nfa-get-state-trans nfa state)))
2691;; (and (not (null? t)) (not (null? ls))
2692;; (cset-contains? (car t) (car ls))
2693;; (lp (+ pos 1) (cdr ls) (cdr t) '())))
2694;; (any (lambda (e)
2695;; (let ((old-matches (%vector-copy matches)))
2696;; (cond ((cdr e)
2697;; (vector-set! matches (cdr e) pos)))
2698;; (or (and (not (memv (car e) epsilons))
2699;; (lp pos ls (car e) (cons (car e) epsilons)))
2700;; ;; reset match, apparently this branch failed
2701;; (begin (set! matches old-matches) #f))))
2702;; (nfa-get-epsilons nfa state)))
2703;; matches))))
2704
2705;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2706;;;; NFA multi-state representation
2707
2708(define *mst-first-state-index* 3)
2709
2710(define (mst-mappings-summary mst)
2711 (vector-ref mst 0))
2712
2713(define (mst-num-states mst)
2714 (vector-ref mst 1))
2715
2716(define (mst-num-states-set! mst num)
2717 (vector-set! mst 1 num))
2718
2719(define (mst-hash mst)
2720 ;; We could do (modulo X (nfa-num-states nfa)) here which would be faster,
2721 ;; but we can't assume a full numerical tower (and updating *could*
2722 ;; produce a bignum), so we do it each time when updating the hash.
2723 (vector-ref mst 2))
2724
2725(define (mst-hash-set! mst hash)
2726 (vector-set! mst 2 hash))
2727
2728;; Returns #f if NFA state does not occur in multi-state
2729(define (mst-state-mappings mst state)
2730 (vector-ref mst (+ state *mst-first-state-index*)))
2731
2732(define (mst-state-mappings-set! mst state mappings)
2733 (vector-set! mst (+ state *mst-first-state-index*) mappings))
2734
2735;; A multi-state holds a set of states with their tag-to-slot mappings.
2736;; Slot 0 contains a summary of all mappings for all states in the multi-state.
2737;; Slot 1 contains the total number of states in the multi-state.
2738;; Slot 2 contains a hash value, which is used for quick lookup of cached
2739;; reorder-commands or epsilon-closure in the NFA. This is the sum of all
2740;; state numbers plus each tag value (once per occurrence). This is a silly
2741;; hashing calculation, but it seems to produce a well-spread out hash table and
2742;; it has the added advantage that we can use the value as a quick check if the
2743;; state is definitely NOT equivalent to another in mst-same-states?
2744;; The other slots contain mappings for each corresponding state.
2745
2746(define (make-mst nfa)
2747 (let ((mst (make-vector (+ (nfa-num-states nfa) *mst-first-state-index*) #f)))
2748 (vector-set! mst 0 (make-vector (nfa-num-tags nfa) '())) ; tag summary
2749 (vector-set! mst 1 0) ; total number of states
2750 (vector-set! mst 2 0) ; states and tags hash
2751 mst))
2752
2753;; NOTE: This doesn't do a deep copy of the mappings. Don't mutate them!
2754(define (mst-copy mst)
2755 (let ((v (%vector-copy mst)))
2756 (vector-set! v 0 (%vector-copy (vector-ref mst 0)))
2757 v))
2758
2759(define (nfa-state->mst nfa state mappings)
2760 (let ((mst (make-mst nfa)))
2761 (mst-add! nfa mst state mappings)
2762 mst))
2763
2764;; Extend multi-state with a state and add its tag->slot mappings.
2765(define (mst-add! nfa mst state mappings)
2766 (let ((hash-value (mst-hash mst)))
2767 (cond ((not (mst-state-mappings mst state)) ; Update state hash & count?
2768 (set! hash-value (+ hash-value state))
2769 (mst-num-states-set! mst (+ (mst-num-states mst) 1))))
2770 (mst-state-mappings-set! mst state mappings)
2771 (let ((all-mappings (mst-mappings-summary mst)))
2772 (for-each
2773 (lambda (tag&slot)
2774 (let* ((t (car tag&slot))
2775 (s (cdr tag&slot))
2776 (m (vector-ref all-mappings t)))
2777 (cond ((not (memv s m))
2778 (set! hash-value (+ hash-value t))
2779 (vector-set! all-mappings t (cons s m))))))
2780 mappings))
2781 (mst-hash-set! mst (modulo hash-value (nfa-num-states nfa)))))
2782
2783;; Same as above, but skip updating mappings summary.
2784;; Called when we know all the tag->slot mappings are already in the summary.
2785(define (mst-add/fast! nfa mst state mappings)
2786 (cond ((not (mst-state-mappings mst state)) ; Update state hash & count?
2787 (mst-hash-set!
2788 mst (modulo (+ (mst-hash mst) state)
2789 (nfa-num-states nfa)))
2790 (mst-num-states-set! mst (+ (mst-num-states mst) 1))))
2791 (mst-state-mappings-set! mst state mappings))
2792
2793;; Same as above, assigning a new slot for a tag. This slot is then
2794;; added to the summary, if it isn't in there yet. This is more efficient
2795;; than looping through all the mappings.
2796(define (mst-add-tagged! nfa mst state mappings tag slot)
2797 (let* ((mappings-summary (mst-mappings-summary mst))
2798 (summary-tag-slots (vector-ref mappings-summary tag))
2799 (new-mappings (let lp ((m mappings)
2800 (res '()))
2801 (cond ((null? m) (cons (cons tag slot) res))
2802 ((= (caar m) tag)
2803 (append res (cons (cons tag slot) (cdr m))))
2804 (else (lp (cdr m) (cons (car m) res))))))
2805 (hash-value (mst-hash mst)))
2806 (cond ((not (mst-state-mappings mst state)) ; Update state hash & count?
2807 (set! hash-value (+ hash-value state))
2808 (mst-num-states-set! mst (+ (mst-num-states mst) 1))))
2809 (mst-state-mappings-set! mst state new-mappings)
2810 (cond ((not (memv slot summary-tag-slots)) ; Update tag/slot summary
2811 (set! hash-value (+ hash-value tag))
2812 (vector-set! mappings-summary tag (cons slot summary-tag-slots))))
2813 (mst-hash-set! mst (modulo hash-value (nfa-num-states nfa)))
2814 new-mappings))
2815
2816(define (mst-same-states? a b)
2817 ;; First check if hash and state counts match, then check each state
2818 (and (= (mst-hash a) (mst-hash b))
2819 (= (mst-num-states a) (mst-num-states b))
2820 (let ((len (vector-length a)))
2821 (let lp ((i *mst-first-state-index*))
2822 (or (= i len)
2823 (and (equal? (not (vector-ref a i))
2824 (not (vector-ref b i)))
2825 (lp (+ i 1))))))))
2826
2827(define (mst-fold mst kons knil)
2828 (let ((limit (vector-length mst)))
2829 (let lp ((i *mst-first-state-index*)
2830 (acc knil))
2831 (if (= i limit)
2832 acc
2833 (let ((m (vector-ref mst i)))
2834 (lp (+ i 1) (if m (kons (- i *mst-first-state-index*) m acc) acc)))))))
2835
2836;; Find the lowest fresh index for this tag that's unused
2837;; in the multi-state. This also updates the nfa's highest
2838;; tag counter if a completely new slot number was assigned.
2839(define (next-index-for-tag! nfa tag mst)
2840 (let* ((highest (nfa-highest-map-index nfa))
2841 (tag-slots (vector-ref (mst-mappings-summary mst) tag))
2842 (new-index (do ((slot 0 (+ slot 1)))
2843 ((not (memv slot tag-slots)) slot))))
2844 (cond ((> new-index highest)
2845 (nfa-set-highest-map-index! nfa new-index)))
2846 new-index))
2847
2848;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2849;;;; tNFA->DFA compilation
2850;; During processing, the DFA is a list of the form:
2851;;
2852;; ((annotated-tNFA-states ...) finalizer transitions ...)
2853;;
2854;; where the transitions are as in the NFA, except there are no
2855;; epsilons, duplicate characters or overlapping char-set ranges, and
2856;; the states moved to are closures (sets of NFA states). Multiple
2857;; DFA states may be accepting states. If the state is an accepting state,
2858;; the finalizer is a list of (tag . memory-slot) retrieval commands.
2859;; tNFA-states are annotated with mappings which store the tag values of
2860;; memory slots, if any. There is always at most one slot for a tag.
2861;;
2862;; The DFA itself simulates a NFA by representing all the simultaneous
2863;; states the NFA can be in at any given point in time as one DFA state.
2864;; The tag values are ambiguous since each NFA transition can set a tag.
2865;; To solve this we keep a bank of memory slots around which tracks tag
2866;; values for each distinct path through the NFA.
2867;;
2868;; Once we get to a final state we can pluck the tag values from the
2869;; memory slots corresponding to the path through which the NFA could have
2870;; reached the final state. To resolve ambiguities, states are assigned
2871;; priorities, and the path to the final state is chosen correspondingly.
2872;;
2873;; For a more detailed explanation about this process, see
2874;; Ville Laurikari; ``NFAs with Tagged Transitions, their Conversion to
2875;; Deterministic Automata and Application to Regular Expressions'' (2000).
2876;; Laurikari also wrote a master's thesis about this approach which is
2877;; less terse but the algorithms are not exactly the same.
2878;; ``Efficient submatch addressing for regular expressions'' (2001).
2879;; This implementation follows the 2000 paper where they differ.
2880
2881(define (nfa->dfa nfa . o)
2882 (let* ((max-states (and (pair? o) (car o)))
2883 (start (nfa-state->mst nfa (nfa-start-state nfa) '()))
2884 (start-closure (nfa-epsilon-closure nfa start))
2885 ;; Set up a special "initializer" state from which we reach the
2886 ;; start-closure to ensure that leading tags are set properly.
2887 (init-set (tag-set-commands-for-closure nfa start start-closure '()))
2888 (dummy (make-mst nfa))
2889 (init-state (list dummy #f `((,start-closure #f () . ,init-set)))))
2890 ;; Unmarked states are just sets of NFA states with tag-maps, marked states
2891 ;; are sets of NFA states with transitions to sets of NFA states
2892 (let lp ((unmarked-states (list start-closure))
2893 (marked-states (list init-state))
2894 (dfa-size 0))
2895 (cond
2896 ((null? unmarked-states)
2897 ;; Abuse finalizer slot for storing the number of memory slots we need
2898 (set-car! (cdr init-state) (+ (nfa-highest-map-index nfa) 1))
2899 (dfa-renumber (reverse marked-states)))
2900 ((and max-states (> dfa-size max-states)) ; Too many DFA states
2901 #f)
2902 ((assoc (car unmarked-states) marked-states) ; Seen set of NFA-states?
2903 (lp (cdr unmarked-states) marked-states dfa-size))
2904 (else
2905 (let ((dfa-state (car unmarked-states)))
2906 (let lp2 ((trans (get-distinct-transitions nfa dfa-state))
2907 (unmarked-states (cdr unmarked-states))
2908 (dfa-trans '()))
2909 (if (null? trans)
2910 (let ((finalizer (mst-state-mappings dfa-state 0)))
2911 (lp unmarked-states
2912 (cons (list dfa-state finalizer dfa-trans) marked-states)
2913 (+ dfa-size 1)))
2914 (let* ((closure (nfa-epsilon-closure nfa (cdar trans)))
2915 (reordered
2916 (find-reorder-commands nfa closure marked-states))
2917 (copy-cmds (if reordered (cdr reordered) '()))
2918 ;; Laurikari doesn't mention what "k" is, but it seems it
2919 ;; must be the mappings of the state's reach
2920 (set-cmds (tag-set-commands-for-closure
2921 nfa (cdar trans) closure copy-cmds))
2922 (trans-closure (if reordered (car reordered) closure)))
2923 (lp2 (cdr trans)
2924 (if reordered
2925 unmarked-states
2926 (cons trans-closure unmarked-states))
2927 (cons `(,trans-closure
2928 ,(caar trans) ,copy-cmds . ,set-cmds)
2929 dfa-trans)))))))))))
2930
2931;; When the conversion is complete we renumber the DFA sets-of-states
2932;; in order and convert the result to a vector for fast lookup.
2933;; Charsets containing single characters are converted to those characters
2934;; for quick matching of the literal parts in a regex.
2935(define (dfa-renumber states)
2936 (let ((indexes (let lp ((i 0) (states states) (indexes '()))
2937 (if (null? states)
2938 indexes
2939 (lp (+ i 1) (cdr states)
2940 (cons (cons (caar states) i) indexes)))))
2941 (dfa (make-vector (length states))))
2942 (do ((i 0 (+ i 1))
2943 (states states (cdr states)))
2944 ((null? states) dfa)
2945 (let ((maybe-finalizer (cadar states))
2946 (transitions (caddar states)))
2947 (vector-set!
2948 dfa i
2949 (cons maybe-finalizer
2950 (map (lambda (tr)
2951 `(,(and (cadr tr) (maybe-cset->char (cadr tr)))
2952 ,(cdr (assoc (car tr) indexes)) . ,(cddr tr)))
2953 transitions)))))))
2954
2955;; Extract all distinct ranges and the potential states they can transition
2956;; to from a given set of states. Any ranges that would overlap with
2957;; distinct characters are split accordingly.
2958;; This function is like "reach" in Laurikari's papers, but for each
2959;; possible distinct range of characters rather than per character.
2960(define (get-distinct-transitions nfa annotated-states)
2961 (define (csets-intersect? a b)
2962 (let ((i (cset-intersection a b)))
2963 (and (not (cset-empty? i)) i)))
2964 (mst-fold
2965 annotated-states
2966 (lambda (st mappings res)
2967 (let ((trans (nfa-get-state-trans nfa st))) ; Always one state per trans
2968 (if (null? trans)
2969 res
2970 (let lp ((ls res) (cs (car trans)) (state (cdr trans)) (res '()))
2971 (cond
2972 ;; State not seen yet? Add a new state transition
2973 ((null? ls)
2974 ;; TODO: We should try to find an existing DFA state
2975 ;; with only this NFA state in it, and extend the cset
2976 ;; with the current one. This produces smaller DFAs,
2977 ;; but takes longer to compile.
2978 (cons (cons cs (nfa-state->mst nfa state mappings))
2979 res))
2980 ((cset=? cs (caar ls))
2981 ;; Add state to existing set for this charset
2982 (mst-add! nfa (cdar ls) state mappings)
2983 (append ls res))
2984 ((csets-intersect? cs (caar ls)) =>
2985 (lambda (intersection)
2986 (let* ((only-in-new (cset-difference cs (caar ls)))
2987 (only-in-old (cset-difference (caar ls) cs))
2988 (states-in-both (cdar ls))
2989 (states-for-old
2990 (and (not (cset-empty? only-in-old))
2991 (mst-copy states-in-both)))
2992 (res (if states-for-old
2993 (cons (cons only-in-old states-for-old) res)
2994 res)))
2995 (mst-add! nfa states-in-both state mappings)
2996 ;; Add this state to the states already here and
2997 ;; restrict to the overlapping charset and continue
2998 ;; with the remaining subset of the new cset (if
2999 ;; nonempty)
3000 (if (cset-empty? only-in-new)
3001 (cons (cons intersection states-in-both)
3002 (append (cdr ls) res))
3003 (lp (cdr ls) only-in-new state
3004 (cons (cons intersection states-in-both) res))))))
3005 (else
3006 (lp (cdr ls) cs state (cons (car ls) res))))))))
3007 '()))
3008
3009;; The epsilon-closure of a set of states is all the states reachable
3010;; through epsilon transitions, with the tags encountered on the way.
3011(define (nfa-epsilon-closure-internal nfa annotated-states)
3012 ;; The stack _MUST_ be in this order for some reason I don't fully understand
3013 (let lp ((stack (mst-fold annotated-states
3014 (lambda (st m res)
3015 (cons (cons st m) res))
3016 '()))
3017 (priorities (make-vector (nfa-num-states nfa) 0))
3018 (closure (mst-copy annotated-states)))
3019 (if (null? stack)
3020 closure
3021 (let ((prio/orig-state (caar stack)) ; priority is just the state nr.
3022 (mappings (cdar stack)))
3023 (let lp2 ((trans (nfa-get-epsilons nfa prio/orig-state))
3024 (stack (cdr stack)))
3025 (if (null? trans)
3026 (lp stack priorities closure)
3027 (let ((state (caar trans)))
3028 (cond
3029 ;; Our priorities are inverted because we start at
3030 ;; the highest state number and go downwards to 0.
3031 ((> prio/orig-state (vector-ref priorities state))
3032 (vector-set! priorities state prio/orig-state)
3033 (cond
3034 ((cdar trans) => ; tagged transition?
3035 (lambda (tag)
3036 (let* ((index (next-index-for-tag! nfa tag closure))
3037 (new-mappings
3038 (mst-add-tagged!
3039 nfa closure state mappings tag index)))
3040 (lp2 (cdr trans)
3041 (cons (cons state new-mappings) stack)))))
3042 (else
3043 (mst-add/fast! nfa closure state mappings)
3044 (lp2 (cdr trans) (cons (cons state mappings) stack)))))
3045 (else (lp2 (cdr trans) stack))))))))))
3046
3047(define (nfa-epsilon-closure nfa states)
3048 (or (nfa-get-closure nfa states)
3049 (let ((res (nfa-epsilon-closure-internal nfa states)))
3050 (nfa-add-closure! nfa states res)
3051 res)))
3052
3053;; Generate "set" commands for all tags in the closure that are
3054;; not present in the original state.
3055(define (tag-set-commands-for-closure nfa orig-state closure copy-cmds)
3056 (let ((num-tags (nfa-num-tags nfa))
3057 (closure-summary (mst-mappings-summary closure))
3058 (state-summary (mst-mappings-summary orig-state)))
3059 (let lp ((t 0) (cmds '()))
3060 (if (= t num-tags)
3061 cmds
3062 (let lp2 ((s1 (vector-ref closure-summary t))
3063 (s2 (vector-ref state-summary t))
3064 (cmds cmds))
3065 (cond ((null? s1) (lp (+ t 1) cmds))
3066 ((or (memv (car s1) s2) ; Tag in original state?
3067 ;; Try to avoid generating set-commands for any slots
3068 ;; that will be overwritten by copy commands, but only
3069 ;; if that slot isn't copied to another slot.
3070 (and (not (null? copy-cmds)) ; null check for performance
3071 ;; Look for copy command overwriting this tag-slot
3072 (any (lambda (c)
3073 (and (= (vector-ref c 0) t)
3074 (= (vector-ref c 2) (car s1))))
3075 copy-cmds)
3076 ;; Ensure it's not copied to another slot before
3077 ;; discarding the set-command.
3078 (not (any (lambda (c)
3079 (and (= (vector-ref c 0) t)
3080 (= (vector-ref c 1) (car s1))))
3081 copy-cmds))))
3082 (lp2 (cdr s1) s2 cmds))
3083 (else (lp2 (cdr s1) s2
3084 (cons (cons t (car s1)) cmds)))))))))
3085
3086;; Look in dfa-states for an already existing state which matches
3087;; closure, but has different tag value mappings.
3088;; If found, calculate reordering commands so we can map the closure
3089;; to that state instead of adding a new DFA state.
3090;; This is completely handwaved away in Laurikari's paper (it basically
3091;; says "insert reordering algorithm here"), so this code was constructed
3092;; after some experimentation. In other words, bugs be here.
3093(define (find-reorder-commands-internal nfa closure dfa-states)
3094 (let ((num-tags (nfa-num-tags nfa))
3095 (closure-summary (mst-mappings-summary closure)))
3096 (let lp ((dfa-states dfa-states))
3097 (if (null? dfa-states)
3098 #f
3099 (if (not (mst-same-states? (caar dfa-states) closure))
3100 (lp (cdr dfa-states))
3101 (let lp2 ((state-summary (mst-mappings-summary (caar dfa-states)))
3102 (t 0) (cmds '()))
3103 (if (= t num-tags)
3104 (cons (caar dfa-states) cmds)
3105 (let lp3 ((closure-slots (vector-ref closure-summary t))
3106 (state-slots (vector-ref state-summary t))
3107 (cmds cmds))
3108 (cond ((null? closure-slots)
3109 (if (null? state-slots)
3110 (lp2 state-summary (+ t 1) cmds)
3111 (lp (cdr dfa-states))))
3112 ((null? state-slots) (lp (cdr dfa-states)))
3113 (else (lp3 (cdr closure-slots)
3114 (cdr state-slots)
3115 (if (= (car closure-slots) (car state-slots))
3116 cmds
3117 (cons (vector t (car closure-slots) (car state-slots))
3118 cmds)))))))))))))
3119
3120(define (find-reorder-commands nfa closure dfa-states)
3121 (or (nfa-get-reorder-commands nfa closure)
3122 (let ((res (find-reorder-commands-internal nfa closure dfa-states)))
3123 (nfa-set-reorder-commands! nfa closure res)
3124 res)))
3125
3126;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3127;;;; Closure Compilation
3128;;
3129;; We use this for non-regular expressions instead of an interpreted
3130;; NFA matcher. We use backtracking anyway, but this gives us more
3131;; freedom of implementation, allowing us to support patterns that
3132;; can't be represented in the above NFA representation.
3133
3134(define (sre->procedure sre . o)
3135 (define names
3136 (if (and (pair? o) (pair? (cdr o))) (cadr o) (sre-names sre 1 '())))
3137 (let lp ((sre sre)
3138 (n 1)
3139 (flags (if (pair? o) (car o) ~none))
3140 (next (lambda (cnk init src str i end matches fail)
3141 (irregex-match-start-chunk-set! matches 0 (car init))
3142 (irregex-match-start-index-set! matches 0 (cdr init))
3143 (irregex-match-end-chunk-set! matches 0 src)
3144 (irregex-match-end-index-set! matches 0 i)
3145 (%irregex-match-fail-set! matches fail)
3146 matches)))
3147 ;; XXXX this should be inlined
3148 (define (rec sre) (lp sre n flags next))
3149 (cond
3150 ((pair? sre)
3151 (if (string? (car sre))
3152 (sre-cset->procedure
3153 (sre->cset (car sre) (flag-set? flags ~case-insensitive?))
3154 next)
3155 (case (car sre)
3156 ((~ - & /)
3157 (sre-cset->procedure
3158 (sre->cset sre (flag-set? flags ~case-insensitive?))
3159 next))
3160 ((or)
3161 (case (length (cdr sre))
3162 ((0) (lambda (cnk init src str i end matches fail) (fail)))
3163 ((1) (rec (cadr sre)))
3164 (else
3165 (let* ((first (rec (cadr sre)))
3166 (rest (lp (sre-alternate (cddr sre))
3167 (+ n (sre-count-submatches (cadr sre)))
3168 flags
3169 next)))
3170 (lambda (cnk init src str i end matches fail)
3171 (first cnk init src str i end matches
3172 (lambda ()
3173 (rest cnk init src str i end matches fail))))))))
3174 ((w/case)
3175 (lp (sre-sequence (cdr sre))
3176 n
3177 (flag-clear flags ~case-insensitive?)
3178 next))
3179 ((w/nocase)
3180 (lp (sre-sequence (cdr sre))
3181 n
3182 (flag-join flags ~case-insensitive?)
3183 next))
3184 ((w/utf8)
3185 (lp (sre-sequence (cdr sre)) n (flag-join flags ~utf8?) next))
3186 ((w/noutf8)
3187 (lp (sre-sequence (cdr sre)) n (flag-clear flags ~utf8?) next))
3188 ((seq :)
3189 (case (length (cdr sre))
3190 ((0) next)
3191 ((1) (rec (cadr sre)))
3192 (else
3193 (let ((rest (lp (sre-sequence (cddr sre))
3194 (+ n (sre-count-submatches (cadr sre)))
3195 flags
3196 next)))
3197 (lp (cadr sre) n flags rest)))))
3198 ((?)
3199 (let ((body (rec (sre-sequence (cdr sre)))))
3200 (lambda (cnk init src str i end matches fail)
3201 (body cnk init src str i end matches
3202 (lambda () (next cnk init src str i end matches fail))))))
3203 ((??)
3204 (let ((body (rec (sre-sequence (cdr sre)))))
3205 (lambda (cnk init src str i end matches fail)
3206 (next cnk init src str i end matches
3207 (lambda () (body cnk init src str i end matches fail))))))
3208 ((*)
3209 (cond
3210 ((sre-empty? (sre-sequence (cdr sre)))
3211 (error "invalid sre: empty *" sre))
3212 (else
3213 (let ((body (rec (list '+ (sre-sequence (cdr sre))))))
3214 (lambda (cnk init src str i end matches fail)
3215 (body cnk init src str i end matches
3216 (lambda ()
3217 (next cnk init src str i end matches fail))))))))
3218 ((*?)
3219 (cond
3220 ((sre-empty? (sre-sequence (cdr sre)))
3221 (error "invalid sre: empty *?" sre))
3222 (else
3223 (letrec
3224 ((body
3225 (lp (sre-sequence (cdr sre))
3226 n
3227 flags
3228 (lambda (cnk init src str i end matches fail)
3229 (next cnk init src str i end matches
3230 (lambda ()
3231 (body cnk init src str i end matches fail)
3232 ))))))
3233 (lambda (cnk init src str i end matches fail)
3234 (next cnk init src str i end matches
3235 (lambda ()
3236 (body cnk init src str i end matches fail))))))))
3237 ((+)
3238 (cond
3239 ((sre-empty? (sre-sequence (cdr sre)))
3240 (error "invalid sre: empty +" sre))
3241 (else
3242 (letrec
3243 ((body
3244 (lp (sre-sequence (cdr sre))
3245 n
3246 flags
3247 (lambda (cnk init src str i end matches fail)
3248 (body cnk init src str i end matches
3249 (lambda ()
3250 (next cnk init src str i end matches fail)
3251 ))))))
3252 body))))
3253 ((=)
3254 (rec `(** ,(cadr sre) ,(cadr sre) ,@(cddr sre))))
3255 ((>=)
3256 (rec `(** ,(cadr sre) #f ,@(cddr sre))))
3257 ((**)
3258 (cond
3259 ((or (and (number? (cadr sre))
3260 (number? (caddr sre))
3261 (> (cadr sre) (caddr sre)))
3262 (and (not (cadr sre)) (caddr sre)))
3263 (lambda (cnk init src str i end matches fail) (fail)))
3264 (else
3265 (letrec
3266 ((from (cadr sre))
3267 (to (caddr sre))
3268 (body-contents (sre-sequence (cdddr sre)))
3269 (body
3270 (lambda (count)
3271 (lp body-contents
3272 n
3273 flags
3274 (lambda (cnk init src str i end matches fail)
3275 (if (and to (= count to))
3276 (next cnk init src str i end matches fail)
3277 ((body (+ 1 count))
3278 cnk init src str i end matches
3279 (lambda ()
3280 (if (>= count from)
3281 (next cnk init src str i end matches fail)
3282 (fail))))))))))
3283 (if (and (zero? from) to (zero? to))
3284 next
3285 (lambda (cnk init src str i end matches fail)
3286 ((body 1) cnk init src str i end matches
3287 (lambda ()
3288 (if (zero? from)
3289 (next cnk init src str i end matches fail)
3290 (fail))))))))))
3291 ((**?)
3292 (cond
3293 ((or (and (number? (cadr sre))
3294 (number? (caddr sre))
3295 (> (cadr sre) (caddr sre)))
3296 (and (not (cadr sre)) (caddr sre)))
3297 (lambda (cnk init src str i end matches fail) (fail)))
3298 (else
3299 (letrec
3300 ((from (cadr sre))
3301 (to (caddr sre))
3302 (body-contents (sre-sequence (cdddr sre)))
3303 (body
3304 (lambda (count)
3305 (lp body-contents
3306 n
3307 flags
3308 (lambda (cnk init src str i end matches fail)
3309 (if (< count from)
3310 ((body (+ 1 count)) cnk init
3311 src str i end matches fail)
3312 (next cnk init src str i end matches
3313 (lambda ()
3314 (if (and to (= count to))
3315 (fail)
3316 ((body (+ 1 count)) cnk init
3317 src str i end matches fail))))))))))
3318 (if (and (zero? from) to (zero? to))
3319 next
3320 (lambda (cnk init src str i end matches fail)
3321 (if (zero? from)
3322 (next cnk init src str i end matches
3323 (lambda ()
3324 ((body 1) cnk init src str i end matches fail)))
3325 ((body 1) cnk init src str i end matches fail))))))))
3326 ((word)
3327 (rec `(seq bow ,@(cdr sre) eow)))
3328 ((word+)
3329 (rec `(seq bow (+ (& (or alphanumeric "_")
3330 (or ,@(cdr sre)))) eow)))
3331 ((posix-string)
3332 (rec (string->sre (cadr sre))))
3333 ((look-ahead)
3334 (let ((check
3335 (lp (sre-sequence (cdr sre))
3336 n
3337 flags
3338 (lambda (cnk init src str i end matches fail) i))))
3339 (lambda (cnk init src str i end matches fail)
3340 (if (check cnk init src str i end matches (lambda () #f))
3341 (next cnk init src str i end matches fail)
3342 (fail)))))
3343 ((neg-look-ahead)
3344 (let ((check
3345 (lp (sre-sequence (cdr sre))
3346 n
3347 flags
3348 (lambda (cnk init src str i end matches fail) i))))
3349 (lambda (cnk init src str i end matches fail)
3350 (if (check cnk init src str i end matches (lambda () #f))
3351 (fail)
3352 (next cnk init src str i end matches fail)))))
3353 ((look-behind neg-look-behind)
3354 (let ((check
3355 (lp (sre-sequence
3356 (cons '(* any) (append (cdr sre) '(eos))))
3357 n
3358 flags
3359 (lambda (cnk init src str i end matches fail) i))))
3360 (lambda (cnk init src str i end matches fail)
3361 (let* ((cnk* (wrap-end-chunker cnk src i))
3362 (str* ((chunker-get-str cnk*) (car init)))
3363 (i* (cdr init))
3364 (end* ((chunker-get-end cnk*) (car init))))
3365 (if ((if (eq? (car sre) 'look-behind) (lambda (x) x) not)
3366 (check cnk* init (car init) str* i* end* matches
3367 (lambda () #f)))
3368 (next cnk init src str i end matches fail)
3369 (fail))))))
3370 ((atomic)
3371 (let ((once
3372 (lp (sre-sequence (cdr sre))
3373 n
3374 flags
3375 (lambda (cnk init src str i end matches fail) i))))
3376 (lambda (cnk init src str i end matches fail)
3377 (let ((j (once cnk init src str i end matches (lambda () #f))))
3378 (if j
3379 (next cnk init src str j end matches fail)
3380 (fail))))))
3381 ((if)
3382 (let* ((test-submatches (sre-count-submatches (cadr sre)))
3383 (pass (lp (caddr sre) flags (+ n test-submatches) next))
3384 (fail (if (pair? (cdddr sre))
3385 (lp (cadddr sre)
3386 (+ n test-submatches
3387 (sre-count-submatches (caddr sre)))
3388 flags
3389 next)
3390 (lambda (cnk init src str i end matches fail)
3391 (fail)))))
3392 (cond
3393 ((or (number? (cadr sre)) (symbol? (cadr sre)))
3394 (let ((index
3395 (if (symbol? (cadr sre))
3396 (cond
3397 ((assq (cadr sre) names) => cdr)
3398 (else
3399 (error "unknown named backref in SRE IF" sre)))
3400 (cadr sre))))
3401 (lambda (cnk init src str i end matches fail2)
3402 (if (%irregex-match-end-chunk matches index)
3403 (pass cnk init src str i end matches fail2)
3404 (fail cnk init src str i end matches fail2)))))
3405 (else
3406 (let ((test (lp (cadr sre) n flags pass)))
3407 (lambda (cnk init src str i end matches fail2)
3408 (test cnk init src str i end matches
3409 (lambda () (fail cnk init src str i end matches fail2)))
3410 ))))))
3411 ((backref backref-ci)
3412 (let ((n (cond ((number? (cadr sre)) (cadr sre))
3413 ((assq (cadr sre) names) => cdr)
3414 (else (error "unknown backreference" (cadr sre)))))
3415 (compare (if (or (eq? (car sre) 'backref-ci)
3416 (flag-set? flags ~case-insensitive?))
3417 string-ci=?
3418 string=?)))
3419 (lambda (cnk init src str i end matches fail)
3420 (let ((s (irregex-match-substring matches n)))
3421 (if (not s)
3422 (fail)
3423 ;; XXXX create an abstract subchunk-compare
3424 (let lp ((src src)
3425 (str str)
3426 (i i)
3427 (end end)
3428 (j 0)
3429 (len (string-length s)))
3430 (cond
3431 ((<= len (- end i))
3432 (cond
3433 ((compare (substring s j (string-length s))
3434 (substring str i (+ i len)))
3435 (next cnk init src str (+ i len) end matches fail))
3436 (else
3437 (fail))))
3438 (else
3439 (cond
3440 ((compare (substring s j (+ j (- end i)))
3441 (substring str i end))
3442 (let ((src2 ((chunker-get-next cnk) src)))
3443 (if src2
3444 (lp src2
3445 ((chunker-get-str cnk) src2)
3446 ((chunker-get-start cnk) src2)
3447 ((chunker-get-end cnk) src2)
3448 (+ j (- end i))
3449 (- len (- end i)))
3450 (fail))))
3451 (else
3452 (fail)))))))))))
3453 ((dsm)
3454 (lp (sre-sequence (cdddr sre)) (+ n (cadr sre)) flags next))
3455 (($ submatch)
3456 (let ((body
3457 (lp (sre-sequence (cdr sre))
3458 (+ n 1)
3459 flags
3460 (lambda (cnk init src str i end matches fail)
3461 (let ((old-source
3462 (%irregex-match-end-chunk matches n))
3463 (old-index
3464 (%irregex-match-end-index matches n)))
3465 (irregex-match-end-chunk-set! matches n src)
3466 (irregex-match-end-index-set! matches n i)
3467 (next cnk init src str i end matches
3468 (lambda ()
3469 (irregex-match-end-chunk-set!
3470 matches n old-source)
3471 (irregex-match-end-index-set!
3472 matches n old-index)
3473 (fail))))))))
3474 (lambda (cnk init src str i end matches fail)
3475 (let ((old-source (%irregex-match-start-chunk matches n))
3476 (old-index (%irregex-match-start-index matches n)))
3477 (irregex-match-start-chunk-set! matches n src)
3478 (irregex-match-start-index-set! matches n i)
3479 (body cnk init src str i end matches
3480 (lambda ()
3481 (irregex-match-start-chunk-set!
3482 matches n old-source)
3483 (irregex-match-start-index-set!
3484 matches n old-index)
3485 (fail)))))))
3486 ((=> submatch-named)
3487 (rec `(submatch ,@(cddr sre))))
3488 (else
3489 (error "unknown regexp operator" sre)))))
3490 ((symbol? sre)
3491 (case sre
3492 ((any)
3493 (lambda (cnk init src str i end matches fail)
3494 (if (< i end)
3495 (next cnk init src str (+ i 1) end matches fail)
3496 (let ((src2 ((chunker-get-next cnk) src)))
3497 (if src2
3498 (let ((str2 ((chunker-get-str cnk) src2))
3499 (i2 ((chunker-get-start cnk) src2))
3500 (end2 ((chunker-get-end cnk) src2)))
3501 (next cnk init src2 str2 (+ i2 1) end2 matches fail))
3502 (fail))))))
3503 ((nonl)
3504 (lambda (cnk init src str i end matches fail)
3505 (if (< i end)
3506 (if (not (eqv? #\newline (string-ref str i)))
3507 (next cnk init src str (+ i 1) end matches fail)
3508 (fail))
3509 (let ((src2 ((chunker-get-next cnk) src)))
3510 (if src2
3511 (let ((str2 ((chunker-get-str cnk) src2))
3512 (i2 ((chunker-get-start cnk) src2))
3513 (end2 ((chunker-get-end cnk) src2)))
3514 (if (not (eqv? #\newline (string-ref str2 i2)))
3515 (next cnk init src2 str2 (+ i2 1) end2 matches fail)
3516 (fail)))
3517 (fail))))))
3518 ((bos)
3519 (lambda (cnk init src str i end matches fail)
3520 (if (and (eq? src (car init)) (eqv? i (cdr init)))
3521 (next cnk init src str i end matches fail)
3522 (fail))))
3523 ((bol)
3524 (lambda (cnk init src str i end matches fail)
3525 (if (let ((ch (if (> i ((chunker-get-start cnk) src))
3526 (string-ref str (- i 1))
3527 (chunker-prev-char cnk init src))))
3528 (or (not ch) (eqv? #\newline ch)))
3529 (next cnk init src str i end matches fail)
3530 (fail))))
3531 ((bow)
3532 (lambda (cnk init src str i end matches fail)
3533 (if (and (if (> i ((chunker-get-start cnk) src))
3534 (not (char-alphanumeric? (string-ref str (- i 1))))
3535 (let ((ch (chunker-prev-char cnk init src)))
3536 (or (not ch) (not (char-alphanumeric? ch)))))
3537 (if (< i end)
3538 (char-alphanumeric? (string-ref str i))
3539 (let ((next ((chunker-get-next cnk) src)))
3540 (and next
3541 (char-alphanumeric?
3542 (string-ref ((chunker-get-str cnk) next)
3543 ((chunker-get-start cnk) next)))))))
3544 (next cnk init src str i end matches fail)
3545 (fail))))
3546 ((eos)
3547 (lambda (cnk init src str i end matches fail)
3548 (if (and (>= i end) (not ((chunker-get-next cnk) src)))
3549 (next cnk init src str i end matches fail)
3550 (fail))))
3551 ((eol)
3552 (lambda (cnk init src str i end matches fail)
3553 (if (if (< i end)
3554 (eqv? #\newline (string-ref str i))
3555 (let ((src2 ((chunker-get-next cnk) src)))
3556 (if (not src2)
3557 #t
3558 (eqv? #\newline
3559 (string-ref ((chunker-get-str cnk) src2)
3560 ((chunker-get-start cnk) src2))))))
3561 (next cnk init src str i end matches fail)
3562 (fail))))
3563 ((eow)
3564 (lambda (cnk init src str i end matches fail)
3565 (if (and (if (< i end)
3566 (not (char-alphanumeric? (string-ref str i)))
3567 (let ((ch (chunker-next-char cnk src)))
3568 (or (not ch) (not (char-alphanumeric? ch)))))
3569 (if (> i ((chunker-get-start cnk) src))
3570 (char-alphanumeric? (string-ref str (- i 1)))
3571 (let ((prev (chunker-prev-char cnk init src)))
3572 (or (not prev) (char-alphanumeric? prev)))))
3573 (next cnk init src str i end matches fail)
3574 (fail))))
3575 ((nwb) ;; non-word-boundary
3576 (lambda (cnk init src str i end matches fail)
3577 (let ((c1 (if (< i end)
3578 (string-ref str i)
3579 (chunker-next-char cnk src)))
3580 (c2 (if (> i ((chunker-get-start cnk) src))
3581 (string-ref str (- i 1))
3582 (chunker-prev-char cnk init src))))
3583 (if (and c1 c2
3584 (if (char-alphanumeric? c1)
3585 (char-alphanumeric? c2)
3586 (not (char-alphanumeric? c2))))
3587 (next cnk init src str i end matches fail)
3588 (fail)))))
3589 ((epsilon)
3590 next)
3591 (else
3592 (let ((cell (assq sre sre-named-definitions)))
3593 (if cell
3594 (rec (cdr cell))
3595 (error "unknown regexp" sre))))))
3596 ((char? sre)
3597 (if (flag-set? flags ~case-insensitive?)
3598 ;; case-insensitive
3599 (lambda (cnk init src str i end matches fail)
3600 (if (>= i end)
3601 (let lp ((src2 ((chunker-get-next cnk) src)))
3602 (if src2
3603 (let ((str2 ((chunker-get-str cnk) src2))
3604 (i2 ((chunker-get-start cnk) src2))
3605 (end2 ((chunker-get-end cnk) src2)))
3606 (if (>= i2 end2)
3607 (lp ((chunker-get-next cnk) src2))
3608 (if (char-ci=? sre (string-ref str2 i2))
3609 (next cnk init src2 str2 (+ i2 1) end2
3610 matches fail)
3611 (fail))))
3612 (fail)))
3613 (if (char-ci=? sre (string-ref str i))
3614 (next cnk init src str (+ i 1) end matches fail)
3615 (fail))))
3616 ;; case-sensitive
3617 (lambda (cnk init src str i end matches fail)
3618 (if (>= i end)
3619 (let lp ((src2 ((chunker-get-next cnk) src)))
3620 (if src2
3621 (let ((str2 ((chunker-get-str cnk) src2))
3622 (i2 ((chunker-get-start cnk) src2))
3623 (end2 ((chunker-get-end cnk) src2)))
3624 (if (>= i2 end2)
3625 (lp ((chunker-get-next cnk) src2))
3626 (if (char=? sre (string-ref str2 i2))
3627 (next cnk init src2 str2 (+ i2 1) end2
3628 matches fail)
3629 (fail))))
3630 (fail)))
3631 (if (char=? sre (string-ref str i))
3632 (next cnk init src str (+ i 1) end matches fail)
3633 (fail))))
3634 ))
3635 ((string? sre)
3636 (rec (sre-sequence (string->list sre)))
3637;; XXXX reintroduce faster string matching on chunks
3638;; (if (flag-set? flags ~case-insensitive?)
3639;; (rec (sre-sequence (string->list sre)))
3640;; (let ((len (string-length sre)))
3641;; (lambda (cnk init src str i end matches fail)
3642;; (if (and (<= (+ i len) end)
3643;; (%substring=? sre str 0 i len))
3644;; (next str (+ i len) matches fail)
3645;; (fail)))))
3646 )
3647 (else
3648 (error "unknown regexp" sre)))))
3649
3650;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3651;;;; Character Sets
3652;;
3653;; Simple character sets as lists of ranges, as used in the NFA/DFA
3654;; compilation. This is not especially efficient, but is portable and
3655;; scalable for any range of character sets.
3656
3657(define (sre-cset->procedure cset next)
3658 (lambda (cnk init src str i end matches fail)
3659 (if (< i end)
3660 (if (cset-contains? cset (string-ref str i))
3661 (next cnk init src str (+ i 1) end matches fail)
3662 (fail))
3663 (let ((src2 ((chunker-get-next cnk) src)))
3664 (if src2
3665 (let ((str2 ((chunker-get-str cnk) src2))
3666 (i2 ((chunker-get-start cnk) src2))
3667 (end2 ((chunker-get-end cnk) src2)))
3668 (if (cset-contains? cset (string-ref str2 i2))
3669 (next cnk init src2 str2 (+ i2 1) end2 matches fail)
3670 (fail)))
3671 (fail))))))
3672
3673(define (make-cset) (vector))
3674(define (range->cset from to) (vector (cons from to)))
3675(define (char->cset ch) (vector (cons ch ch)))
3676(define (cset-empty? cs) (zero? (vector-length cs)))
3677(define (maybe-cset->char cs)
3678 (if (and (= (vector-length cs) 1)
3679 (char=? (car (vector-ref cs 0)) (cdr (vector-ref cs 0))))
3680 (car (vector-ref cs 0))
3681 cs))
3682
3683;; Since csets are sorted, there's only one possible representation of any cset
3684(define cset=? equal?)
3685
3686(define (cset-size cs)
3687 (let ((len (vector-length cs)))
3688 (let lp ((i 0) (size 0))
3689 (if (= i len)
3690 size
3691 (lp (+ i 1) (+ size 1
3692 (- (char->integer (cdr (vector-ref cs i)))
3693 (char->integer (car (vector-ref cs i))))))))))
3694
3695(define (cset->plist cs)
3696 (let lp ((i (- (vector-length cs) 1))
3697 (res '()))
3698 (if (= i -1)
3699 res
3700 (lp (- i 1) (cons (car (vector-ref cs i))
3701 (cons (cdr (vector-ref cs i)) res))))))
3702
3703(define (plist->cset ls)
3704 (let lp ((ls ls) (res (make-cset)))
3705 (if (null? ls)
3706 res
3707 (lp (cddr ls) (cset-union (range->cset (car ls) (cadr ls)) res)))))
3708
3709(define (string->cset s)
3710 (fold (lambda (ch cs)
3711 (cset-adjoin cs ch))
3712 (make-cset)
3713 (string->list s)))
3714
3715(define (sre->cset sre . o)
3716 (let lp ((sre sre) (ci? (and (pair? o) (car o))))
3717 (define (rec sre) (lp sre ci?))
3718 (cond
3719 ((pair? sre)
3720 (if (string? (car sre))
3721 (if ci?
3722 (cset-case-insensitive (string->cset (car sre)))
3723 (string->cset (car sre)))
3724 (case (car sre)
3725 ((~)
3726 (cset-complement
3727 (fold cset-union (rec (cadr sre)) (map rec (cddr sre)))))
3728 ((&)
3729 (fold cset-intersection (rec (cadr sre)) (map rec (cddr sre))))
3730 ((-)
3731 (fold (lambda (x res) (cset-difference res x))
3732 (rec (cadr sre))
3733 (map rec (cddr sre))))
3734 ((/)
3735 (let ((res (plist->cset (sre-flatten-ranges (cdr sre)))))
3736 (if ci?
3737 (cset-case-insensitive res)
3738 res)))
3739 ((or)
3740 (fold cset-union (rec (cadr sre)) (map rec (cddr sre))))
3741 ((w/case)
3742 (lp (sre-alternate (cdr sre)) #f))
3743 ((w/nocase)
3744 (lp (sre-alternate (cdr sre)) #t))
3745 (else
3746 (error "not a valid sre char-set operator" sre)))))
3747 ((char? sre) (if ci?
3748 (cset-case-insensitive (range->cset sre sre))
3749 (range->cset sre sre)))
3750 ((string? sre) (rec (list sre)))
3751 (else
3752 (let ((cell (assq sre sre-named-definitions)))
3753 (if cell
3754 (rec (cdr cell))
3755 (error "not a valid sre char-set" sre)))))))
3756
3757(define (cset->sre cset)
3758 (cons '/
3759 (fold (lambda (x res) (cons (car x) (cons (cdr x) res)))
3760 '()
3761 (vector->list cset))))
3762
3763(define (cset-contains? cset ch)
3764 ;; CHICKEN: Type assumption added for performance. This is a very
3765 ;; hot code path, so every type improvement matters.
3766 (assume ((cset (vector-of (pair char char)))
3767 (ch char))
3768 (let ((len (vector-length cset)))
3769 (case len
3770 ((0) #f)
3771 ((1) (let ((range (vector-ref cset 0)))
3772 (and (char<=? ch (cdr range)) (char<=? (car range) ch))))
3773 (else (let lp ((lower 0) (upper len))
3774 (let* ((middle (quotient (+ upper lower) 2))
3775 (range (vector-ref cset middle)))
3776 (cond ((char<? (cdr range) ch)
3777 (let ((next (+ middle 1)))
3778 (and (< next upper) (lp next upper))))
3779 ((char<? ch (car range))
3780 (and (< lower middle) (lp lower middle)))
3781 (else #t)))))))))
3782
3783(define (char-ranges-union a b)
3784 (cons (if (char<=? (car a) (car b)) (car a) (car b))
3785 (if (char>=? (cdr a) (cdr b)) (cdr a) (cdr b))))
3786
3787(define (cset-union a b)
3788 (let union-range ((a (vector->list a))
3789 (b (vector->list b))
3790 (res '()))
3791 (cond
3792 ((null? a) (list->vector (reverse (append (reverse b) res))))
3793 ((null? b) (list->vector (reverse (append (reverse a) res))))
3794 (else
3795 (let ((a-range (car a))
3796 (b-range (car b)))
3797 (cond
3798 ;; Can't use next-char here since it will cause an error if we are
3799 ;; comparing a cset with the maximum character as high char.
3800 ((< (+ (char->integer (cdr a-range)) 1) (char->integer (car b-range)))
3801 (union-range (cdr a) b (cons a-range res)))
3802 ((> (char->integer (car a-range)) (+ (char->integer (cdr b-range)) 1))
3803 (union-range (cdr b) a (cons b-range res)))
3804 ((char>=? (cdr a-range) (car b-range))
3805 (union-range (cons (char-ranges-union a-range b-range) (cdr a))
3806 (cdr b)
3807 res))
3808 (else (union-range (cdr a)
3809 (cons (char-ranges-union a-range b-range) (cdr b))
3810 res))))))))
3811
3812(define (cset-adjoin cs ch) (cset-union cs (char->cset ch)))
3813
3814(define (next-char c)
3815 (integer->char (+ (char->integer c) 1)))
3816
3817(define (prev-char c)
3818 (integer->char (- (char->integer c) 1)))
3819
3820(define (cset-difference a b)
3821 (let diff ((a (vector->list a))
3822 (b (vector->list b))
3823 (res '()))
3824 (cond ((null? a) (list->vector (reverse res)))
3825 ((null? b) (list->vector (append (reverse res) a)))
3826 (else
3827 (let ((a-range (car a))
3828 (b-range (car b)))
3829 (cond
3830 ((char<? (cdr a-range) (car b-range))
3831 (diff (cdr a) b (cons a-range res)))
3832 ((char>? (car a-range) (cdr b-range))
3833 (diff a (cdr b) res))
3834 ((and (char<=? (car b-range) (car a-range))
3835 (char>=? (cdr b-range) (cdr a-range)))
3836 (diff (cdr a) b res))
3837 (else (let ((left (and (char<? (car a-range) (car b-range))
3838 (cons (car a-range)
3839 (prev-char (car b-range)))))
3840 (right (and (char>? (cdr a-range) (cdr b-range))
3841 (cons (next-char (cdr b-range))
3842 (cdr a-range)))))
3843 (diff (if right (cons right (cdr a)) (cdr a))
3844 b
3845 (if left (cons left res) res))))))))))
3846
3847(define (min-char a b)
3848 (if (char<? a b) a b))
3849
3850(define (max-char a b)
3851 (if (char<? a b) b a))
3852
3853(define (cset-intersection a b)
3854 (let intersect ((a (vector->list a))
3855 (b (vector->list b))
3856 (res '()))
3857 (if (or (null? a) (null? b))
3858 (list->vector (reverse res))
3859 (let ((a-range (car a))
3860 (b-range (car b)))
3861 (cond
3862 ((char<? (cdr a-range) (car b-range))
3863 (intersect (cdr a) b res))
3864 ((char>? (car a-range) (cdr b-range))
3865 (intersect a (cdr b) res))
3866 (else
3867 (let ((result (cons (max-char (car b-range) (car a-range))
3868 (min-char (cdr a-range) (cdr b-range)))))
3869 (intersect (if (char>? (cdr a-range) (cdr result))
3870 a (cdr a))
3871 (if (char>? (cdr b-range) (cdr result))
3872 b (cdr b))
3873 (cons result res)))))))))
3874
3875(define (cset-complement a)
3876 (cset-difference (sre->cset *all-chars*) a))
3877
3878;; This could use some optimization :)
3879(define (cset-case-insensitive a)
3880 (let lp ((ls (vector->list a)) (res '()))
3881 (cond ((null? ls) (list->vector (reverse res)))
3882 ((and (char-alphabetic? (caar ls))
3883 (char-alphabetic? (cdar ls)))
3884 (lp (cdr ls)
3885 (reverse
3886 (vector->list
3887 (cset-union (cset-union (list->vector (reverse res))
3888 (vector (car ls)))
3889 (range->cset (char-altcase (caar ls))
3890 (char-altcase (cdar ls))))))))
3891 (else (lp (cdr ls) (reverse (vector->list
3892 (cset-union (list->vector (reverse res))
3893 (vector (car ls))))))))))
3894
3895;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3896;;;; Match and Replace Utilities
3897
3898(define (irregex-fold/fast irx kons knil str . o)
3899 (if (not (string? str)) (error 'irregex-fold "not a string" str))
3900 (let* ((irx (irregex irx))
3901 (matches (irregex-new-matches irx))
3902 (finish (or (and (pair? o) (car o)) (lambda (i acc) acc)))
3903 (start (if (and (pair? o) (pair? (cdr o))) (cadr o) 0))
3904 (end (if (and (pair? o) (pair? (cdr o)) (pair? (cddr o)))
3905 (caddr o)
3906 (string-length str)))
3907 (init-src (list str start end))
3908 (init (cons init-src start)))
3909 (if (not (and (integer? start) (exact? start)))
3910 (error 'irregex-fold "not an exact integer" start))
3911 (if (not (and (integer? end) (exact? end)))
3912 (error 'irregex-fold "not an exact integer" end))
3913 (irregex-match-chunker-set! matches irregex-basic-string-chunker)
3914 (let lp ((src init-src) (from start) (i start) (acc knil))
3915 (if (>= i end)
3916 (finish from acc)
3917 (let ((m (irregex-search/matches
3918 irx
3919 irregex-basic-string-chunker
3920 init
3921 src
3922 i
3923 matches)))
3924 (if (not m)
3925 (finish from acc)
3926 (let ((j-start (%irregex-match-start-index m 0))
3927 (j (%irregex-match-end-index m 0))
3928 (acc (kons from m acc)))
3929 (irregex-reset-matches! matches)
3930 (cond
3931 ((flag-set? (irregex-flags irx) ~consumer?)
3932 (finish j acc))
3933 ((= j j-start)
3934 ;; skip one char forward if we match the empty string
3935 (lp (list str j end) j (+ j 1) acc))
3936 (else
3937 (lp (list str j end) j j acc))))))))))
3938
3939(define (irregex-fold irx kons . args)
3940 (if (not (procedure? kons)) (error 'irregex-fold "not a procedure" kons))
3941 (let ((kons2 (lambda (i m acc) (kons i (irregex-copy-matches m) acc))))
3942 (apply irregex-fold/fast irx kons2 args)))
3943
3944(define (irregex-fold/chunked/fast irx kons knil cnk start . o)
3945 (let* ((irx (irregex irx))
3946 (matches (irregex-new-matches irx))
3947 (finish (or (and (pair? o) (car o)) (lambda (src i acc) acc)))
3948 (i (if (and (pair? o) (pair? (cdr o)))
3949 (cadr o)
3950 ((chunker-get-start cnk) start)))
3951 (init (cons start i)))
3952 (if (not (integer? i)) (error 'irregex-fold/chunked "not an integer" i))
3953 (irregex-match-chunker-set! matches cnk)
3954 (let lp ((start start) (i i) (acc knil))
3955 (if (not start)
3956 (finish start i acc)
3957 (let ((m (irregex-search/matches irx cnk init start i matches)))
3958 (if (not m)
3959 (finish start i acc)
3960 (let ((end-src (%irregex-match-end-chunk m 0))
3961 (end-index (%irregex-match-end-index m 0)))
3962 (if (and (eq? end-src start) (= end-index i))
3963 (if (>= end-index ((chunker-get-end cnk) end-src ))
3964 (let ((next ((chunker-get-next cnk) end-src)))
3965 (lp next ((chunker-get-start cnk) next) acc))
3966 (lp end-src (+ end-index 1) acc))
3967 (let ((acc (kons start i m acc)))
3968 (irregex-reset-matches! matches)
3969 (if (flag-set? (irregex-flags irx) ~consumer?)
3970 (finish end-src end-index acc)
3971 (lp end-src end-index acc)))))))))))
3972
3973(define (irregex-fold/chunked irx kons . args)
3974 (if (not (procedure? kons)) (error 'irregex-fold/chunked "not a procedure" kons))
3975 (let ((kons2 (lambda (s i m acc) (kons s i (irregex-copy-matches m) acc))))
3976 (apply irregex-fold/chunked/fast irx kons2 args)))
3977
3978(define (irregex-replace irx str . o)
3979 (if (not (string? str)) (error 'irregex-replace "not a string" str))
3980 (let ((m (irregex-search irx str)))
3981 (if m
3982 (string-cat-reverse
3983 (cons (substring str (%irregex-match-end-index m 0) (string-length str))
3984 (append (irregex-apply-match m o)
3985 (list (substring str 0 (%irregex-match-start-index m 0)))
3986 )))
3987 str)))
3988
3989(define (irregex-replace/all irx str . o)
3990 (if (not (string? str)) (error 'irregex-replace/all "not a string" str))
3991 (irregex-fold/fast
3992 irx
3993 (lambda (i m acc)
3994 (let ((m-start (%irregex-match-start-index m 0)))
3995 (if (>= i m-start)
3996 (append (irregex-apply-match m o) acc)
3997 (append (irregex-apply-match m o)
3998 (cons (substring str i m-start) acc)))))
3999 '()
4000 str
4001 (lambda (i acc)
4002 (let ((end (string-length str)))
4003 (string-cat-reverse (if (>= i end)
4004 acc
4005 (cons (substring str i end) acc)))))))
4006
4007(define (irregex-apply-match m ls)
4008 (let lp ((ls ls) (res '()))
4009 (if (null? ls)
4010 res
4011 (cond
4012 ((integer? (car ls))
4013 (lp (cdr ls)
4014 (cons (or (irregex-match-substring m (car ls)) "") res)))
4015 ((procedure? (car ls))
4016 (lp (cdr ls) (cons ((car ls) m) res)))
4017 ((symbol? (car ls))
4018 (case (car ls)
4019 ((pre)
4020 (lp (cdr ls)
4021 (cons (substring (car (%irregex-match-start-chunk m 0))
4022 0
4023 (%irregex-match-start-index m 0))
4024 res)))
4025 ((post)
4026 (let ((str (car (%irregex-match-start-chunk m 0))))
4027 (lp (cdr ls)
4028 (cons (substring str
4029 (%irregex-match-end-index m 0)
4030 (string-length str))
4031 res))))
4032 (else
4033 (cond
4034 ((assq (car ls) (irregex-match-names m))
4035 => (lambda (x) (lp (cons (cdr x) (cdr ls)) res)))
4036 (else
4037 (error "unknown match replacement" (car ls)))))))
4038 (else
4039 (lp (cdr ls) (cons (car ls) res)))))))
4040
4041(define (irregex-extract irx str . o)
4042 (if (not (string? str)) (error 'irregex-extract "not a string" str))
4043 (apply irregex-fold/fast
4044 irx
4045 (lambda (i m a) (cons (irregex-match-substring m) a))
4046 '()
4047 str
4048 (lambda (i a) (reverse a))
4049 o))
4050
4051(define (irregex-split irx str . o)
4052 (if (not (string? str)) (error 'irregex-split "not a string" str))
4053 (let ((start (if (pair? o) (car o) 0))
4054 (end (if (and (pair? o) (pair? (cdr o))) (cadr o) (string-length str))))
4055 (irregex-fold/fast
4056 irx
4057 (lambda (i m a)
4058 (cond
4059 ((= i (%irregex-match-start-index m 0))
4060 a)
4061 (else
4062 (cons (substring str i (%irregex-match-start-index m 0)) a))))
4063 '()
4064 str
4065 (lambda (i a)
4066 (let lp ((ls (if (= i end) a (cons (substring str i end) a)))
4067 (res '())
4068 (was-char? #f))
4069 (cond
4070 ((null? ls) res)
4071 ((char? (car ls))
4072 (lp (cdr ls)
4073 (if (or was-char? (null? res))
4074 (cons (string (car ls)) res)
4075 (cons (string-append (string (car ls)) (car res))
4076 (cdr res)))
4077 #t))
4078 (else (lp (cdr ls) (cons (car ls) res) #f)))))
4079 start
4080 end)))