~ chicken-core (chicken-5) /irregex-core.scm
Trap1;;;; irregex.scm -- IrRegular Expressions
2;;
3;; Copyright (c) 2005-2024 Alex Shinn. All rights reserved.
4;; BSD-style license: http://synthcode.com/license.txt
5
6;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
7;; At this moment there was a loud ring at the bell, and I could
8;; hear Mrs. Hudson, our landlady, raising her voice in a wail of
9;; expostulation and dismay.
10;;
11;; "By heaven, Holmes," I said, half rising, "I believe that
12;; they are really after us."
13;;
14;; "No, it's not quite so bad as that. It is the unofficial
15;; force, -- the Baker Street irregulars."
16
17;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
18;;;; Notes
19;;
20;; This code should not require any porting - it should work out of
21;; the box in any R[457]RS Scheme implementation. Slight modifications
22;; are needed for R6RS (a separate R6RS-compatible version is included
23;; in the distribution as irregex-r6rs.scm).
24;;
25;; The goal of portability makes this code a little clumsy and
26;; inefficient. Future versions will include both cleanup and
27;; performance tuning, but you can only go so far while staying
28;; portable. AND-LET*, SRFI-9 records and custom macros would've been
29;; nice.
30
31;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
32;;;; History
33;; 0.9.11: 2024/02/23 - Guile test and packaging support from Tomas Volf.
34;; 0.9.10: 2021/07/06 - fixes for submatches under kleene star, empty seqs
35;; in alternations, and bol in folds for backtracking
36;; matcher (thanks John Clements and snan for reporting
37;; and Peter Bex for fixing)
38;; 0.9.9: 2021/05/14 - more comprehensive fix for repeated empty matches
39;; 0.9.8: 2020/07/13 - fix irregex-replace/all with look-behind patterns
40;; 0.9.7: 2019/12/31 - more intuitive handling of empty matches in -fold,
41;; -replace and -split
42;; 0.9.6: 2016/12/05 - fixed exponential memory use of + in compilation
43;; of backtracking matcher (CVE-2016-9954).
44;; 0.9.5: 2016/09/10 - fixed a bug in irregex-fold handling of bow
45;; 0.9.4: 2015/12/14 - performance improvement for {n,m} matches
46;; 0.9.3: 2014/07/01 - R7RS library
47;; 0.9.2: 2012/11/29 - fixed a bug in -fold on conditional bos patterns
48;; 0.9.1: 2012/11/27 - various accumulated bugfixes
49;; 0.9.0: 2012/06/03 - Using tags for match extraction from Peter Bex.
50;; 0.8.3: 2011/12/18 - various accumulated bugfixes
51;; 0.8.2: 2010/08/28 - (...)? submatch extraction fix and alternate
52;; named submatches from Peter Bex
53;; Added irregex-split, irregex-extract,
54;; irregex-match-names and irregex-match-valid-index?
55;; to Chicken and Guile module export lists and made
56;; the latter accept named submatches. The procedures
57;; irregex-match-{start,end}-{index,chunk} now also
58;; accept named submatches, with the index argument
59;; made optional. Improved argument type checks.
60;; Disallow negative submatch index.
61;; Improve performance of backtracking matcher.
62;; Refactor charset handling into a consistent API
63;; 0.8.1: 2010/03/09 - backtracking irregex-match fix and other small fixes
64;; 0.8.0: 2010/01/20 - optimizing DFA compilation, adding SRE escapes
65;; inside PCREs, adding utility SREs
66;; 0.7.5: 2009/08/31 - adding irregex-extract and irregex-split
67;; *-fold copies match data (use *-fold/fast for speed)
68;; irregex-opt now returns an SRE
69;; 0.7.4: 2009/05/14 - empty alternates (or) and empty csets always fail,
70;; bugfix in default finalizer for irregex-fold/chunked
71;; 0.7.3: 2009/04/14 - adding irregex-fold/chunked, minor doc fixes
72;; 0.7.2: 2009/02/11 - some bugfixes, much improved documentation
73;; 0.7.1: 2008/10/30 - several bugfixes (thanks to Derick Eddington)
74;; 0.7.0: 2008/10/20 - support abstract chunked strings
75;; 0.6.2: 2008/07/26 - minor bugfixes, allow global disabling of utf8 mode,
76;; friendlier error messages in parsing, \Q..\E support
77;; 0.6.1: 2008/07/21 - added utf8 mode, more utils, bugfixes
78;; 0.6: 2008/05/01 - most of PCRE supported
79;; 0.5: 2008/04/24 - fully portable R4RS, many PCRE features implemented
80;; 0.4: 2008/04/17 - rewriting NFA to use efficient closure compilation,
81;; normal strings only, but all of the spencer tests pass
82;; 0.3: 2008/03/10 - adding DFA converter (normal strings only)
83;; 0.2: 2005/09/27 - adding irregex-opt (like elisp's regexp-opt) utility
84;; 0.1: 2005/08/18 - simple NFA interpreter over abstract chunked strings
85
86;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
87;;;; Data Structures
88
89(define (vector-copy v)
90 (let ((v2 (make-vector (vector-length v))))
91 (vector-copy! v v2)
92 v2))
93
94(cond-expand
95 (chicken-bootstrap
96 (begin
97 ;; make-irregex defined elsewhere
98 (define (irregex? x)
99 (##sys#structure? x 'regexp))
100 (define (irregex-dfa x)
101 (##sys#check-structure x 'regexp 'irregex-dfa)
102 (##sys#slot x 1))
103 (define (irregex-dfa/search x)
104 (##sys#check-structure x 'regexp 'irregex-dfa/search)
105 (##sys#slot x 2))
106 (define (irregex-nfa x)
107 (##sys#check-structure x 'regexp 'irregex-nfa)
108 (##sys#slot x 3))
109 (define (irregex-flags x)
110 (##sys#check-structure x 'regexp 'irregex-flags)
111 (##sys#slot x 4))
112 (define (irregex-num-submatches x)
113 (##sys#check-structure x 'regexp 'irregex-num-submatches)
114 (##sys#slot x 5))
115 (define (irregex-lengths x)
116 (##sys#check-structure x 'regexp 'irregex-lengths)
117 (##sys#slot x 6))
118 (define (irregex-names x)
119 (##sys#check-structure x 'regexp 'irregex-names)
120 (##sys#slot x 7))
121 ;; make-irregex-match defined elsewhere
122 (define (irregex-new-matches irx)
123 (make-irregex-match (irregex-num-submatches irx) (irregex-names irx)))
124 (define (irregex-reset-matches! m)
125 (let ((v (##sys#slot m 1)))
126 (vector-fill! v #f)
127 m))
128 (define (irregex-copy-matches m)
129 (and (##sys#structure? m 'regexp-match)
130 (##sys#make-structure
131 'regexp-match
132 (vector-copy (##sys#slot m 1))
133 (##sys#slot m 2)
134 (##sys#slot m 3)
135 (##sys#slot m 4))))
136 (define (irregex-match-data? obj)
137 (##sys#structure? obj 'regexp-match))
138 (define (irregex-match-num-submatches m)
139 (##sys#check-structure m 'regexp-match 'irregex-match-num-submatches)
140 (- (fx/ (##sys#size (##sys#slot m 1)) 4) 2))
141 (define (irregex-match-chunker m)
142 (##sys#slot m 3))
143 (define (irregex-match-names m)
144 (##sys#check-structure m 'regexp-match 'irregex-match-names)
145 (##sys#slot m 2))
146 (define (irregex-match-chunker-set! m str)
147 (##sys#setslot m 3 str))
148 (define-inline (%irregex-match-start-chunk m n)
149 (##sys#slot (##sys#slot m 1) (* n 4)))
150 (define-inline (%irregex-match-start-index m n)
151 (##sys#slot (##sys#slot m 1) (+ 1 (* n 4))))
152 (define-inline (%irregex-match-end-chunk m n)
153 (##sys#slot (##sys#slot m 1) (+ 2 (* n 4))))
154 (define (%irregex-match-end-index m n)
155 (##sys#slot (##sys#slot m 1) (+ 3 (* n 4))))
156 (define (%irregex-match-fail m) (##sys#slot m 4))
157 (define (%irregex-match-fail-set! m x) (##sys#setslot m 4 x))
158 (set-record-printer! 'regexp-match
159 (lambda (m out)
160 (let ((n (irregex-match-num-submatches m)))
161 (display "#<regexp-match (" out)
162 (display n out)
163 (display " submatch" out)
164 (when (or (eq? n 0) (fx> n 1)) (display "es" out))
165 (display ")>" out))))
166 (define-inline (irregex-match-valid-numeric-index? m n)
167 (let ((v (##sys#slot m 1)))
168 (and (>= n 0) (< (* n 4) (- (##sys#size v) 4)))))
169 (define-inline (irregex-match-matched-numeric-index? m n)
170 (let ((v (##sys#slot m 1)))
171 (and (##sys#slot v (+ 1 (* n 4)))
172 #t)))))
173 (else
174 (begin
175 (define irregex-tag '*irregex-tag*)
176 (define (make-irregex dfa dfa/search nfa flags submatches lengths names)
177 (vector irregex-tag dfa dfa/search nfa flags submatches lengths names))
178 (define (irregex? obj)
179 (and (vector? obj)
180 (= 8 (vector-length obj))
181 (eq? irregex-tag (vector-ref obj 0))))
182 (define (irregex-dfa x) (vector-ref x 1))
183 (define (irregex-dfa/search x) (vector-ref x 2))
184 (define (irregex-nfa x) (vector-ref x 3))
185 (define (irregex-flags x) (vector-ref x 4))
186 (define (irregex-num-submatches x) (vector-ref x 5))
187 (define (irregex-lengths x) (vector-ref x 6))
188 (define (irregex-names x) (vector-ref x 7))
189 (define (irregex-new-matches irx)
190 (make-irregex-match (irregex-num-submatches irx) (irregex-names irx)))
191 (define (irregex-reset-matches! m)
192 (do ((i (- (vector-length m) 1) (- i 1)))
193 ((<= i 3) m)
194 (vector-set! m i #f)))
195 (define (irregex-copy-matches m)
196 (and (vector? m) (vector-copy m)))
197 (define irregex-match-tag '*irregex-match-tag*)
198 (define (irregex-match-data? obj)
199 (and (vector? obj)
200 (>= (vector-length obj) 11)
201 (eq? irregex-match-tag (vector-ref obj 0))))
202 (define (make-irregex-match count names)
203 (let ((res (make-vector (+ (* 4 (+ 2 count)) 3) #f)))
204 (vector-set! res 0 irregex-match-tag)
205 (vector-set! res 2 names)
206 res))
207 (define (irregex-match-num-submatches m)
208 (- (quotient (- (vector-length m) 3) 4) 2))
209 (define (irregex-match-chunker m)
210 (vector-ref m 1))
211 (define (irregex-match-names m)
212 (vector-ref m 2))
213 (define (irregex-match-chunker-set! m str)
214 (vector-set! m 1 str))
215 (define (%irregex-match-start-chunk m n) (vector-ref m (+ 3 (* n 4))))
216 (define (%irregex-match-start-index m n) (vector-ref m (+ 4 (* n 4))))
217 (define (%irregex-match-end-chunk m n) (vector-ref m (+ 5 (* n 4))))
218 (define (%irregex-match-end-index m n) (vector-ref m (+ 6 (* n 4))))
219 (define (%irregex-match-fail m) (vector-ref m (- (vector-length m) 1)))
220 (define (%irregex-match-fail-set! m x) (vector-set! m (- (vector-length m) 1) x))
221 (define (irregex-match-valid-numeric-index? m n)
222 (and (>= n 0) (< (+ 3 (* n 4)) (- (vector-length m) 4))))
223 (define (irregex-match-matched-numeric-index? m n)
224 (and (vector-ref m (+ 4 (* n 4)))
225 #t)))))
226
227(define (irregex-match-valid-named-index? m n)
228 (and (assq n (irregex-match-names m))
229 #t))
230
231;; public interface with error checking
232(define (irregex-match-start-chunk m . opt)
233 (let ((n (irregex-match-numeric-index 'irregex-match-start-chunk m opt)))
234 (and n (%irregex-match-start-chunk m n))))
235(define (irregex-match-start-index m . opt)
236 (let ((n (irregex-match-numeric-index 'irregex-match-start-index m opt)))
237 (and n (%irregex-match-start-index m n))))
238(define (irregex-match-end-chunk m . opt)
239 (let ((n (irregex-match-numeric-index 'irregex-match-end-chunk m opt)))
240 (and n (%irregex-match-end-chunk m n))))
241(define (irregex-match-end-index m . opt)
242 (let ((n (irregex-match-numeric-index 'irregex-match-end-index m opt)))
243 (and n (%irregex-match-end-index m n))))
244
245(define (irregex-match-start-chunk-set! m n start)
246 (vector-set! m (+ 3 (* n 4)) start))
247(define (irregex-match-start-index-set! m n start)
248 (vector-set! m (+ 4 (* n 4)) start))
249(define (irregex-match-end-chunk-set! m n end)
250 (vector-set! m (+ 5 (* n 4)) end))
251(define (irregex-match-end-index-set! m n end)
252 (vector-set! m (+ 6 (* n 4)) end))
253
254;; Tags use indices that are aligned to start/end positions just like the
255;; match vectors. ie, a tag 0 is a start tag, 1 is its corresponding end tag.
256;; They start at 0, which requires us to map them to submatch index 1.
257;; Sorry for the horrible name ;)
258(define (irregex-match-chunk&index-from-tag-set! m t chunk index)
259 (vector-set! m (+ 7 (* t 2)) chunk)
260 (vector-set! m (+ 8 (* t 2)) index))
261
262;; Helper procedure to convert any type of index from a rest args list
263;; to a numeric index. Named submatches are converted to their corresponding
264;; numeric index, and numeric submatches are checked for validity.
265;; An error is raised for invalid numeric or named indices, #f is returned
266;; for defined but nonmatching indices.
267(define (irregex-match-numeric-index location m opt)
268 (cond
269 ((not (irregex-match-data? m))
270 (error location "not match data" m))
271 ((not (pair? opt)) 0)
272 ((pair? (cdr opt))
273 (apply error location "too many arguments" m opt))
274 (else
275 (let ((n (car opt)))
276 (if (number? n)
277 (if (and (integer? n) (exact? n))
278 (if (irregex-match-valid-numeric-index? m n)
279 (and (irregex-match-matched-numeric-index? m n) n)
280 (error location "not a valid index" m n))
281 (error location "not an exact integer" n))
282 (let lp ((ls (irregex-match-names m))
283 (unknown? #t))
284 (cond
285 ((null? ls)
286 (and unknown?
287 (error location "unknown match name" n)))
288 ((eq? n (caar ls))
289 (if (%irregex-match-start-chunk m (cdar ls))
290 (cdar ls)
291 (lp (cdr ls) #f)))
292 (else (lp (cdr ls) unknown?)))))))))
293
294(define (irregex-match-valid-index? m n)
295 (if (not (irregex-match-data? m))
296 (error 'irregex-match-valid-index? "not match data" m))
297 (if (integer? n)
298 (if (not (exact? n))
299 (error 'irregex-match-valid-index? "not an exact integer" n)
300 (irregex-match-valid-numeric-index? m n))
301 (irregex-match-valid-named-index? m n)))
302
303(define (irregex-match-substring m . opt)
304 (let* ((n (irregex-match-numeric-index 'irregex-match-substring m opt))
305 (cnk (irregex-match-chunker m)))
306 (and n
307 ((chunker-get-substring cnk)
308 (%irregex-match-start-chunk m n)
309 (%irregex-match-start-index m n)
310 (%irregex-match-end-chunk m n)
311 (%irregex-match-end-index m n)))))
312
313(define (irregex-match-subchunk m . opt)
314 (let* ((n (irregex-match-numeric-index 'irregex-match-subchunk m opt))
315 (cnk (irregex-match-chunker m))
316 (get-subchunk (chunker-get-subchunk cnk)))
317 (if (not get-subchunk)
318 (error "this chunk type does not support match subchunks" m n)
319 (and n (get-subchunk
320 (%irregex-match-start-chunk m n)
321 (%irregex-match-start-index m n)
322 (%irregex-match-end-chunk m n)
323 (%irregex-match-end-index m n))))))
324
325;; chunkers tell us how to navigate through chained chunks of strings
326
327(define (make-irregex-chunker get-next get-str . o)
328 (let* ((get-start (or (and (pair? o) (car o)) (lambda (cnk) 0)))
329 (o (if (pair? o) (cdr o) o))
330 (get-end (or (and (pair? o) (car o))
331 (lambda (cnk) (string-length (get-str cnk)))))
332 (o (if (pair? o) (cdr o) o))
333 (get-substr
334 (or (and (pair? o) (car o))
335 (lambda (cnk1 start cnk2 end)
336 (if (eq? cnk1 cnk2)
337 (substring (get-str cnk1) start end)
338 (let loop ((cnk (get-next cnk1))
339 (res (list (substring (get-str cnk1)
340 start
341 (get-end cnk1)))))
342 (if (eq? cnk cnk2)
343 (string-cat-reverse
344 (cons (substring (get-str cnk)
345 (get-start cnk)
346 end)
347 res))
348 (loop (get-next cnk)
349 (cons (substring (get-str cnk)
350 (get-start cnk)
351 (get-end cnk))
352 res))))))))
353 (o (if (pair? o) (cdr o) o))
354 (get-subchunk (and (pair? o) (car o))))
355 (if (not (and (procedure? get-next) (procedure? get-str)
356 (procedure? get-start) (procedure? get-substr)))
357 (error 'make-irregex-chunker "expected a procdure"))
358 (vector get-next get-str get-start get-end get-substr get-subchunk)))
359
360(define (chunker-get-next cnk) (vector-ref cnk 0))
361(define (chunker-get-str cnk) (vector-ref cnk 1))
362(define (chunker-get-start cnk) (vector-ref cnk 2))
363(define (chunker-get-end cnk) (vector-ref cnk 3))
364(define (chunker-get-substring cnk) (vector-ref cnk 4))
365(define (chunker-get-subchunk cnk) (vector-ref cnk 5))
366
367(define (chunker-prev-chunk cnk start end)
368 (if (eq? start end)
369 #f
370 (let ((get-next (chunker-get-next cnk)))
371 (let lp ((start start))
372 (let ((next (get-next start)))
373 (if (eq? next end)
374 start
375 (and next (lp next))))))))
376
377(define (chunker-prev-char cnk start end)
378 (let ((prev (chunker-prev-chunk cnk start end)))
379 (and prev
380 (string-ref ((chunker-get-str cnk) prev)
381 (- ((chunker-get-end cnk) prev) 1)))))
382
383(define (chunker-next-char cnk src)
384 (let ((next ((chunker-get-next cnk) src)))
385 (and next
386 (string-ref ((chunker-get-str cnk) next)
387 ((chunker-get-start cnk) next)))))
388
389(define (chunk-before? cnk a b)
390 (and (not (eq? a b))
391 (let ((next ((chunker-get-next cnk) a)))
392 (and next
393 (if (eq? next b)
394 #t
395 (chunk-before? cnk next b))))))
396
397;; For look-behind searches, wrap an existing chunker such that it
398;; returns the same results but ends at a given point.
399(define (wrap-end-chunker cnk src i)
400 (make-irregex-chunker
401 (lambda (x) (and (not (eq? x src)) ((chunker-get-next cnk) x)))
402 (chunker-get-str cnk)
403 (chunker-get-start cnk)
404 (lambda (x)
405 ;; TODO: this is a hack workaround for the fact that we don't
406 ;; have either a notion of chunk equivalence or chunk truncation,
407 ;; until which time (neg-)look-behind in a fold won't work on
408 ;; non-basic chunks.
409 (if (or (eq? x src)
410 (and (not ((chunker-get-next cnk) x))
411 (not ((chunker-get-next cnk) src))))
412 i
413 ((chunker-get-end cnk) x)))
414 (chunker-get-substring cnk)
415 (chunker-get-subchunk cnk)))
416
417;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
418;;;; String Utilities
419
420;; Unicode version (skip surrogates)
421(define *all-chars*
422 `(/ ,(integer->char 0) ,(integer->char #xD7FF)
423 ,(integer->char #xE000) ,(integer->char #x10FFFF)))
424
425;; ASCII version, offset to not assume 0-255
426;; (define *all-chars* `(/ ,(integer->char (- (char->integer #\space) 32)) ,(integer->char (+ (char->integer #\space) 223))))
427
428;; set to #f to ignore even an explicit request for utf8 handling
429;; The utf8-mode is undesired on any implementation with native unicode support.
430;; It is a workaround for those that treat strings as a raw byte sequences, and
431;; does not work well otherwise. So disable it on implementations known to
432;; handle unicode natively.
433(define *allow-utf8-mode?* (cond-expand ((and chicken (not full-unicode)) #t)
434 (else #f)))
435
436;; (define *named-char-properties* '())
437
438(define (string-scan-char str c . o)
439 (let ((end (string-length str)))
440 (let scan ((i (if (pair? o) (car o) 0)))
441 (cond ((= i end) #f)
442 ((eqv? c (string-ref str i)) i)
443 (else (scan (+ i 1)))))))
444
445(define (string-scan-char-escape str c . o)
446 (let ((end (string-length str)))
447 (let scan ((i (if (pair? o) (car o) 0)))
448 (cond ((= i end) #f)
449 ((eqv? c (string-ref str i)) i)
450 ((eqv? c #\\) (scan (+ i 2)))
451 (else (scan (+ i 1)))))))
452
453(define (string-scan-pred str pred . o)
454 (let ((end (string-length str)))
455 (let scan ((i (if (pair? o) (car o) 0)))
456 (cond ((= i end) #f)
457 ((pred (string-ref str i)) i)
458 (else (scan (+ i 1)))))))
459
460(define (string-split-char str c)
461 (let ((end (string-length str)))
462 (let lp ((i 0) (from 0) (res '()))
463 (define (collect) (cons (substring str from i) res))
464 (cond ((>= i end) (reverse (collect)))
465 ((eqv? c (string-ref str i)) (lp (+ i 1) (+ i 1) (collect)))
466 (else (lp (+ i 1) from res))))))
467
468(define (char-alphanumeric? c)
469 (or (char-alphabetic? c) (char-numeric? c)))
470
471(define (%substring=? a b start1 start2 len)
472 (let lp ((i 0))
473 (cond ((>= i len)
474 #t)
475 ((char=? (string-ref a (+ start1 i)) (string-ref b (+ start2 i)))
476 (lp (+ i 1)))
477 (else
478 #f))))
479
480;; SRFI-13 extracts
481
482(define (%%string-copy! to tstart from fstart fend)
483 (do ((i fstart (+ i 1))
484 (j tstart (+ j 1)))
485 ((>= i fend))
486 (string-set! to j (string-ref from i))))
487
488(define (string-cat-reverse string-list)
489 (string-cat-reverse/aux
490 (fold (lambda (s a) (+ (string-length s) a)) 0 string-list)
491 string-list))
492
493(define (string-cat-reverse/aux len string-list)
494 (let ((res (make-string len)))
495 (let lp ((i len) (ls string-list))
496 (if (pair? ls)
497 (let* ((s (car ls))
498 (slen (string-length s))
499 (i (- i slen)))
500 (%%string-copy! res i s 0 slen)
501 (lp i (cdr ls)))))
502 res))
503
504;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
505;;;; List Utilities
506
507;; like the one-arg IOTA case
508(define (zero-to n)
509 (if (<= n 0)
510 '()
511 (let lp ((i (- n 1)) (res '()))
512 (if (zero? i) (cons 0 res) (lp (- i 1) (cons i res))))))
513
514;; SRFI-1 extracts (simplified 1-ary versions)
515
516(define (find pred ls)
517 (let lp ((ls ls))
518 (cond ((null? ls) #f)
519 ((pred (car ls)) (car ls))
520 (else (lp (cdr ls))))))
521
522(define (find-tail pred ls)
523 (let lp ((ls ls))
524 (cond ((null? ls) #f)
525 ((pred (car ls)) ls)
526 (else (lp (cdr ls))))))
527
528(define (last ls)
529 (if (not (pair? ls))
530 (error "can't take last of empty list")
531 (let lp ((ls ls))
532 (if (pair? (cdr ls))
533 (lp (cdr ls))
534 (car ls)))))
535
536(define (any pred ls)
537 (and (pair? ls)
538 (let lp ((head (car ls)) (tail (cdr ls)))
539 (if (null? tail)
540 (pred head)
541 (or (pred head) (lp (car tail) (cdr tail)))))))
542
543(define (every pred ls)
544 (or (null? ls)
545 (let lp ((head (car ls)) (tail (cdr ls)))
546 (if (null? tail)
547 (pred head)
548 (and (pred head) (lp (car tail) (cdr tail)))))))
549
550(define (fold kons knil ls)
551 (let lp ((ls ls) (res knil))
552 (if (null? ls)
553 res
554 (lp (cdr ls) (kons (car ls) res)))))
555
556(define (filter pred ls)
557 (let lp ((ls ls) (res '()))
558 (if (null? ls)
559 (reverse res)
560 (lp (cdr ls) (if (pred (car ls)) (cons (car ls) res) res)))))
561
562(define (remove pred ls)
563 (let lp ((ls ls) (res '()))
564 (if (null? ls)
565 (reverse res)
566 (lp (cdr ls) (if (pred (car ls)) res (cons (car ls) res))))))
567
568;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
569;;;; Flags
570
571(define (bit-shr n i)
572 (quotient n (expt 2 i)))
573
574(define (bit-shl n i)
575 (* n (expt 2 i)))
576
577(define (bit-not n) (- #xFFFF n))
578
579(define (bit-ior a b)
580 (cond
581 ((zero? a) b)
582 ((zero? b) a)
583 (else
584 (+ (if (or (odd? a) (odd? b)) 1 0)
585 (* 2 (bit-ior (quotient a 2) (quotient b 2)))))))
586
587(define (bit-and a b)
588 (cond
589 ((zero? a) 0)
590 ((zero? b) 0)
591 (else
592 (+ (if (and (odd? a) (odd? b)) 1 0)
593 (* 2 (bit-and (quotient a 2) (quotient b 2)))))))
594
595(define (integer-log n)
596 (define (b8 n r)
597 (if (>= n (bit-shl 1 8)) (b4 (bit-shr n 8) (+ r 8)) (b4 n r)))
598 (define (b4 n r)
599 (if (>= n (bit-shl 1 4)) (b2 (bit-shr n 4) (+ r 4)) (b2 n r)))
600 (define (b2 n r)
601 (if (>= n (bit-shl 1 2)) (b1 (bit-shr n 2) (+ r 2)) (b1 n r)))
602 (define (b1 n r) (if (>= n (bit-shl 1 1)) (+ r 1) r))
603 (if (>= n (bit-shl 1 16)) (b8 (bit-shr n 16) 16) (b8 n 0)))
604
605(define (flag-set? flags i)
606 (= i (bit-and flags i)))
607(define (flag-join a b)
608 (if b (bit-ior a b) a))
609(define (flag-clear a b)
610 (bit-and a (bit-not b)))
611
612(define ~none 0)
613(define ~searcher? 1)
614(define ~consumer? 2)
615
616;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
617;;;; Parsing Embedded SREs in PCRE Strings
618
619;; (define (with-read-from-string str i proc)
620;; (define (port-size in)
621;; (let lp ((i 0)) (if (eof-object? (read-char in)) i (lp (+ i 1)))))
622;; (let* ((len (string-length str))
623;; (tail-len (- len i))
624;; (in (open-input-string (substring str i len)))
625;; (sre (read in))
626;; (unused-len (port-size in)))
627;; (close-input-port in)
628;; (proc sre (- tail-len unused-len))))
629
630(define close-token (list 'close))
631(define dot-token (string->symbol "."))
632
633(define (with-read-from-string str i proc)
634 (define end (string-length str))
635 (define (read i k)
636 (cond
637 ((>= i end) (error "unterminated embedded SRE" str))
638 (else
639 (case (string-ref str i)
640 ((#\()
641 (let lp ((i (+ i 1)) (ls '()))
642 (read
643 i
644 (lambda (x j)
645 (cond
646 ((eq? x close-token)
647 (k (reverse ls) j))
648 ((eq? x dot-token)
649 (if (null? ls)
650 (error "bad dotted form" str)
651 (read j (lambda (y j2)
652 (read j2 (lambda (z j3)
653 (if (not (eq? z close-token))
654 (error "bad dotted form" str)
655 (k (append (reverse (cdr ls))
656 (cons (car ls) y))
657 j3))))))))
658 (else
659 (lp j (cons x ls))))))))
660 ((#\))
661 (k close-token (+ i 1)))
662 ((#\;)
663 (let skip ((i (+ i 1)))
664 (if (or (>= i end) (eqv? #\newline (string-ref str i)))
665 (read (+ i 1) k)
666 (skip (+ i 1)))))
667 ((#\' #\`)
668 (read (+ i 1)
669 (lambda (sexp j)
670 (let ((q (if (eqv? #\' (string-ref str i)) 'quote 'quasiquote)))
671 (k (list q sexp) j)))))
672 ((#\,)
673 (let* ((at? (and (< (+ i 1) end) (eqv? #\@ (string-ref str (+ i 1)))))
674 (u (if at? 'uquote-splicing 'unquote))
675 (j (if at? (+ i 2) (+ i 1))))
676 (read j (lambda (sexp j) (k (list u sexp) j)))))
677 ((#\")
678 (let scan ((from (+ i 1)) (i (+ i 1)) (res '()))
679 (define (collect)
680 (if (= from i) res (cons (substring str from i) res)))
681 (if (>= i end)
682 (error "unterminated string in embedded SRE" str)
683 (case (string-ref str i)
684 ((#\") (k (string-cat-reverse (collect)) (+ i 1)))
685 ((#\\) (scan (+ i 1) (+ i 2) (collect)))
686 (else (scan from (+ i 1) res))))))
687 ((#\#)
688 (case (string-ref str (+ i 1))
689 ((#\;)
690 (read (+ i 2) (lambda (sexp j) (read j k))))
691 ((#\\)
692 (read (+ i 2)
693 (lambda (sexp j)
694 (k (case sexp
695 ((space) #\space)
696 ((newline) #\newline)
697 (else (let ((s (if (number? sexp)
698 (number->string sexp)
699 (symbol->string sexp))))
700 (string-ref s 0))))
701 j))))
702 ((#\t #\f)
703 (k (eqv? #\t (string-ref str (+ i 1))) (+ i 2)))
704 (else
705 (error "bad # syntax in simplified SRE" i))))
706 (else
707 (cond
708 ((char-whitespace? (string-ref str i))
709 (read (+ i 1) k))
710 (else ;; symbol/number
711 (let scan ((j (+ i 1)))
712 (cond
713 ((or (>= j end)
714 (let ((c (string-ref str j)))
715 (or (char-whitespace? c)
716 (memv c '(#\; #\( #\) #\" #\# #\\)))))
717 (let ((str2 (substring str i j)))
718 (k (or (string->number str2) (string->symbol str2)) j)))
719 (else (scan (+ j 1))))))))))))
720 (read i (lambda (res j)
721 (if (eq? res 'close-token)
722 (error "unexpected ')' in SRE" str j)
723 (proc res j)))))
724
725;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
726;;;; Parsing PCRE Strings
727
728(define ~save? 1)
729(define ~case-insensitive? 2)
730(define ~multi-line? 4)
731(define ~single-line? 8)
732(define ~ignore-space? 16)
733(define ~utf8? 32)
734
735(define (symbol-list->flags ls)
736 (let lp ((ls ls) (res ~none))
737 (if (not (pair? ls))
738 res
739 (lp (cdr ls)
740 (flag-join
741 res
742 (case (car ls)
743 ((i ci case-insensitive) ~case-insensitive?)
744 ((m multi-line) ~multi-line?)
745 ((s single-line) ~single-line?)
746 ((x ignore-space) ~ignore-space?)
747 ((u utf8) (if *allow-utf8-mode?* ~utf8? ~none))
748 (else #f)))))))
749
750(define (maybe-string->sre obj)
751 (if (string? obj) (string->sre obj) obj))
752
753(define (string->sre str . o)
754 (if (not (string? str)) (error 'string->sre "expected a string" str))
755 (let ((end (string-length str))
756 (flags (symbol-list->flags o)))
757
758 (let lp ((i 0) (from 0) (flags flags) (res '()) (st '()))
759
760 ;; handle case sensitivity at the literal char/string level
761 (define (cased-char ch)
762 (if (and (flag-set? flags ~case-insensitive?)
763 (char-alphabetic? ch))
764 `(or ,ch ,(char-altcase ch))
765 ch))
766 (define (cased-string str)
767 (if (flag-set? flags ~case-insensitive?)
768 (sre-sequence (map cased-char (string->list str)))
769 str))
770 ;; accumulate the substring from..i as literal text
771 (define (collect)
772 (if (= i from) res (cons (cased-string (substring str from i)) res)))
773 ;; like collect but breaks off the last single character when
774 ;; collecting literal data, as the argument to ?/*/+ etc.
775 (define (collect/single)
776 (let* ((utf8? (flag-set? flags ~utf8?))
777 (j (if (and utf8? (> i 1))
778 (utf8-backup-to-initial-char str (- i 1))
779 (- i 1))))
780 (cond
781 ((< j from)
782 res)
783 (else
784 (let ((c (cased-char (if utf8?
785 (utf8-string-ref str j (- i j))
786 (string-ref str j)))))
787 (cond
788 ((= j from)
789 (cons c res))
790 (else
791 (cons c
792 (cons (cased-string (substring str from j))
793 res)))))))))
794 ;; collects for use as a result, reversing and grouping OR
795 ;; terms, and some ugly tweaking of `function-like' groups and
796 ;; conditionals
797 (define (collect/terms)
798 (let* ((ls (collect))
799 (func
800 (and (pair? ls)
801 (memq (last ls)
802 '(atomic if look-ahead neg-look-ahead
803 look-behind neg-look-behind
804 => submatch-named
805 w/utf8 w/noutf8))))
806 (prefix (if (and func (memq (car func) '(=> submatch-named)))
807 (list 'submatch-named (cadr (reverse ls)))
808 (and func (list (car func)))))
809 (ls (if func
810 (if (memq (car func) '(=> submatch-named))
811 (reverse (cddr (reverse ls)))
812 (reverse (cdr (reverse ls))))
813 ls)))
814 (let lp ((ls ls) (term '()) (res '()))
815 (define (shift)
816 (cons (sre-sequence term) res))
817 (cond
818 ((null? ls)
819 (let* ((res (sre-alternate (shift)))
820 (res (if (flag-set? flags ~save?)
821 (list 'submatch res)
822 res)))
823 (if prefix
824 (if (eq? 'if (car prefix))
825 (cond
826 ((not (pair? res))
827 'epsilon)
828 ((memq (car res)
829 '(look-ahead neg-look-ahead
830 look-behind neg-look-behind))
831 res)
832 ((eq? 'seq (car res))
833 `(if ,(cadr res)
834 ,(sre-sequence (cddr res))))
835 (else
836 `(if ,(cadadr res)
837 ,(sre-sequence (cddadr res))
838 ,(sre-alternate (cddr res)))))
839 `(,@prefix ,res))
840 res)))
841 ((eq? 'or (car ls)) (lp (cdr ls) '() (shift)))
842 (else (lp (cdr ls) (cons (car ls) term) res))))))
843 (define (save)
844 (cons (cons flags (collect)) st))
845
846 ;; main parsing
847 (if (>= i end)
848 (if (pair? st)
849 (error "unterminated parenthesis in regexp" str)
850 (collect/terms))
851 (let ((c (string-ref str i)))
852 (case c
853 ((#\.)
854 (lp (+ i 1) (+ i 1) flags
855 (cons (if (flag-set? flags ~single-line?) 'any 'nonl)
856 (collect))
857 st))
858 ((#\?)
859 (let ((res (collect/single)))
860 (if (null? res)
861 (error "? can't follow empty pattern" str res)
862 (let ((x (car res)))
863 (lp (+ i 1)
864 (+ i 1)
865 flags
866 (cons
867 (if (pair? x)
868 (case (car x)
869 ((*) `(*? ,@(cdr x)))
870 ((+) `(**? 1 #f ,@(cdr x)))
871 ((?) `(?? ,@(cdr x)))
872 ((**) `(**? ,@(cdr x)))
873 ((=) `(**? ,(cadr x) ,@(cdr x)))
874 ((>=) `(**? ,(cadr x) #f ,@(cddr x)))
875 (else `(? ,x)))
876 `(? ,x))
877 (cdr res))
878 st)))))
879 ((#\+ #\*)
880 (let* ((res (collect/single))
881 (x (if (pair? res) (car res) 'epsilon))
882 (op (string->symbol (string c))))
883 (cond
884 ((sre-repeater? x)
885 (error "duplicate repetition (e.g. **) in pattern" str res))
886 ((sre-empty? x)
887 (error "can't repeat empty pattern (e.g. ()*)" str res))
888 (else
889 (lp (+ i 1) (+ i 1) flags
890 (cons (list op x) (cdr res))
891 st)))))
892 ((#\()
893 (cond
894 ((>= (+ i 1) end)
895 (error "unterminated parenthesis in regexp" str))
896 ((not (memv (string-ref str (+ i 1)) '(#\? #\*))) ; normal case
897 (lp (+ i 1) (+ i 1) (flag-join flags ~save?) '() (save)))
898 ((>= (+ i 2) end)
899 (error "unterminated parenthesis in regexp" str))
900 ((eqv? (string-ref str (+ i 1)) #\*)
901 (if (eqv? #\' (string-ref str (+ i 2)))
902 (with-read-from-string str (+ i 3)
903 (lambda (sre j)
904 (if (or (>= j end) (not (eqv? #\) (string-ref str j))))
905 (error "unterminated (*'...) SRE escape" str)
906 (lp (+ j 1) (+ j 1) flags (cons sre (collect)) st))))
907 (error "bad regexp syntax: (*FOO) not supported" str)))
908 (else ;; (?...) case
909 (case (string-ref str (+ i 2))
910 ((#\#)
911 (let ((j (string-scan-char str #\) (+ i 3))))
912 (lp (+ j i) (+ j 1) flags (collect) st)))
913 ((#\:)
914 (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) '() (save)))
915 ((#\=)
916 (lp (+ i 3) (+ i 3) (flag-clear flags ~save?)
917 '(look-ahead) (save)))
918 ((#\!)
919 (lp (+ i 3) (+ i 3) (flag-clear flags ~save?)
920 '(neg-look-ahead) (save)))
921 ((#\<)
922 (cond
923 ((>= (+ i 3) end)
924 (error "unterminated parenthesis in regexp" str))
925 (else
926 (case (string-ref str (+ i 3))
927 ((#\=)
928 (lp (+ i 4) (+ i 4) (flag-clear flags ~save?)
929 '(look-behind) (save)))
930 ((#\!)
931 (lp (+ i 4) (+ i 4) (flag-clear flags ~save?)
932 '(neg-look-behind) (save)))
933 (else
934 (let ((j (and (char-alphabetic?
935 (string-ref str (+ i 3)))
936 (string-scan-char str #\> (+ i 4)))))
937 (if j
938 (lp (+ j 1) (+ j 1) (flag-clear flags ~save?)
939 `(,(string->symbol (substring str (+ i 3) j))
940 submatch-named)
941 (save))
942 (error "invalid (?< sequence" str))))))))
943 ((#\>)
944 (lp (+ i 3) (+ i 3) (flag-clear flags ~save?)
945 '(atomic) (save)))
946 ;;((#\' #\P) ; named subpatterns
947 ;; )
948 ;;((#\R) ; recursion
949 ;; )
950 ((#\()
951 (cond
952 ((>= (+ i 3) end)
953 (error "unterminated parenthesis in regexp" str))
954 ((char-numeric? (string-ref str (+ i 3)))
955 (let* ((j (string-scan-char str #\) (+ i 3)))
956 (n (string->number (substring str (+ i 3) j))))
957 (if (not n)
958 (error "invalid conditional reference" str)
959 (lp (+ j 1) (+ j 1) (flag-clear flags ~save?)
960 `(,n if) (save)))))
961 ((char-alphabetic? (string-ref str (+ i 3)))
962 (let* ((j (string-scan-char str #\) (+ i 3)))
963 (s (string->symbol (substring str (+ i 3) j))))
964 (lp (+ j 1) (+ j 1) (flag-clear flags ~save?)
965 `(,s if) (save))))
966 (else
967 (lp (+ i 2) (+ i 2) (flag-clear flags ~save?)
968 '(if) (save)))))
969 ((#\{)
970 (error "unsupported Perl-style cluster" str))
971 (else
972 (let ((old-flags flags))
973 (let lp2 ((j (+ i 2)) (flags flags) (invert? #f))
974 (define (join x)
975 ((if invert? flag-clear flag-join) flags x))
976 (define (new-res res)
977 (let ((before (flag-set? old-flags ~utf8?))
978 (after (flag-set? flags ~utf8?)))
979 (if (eq? before after)
980 res
981 (cons (if after 'w/utf8 'w/noutf8) res))))
982 (cond
983 ((>= j end)
984 (error "incomplete cluster" str i))
985 (else
986 (case (string-ref str j)
987 ((#\i)
988 (lp2 (+ j 1) (join ~case-insensitive?) invert?))
989 ((#\m)
990 (lp2 (+ j 1) (join ~multi-line?) invert?))
991 ((#\x)
992 (lp2 (+ j 1) (join ~ignore-space?) invert?))
993 ((#\u)
994 (if *allow-utf8-mode?*
995 (lp2 (+ j 1) (join ~utf8?) invert?)
996 (lp2 (+ j 1) flags invert?)))
997 ((#\-)
998 (lp2 (+ j 1) flags (not invert?)))
999 ((#\))
1000 (lp (+ j 1) (+ j 1) flags (new-res (collect))
1001 st))
1002 ((#\:)
1003 (lp (+ j 1) (+ j 1) flags (new-res '())
1004 (cons (cons old-flags (collect)) st)))
1005 (else
1006 (error "unknown regex cluster modifier" str)
1007 )))))))))))
1008 ((#\))
1009 (if (null? st)
1010 (error "too many )'s in regexp" str)
1011 (lp (+ i 1)
1012 (+ i 1)
1013 (caar st)
1014 (cons (collect/terms) (cdar st))
1015 (cdr st))))
1016 ((#\[)
1017 (apply
1018 (lambda (sre j)
1019 (lp (+ j 1) (+ j 1) flags (cons sre (collect)) st))
1020 (string-parse-cset str (+ i 1) flags)))
1021 ((#\{)
1022 (cond
1023 ((or (>= (+ i 1) end)
1024 (not (or (char-numeric? (string-ref str (+ i 1)))
1025 (eqv? #\, (string-ref str (+ i 1))))))
1026 (lp (+ i 1) from flags res st))
1027 (else
1028 (let ((res (collect/single)))
1029 (cond
1030 ((null? res)
1031 (error "{ can't follow empty pattern"))
1032 (else
1033 (let* ((x (car res))
1034 (tail (cdr res))
1035 (j (string-scan-char str #\} (+ i 1)))
1036 (s2 (string-split-char (substring str (+ i 1) j)
1037 #\,))
1038 (n (string->number (car s2)))
1039 (m (and (pair? (cdr s2))
1040 (string->number (cadr s2)))))
1041 (cond
1042 ((or (not n)
1043 (and (pair? (cdr s2))
1044 (not (equal? "" (cadr s2)))
1045 (not m)))
1046 (error "invalid {n} repetition syntax" s2))
1047 ((null? (cdr s2))
1048 (lp (+ j 1) (+ j 1) flags `((= ,n ,x) ,@tail) st))
1049 (m
1050 (lp (+ j 1) (+ j 1) flags `((** ,n ,m ,x) ,@tail) st))
1051 (else
1052 (lp (+ j 1) (+ j 1) flags `((>= ,n ,x) ,@tail) st)
1053 )))))))))
1054 ((#\\)
1055 (cond
1056 ((>= (+ i 1) end)
1057 (error "incomplete escape sequence" str))
1058 (else
1059 (let ((c (string-ref str (+ i 1))))
1060 (case c
1061 ((#\d)
1062 (lp (+ i 2) (+ i 2) flags `(numeric ,@(collect)) st))
1063 ((#\D)
1064 (lp (+ i 2) (+ i 2) flags `((~ numeric) ,@(collect)) st))
1065 ((#\s)
1066 (lp (+ i 2) (+ i 2) flags `(space ,@(collect)) st))
1067 ((#\S)
1068 (lp (+ i 2) (+ i 2) flags `((~ space) ,@(collect)) st))
1069 ((#\w)
1070 (lp (+ i 2) (+ i 2) flags
1071 `((or alphanumeric ("_")) ,@(collect)) st))
1072 ((#\W)
1073 (lp (+ i 2) (+ i 2) flags
1074 `((~ (or alphanumeric ("_"))) ,@(collect)) st))
1075 ((#\b)
1076 (lp (+ i 2) (+ i 2) flags
1077 `((or bow eow) ,@(collect)) st))
1078 ((#\B)
1079 (lp (+ i 2) (+ i 2) flags `(nwb ,@(collect)) st))
1080 ((#\A)
1081 (lp (+ i 2) (+ i 2) flags `(bos ,@(collect)) st))
1082 ((#\Z)
1083 (lp (+ i 2) (+ i 2) flags
1084 `((? #\newline) eos ,@(collect)) st))
1085 ((#\z)
1086 (lp (+ i 2) (+ i 2) flags `(eos ,@(collect)) st))
1087 ((#\R)
1088 (lp (+ i 2) (+ i 2) flags `(newline ,@(collect)) st))
1089 ((#\K)
1090 (lp (+ i 2) (+ i 2) flags `(reset ,@(collect)) st))
1091 ;; these two are from Emacs and TRE, but not in PCRE
1092 ((#\<)
1093 (lp (+ i 2) (+ i 2) flags `(bow ,@(collect)) st))
1094 ((#\>)
1095 (lp (+ i 2) (+ i 2) flags `(eow ,@(collect)) st))
1096 ((#\x)
1097 (apply
1098 (lambda (ch j)
1099 (lp (+ j 1) (+ j 1) flags `(,ch ,@(collect)) st))
1100 (string-parse-hex-escape str (+ i 2) end)))
1101 ((#\k)
1102 (let ((c (string-ref str (+ i 2))))
1103 (if (not (memv c '(#\< #\{ #\')))
1104 (error "bad \\k usage, expected \\k<...>" str)
1105 (let* ((terminal (char-mirror c))
1106 (j (string-scan-char str terminal (+ i 2)))
1107 (s (and j (substring str (+ i 3) j)))
1108 (backref
1109 (if (flag-set? flags ~case-insensitive?)
1110 'backref-ci
1111 'backref)))
1112 (if (not j)
1113 (error "unterminated named backref" str)
1114 (lp (+ j 1) (+ j 1) flags
1115 `((,backref ,(string->symbol s))
1116 ,@(collect))
1117 st))))))
1118 ((#\Q) ;; \Q..\E escapes
1119 (let ((res (collect)))
1120 (let lp2 ((j (+ i 2)))
1121 (cond
1122 ((>= j end)
1123 (lp j (+ i 2) flags res st))
1124 ((eqv? #\\ (string-ref str j))
1125 (cond
1126 ((>= (+ j 1) end)
1127 (lp (+ j 1) (+ i 2) flags res st))
1128 ((eqv? #\E (string-ref str (+ j 1)))
1129 (lp (+ j 2) (+ j 2) flags
1130 (cons (substring str (+ i 2) j) res) st))
1131 (else
1132 (lp2 (+ j 2)))))
1133 (else
1134 (lp2 (+ j 1)))))))
1135 ((#\')
1136 (with-read-from-string str (+ i 2)
1137 (lambda (sre j)
1138 (lp j j flags (cons sre (collect)) st))))
1139 ;;((#\p) ; XXXX unicode properties
1140 ;; )
1141 ;;((#\P)
1142 ;; )
1143 (else
1144 (cond
1145 ((char-numeric? c)
1146 (let* ((j (or (string-scan-pred
1147 str
1148 (lambda (c) (not (char-numeric? c)))
1149 (+ i 2))
1150 end))
1151 (backref
1152 (if (flag-set? flags ~case-insensitive?)
1153 'backref-ci
1154 'backref))
1155 (res `((,backref ,(string->number
1156 (substring str (+ i 1) j)))
1157 ,@(collect))))
1158 (lp j j flags res st)))
1159 ((char-alphabetic? c)
1160 (let ((cell (assv c posix-escape-sequences)))
1161 (if cell
1162 (lp (+ i 2) (+ i 2) flags
1163 (cons (cdr cell) (collect)) st)
1164 (error "unknown escape sequence" str c))))
1165 (else
1166 (lp (+ i 2) (+ i 1) flags (collect) st)))))))))
1167 ((#\|)
1168 (lp (+ i 1) (+ i 1) flags (cons 'or (collect)) st))
1169 ((#\^)
1170 (let ((sym (if (flag-set? flags ~multi-line?) 'bol 'bos)))
1171 (lp (+ i 1) (+ i 1) flags (cons sym (collect)) st)))
1172 ((#\$)
1173 (let ((sym (if (flag-set? flags ~multi-line?) 'eol 'eos)))
1174 (lp (+ i 1) (+ i 1) flags (cons sym (collect)) st)))
1175 ((#\space)
1176 (if (flag-set? flags ~ignore-space?)
1177 (lp (+ i 1) (+ i 1) flags (collect) st)
1178 (lp (+ i 1) from flags res st)))
1179 ((#\#)
1180 (if (flag-set? flags ~ignore-space?)
1181 (let ((j (or (string-scan-char str #\newline (+ i 1))
1182 (- end 1))))
1183 (lp (+ j 1) (+ j 1) flags (collect) st))
1184 (lp (+ i 1) from flags res st)))
1185 (else
1186 (lp (+ i 1) from flags res st))))))))
1187
1188(define posix-escape-sequences
1189 `((#\n . #\newline)
1190 (#\r . ,(integer->char (+ (char->integer #\newline) 3)))
1191 (#\t . ,(integer->char (- (char->integer #\newline) 1)))
1192 (#\a . ,(integer->char (- (char->integer #\newline) 3)))
1193 (#\e . ,(integer->char (+ (char->integer #\newline) #x11)))
1194 (#\f . ,(integer->char (+ (char->integer #\newline) 2)))
1195 ))
1196
1197(define (char-altcase c)
1198 (if (char-upper-case? c) (char-downcase c) (char-upcase c)))
1199
1200(define (char-mirror c)
1201 (case c ((#\<) #\>) ((#\{) #\}) ((#\() #\)) ((#\[) #\]) (else c)))
1202
1203(define (string-parse-hex-escape str i end)
1204 (cond
1205 ((>= i end)
1206 (error "incomplete hex escape" str i))
1207 ((eqv? #\{ (string-ref str i))
1208 (let ((j (string-scan-char-escape str #\} (+ i 1))))
1209 (if (not j)
1210 (error "incomplete hex brace escape" str i)
1211 (let* ((s (substring str (+ i 1) j))
1212 (n (string->number s 16)))
1213 (if n
1214 (list (integer->char n) j)
1215 (error "bad hex brace escape" s))))))
1216 ((>= (+ i 1) end)
1217 (error "incomplete hex escape" str i))
1218 (else
1219 (let* ((s (substring str i (+ i 2)))
1220 (n (string->number s 16)))
1221 (if n
1222 (list (integer->char n) (+ i 2))
1223 (error "bad hex escape" s))))))
1224
1225(define (string-parse-cset str start flags)
1226 (let* ((end (string-length str))
1227 (invert? (and (< start end) (eqv? #\^ (string-ref str start))))
1228 (utf8? (flag-set? flags ~utf8?)))
1229 (define (go i prev-char cset)
1230 (if (>= i end)
1231 (error "incomplete char set" str i end)
1232 (let ((c (string-ref str i)))
1233 (case c
1234 ((#\])
1235 (if (cset-empty? cset)
1236 (go (+ i 1) #\] (cset-adjoin cset #\]))
1237 (let ((ci? (flag-set? flags ~case-insensitive?)))
1238 (list
1239 (let ((res (if ci? (cset-case-insensitive cset) cset)))
1240 (cset->sre (if invert? (cset-complement res) res)))
1241 i))))
1242 ((#\-)
1243 (cond
1244 ((or (= i start)
1245 (and (= i (+ start 1)) (eqv? #\^ (string-ref str start)))
1246 (eqv? #\] (string-ref str (+ i 1))))
1247 (go (+ i 1) c (cset-adjoin cset c)))
1248 ((not prev-char)
1249 (error "bad char-set"))
1250 (else
1251 (let ((char (string-ref str (+ i 1))))
1252 (apply
1253 (lambda (c j)
1254 (if (char<? c prev-char)
1255 (error "inverted range in char-set" prev-char c)
1256 (go j #f (cset-union cset (range->cset prev-char c)))))
1257 (cond
1258 ((and (eqv? #\\ char) (assv char posix-escape-sequences))
1259 => (lambda (x) (list (cdr x) (+ i 3))))
1260 ((and (eqv? #\\ char)
1261 (eqv? (string-ref str (+ i 2)) #\x))
1262 (string-parse-hex-escape str (+ i 3) end))
1263 ((and utf8? (<= #x80 (char->integer char) #xFF))
1264 (let ((len (utf8-start-char->length char)))
1265 (list (utf8-string-ref str (+ i 1) len) (+ i 1 len))))
1266 (else
1267 (list char (+ i 2)))))))))
1268 ((#\[)
1269 (let* ((inv? (eqv? #\^ (string-ref str (+ i 1))))
1270 (i2 (if inv? (+ i 2) (+ i 1))))
1271 (case (string-ref str i2)
1272 ((#\:)
1273 (let ((j (string-scan-char str #\: (+ i2 1))))
1274 (if (or (not j) (not (eqv? #\] (string-ref str (+ j 1)))))
1275 (error "incomplete character class" str)
1276 (let* ((class (sre->cset
1277 (string->symbol
1278 (substring str (+ i2 1) j))))
1279 (class (if inv? (cset-complement class) class)))
1280 (go (+ j 2) #f (cset-union cset class))))))
1281 ((#\= #\.)
1282 (error "collating sequences not supported" str))
1283 (else
1284 (go (+ i 1) #\[ (cset-adjoin cset #\[))))))
1285 ((#\\)
1286 (let ((c (string-ref str (+ i 1))))
1287 (case c
1288 ((#\d #\D #\s #\S #\w #\W)
1289 (go (+ i 2) #f
1290 (cset-union cset
1291 (sre->cset (string->sre (string #\\ c))))))
1292 ((#\x)
1293 (apply
1294 (lambda (ch j)
1295 (go j ch (cset-adjoin cset ch)))
1296 (string-parse-hex-escape str (+ i 2) end)))
1297 (else
1298 (let ((c (cond ((assv c posix-escape-sequences) => cdr)
1299 (else c))))
1300 (go (+ i 2) c (cset-adjoin cset c)))))))
1301 (else
1302 (if (and utf8? (<= #x80 (char->integer c) #xFF))
1303 (let ((len (utf8-start-char->length c)))
1304 (go (+ i len)
1305 (utf8-string-ref str i len)
1306 (cset-adjoin cset (utf8-string-ref str i len))))
1307 (go (+ i 1) c (cset-adjoin cset c))))))))
1308 (if invert?
1309 (go (+ start 1)
1310 #f
1311 (if (flag-set? flags ~multi-line?)
1312 (char->cset #\newline)
1313 (make-cset)))
1314 (go start #f (make-cset)))))
1315
1316;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1317;;;; UTF-8 Utilities
1318
1319;; Here are some hairy optimizations that need to be documented
1320;; better. Thanks to these, we never do any utf8 processing once the
1321;; regexp is compiled.
1322
1323;; two chars: ab..ef
1324;; a[b..xFF]|[b-d][x80..xFF]|e[x80..xFF]
1325
1326;; three chars: abc..ghi
1327;; ab[c..xFF]|a[d..xFF][x80..xFF]|
1328;; [b..f][x80..xFF][x80..xFF]|
1329;; g[x80..g][x80..xFF]|gh[x80..i]
1330
1331;; four chars: abcd..ghij
1332;; abc[d..xFF]|ab[d..xFF][x80..xFF]|a[c..xFF][x80..xFF][x80..xFF]|
1333;; [b..f][x80..xFF][x80..xFF][x80..xFF]|
1334;; g[x80..g][x80..xFF][x80..xFF]|gh[x80..h][x80..xFF]|ghi[x80..j]
1335
1336(define (high-char? c) (<= #x80 (char->integer c)))
1337
1338;; number of total bytes in a utf8 char given the 1st byte
1339
1340(define utf8-start-char->length
1341 (let ((table '#(
13421 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 0x
13431 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 1x
13441 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 2x
13451 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 3x
13461 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 4x
13471 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 5x
13481 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 6x
13491 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 7x
13501 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 8x
13511 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 9x
13521 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; ax
13531 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; bx
13542 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ; cx
13552 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ; dx
13563 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 ; ex
13574 4 4 4 4 4 4 4 5 5 5 5 6 6 0 0 ; fx
1358)))
1359 (lambda (c) (vector-ref table (char->integer c)))))
1360
1361(define (utf8-string-ref str i len)
1362 (define (byte n) (char->integer (string-ref str n)))
1363 (case len
1364 ((1) ; shouldn't happen in this module
1365 (string-ref str i))
1366 ((2)
1367 (integer->char
1368 (+ (bit-shl (bit-and (byte i) #b00011111) 6)
1369 (bit-and (byte (+ i 1)) #b00111111))))
1370 ((3)
1371 (integer->char
1372 (+ (bit-shl (bit-and (byte i) #b00001111) 12)
1373 (bit-shl (bit-and (byte (+ i 1)) #b00111111) 6)
1374 (bit-and (byte (+ i 2)) #b00111111))))
1375 ((4)
1376 (integer->char
1377 (+ (bit-shl (bit-and (byte i) #b00000111) 18)
1378 (bit-shl (bit-and (byte (+ i 1)) #b00111111) 12)
1379 (bit-shl (bit-and (byte (+ i 2)) #b00111111) 6)
1380 (bit-and (byte (+ i 3)) #b00111111))))
1381 (else
1382 (error "invalid utf8 length" str len i))))
1383
1384(define (utf8-backup-to-initial-char str i)
1385 (let lp ((i i))
1386 (if (= i 0)
1387 0
1388 (let ((c (char->integer (string-ref str i))))
1389 (if (or (< c #x80) (>= c #xC0))
1390 i
1391 (lp (- i 1)))))))
1392
1393(define (utf8-lowest-digit-of-length len)
1394 (case len
1395 ((1) 0) ((2) #xC0) ((3) #xE0) ((4) #xF0)
1396 (else (error "invalid utf8 length" len))))
1397
1398(define (utf8-highest-digit-of-length len)
1399 (case len
1400 ((1) #x7F) ((2) #xDF) ((3) #xEF) ((4) #xF7)
1401 (else (error "invalid utf8 length" len))))
1402
1403(define (char->utf8-list c)
1404 (let ((i (char->integer c)))
1405 (cond
1406 ((<= i #x7F) (list i))
1407 ((<= i #x7FF)
1408 (list (bit-ior #b11000000 (bit-shr i 6))
1409 (bit-ior #b10000000 (bit-and i #b111111))))
1410 ((<= i #xFFFF)
1411 (list (bit-ior #b11100000 (bit-shr i 12))
1412 (bit-ior #b10000000 (bit-and (bit-shr i 6) #b111111))
1413 (bit-ior #b10000000 (bit-and i #b111111))))
1414 ((<= i #x1FFFFF)
1415 (list (bit-ior #b11110000 (bit-shr i 18))
1416 (bit-ior #b10000000 (bit-and (bit-shr i 12) #b111111))
1417 (bit-ior #b10000000 (bit-and (bit-shr i 6) #b111111))
1418 (bit-ior #b10000000 (bit-and i #b111111))))
1419 (else (error "unicode codepoint out of range:" i)))))
1420
1421(define (unicode-range->utf8-pattern lo hi)
1422 (let ((lo-ls (char->utf8-list lo))
1423 (hi-ls (char->utf8-list hi)))
1424 (if (not (= (length lo-ls) (length hi-ls)))
1425 (sre-alternate (list (unicode-range-climb-digits lo-ls hi-ls)
1426 (unicode-range-up-to hi-ls)))
1427 (let lp ((lo-ls lo-ls) (hi-ls hi-ls))
1428 (cond
1429 ((= (car lo-ls) (car hi-ls))
1430 (sre-sequence
1431 (cons (integer->char (car lo-ls))
1432 (if (null? (cdr lo-ls)) '()
1433 (cons (lp (cdr lo-ls) (cdr hi-ls)) '())))))
1434 ((= (+ (car lo-ls) 1) (car hi-ls))
1435 (sre-alternate (list (unicode-range-up-from lo-ls)
1436 (unicode-range-up-to hi-ls))))
1437 (else
1438 (sre-alternate (list (unicode-range-up-from lo-ls)
1439 (unicode-range-middle lo-ls hi-ls)
1440 (unicode-range-up-to hi-ls)))))))))
1441
1442(define (unicode-range-helper one ls prefix res)
1443 (if (null? ls)
1444 res
1445 (unicode-range-helper
1446 one
1447 (cdr ls)
1448 (cons (car ls) prefix)
1449 (cons (sre-sequence
1450 `(,@(map integer->char prefix)
1451 ,(one (car ls))
1452 ,@(map (lambda (_)
1453 `(/ ,(integer->char #x80)
1454 ,(integer->char #xFF)))
1455 (cdr ls))))
1456 res))))
1457
1458(define (unicode-range-up-from lo-ls)
1459 (sre-sequence
1460 (list (integer->char (car lo-ls))
1461 (sre-alternate
1462 (unicode-range-helper
1463 (lambda (c)
1464 `(/ ,(integer->char (+ (car lo-ls) 1)) ,(integer->char #xFF)))
1465 (cdr (reverse (cdr lo-ls)))
1466 '()
1467 (list
1468 (sre-sequence
1469 (append
1470 (map integer->char (reverse (cdr (reverse (cdr lo-ls)))))
1471 `((/ ,(integer->char (last lo-ls))
1472 ,(integer->char #xFF)))))))))))
1473
1474(define (unicode-range-up-to hi-ls)
1475 (sre-sequence
1476 (list (integer->char (car hi-ls))
1477 (sre-alternate
1478 (unicode-range-helper
1479 (lambda (c)
1480 `(/ ,(integer->char #x80) ,(integer->char (- (car hi-ls) 1))))
1481 (cdr (reverse (cdr hi-ls)))
1482 '()
1483 (list
1484 (sre-sequence
1485 (append
1486 (map integer->char (reverse (cdr (reverse (cdr hi-ls)))))
1487 `((/ ,(integer->char #x80)
1488 ,(integer->char (last hi-ls))))))))))))
1489
1490(define (unicode-range-climb-digits lo-ls hi-ls)
1491 (let ((lo-len (length lo-ls)))
1492 (sre-alternate
1493 (append
1494 (list
1495 (sre-sequence
1496 (cons `(/ ,(integer->char (car lo-ls))
1497 ,(integer->char (if (<= (car lo-ls) #x7F) #x7F #xFF)))
1498 (map (lambda (_)
1499 `(/ ,(integer->char #x80) ,(integer->char #xFF)))
1500 (cdr lo-ls)))))
1501 (map
1502 (lambda (i)
1503 (sre-sequence
1504 (cons
1505 `(/ ,(integer->char (utf8-lowest-digit-of-length (+ i lo-len 1)))
1506 ,(integer->char (utf8-highest-digit-of-length (+ i lo-len 1))))
1507 (map (lambda (_)
1508 `(/ ,(integer->char #x80) ,(integer->char #xFF)))
1509 (zero-to (+ i lo-len))))))
1510 (zero-to (- (length hi-ls) (+ lo-len 1))))
1511 (list
1512 (sre-sequence
1513 (cons `(/ ,(integer->char
1514 (utf8-lowest-digit-of-length
1515 (utf8-start-char->length
1516 (integer->char (- (car hi-ls) 1)))))
1517 ,(integer->char (- (car hi-ls) 1)))
1518 (map (lambda (_)
1519 `(/ ,(integer->char #x80) ,(integer->char #xFF)))
1520 (cdr hi-ls)))))))))
1521
1522(define (unicode-range-middle lo-ls hi-ls)
1523 (let ((lo (integer->char (+ (car lo-ls) 1)))
1524 (hi (integer->char (- (car hi-ls) 1))))
1525 (sre-sequence
1526 (cons (if (char=? lo hi) lo `(/ ,lo ,hi))
1527 (map (lambda (_) `(/ ,(integer->char #x80) ,(integer->char #xFF)))
1528 (cdr lo-ls))))))
1529
1530;; Maybe this should just modify the input?
1531(define (cset->utf8-pattern cset)
1532 (let lp ((ls (cset->plist cset)) (alts '()) (lo-cset '()))
1533 (if (null? ls)
1534 (sre-alternate (append (reverse alts)
1535 (if (null? lo-cset)
1536 '()
1537 (list (cons '/ (reverse lo-cset))))))
1538 (if (or (high-char? (car ls)) (high-char? (cadr ls)))
1539 (lp (cddr ls)
1540 (cons (unicode-range->utf8-pattern (car ls) (cadr ls)) alts)
1541 lo-cset)
1542 (lp (cddr ls) alts (cons (cadr ls) (cons (car ls) lo-cset)))))))
1543
1544(define (sre-adjust-utf8 sre flags)
1545 (let adjust ((sre sre)
1546 (utf8? (flag-set? flags ~utf8?))
1547 (ci? (flag-set? flags ~case-insensitive?)))
1548 (define (rec sre) (adjust sre utf8? ci?))
1549 (cond
1550 ((pair? sre)
1551 (case (car sre)
1552 ((w/utf8) (adjust (sre-sequence (cdr sre)) #t ci?))
1553 ((w/noutf8) (adjust (sre-sequence (cdr sre)) #f ci?))
1554 ((w/case)
1555 (cons (car sre) (map (lambda (s) (adjust s utf8? #f)) (cdr sre))))
1556 ((w/nocase)
1557 (cons (car sre) (map (lambda (s) (adjust s utf8? #t)) (cdr sre))))
1558 ((/ ~ & -)
1559 (if (not utf8?)
1560 sre
1561 (let ((cset (sre->cset sre ci?)))
1562 (if (any high-char? (cset->plist cset))
1563 (if ci?
1564 (list 'w/case (cset->utf8-pattern cset))
1565 (cset->utf8-pattern cset))
1566 sre))))
1567 ((*)
1568 (case (sre-sequence (cdr sre))
1569 ;; special case optimization: .* w/utf8 == .* w/noutf8
1570 ((any) '(* any))
1571 ((nonl) '(* nonl))
1572 (else (cons '* (map rec (cdr sre))))))
1573 (else
1574 (cons (car sre) (map rec (cdr sre))))))
1575 (else
1576 (case sre
1577 ((any) (if utf8? 'utf8-any 'any))
1578 ((nonl) (if utf8? 'utf8-nonl 'nonl))
1579 (else
1580 (if (and utf8? (char? sre) (high-char? sre))
1581 (sre-sequence (map integer->char (char->utf8-list sre)))
1582 sre)))))))
1583
1584;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1585;;;; Compilation
1586
1587(cond-expand
1588 (chicken-bootstrap
1589 (define-syntax cached
1590 (syntax-rules ()
1591 ((_ arg fail) (build-cache 5 arg fail)))))
1592 (else
1593 (define-syntax cached
1594 (syntax-rules ()
1595 ((_ arg fail) fail)))))
1596
1597(define (irregex x . o)
1598 (cond ((irregex? x) x)
1599 ((null? o)
1600 (cached
1601 x
1602 (if (string? x)
1603 (string->irregex x)
1604 (sre->irregex x))))
1605 (else
1606 (if (string? x)
1607 (apply string->irregex x o)
1608 (apply sre->irregex x o)))))
1609
1610(define (string->irregex str . o)
1611 (apply sre->irregex (apply string->sre str o) o))
1612
1613(define (sre->irregex sre . o)
1614 (let* ((pat-flags (symbol-list->flags o))
1615 (sre (if *allow-utf8-mode?*
1616 (sre-adjust-utf8 sre pat-flags)
1617 sre))
1618 (searcher? (sre-searcher? sre))
1619 (sre-dfa (if searcher? (sre-remove-initial-bos sre) sre))
1620 (dfa-limit (cond ((memq 'small o) 1) ((memq 'fast o) 50) (else 10)))
1621 ;; TODO: Maybe make these two promises; if we only want to search,
1622 ;; it's wasteful to compile the matcher, and vice versa
1623 ;; Maybe provide a flag to compile eagerly, to help benchmarking etc.
1624 (dfa/search
1625 (cond ((memq 'backtrack o) #f)
1626 (searcher? #t)
1627 ((sre->nfa `(seq (* any) ,sre-dfa) pat-flags)
1628 => (lambda (nfa)
1629 (nfa->dfa nfa (* dfa-limit (nfa-num-states nfa)))))
1630 (else #f)))
1631 (dfa (cond ((and dfa/search (sre->nfa sre-dfa pat-flags))
1632 => (lambda (nfa)
1633 (nfa->dfa nfa (* dfa-limit (nfa-num-states nfa)))))
1634 (else #f)))
1635 (submatches (sre-count-submatches sre-dfa))
1636 (names (sre-names sre-dfa 1 '()))
1637 (lens (sre-length-ranges sre-dfa names))
1638 (flags (flag-join
1639 (flag-join ~none (and searcher? ~searcher?))
1640 (and (sre-consumer? sre) ~consumer?))))
1641 (cond
1642 (dfa
1643 (make-irregex dfa dfa/search #f flags submatches lens names))
1644 (else
1645 (let ((f (sre->procedure sre pat-flags names)))
1646 (make-irregex #f #f f flags submatches lens names))))))
1647
1648;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1649;;;; SRE Analysis
1650
1651;; returns #t if the sre can ever be empty
1652(define (sre-empty? sre)
1653 (if (pair? sre)
1654 (case (car sre)
1655 ((* ? look-ahead look-behind neg-look-ahead neg-look-behind) #t)
1656 ((**) (or (not (number? (cadr sre))) (zero? (cadr sre))))
1657 ((or) (any sre-empty? (cdr sre)))
1658 ((: seq $ submatch => submatch-named + atomic)
1659 (every sre-empty? (cdr sre)))
1660 (else #f))
1661 (memq sre '(epsilon bos eos bol eol bow eow commit))))
1662
1663(define (sre-any? sre)
1664 (or (eq? sre 'any)
1665 (and (pair? sre)
1666 (case (car sre)
1667 ((seq : $ submatch => submatch-named)
1668 (and (pair? (cdr sre)) (null? (cddr sre)) (sre-any? (cadr sre))))
1669 ((or) (every sre-any? (cdr sre)))
1670 (else #f)))))
1671
1672(define (sre-repeater? sre)
1673 (and (pair? sre)
1674 (or (memq (car sre) '(* +))
1675 (and (memq (car sre) '($ submatch => submatch-named seq :))
1676 (pair? (cdr sre))
1677 (null? (cddr sre))
1678 (sre-repeater? (cadr sre))))))
1679
1680(define (sre-bos? sre)
1681 (if (pair? sre)
1682 (case (car sre)
1683 ((seq : $ submatch => submatch-named)
1684 (and (pair? (cdr sre)) (sre-bos? (cadr sre))))
1685 ((or) (every sre-bos? (cdr sre)))
1686 (else #f))
1687 (eq? 'bos sre)))
1688
1689;; a searcher doesn't need explicit iteration to find the first match
1690(define (sre-searcher? sre)
1691 (or (sre-bos? sre)
1692 (and (pair? sre)
1693 (case (car sre)
1694 ((* +) (sre-any? (sre-sequence (cdr sre))))
1695 ((seq : $ submatch => submatch-named)
1696 (and (pair? (cdr sre)) (sre-searcher? (cadr sre))))
1697 ((or) (every sre-searcher? (cdr sre)))
1698 (else #f)))))
1699
1700;; a consumer doesn't need to match more than once
1701(define (sre-consumer? sre)
1702 (or (sre-bos? sre)
1703 (and (pair? sre)
1704 (case (car sre)
1705 ((* +) (sre-any? (sre-sequence (cdr sre))))
1706 ((seq : $ submatch => submatch-named)
1707 (and (pair? (cdr sre)) (sre-consumer? (last sre))))
1708 ((or) (every sre-consumer? (cdr sre)))
1709 (else #f)))))
1710
1711(define (sre-has-submatches? sre)
1712 (and (pair? sre)
1713 (or (memq (car sre) '($ submatch => submatch-named))
1714 (if (eq? 'posix-string (car sre))
1715 (sre-has-submatches? (string->sre (cadr sre)))
1716 (any sre-has-submatches? (cdr sre))))))
1717
1718(define (sre-count-submatches sre)
1719 (let count ((sre sre) (sum 0))
1720 (if (pair? sre)
1721 (fold count
1722 (+ sum (case (car sre)
1723 (($ submatch => submatch-named) 1)
1724 ((dsm) (+ (cadr sre) (caddr sre)))
1725 ((posix-string)
1726 (sre-count-submatches (string->sre (cadr sre))))
1727 (else 0)))
1728 (cdr sre))
1729 sum)))
1730
1731(define (sre-length-ranges sre . o)
1732 (let ((names (if (pair? o) (car o) (sre-names sre 1 '())))
1733 (sublens (make-vector (+ 1 (sre-count-submatches sre)) #f)))
1734 (vector-set!
1735 sublens
1736 0
1737 (let lp ((sre sre) (n 1) (lo 0) (hi 0) (return cons))
1738 (define (grow i) (return (+ lo i) (and hi (+ hi i))))
1739 (cond
1740 ((pair? sre)
1741 (if (string? (car sre))
1742 (grow 1)
1743 (case (car sre)
1744 ((/ ~ & -)
1745 (grow 1))
1746 ((posix-string)
1747 (lp (string->sre (cadr sre)) n lo hi return))
1748 ((seq : w/case w/nocase atomic)
1749 (let lp2 ((ls (cdr sre)) (n n) (lo2 0) (hi2 0))
1750 (if (null? ls)
1751 (return (+ lo lo2) (and hi hi2 (+ hi hi2)))
1752 (lp (car ls) n 0 0
1753 (lambda (lo3 hi3)
1754 (lp2 (cdr ls)
1755 (+ n (sre-count-submatches (car ls)))
1756 (+ lo2 lo3)
1757 (and hi2 hi3 (+ hi2 hi3))))))))
1758 ((or)
1759 (let lp2 ((ls (cdr sre)) (n n) (lo2 #f) (hi2 0))
1760 (if (null? ls)
1761 (return (+ lo (or lo2 1)) (and hi hi2 (+ hi hi2)))
1762 (lp (car ls) n 0 0
1763 (lambda (lo3 hi3)
1764 (lp2 (cdr ls)
1765 (+ n (sre-count-submatches (car ls)))
1766 (if lo2 (min lo2 lo3) lo3)
1767 (and hi2 hi3 (max hi2 hi3))))))))
1768 ((if)
1769 (cond
1770 ((or (null? (cdr sre)) (null? (cddr sre)))
1771 (return lo hi))
1772 (else
1773 (let ((n1 (sre-count-submatches (car sre)))
1774 (n2 (sre-count-submatches (cadr sre))))
1775 (lp (if (or (number? (cadr sre)) (symbol? (cadr sre)))
1776 'epsilon
1777 (cadr sre))
1778 n lo hi
1779 (lambda (lo2 hi2)
1780 (lp (caddr sre) (+ n n1) 0 0
1781 (lambda (lo3 hi3)
1782 (lp (if (pair? (cdddr sre))
1783 (cadddr sre)
1784 'epsilon)
1785 (+ n n1 n2) 0 0
1786 (lambda (lo4 hi4)
1787 (return (+ lo2 (min lo3 lo4))
1788 (and hi2 hi3 hi4
1789 (+ hi2 (max hi3 hi4))
1790 ))))))))))))
1791 ((dsm)
1792 (lp (sre-sequence (cdddr sre)) (+ n (cadr sre)) lo hi return))
1793 (($ submatch => submatch-named)
1794 (lp (sre-sequence
1795 (if (eq? 'submatch (car sre)) (cdr sre) (cddr sre)))
1796 (+ n 1) lo hi
1797 (lambda (lo2 hi2)
1798 (vector-set! sublens n (cons lo2 hi2))
1799 (return lo2 hi2))))
1800 ((backref backref-ci)
1801 (let ((n (cond
1802 ((number? (cadr sre)) (cadr sre))
1803 ((assq (cadr sre) names) => cdr)
1804 (else (error "unknown backreference" (cadr sre))))))
1805 (cond
1806 ((or (not (integer? n))
1807 (not (< 0 n (vector-length sublens))))
1808 (error 'sre-length "invalid backreference" sre))
1809 ((not (vector-ref sublens n))
1810 (error 'sre-length "invalid forward backreference" sre))
1811 (else
1812 (let ((lo2 (car (vector-ref sublens n)))
1813 (hi2 (cdr (vector-ref sublens n))))
1814 (return (+ lo lo2) (and hi hi2 (+ hi hi2))))))))
1815 ((* *?)
1816 (lp (sre-sequence (cdr sre)) n lo hi (lambda (lo hi) #f))
1817 (return lo #f))
1818 ((** **?)
1819 (cond
1820 ((or (and (number? (cadr sre))
1821 (number? (caddr sre))
1822 (> (cadr sre) (caddr sre)))
1823 (and (not (cadr sre)) (caddr sre)))
1824 (return lo hi))
1825 (else
1826 (if (caddr sre)
1827 (lp (sre-sequence (cdddr sre)) n 0 0
1828 (lambda (lo2 hi2)
1829 (return (+ lo (* (cadr sre) lo2))
1830 (and hi hi2 (+ hi (* (caddr sre) hi2))))))
1831 (lp (sre-sequence (cdddr sre)) n 0 0
1832 (lambda (lo2 hi2)
1833 (return (+ lo (* (cadr sre) lo2)) #f)))))))
1834 ((+)
1835 (lp (sre-sequence (cdr sre)) n lo hi
1836 (lambda (lo2 hi2)
1837 (return (+ lo lo2) #f))))
1838 ((? ??)
1839 (lp (sre-sequence (cdr sre)) n lo hi
1840 (lambda (lo2 hi2)
1841 (return lo (and hi hi2 (+ hi hi2))))))
1842 ((= =? >= >=?)
1843 (lp `(** ,(cadr sre)
1844 ,(if (memq (car sre) '(>= >=?)) #f (cadr sre))
1845 ,@(cddr sre))
1846 n lo hi return))
1847 ((look-ahead neg-look-ahead look-behind neg-look-behind)
1848 (return lo hi))
1849 (else
1850 (cond
1851 ((assq (car sre) sre-named-definitions)
1852 => (lambda (cell)
1853 (lp (apply (cdr cell) (cdr sre)) n lo hi return)))
1854 (else
1855 (error 'sre-length-ranges "unknown sre operator" sre)))))))
1856 ((char? sre)
1857 (grow 1))
1858 ((string? sre)
1859 (grow (string-length sre)))
1860 ((memq sre '(any nonl))
1861 (grow 1))
1862 ((memq sre '(epsilon bos eos bol eol bow eow nwb commit))
1863 (return lo hi))
1864 (else
1865 (let ((cell (assq sre sre-named-definitions)))
1866 (if cell
1867 (lp (if (procedure? (cdr cell)) ((cdr cell)) (cdr cell))
1868 n lo hi return)
1869 (error 'sre-length-ranges "unknown sre" sre)))))))
1870 sublens))
1871
1872;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1873;;;; SRE Manipulation
1874
1875;; build a (seq ls ...) sre from a list
1876(define (sre-sequence ls)
1877 (cond
1878 ((null? ls) 'epsilon)
1879 ((null? (cdr ls)) (car ls))
1880 (else (cons 'seq ls))))
1881
1882;; build a (or ls ...) sre from a list
1883(define (sre-alternate ls)
1884 (cond
1885 ((null? ls) '(or))
1886 ((null? (cdr ls)) (car ls))
1887 (else (cons 'or ls))))
1888
1889;; returns an equivalent SRE without any match information
1890(define (sre-strip-submatches sre)
1891 (if (not (pair? sre))
1892 sre
1893 (case (car sre)
1894 (($ submatch) (sre-strip-submatches (sre-sequence (cdr sre))))
1895 ((=> submatch-named) (sre-strip-submatches (sre-sequence (cddr sre))))
1896 ((dsm) (sre-strip-submatches (sre-sequence (cdddr sre))))
1897 (else (map sre-strip-submatches sre)))))
1898
1899;; given a char-set list of chars and strings, flattens them into
1900;; chars only
1901(define (sre-flatten-ranges ls)
1902 (let lp ((ls ls) (res '()))
1903 (cond
1904 ((null? ls)
1905 (reverse res))
1906 ((string? (car ls))
1907 (lp (append (string->list (car ls)) (cdr ls)) res))
1908 (else
1909 (lp (cdr ls) (cons (car ls) res))))))
1910
1911(define (sre-names sre n names)
1912 (if (not (pair? sre))
1913 names
1914 (case (car sre)
1915 (($ submatch)
1916 (sre-names (sre-sequence (cdr sre)) (+ n 1) names))
1917 ((=> submatch-named)
1918 (sre-names (sre-sequence (cddr sre))
1919 (+ n 1)
1920 (cons (cons (cadr sre) n) names)))
1921 ((dsm)
1922 (sre-names (sre-sequence (cdddr sre)) (+ n (cadr sre)) names))
1923 ((seq : or * + ? *? ?? w/case w/nocase atomic
1924 look-ahead look-behind neg-look-ahead neg-look-behind)
1925 (sre-sequence-names (cdr sre) n names))
1926 ((= >=)
1927 (sre-sequence-names (cddr sre) n names))
1928 ((** **?)
1929 (sre-sequence-names (cdddr sre) n names))
1930 (else
1931 names))))
1932
1933(define (sre-sequence-names ls n names)
1934 (if (null? ls)
1935 names
1936 (sre-sequence-names (cdr ls)
1937 (+ n (sre-count-submatches (car ls)))
1938 (sre-names (car ls) n names))))
1939
1940(define (sre-remove-initial-bos sre)
1941 (cond
1942 ((pair? sre)
1943 (case (car sre)
1944 ((seq : $ submatch => submatch-named * +)
1945 (cond
1946 ((not (pair? (cdr sre)))
1947 sre)
1948 ((eq? 'bos (cadr sre))
1949 (cons (car sre) (cddr sre)))
1950 (else
1951 (cons (car sre)
1952 (cons (sre-remove-initial-bos (cadr sre)) (cddr sre))))))
1953 ((or)
1954 (sre-alternate (map sre-remove-initial-bos (cdr sre))))
1955 (else
1956 sre)))
1957 (else
1958 sre)))
1959
1960;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1961;;;; Basic Matching
1962
1963(define irregex-basic-string-chunker
1964 (make-irregex-chunker (lambda (x) #f)
1965 car
1966 cadr
1967 caddr
1968 (lambda (src1 i src2 j)
1969 (substring (car src1) i j))))
1970
1971(define (irregex-search x str . o)
1972 (if (not (string? str)) (error 'irregex-search "not a string" str))
1973 (let ((start (or (and (pair? o) (car o)) 0))
1974 (end (or (and (pair? o) (pair? (cdr o)) (cadr o)) (string-length str))))
1975 (if (not (and (integer? start) (exact? start)))
1976 (error 'irregex-search "not an exact integer" start))
1977 (if (not (and (integer? end) (exact? end)))
1978 (error 'irregex-search "not an exact integer" end))
1979 (irregex-search/chunked x
1980 irregex-basic-string-chunker
1981 (list str start end)
1982 start)))
1983
1984(define (irregex-search/chunked x cnk src . o)
1985 (let* ((irx (irregex x))
1986 (matches (irregex-new-matches irx))
1987 (i (if (pair? o) (car o) ((chunker-get-start cnk) src))))
1988 (if (not (integer? i)) (error 'irregex-search "not an integer" i))
1989 (irregex-match-chunker-set! matches cnk)
1990 (irregex-search/matches irx cnk (cons src i) src i matches)))
1991
1992;; internal routine, can be used in loops to avoid reallocating the
1993;; match vector
1994(define (irregex-search/matches irx cnk init src i matches)
1995 (cond
1996 ((irregex-dfa irx)
1997 (cond
1998 ((flag-set? (irregex-flags irx) ~searcher?)
1999 (cond
2000 ((dfa-match/longest (irregex-dfa irx) cnk src i #f #f matches 0)
2001 (irregex-match-start-chunk-set! matches 0 src)
2002 (irregex-match-start-index-set! matches 0 i)
2003 matches)
2004 (else
2005 #f)))
2006 ((dfa-match/shortest
2007 (irregex-dfa/search irx) cnk src i matches 0)
2008 (let ((dfa (irregex-dfa irx))
2009 (get-start (chunker-get-start cnk))
2010 (get-end (chunker-get-end cnk))
2011 (get-next (chunker-get-next cnk)))
2012 (let lp1 ((src src) (i i))
2013 (let ((end (get-end src)))
2014 (let lp2 ((i i))
2015 (cond
2016 ((dfa-match/longest dfa cnk src i #f #f matches 0)
2017 (irregex-match-start-chunk-set! matches 0 src)
2018 (irregex-match-start-index-set! matches 0 i)
2019 matches)
2020 ((>= i end)
2021 (let ((next (get-next src)))
2022 (and next (lp1 next (get-start next)))))
2023 (else
2024 (lp2 (+ i 1)))))))))
2025 (else
2026 #f)))
2027 (else
2028 (let ((res (irregex-search/backtrack irx cnk init src i matches)))
2029 (if res (%irregex-match-fail-set! res #f))
2030 res))))
2031
2032(define (irregex-search/backtrack irx cnk init src i matches)
2033 (let ((matcher (irregex-nfa irx))
2034 (str ((chunker-get-str cnk) src))
2035 (end ((chunker-get-end cnk) src))
2036 (get-next (chunker-get-next cnk)))
2037 (if (flag-set? (irregex-flags irx) ~searcher?)
2038 (matcher cnk init src str i end matches (lambda () #f))
2039 (let lp ((src2 src)
2040 (str str)
2041 (i i)
2042 (end end))
2043 (cond
2044 ((matcher cnk init src2 str i end matches (lambda () #f))
2045 (irregex-match-start-chunk-set! matches 0 src2)
2046 (irregex-match-start-index-set! matches 0 i)
2047 matches)
2048 ((< i end)
2049 (lp src2 str (+ i 1) end))
2050 (else
2051 (let ((src2 (get-next src2)))
2052 (if src2
2053 (lp src2
2054 ((chunker-get-str cnk) src2)
2055 ((chunker-get-start cnk) src2)
2056 ((chunker-get-end cnk) src2))
2057 #f))))))))
2058
2059(define (irregex-match irx str . o)
2060 (if (not (string? str)) (error 'irregex-match "not a string" str))
2061 (let ((start (or (and (pair? o) (car o)) 0))
2062 (end (or (and (pair? o) (pair? (cdr o)) (cadr o)) (string-length str))))
2063 (if (not (and (integer? start) (exact? start)))
2064 (error 'irregex-match "not an exact integer" start))
2065 (if (not (and (integer? end) (exact? end)))
2066 (error 'irregex-match "not an exact integer" end))
2067 (irregex-match/chunked irx
2068 irregex-basic-string-chunker
2069 (list str start end))))
2070
2071(define (irregex-match/chunked irx cnk src)
2072 (let* ((irx (irregex irx))
2073 (matches (irregex-new-matches irx)))
2074 (irregex-match-chunker-set! matches cnk)
2075 (cond
2076 ((irregex-dfa irx)
2077 (and
2078 (dfa-match/longest
2079 (irregex-dfa irx) cnk src ((chunker-get-start cnk) src) #f #f matches 0)
2080 (= ((chunker-get-end cnk) (%irregex-match-end-chunk matches 0))
2081 (%irregex-match-end-index matches 0))
2082 (begin
2083 (irregex-match-start-chunk-set! matches 0 src)
2084 (irregex-match-start-index-set! matches
2085 0
2086 ((chunker-get-start cnk) src))
2087 matches)))
2088 (else
2089 (let* ((matcher (irregex-nfa irx))
2090 (str ((chunker-get-str cnk) src))
2091 (i ((chunker-get-start cnk) src))
2092 (end ((chunker-get-end cnk) src))
2093 (init (cons src i)))
2094 (let lp ((m (matcher cnk init src str i end matches (lambda () #f))))
2095 (and m
2096 (cond
2097 ((and (not ((chunker-get-next cnk)
2098 (%irregex-match-end-chunk m 0)))
2099 (= ((chunker-get-end cnk)
2100 (%irregex-match-end-chunk m 0))
2101 (%irregex-match-end-index m 0)))
2102 (%irregex-match-fail-set! m #f)
2103 m)
2104 ((%irregex-match-fail m)
2105 (lp ((%irregex-match-fail m))))
2106 (else
2107 #f)))))))))
2108
2109(define (irregex-match? . args)
2110 (and (apply irregex-match args) #t))
2111
2112;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2113;;;; DFA Matching
2114
2115;; inline these
2116(define (dfa-init-state dfa)
2117 (vector-ref dfa 0))
2118(define (dfa-next-state dfa node)
2119 (vector-ref dfa (cadr node)))
2120(define (dfa-cell-commands dfa node)
2121 (cddr node))
2122(define (dfa-finalizer dfa state)
2123 (car state))
2124
2125;; this searches for the first end index for which a match is possible
2126(define (dfa-match/shortest dfa cnk src start matches index)
2127 (let ((get-str (chunker-get-str cnk))
2128 (get-start (chunker-get-start cnk))
2129 (get-end (chunker-get-end cnk))
2130 (get-next (chunker-get-next cnk))
2131 ;; Skip the "set-up" state, we don't need to set tags.
2132 (start-state (dfa-next-state dfa (cadr (dfa-init-state dfa)))))
2133 (let lp1 ((src src) (start start) (state start-state))
2134 (and
2135 src
2136 (let ((str (get-str src))
2137 (end (get-end src)))
2138 (let lp2 ((i start) (state state))
2139 (cond
2140 ((dfa-finalizer dfa state)
2141 (cond
2142 (index
2143 (irregex-match-end-chunk-set! matches index src)
2144 (irregex-match-end-index-set! matches index i)))
2145 #t)
2146 ((< i end)
2147 (let* ((ch (string-ref str i))
2148 (next (find (lambda (x)
2149 (or (eqv? ch (car x))
2150 (and (not (char? (car x)))
2151 (cset-contains? (car x) ch))))
2152 (cdr state))))
2153 (and next (lp2 (+ i 1) (dfa-next-state dfa next)))))
2154 (else
2155 (let ((next (get-next src)))
2156 (and next (lp1 next (get-start next) state)))))))))))
2157
2158(define (finalize! finalizer memory matches)
2159 (for-each
2160 (lambda (tag&slot)
2161 (let* ((tag (car tag&slot))
2162 (slot (vector-ref memory (cdr tag&slot)))
2163 (chunk&pos (vector-ref slot tag)))
2164 (irregex-match-chunk&index-from-tag-set!
2165 matches tag
2166 (and chunk&pos (car chunk&pos))
2167 (and chunk&pos (cdr chunk&pos)))))
2168 finalizer))
2169(define (make-initial-memory slots matches)
2170 (let ((size (* (irregex-match-num-submatches matches) 2))
2171 (memory (make-vector slots)))
2172 (do ((i 0 (+ i 1)))
2173 ((= i slots) memory)
2174 (vector-set! memory i (make-vector size #f)))))
2175
2176;; this finds the longest match starting at a given index
2177(define (dfa-match/longest dfa cnk src start end-src end matches index)
2178 (let* ((get-str (chunker-get-str cnk))
2179 (get-start (chunker-get-start cnk))
2180 (get-end (chunker-get-end cnk))
2181 (get-next (chunker-get-next cnk))
2182 (initial-state (dfa-init-state dfa))
2183 (memory-size (car initial-state))
2184 (submatches? (not (zero? memory-size)))
2185 ;; A vector of vectors, each of size <number of start/end submatches>
2186 (memory (make-initial-memory memory-size matches))
2187 (init-cell (cadr initial-state))
2188 (start-state (dfa-next-state dfa init-cell))
2189 (start-finalizer (dfa-finalizer dfa start-state)))
2190 (cond
2191 (index
2192 (irregex-match-end-chunk-set! matches index #f)
2193 (irregex-match-end-index-set! matches index #f)))
2194 (cond (submatches?
2195 (for-each (lambda (s)
2196 (let ((slot (vector-ref memory (cdr s))))
2197 (vector-set! slot (car s) (cons src start))))
2198 (cdr (dfa-cell-commands dfa init-cell)))))
2199 (let lp1 ((src src)
2200 (start start)
2201 (state start-state)
2202 (res-src (and start-finalizer src))
2203 (res-index (and start-finalizer start))
2204 (finalizer start-finalizer))
2205 (let ((str (get-str src))
2206 (end (if (eq? src end-src) end (get-end src))))
2207 (let lp2 ((i start)
2208 (state state)
2209 (res-src res-src)
2210 (res-index res-index)
2211 (finalizer finalizer))
2212 (cond
2213 ((>= i end)
2214 (cond
2215 ((and index res-src)
2216 (irregex-match-end-chunk-set! matches index res-src)
2217 (irregex-match-end-index-set! matches index res-index)))
2218 (let ((next (and (not (eq? src end-src)) (get-next src))))
2219 (if next
2220 (lp1 next (get-start next) state res-src res-index finalizer)
2221 (and index
2222 (%irregex-match-end-chunk matches index)
2223 (or (not finalizer) (finalize! finalizer memory matches))
2224 #t))))
2225 (else
2226 (let* ((ch (string-ref str i))
2227 (cell (find (lambda (x)
2228 (or (eqv? ch (car x))
2229 (and (not (char? (car x)))
2230 (cset-contains? (car x) ch))))
2231 (cdr state))))
2232 (cond
2233 (cell
2234 (let* ((next (dfa-next-state dfa cell))
2235 (new-finalizer (dfa-finalizer dfa next)))
2236 (cond
2237 (submatches?
2238 (let ((cmds (dfa-cell-commands dfa cell)))
2239 ;; Save match when we're moving from accepting state to
2240 ;; rejecting state; this could be the last accepting one.
2241 (cond ((and finalizer (not new-finalizer))
2242 (finalize! finalizer memory matches)))
2243 (for-each (lambda (s)
2244 (let ((slot (vector-ref memory (cdr s)))
2245 (chunk&position (cons src (+ i 1))))
2246 (vector-set! slot (car s) chunk&position)))
2247 (cdr cmds))
2248 ;; Reassigning commands may be in an order which
2249 ;; causes memory cells to be clobbered before
2250 ;; they're read out. Make 2 passes to maintain
2251 ;; old values by copying them into a closure.
2252 (for-each (lambda (execute!) (execute!))
2253 (map (lambda (c)
2254 (let* ((tag (vector-ref c 0))
2255 (ss (vector-ref memory (vector-ref c 1)))
2256 (ds (vector-ref memory (vector-ref c 2)))
2257 (value-from (vector-ref ss tag)))
2258 (lambda () (vector-set! ds tag value-from))))
2259 (car cmds))))))
2260 (if new-finalizer
2261 (lp2 (+ i 1) next src (+ i 1) new-finalizer)
2262 (lp2 (+ i 1) next res-src res-index #f))))
2263 (res-src
2264 (cond
2265 (index
2266 (irregex-match-end-chunk-set! matches index res-src)
2267 (irregex-match-end-index-set! matches index res-index)))
2268 (cond (finalizer (finalize! finalizer memory matches)))
2269 #t)
2270 ((and index (%irregex-match-end-chunk matches index))
2271 (cond (finalizer (finalize! finalizer memory matches)))
2272 #t)
2273 (else
2274 #f))))))))))
2275
2276;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2277;;;; Named Definitions
2278
2279(define sre-named-definitions
2280 `((any . ,*all-chars*)
2281 (nonl . (- ,*all-chars* (,(string #\newline))))
2282 (alphabetic . (/ #\a #\z #\A #\Z))
2283 (alpha . alphabetic)
2284 (alphanumeric . (/ #\a #\z #\A #\Z #\0 #\9))
2285 (alphanum . alphanumeric)
2286 (alnum . alphanumeric)
2287 (lower-case . (/ #\a #\z))
2288 (lower . lower-case)
2289 (upper-case . (/ #\A #\Z))
2290 (upper . upper-case)
2291 (numeric . (/ #\0 #\9))
2292 (num . numeric)
2293 (digit . numeric)
2294 (punctuation . (or #\! #\" #\# #\% #\& #\' #\( #\) #\* #\, #\- #\.
2295 #\/ #\: #\; #\? #\@ #\[ #\\ #\] #\_ #\{ #\}))
2296 (punct . punctuation)
2297 (graphic
2298 . (or alphanumeric punctuation #\$ #\+ #\< #\= #\> #\^ #\` #\| #\~))
2299 (graph . graphic)
2300 (blank . (or #\space ,(integer->char (- (char->integer #\space) 23))))
2301 ;; 0B - vertical tab, 0C - form feed
2302 (whitespace . (or blank #\newline #\x0C #\return #\x0B))
2303 (space . whitespace)
2304 (white . whitespace)
2305 (printing . (or graphic whitespace))
2306 (print . printing)
2307
2308 ;; XXXX we assume a (possibly shifted) ASCII-based ordering
2309 (control . (/ ,(integer->char (- (char->integer #\space) 32))
2310 ,(integer->char (- (char->integer #\space) 1))))
2311 (cntrl . control)
2312 (hex-digit . (or numeric (/ #\a #\f #\A #\F)))
2313 (xdigit . hex-digit)
2314 (ascii . (/ ,(integer->char (- (char->integer #\space) 32))
2315 ,(integer->char (+ (char->integer #\space) 95))))
2316 (ascii-nonl . (/ ,(integer->char (- (char->integer #\space) 32))
2317 ,(integer->char (- (char->integer #\newline) 1))
2318 ,(integer->char (+ (char->integer #\newline) 1))
2319 ,(integer->char (+ (char->integer #\space) 95))))
2320 (newline . (or (seq ,(integer->char (+ (char->integer #\newline) 3))
2321 #\newline)
2322 (/ #\newline
2323 ,(integer->char (+ (char->integer #\newline) 3)))))
2324
2325 ;; ... it's really annoying to support old Scheme48
2326 (word . (seq bow (+ (or alphanumeric #\_)) eow))
2327 (utf8-tail-char . (/ ,(integer->char (+ (char->integer #\space) #x60))
2328 ,(integer->char (+ (char->integer #\space) #xA1))))
2329 (utf8-2-char . (seq (/ ,(integer->char (+ (char->integer #\space) #xA2))
2330 ,(integer->char (+ (char->integer #\space) #xBF)))
2331 utf8-tail-char))
2332 (utf8-3-char . (seq (/ ,(integer->char (+ (char->integer #\space) #xC0))
2333 ,(integer->char (+ (char->integer #\space) #xCF)))
2334 utf8-tail-char
2335 utf8-tail-char))
2336 (utf8-4-char . (seq (/ ,(integer->char (+ (char->integer #\space) #xD0))
2337 ,(integer->char (+ (char->integer #\space) #xD7)))
2338 utf8-tail-char
2339 utf8-tail-char
2340 utf8-tail-char))
2341 (utf8-any . (or ascii utf8-2-char utf8-3-char utf8-4-char))
2342 (utf8-nonl . (or ascii-nonl utf8-2-char utf8-3-char utf8-4-char))
2343
2344 ;; extended library patterns
2345 (integer . (seq (? (or #\+ #\-)) (+ numeric)))
2346 (real . (seq (? (or #\+ #\-))
2347 (+ numeric) (? #\. (+ numeric))
2348 (? (or #\e #\E) integer)))
2349 ;; slightly more lax than R5RS, allow ->foo, etc.
2350 (symbol-initial . (or alpha ("!$%&*/:<=>?^_~")))
2351 (symbol-subsequent . (or symbol-initial digit ("+-.@")))
2352 (symbol . (or (seq symbol-initial (* symbol-subsequent))
2353 (seq ("+-") (? symbol-initial (* symbol-subsequent)))
2354 (seq ".." (* "."))))
2355 (sexp-space . (seq (* (* space) ";" (* nonl) newline) (+ space)))
2356 (string . (seq #\" (escape #\\ #\") #\"))
2357 (escape . ,(lambda (esc . o) `(* (or (~ ,esc ,@o) (seq ,esc any)))))
2358
2359 (ipv4-digit . (seq (? (/ "12")) (? numeric) numeric))
2360 (ipv4-address . (seq ipv4-digit (= 3 #\. ipv4-digit)))
2361 ;; XXXX lax, allows multiple double-colons or < 8 terms w/o a ::
2362 (ipv6-address . (seq (** 0 4 hex-digit)
2363 (** 1 7 #\: (? #\:) (** 0 4 hex-digit))))
2364 (ip-address . (or ipv4-address ipv6-address))
2365 (domain-atom . (+ (or alphanumeric #\_ #\-)))
2366 (domain . (seq domain-atom (+ #\. domain-atom)))
2367 ;; XXXX now anything can be a top-level domain, but this is still handy
2368 (top-level-domain . (w/nocase (or "arpa" "com" "gov" "mil" "net" "org"
2369 "edu" "aero" "biz" "coop" "info"
2370 "museum" "name" "pro" (= 2 alpha))))
2371 (domain/common . (seq (+ domain-atom #\.) top-level-domain))
2372 ;;(email-local-part . (seq (+ (or (~ #\") string))))
2373 (email-local-part . (+ (or alphanumeric #\_ #\- #\. #\+)))
2374 (email . (seq email-local-part #\@ domain))
2375 (url-char . (or alnum #\_ #\- #\+ #\\ #\= #\~ #\. #\, #\& #\;
2376 (seq "%" hex-digit hex-digit)))
2377 (url-final-char . (or alnum #\_ #\- #\+ #\\ #\= #\~ #\&
2378 (seq "%" hex-digit hex-digit)))
2379 (http-url . (w/nocase
2380 "http" (? "s") "://"
2381 (or domain ipv4-address) ;; (seq "[" ipv6-address "]")
2382 (? ":" (+ numeric)) ;; port
2383 ;; path
2384 (? "/" (* (or url-char "/"))
2385 (? "?" (* url-char)) ;; query
2386 (? "#" (? (* url-char) url-final-char)) ;; fragment
2387 )))
2388
2389 ))
2390
2391
2392;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2393;;;; SRE->tNFA compilation
2394;;
2395;; A tagged NFA (tNFA) state is a numbered node with a list of
2396;; pattern->number transitions, where pattern is character set range,
2397;; or epsilon (indicating an empty transition).
2398;;
2399;; (Only) epsilon transitions may be *tagged*. Each tag represents
2400;; either the start or the end of a submatch.
2401;;
2402;; There may be overlapping ranges - since it's an NFA we process it
2403;; by considering all possible transitions.
2404
2405(define *nfa-presize* 128) ;; constant
2406(define *nfa-num-fields* 4) ;; constant
2407
2408(define (nfa-num-states nfa) (quotient (vector-length nfa) *nfa-num-fields*))
2409(define (nfa-start-state nfa) (- (nfa-num-states nfa) 1))
2410
2411(define (nfa-num-tags nfa)
2412 (vector-ref nfa 0))
2413(define (nfa-highest-map-index nfa)
2414 (vector-ref nfa 1))
2415(define (nfa-set-highest-map-index! nfa idx)
2416 (vector-set! nfa 1 idx))
2417
2418(define (nfa-get-state-trans nfa i)
2419 (if (= i 0) '() (vector-ref nfa (* i *nfa-num-fields*))))
2420(define (nfa-set-state-trans! nfa i x)
2421 (vector-set! nfa (* i *nfa-num-fields*) x))
2422
2423(define (nfa-get-epsilons nfa i)
2424 (if (= i 0) '() (vector-ref nfa (+ (* i *nfa-num-fields*) 1))))
2425(define (nfa-set-epsilons! nfa i x)
2426 (vector-set! nfa (+ (* i *nfa-num-fields*) 1) x))
2427(define (nfa-add-epsilon! nfa i x t)
2428 (let ((eps (nfa-get-epsilons nfa i)))
2429 (if (not (assv x eps))
2430 (nfa-set-epsilons! nfa i (cons (cons x t) eps)))))
2431
2432(define (nfa-get-reorder-commands nfa mst)
2433 (cond ((assoc mst (vector-ref nfa (+ (* (mst-hash mst) *nfa-num-fields*) 2)))
2434 => cdr)
2435 (else #f)))
2436(define (nfa-set-reorder-commands! nfa mst x)
2437 (let ((i (+ (* (mst-hash mst) *nfa-num-fields*) 2)))
2438 (vector-set! nfa i (cons (cons mst x) (vector-ref nfa i)))))
2439
2440(define (nfa-get-closure nfa mst)
2441 (cond ((assoc mst (vector-ref nfa (+ (* (mst-hash mst) *nfa-num-fields*) 3)))
2442 => cdr)
2443 (else #f)))
2444(define (nfa-add-closure! nfa mst x)
2445 (let ((i (+ (* (mst-hash mst) *nfa-num-fields*) 3)))
2446 (vector-set! nfa i (cons (cons mst x) (vector-ref nfa i)))))
2447
2448;; Compile and return the vector of NFA states (in groups of
2449;; *nfa-num-fields* packed elements). The start state will be the
2450;; last element(s) of the vector, and all remaining states will be in
2451;; descending numeric order, with state 0 being the unique accepting
2452;; state.
2453(define (sre->nfa sre init-flags)
2454 (let* ((buf (make-vector (* *nfa-presize* *nfa-num-fields*) '()))
2455 ;; Get cons cells and map them to numeric submatch indexes.
2456 ;; Doing it here is slightly easier than integrating into the loop below
2457 (match-index
2458 (let lp ((sre (list sre)) (max 0) (res '()))
2459 (cond
2460 ((not (pair? sre))
2461 ;; We abuse the transitions slot for state 0 (the final state,
2462 ;; which can have no transitions) to store the number of tags.
2463 (vector-set! buf 0 (* max 2))
2464 ;; We abuse the epsilons slot for state 0 to store the highest
2465 ;; encountered memory slot mapping index. Initialize to -1.
2466 (vector-set! buf 1 -1)
2467 res)
2468 ((pair? (car sre))
2469 ;; The appends here should be safe (are they?)
2470 (case (caar sre)
2471 (($ submatch => submatch-named)
2472 (lp (append (cdar sre) (cdr sre)) (+ max 1)
2473 (cons (cons (car sre) max) res)))
2474 (else (lp (append (car sre) (cdr sre)) max res))))
2475 (else (lp (cdr sre) max res))))))
2476 ;; we loop over an implicit sequence list
2477 (define (lp ls n flags next)
2478 (define (new-state-number state)
2479 (max n (+ 1 state)))
2480 (define (add-state! n2 trans-ls)
2481 (if (>= (* n2 *nfa-num-fields*) (vector-length buf))
2482 (let ((tmp (make-vector (* 2 (vector-length buf)) '())))
2483 (do ((i (- (vector-length buf) 1) (- i 1)))
2484 ((< i 0))
2485 (vector-set! tmp i (vector-ref buf i)))
2486 (set! buf tmp)))
2487 (nfa-set-state-trans! buf n2 trans-ls)
2488 n2)
2489 (define (extend-state! next trans-cs)
2490 (and next
2491 (add-state! (new-state-number next) (cons trans-cs next))))
2492 (define (add-char-state! next ch)
2493 (let ((alt (char-altcase ch)))
2494 (if (flag-set? flags ~case-insensitive?)
2495 (extend-state! next (cset-union (char->cset ch) (char->cset alt)))
2496 (extend-state! next (char->cset ch)))))
2497 (if (null? ls)
2498 next
2499 (cond
2500 ((or (eq? 'epsilon (car ls)) (equal? "" (car ls)))
2501 ;; chars and epsilons go directly into the transition table
2502 (let ((next (lp (cdr ls) n flags next)))
2503 (and next
2504 (let ((new (add-state! (new-state-number next) '())))
2505 (nfa-add-epsilon! buf new next #f)
2506 new))))
2507 ((string? (car ls))
2508 ;; process literal strings a char at a time
2509 (let ((next (lp (cdr ls) n flags next)))
2510 (and next
2511 (let lp2 ((i (- (string-length (car ls)) 1))
2512 (next next))
2513 (if (< i 0)
2514 next
2515 (lp2 (- i 1)
2516 (add-char-state! next (string-ref (car ls) i))))
2517 ))))
2518 ((char? (car ls))
2519 (add-char-state! (lp (cdr ls) n flags next) (car ls)))
2520 ((symbol? (car ls))
2521 (let ((cell (assq (car ls) sre-named-definitions)))
2522 (and cell
2523 (lp (cons (if (procedure? (cdr cell))
2524 ((cdr cell))
2525 (cdr cell))
2526 (cdr ls))
2527 n
2528 flags
2529 next))))
2530 ((pair? (car ls))
2531 (cond
2532 ((string? (caar ls)) ; Enumerated character set
2533 (let ((set (if (flag-set? flags ~case-insensitive?)
2534 (cset-case-insensitive (string->cset (caar ls)))
2535 (string->cset (caar ls)))))
2536 (extend-state! (lp (cdr ls) n flags next) set)))
2537 (else
2538 (case (caar ls)
2539 ((seq :)
2540 ;; for an explicit sequence, just append to the list
2541 (lp (append (cdar ls) (cdr ls)) n flags next))
2542 ((w/case w/nocase w/utf8 w/noutf8)
2543 (let* ((next (lp (cdr ls) n flags next))
2544 (flags ((if (memq (caar ls) '(w/case w/utf8))
2545 flag-clear
2546 flag-join)
2547 flags
2548 (if (memq (caar ls) '(w/case w/nocase))
2549 ~case-insensitive?
2550 ~utf8?))))
2551 (and next
2552 (lp (cdar ls) (new-state-number next) flags next))))
2553 ((/ - & ~)
2554 (let ((range (sre->cset (car ls)
2555 (flag-set? flags ~case-insensitive?))))
2556 (extend-state! (lp (cdr ls) n flags next)
2557 range)))
2558 ((or)
2559 (let ((next (lp (cdr ls) n flags next)))
2560 (and
2561 next
2562 (if (null? (cdar ls))
2563 ;; empty (or) always fails
2564 (add-state! (new-state-number next) '())
2565 ;; compile both branches and insert epsilon
2566 ;; transitions to either
2567 (let* ((b (lp (list (sre-alternate (cddar ls)))
2568 (new-state-number next)
2569 flags
2570 next))
2571 (a (and b
2572 (lp (list (cadar ls))
2573 (new-state-number (max b next))
2574 flags
2575 next))))
2576 (and a
2577 (let ((c (add-state! (new-state-number (max a b))
2578 '())))
2579 (nfa-add-epsilon! buf c a #f)
2580 (nfa-add-epsilon! buf c b #f)
2581 c)))))))
2582 ((?)
2583 (let ((next (lp (cdr ls) n flags next)))
2584 ;; insert an epsilon transition directly to next
2585 (and
2586 next
2587 (let ((a (lp (cdar ls) (new-state-number next) flags next)))
2588 (if a
2589 (nfa-add-epsilon! buf a next #f))
2590 a))))
2591 ((+ *)
2592 (let ((next (lp (cdr ls) n flags next)))
2593 (and
2594 next
2595 (let* ((new (lp '(epsilon)
2596 (new-state-number next)
2597 flags
2598 next))
2599 (a (lp (cdar ls) (new-state-number new) flags new)))
2600 (cond
2601 (a
2602 ;; for *, insert an epsilon transition as in ? above
2603 (if (eq? '* (caar ls))
2604 (nfa-add-epsilon! buf a new #f))
2605 ;; for both, insert a loop back to self
2606 (nfa-add-epsilon! buf new a #f)))
2607 a))))
2608 ;; need to add these to the match extractor first,
2609 ;; but they tend to generate large DFAs
2610 ;;((=)
2611 ;; (lp (append (vector->list
2612 ;; (make-vector (cadar ls)
2613 ;; (sre-sequence (cddar ls))))
2614 ;; (cdr ls))
2615 ;; n flags next))
2616 ;;((>=)
2617 ;; (lp (append (vector->list
2618 ;; (make-vector (- (cadar ls) 1)
2619 ;; (sre-sequence (cddar ls))))
2620 ;; (cons `(+ ,@(cddar ls)) (cdr ls)))
2621 ;; n flags next))
2622 ;;((**)
2623 ;; (lp (append (vector->list
2624 ;; (make-vector (cadar ls)
2625 ;; (sre-sequence (cdddar ls))))
2626 ;; (map
2627 ;; (lambda (x) `(? ,x))
2628 ;; (vector->list
2629 ;; (make-vector (- (caddar ls) (cadar ls))
2630 ;; (sre-sequence (cdddar ls)))))
2631 ;; (cdr ls))
2632 ;; n flags next))
2633 ;; ignore submatches altogether
2634 (($ submatch)
2635 (let* ((pre-tag (* (cdr (assq (car ls) match-index)) 2))
2636 (post-tag (+ pre-tag 1))
2637 (next (lp (cdr ls) n flags next)))
2638 (and next
2639 (let* ((after (add-state! (new-state-number next) '()))
2640 (sub (lp (list (sre-sequence (cdar ls)))
2641 (new-state-number after) flags after))
2642 (before (and sub (add-state! (new-state-number sub) '()))))
2643 (cond (before
2644 (nfa-add-epsilon! buf before sub pre-tag)
2645 (nfa-add-epsilon! buf after next post-tag)))
2646 before))))
2647 ((=> submatch-named)
2648 (let* ((pre-tag (* (cdr (assq (car ls) match-index)) 2))
2649 (post-tag (+ pre-tag 1))
2650 (next (lp (cdr ls) n flags next)))
2651 (and next
2652 (let* ((after (add-state! (new-state-number next) '()))
2653 (sub (lp (list (sre-sequence (cddar ls)))
2654 (new-state-number after) flags after))
2655 (before (and sub (add-state! (new-state-number sub) '()))))
2656 (cond (before
2657 (nfa-add-epsilon! buf before sub pre-tag)
2658 (nfa-add-epsilon! buf after next post-tag)))
2659 before))))
2660 (else
2661 (cond
2662 ((assq (caar ls) sre-named-definitions)
2663 => (lambda (cell)
2664 (if (procedure? (cdr cell))
2665 (lp (cons (apply (cdr cell) (cdar ls)) (cdr ls))
2666 n flags next)
2667 (error "non-procedure in op position" (caar ls)))))
2668 (else #f)))))))
2669 (else
2670 #f))))
2671 (let ((len (lp (list sre) 1 init-flags 0)))
2672 (and len
2673 (let ((nfa (make-vector (* *nfa-num-fields* (+ len 1)))))
2674 (do ((i (- (vector-length nfa) 1) (- i 1)))
2675 ((< i 0))
2676 (vector-set! nfa i (vector-ref buf i)))
2677 nfa)))))
2678
2679;; We don't really want to use this, we use the closure compilation
2680;; below instead, but this is included for reference and testing the
2681;; sre->nfa conversion.
2682
2683;; (define (nfa-match nfa str)
2684;; (let ((matches (make-vector (nfa-num-tags nfa) #f)))
2685;; (let lp ((pos 0) (ls (string->list str)) (state (nfa-start-state nfa)) (epsilons '()))
2686;; (and (or (and (null? ls) (zero? state))
2687;; (let ((t (nfa-get-state-trans nfa state)))
2688;; (and (not (null? t)) (not (null? ls))
2689;; (cset-contains? (car t) (car ls))
2690;; (lp (+ pos 1) (cdr ls) (cdr t) '())))
2691;; (any (lambda (e)
2692;; (let ((old-matches (vector-copy matches)))
2693;; (cond ((cdr e)
2694;; (vector-set! matches (cdr e) pos)))
2695;; (or (and (not (memv (car e) epsilons))
2696;; (lp pos ls (car e) (cons (car e) epsilons)))
2697;; ;; reset match, apparently this branch failed
2698;; (begin (set! matches old-matches) #f))))
2699;; (nfa-get-epsilons nfa state)))
2700;; matches))))
2701
2702;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2703;;;; NFA multi-state representation
2704
2705(define *mst-first-state-index* 3)
2706
2707(define (mst-mappings-summary mst)
2708 (vector-ref mst 0))
2709
2710(define (mst-num-states mst)
2711 (vector-ref mst 1))
2712
2713(define (mst-num-states-set! mst num)
2714 (vector-set! mst 1 num))
2715
2716(define (mst-hash mst)
2717 ;; We could do (modulo X (nfa-num-states nfa)) here which would be faster,
2718 ;; but we can't assume a full numerical tower (and updating *could*
2719 ;; produce a bignum), so we do it each time when updating the hash.
2720 (vector-ref mst 2))
2721
2722(define (mst-hash-set! mst hash)
2723 (vector-set! mst 2 hash))
2724
2725;; Returns #f if NFA state does not occur in multi-state
2726(define (mst-state-mappings mst state)
2727 (vector-ref mst (+ state *mst-first-state-index*)))
2728
2729(define (mst-state-mappings-set! mst state mappings)
2730 (vector-set! mst (+ state *mst-first-state-index*) mappings))
2731
2732;; A multi-state holds a set of states with their tag-to-slot mappings.
2733;; Slot 0 contains a summary of all mappings for all states in the multi-state.
2734;; Slot 1 contains the total number of states in the multi-state.
2735;; Slot 2 contains a hash value, which is used for quick lookup of cached
2736;; reorder-commands or epsilon-closure in the NFA. This is the sum of all
2737;; state numbers plus each tag value (once per occurrence). This is a silly
2738;; hashing calculation, but it seems to produce a well-spread out hash table and
2739;; it has the added advantage that we can use the value as a quick check if the
2740;; state is definitely NOT equivalent to another in mst-same-states?
2741;; The other slots contain mappings for each corresponding state.
2742
2743(define (make-mst nfa)
2744 (let ((mst (make-vector (+ (nfa-num-states nfa) *mst-first-state-index*) #f)))
2745 (vector-set! mst 0 (make-vector (nfa-num-tags nfa) '())) ; tag summary
2746 (vector-set! mst 1 0) ; total number of states
2747 (vector-set! mst 2 0) ; states and tags hash
2748 mst))
2749
2750;; NOTE: This doesn't do a deep copy of the mappings. Don't mutate them!
2751(define (mst-copy mst)
2752 (let ((v (vector-copy mst)))
2753 (vector-set! v 0 (vector-copy (vector-ref mst 0)))
2754 v))
2755
2756(define (nfa-state->mst nfa state mappings)
2757 (let ((mst (make-mst nfa)))
2758 (mst-add! nfa mst state mappings)
2759 mst))
2760
2761;; Extend multi-state with a state and add its tag->slot mappings.
2762(define (mst-add! nfa mst state mappings)
2763 (let ((hash-value (mst-hash mst)))
2764 (cond ((not (mst-state-mappings mst state)) ; Update state hash & count?
2765 (set! hash-value (+ hash-value state))
2766 (mst-num-states-set! mst (+ (mst-num-states mst) 1))))
2767 (mst-state-mappings-set! mst state mappings)
2768 (let ((all-mappings (mst-mappings-summary mst)))
2769 (for-each
2770 (lambda (tag&slot)
2771 (let* ((t (car tag&slot))
2772 (s (cdr tag&slot))
2773 (m (vector-ref all-mappings t)))
2774 (cond ((not (memv s m))
2775 (set! hash-value (+ hash-value t))
2776 (vector-set! all-mappings t (cons s m))))))
2777 mappings))
2778 (mst-hash-set! mst (modulo hash-value (nfa-num-states nfa)))))
2779
2780;; Same as above, but skip updating mappings summary.
2781;; Called when we know all the tag->slot mappings are already in the summary.
2782(define (mst-add/fast! nfa mst state mappings)
2783 (cond ((not (mst-state-mappings mst state)) ; Update state hash & count?
2784 (mst-hash-set!
2785 mst (modulo (+ (mst-hash mst) state)
2786 (nfa-num-states nfa)))
2787 (mst-num-states-set! mst (+ (mst-num-states mst) 1))))
2788 (mst-state-mappings-set! mst state mappings))
2789
2790;; Same as above, assigning a new slot for a tag. This slot is then
2791;; added to the summary, if it isn't in there yet. This is more efficient
2792;; than looping through all the mappings.
2793(define (mst-add-tagged! nfa mst state mappings tag slot)
2794 (let* ((mappings-summary (mst-mappings-summary mst))
2795 (summary-tag-slots (vector-ref mappings-summary tag))
2796 (new-mappings (let lp ((m mappings)
2797 (res '()))
2798 (cond ((null? m) (cons (cons tag slot) res))
2799 ((= (caar m) tag)
2800 (append res (cons (cons tag slot) (cdr m))))
2801 (else (lp (cdr m) (cons (car m) res))))))
2802 (hash-value (mst-hash mst)))
2803 (cond ((not (mst-state-mappings mst state)) ; Update state hash & count?
2804 (set! hash-value (+ hash-value state))
2805 (mst-num-states-set! mst (+ (mst-num-states mst) 1))))
2806 (mst-state-mappings-set! mst state new-mappings)
2807 (cond ((not (memv slot summary-tag-slots)) ; Update tag/slot summary
2808 (set! hash-value (+ hash-value tag))
2809 (vector-set! mappings-summary tag (cons slot summary-tag-slots))))
2810 (mst-hash-set! mst (modulo hash-value (nfa-num-states nfa)))
2811 new-mappings))
2812
2813(define (mst-same-states? a b)
2814 ;; First check if hash and state counts match, then check each state
2815 (and (= (mst-hash a) (mst-hash b))
2816 (= (mst-num-states a) (mst-num-states b))
2817 (let ((len (vector-length a)))
2818 (let lp ((i *mst-first-state-index*))
2819 (or (= i len)
2820 (and (equal? (not (vector-ref a i))
2821 (not (vector-ref b i)))
2822 (lp (+ i 1))))))))
2823
2824(define (mst-fold mst kons knil)
2825 (let ((limit (vector-length mst)))
2826 (let lp ((i *mst-first-state-index*)
2827 (acc knil))
2828 (if (= i limit)
2829 acc
2830 (let ((m (vector-ref mst i)))
2831 (lp (+ i 1) (if m (kons (- i *mst-first-state-index*) m acc) acc)))))))
2832
2833;; Find the lowest fresh index for this tag that's unused
2834;; in the multi-state. This also updates the nfa's highest
2835;; tag counter if a completely new slot number was assigned.
2836(define (next-index-for-tag! nfa tag mst)
2837 (let* ((highest (nfa-highest-map-index nfa))
2838 (tag-slots (vector-ref (mst-mappings-summary mst) tag))
2839 (new-index (do ((slot 0 (+ slot 1)))
2840 ((not (memv slot tag-slots)) slot))))
2841 (cond ((> new-index highest)
2842 (nfa-set-highest-map-index! nfa new-index)))
2843 new-index))
2844
2845;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2846;;;; tNFA->DFA compilation
2847;; During processing, the DFA is a list of the form:
2848;;
2849;; ((annotated-tNFA-states ...) finalizer transitions ...)
2850;;
2851;; where the transitions are as in the NFA, except there are no
2852;; epsilons, duplicate characters or overlapping char-set ranges, and
2853;; the states moved to are closures (sets of NFA states). Multiple
2854;; DFA states may be accepting states. If the state is an accepting state,
2855;; the finalizer is a list of (tag . memory-slot) retrieval commands.
2856;; tNFA-states are annotated with mappings which store the tag values of
2857;; memory slots, if any. There is always at most one slot for a tag.
2858;;
2859;; The DFA itself simulates a NFA by representing all the simultaneous
2860;; states the NFA can be in at any given point in time as one DFA state.
2861;; The tag values are ambiguous since each NFA transition can set a tag.
2862;; To solve this we keep a bank of memory slots around which tracks tag
2863;; values for each distinct path through the NFA.
2864;;
2865;; Once we get to a final state we can pluck the tag values from the
2866;; memory slots corresponding to the path through which the NFA could have
2867;; reached the final state. To resolve ambiguities, states are assigned
2868;; priorities, and the path to the final state is chosen correspondingly.
2869;;
2870;; For a more detailed explanation about this process, see
2871;; Ville Laurikari; ``NFAs with Tagged Transitions, their Conversion to
2872;; Deterministic Automata and Application to Regular Expressions'' (2000).
2873;; Laurikari also wrote a master's thesis about this approach which is
2874;; less terse but the algorithms are not exactly the same.
2875;; ``Efficient submatch addressing for regular expressions'' (2001).
2876;; This implementation follows the 2000 paper where they differ.
2877
2878(define (nfa->dfa nfa . o)
2879 (let* ((max-states (and (pair? o) (car o)))
2880 (start (nfa-state->mst nfa (nfa-start-state nfa) '()))
2881 (start-closure (nfa-epsilon-closure nfa start))
2882 ;; Set up a special "initializer" state from which we reach the
2883 ;; start-closure to ensure that leading tags are set properly.
2884 (init-set (tag-set-commands-for-closure nfa start start-closure '()))
2885 (dummy (make-mst nfa))
2886 (init-state (list dummy #f `((,start-closure #f () . ,init-set)))))
2887 ;; Unmarked states are just sets of NFA states with tag-maps, marked states
2888 ;; are sets of NFA states with transitions to sets of NFA states
2889 (let lp ((unmarked-states (list start-closure))
2890 (marked-states (list init-state))
2891 (dfa-size 0))
2892 (cond
2893 ((null? unmarked-states)
2894 ;; Abuse finalizer slot for storing the number of memory slots we need
2895 (set-car! (cdr init-state) (+ (nfa-highest-map-index nfa) 1))
2896 (dfa-renumber (reverse marked-states)))
2897 ((and max-states (> dfa-size max-states)) ; Too many DFA states
2898 #f)
2899 ((assoc (car unmarked-states) marked-states) ; Seen set of NFA-states?
2900 (lp (cdr unmarked-states) marked-states dfa-size))
2901 (else
2902 (let ((dfa-state (car unmarked-states)))
2903 (let lp2 ((trans (get-distinct-transitions nfa dfa-state))
2904 (unmarked-states (cdr unmarked-states))
2905 (dfa-trans '()))
2906 (if (null? trans)
2907 (let ((finalizer (mst-state-mappings dfa-state 0)))
2908 (lp unmarked-states
2909 (cons (list dfa-state finalizer dfa-trans) marked-states)
2910 (+ dfa-size 1)))
2911 (let* ((closure (nfa-epsilon-closure nfa (cdar trans)))
2912 (reordered
2913 (find-reorder-commands nfa closure marked-states))
2914 (copy-cmds (if reordered (cdr reordered) '()))
2915 ;; Laurikari doesn't mention what "k" is, but it seems it
2916 ;; must be the mappings of the state's reach
2917 (set-cmds (tag-set-commands-for-closure
2918 nfa (cdar trans) closure copy-cmds))
2919 (trans-closure (if reordered (car reordered) closure)))
2920 (lp2 (cdr trans)
2921 (if reordered
2922 unmarked-states
2923 (cons trans-closure unmarked-states))
2924 (cons `(,trans-closure
2925 ,(caar trans) ,copy-cmds . ,set-cmds)
2926 dfa-trans)))))))))))
2927
2928;; When the conversion is complete we renumber the DFA sets-of-states
2929;; in order and convert the result to a vector for fast lookup.
2930;; Charsets containing single characters are converted to those characters
2931;; for quick matching of the literal parts in a regex.
2932(define (dfa-renumber states)
2933 (let ((indexes (let lp ((i 0) (states states) (indexes '()))
2934 (if (null? states)
2935 indexes
2936 (lp (+ i 1) (cdr states)
2937 (cons (cons (caar states) i) indexes)))))
2938 (dfa (make-vector (length states))))
2939 (do ((i 0 (+ i 1))
2940 (states states (cdr states)))
2941 ((null? states) dfa)
2942 (let ((maybe-finalizer (cadar states))
2943 (transitions (caddar states)))
2944 (vector-set!
2945 dfa i
2946 (cons maybe-finalizer
2947 (map (lambda (tr)
2948 `(,(and (cadr tr) (maybe-cset->char (cadr tr)))
2949 ,(cdr (assoc (car tr) indexes)) . ,(cddr tr)))
2950 transitions)))))))
2951
2952;; Extract all distinct ranges and the potential states they can transition
2953;; to from a given set of states. Any ranges that would overlap with
2954;; distinct characters are split accordingly.
2955;; This function is like "reach" in Laurikari's papers, but for each
2956;; possible distinct range of characters rather than per character.
2957(define (get-distinct-transitions nfa annotated-states)
2958 (define (csets-intersect? a b)
2959 (let ((i (cset-intersection a b)))
2960 (and (not (cset-empty? i)) i)))
2961 (mst-fold
2962 annotated-states
2963 (lambda (st mappings res)
2964 (let ((trans (nfa-get-state-trans nfa st))) ; Always one state per trans
2965 (if (null? trans)
2966 res
2967 (let lp ((ls res) (cs (car trans)) (state (cdr trans)) (res '()))
2968 (cond
2969 ;; State not seen yet? Add a new state transition
2970 ((null? ls)
2971 ;; TODO: We should try to find an existing DFA state
2972 ;; with only this NFA state in it, and extend the cset
2973 ;; with the current one. This produces smaller DFAs,
2974 ;; but takes longer to compile.
2975 (cons (cons cs (nfa-state->mst nfa state mappings))
2976 res))
2977 ((cset=? cs (caar ls))
2978 ;; Add state to existing set for this charset
2979 (mst-add! nfa (cdar ls) state mappings)
2980 (append ls res))
2981 ((csets-intersect? cs (caar ls)) =>
2982 (lambda (intersection)
2983 (let* ((only-in-new (cset-difference cs (caar ls)))
2984 (only-in-old (cset-difference (caar ls) cs))
2985 (states-in-both (cdar ls))
2986 (states-for-old
2987 (and (not (cset-empty? only-in-old))
2988 (mst-copy states-in-both)))
2989 (res (if states-for-old
2990 (cons (cons only-in-old states-for-old) res)
2991 res)))
2992 (mst-add! nfa states-in-both state mappings)
2993 ;; Add this state to the states already here and
2994 ;; restrict to the overlapping charset and continue
2995 ;; with the remaining subset of the new cset (if
2996 ;; nonempty)
2997 (if (cset-empty? only-in-new)
2998 (cons (cons intersection states-in-both)
2999 (append (cdr ls) res))
3000 (lp (cdr ls) only-in-new state
3001 (cons (cons intersection states-in-both) res))))))
3002 (else
3003 (lp (cdr ls) cs state (cons (car ls) res))))))))
3004 '()))
3005
3006;; The epsilon-closure of a set of states is all the states reachable
3007;; through epsilon transitions, with the tags encountered on the way.
3008(define (nfa-epsilon-closure-internal nfa annotated-states)
3009 ;; The stack _MUST_ be in this order for some reason I don't fully understand
3010 (let lp ((stack (mst-fold annotated-states
3011 (lambda (st m res)
3012 (cons (cons st m) res))
3013 '()))
3014 (priorities (make-vector (nfa-num-states nfa) 0))
3015 (closure (mst-copy annotated-states)))
3016 (if (null? stack)
3017 closure
3018 (let ((prio/orig-state (caar stack)) ; priority is just the state nr.
3019 (mappings (cdar stack)))
3020 (let lp2 ((trans (nfa-get-epsilons nfa prio/orig-state))
3021 (stack (cdr stack)))
3022 (if (null? trans)
3023 (lp stack priorities closure)
3024 (let ((state (caar trans)))
3025 (cond
3026 ;; Our priorities are inverted because we start at
3027 ;; the highest state number and go downwards to 0.
3028 ((> prio/orig-state (vector-ref priorities state))
3029 (vector-set! priorities state prio/orig-state)
3030 (cond
3031 ((cdar trans) => ; tagged transition?
3032 (lambda (tag)
3033 (let* ((index (next-index-for-tag! nfa tag closure))
3034 (new-mappings
3035 (mst-add-tagged!
3036 nfa closure state mappings tag index)))
3037 (lp2 (cdr trans)
3038 (cons (cons state new-mappings) stack)))))
3039 (else
3040 (mst-add/fast! nfa closure state mappings)
3041 (lp2 (cdr trans) (cons (cons state mappings) stack)))))
3042 (else (lp2 (cdr trans) stack))))))))))
3043
3044(define (nfa-epsilon-closure nfa states)
3045 (or (nfa-get-closure nfa states)
3046 (let ((res (nfa-epsilon-closure-internal nfa states)))
3047 (nfa-add-closure! nfa states res)
3048 res)))
3049
3050;; Generate "set" commands for all tags in the closure that are
3051;; not present in the original state.
3052(define (tag-set-commands-for-closure nfa orig-state closure copy-cmds)
3053 (let ((num-tags (nfa-num-tags nfa))
3054 (closure-summary (mst-mappings-summary closure))
3055 (state-summary (mst-mappings-summary orig-state)))
3056 (let lp ((t 0) (cmds '()))
3057 (if (= t num-tags)
3058 cmds
3059 (let lp2 ((s1 (vector-ref closure-summary t))
3060 (s2 (vector-ref state-summary t))
3061 (cmds cmds))
3062 (cond ((null? s1) (lp (+ t 1) cmds))
3063 ((or (memv (car s1) s2) ; Tag in original state?
3064 ;; Try to avoid generating set-commands for any slots
3065 ;; that will be overwritten by copy commands, but only
3066 ;; if that slot isn't copied to another slot.
3067 (and (not (null? copy-cmds)) ; null check for performance
3068 ;; Look for copy command overwriting this tag-slot
3069 (any (lambda (c)
3070 (and (= (vector-ref c 0) t)
3071 (= (vector-ref c 2) (car s1))))
3072 copy-cmds)
3073 ;; Ensure it's not copied to another slot before
3074 ;; discarding the set-command.
3075 (not (any (lambda (c)
3076 (and (= (vector-ref c 0) t)
3077 (= (vector-ref c 1) (car s1))))
3078 copy-cmds))))
3079 (lp2 (cdr s1) s2 cmds))
3080 (else (lp2 (cdr s1) s2
3081 (cons (cons t (car s1)) cmds)))))))))
3082
3083;; Look in dfa-states for an already existing state which matches
3084;; closure, but has different tag value mappings.
3085;; If found, calculate reordering commands so we can map the closure
3086;; to that state instead of adding a new DFA state.
3087;; This is completely handwaved away in Laurikari's paper (it basically
3088;; says "insert reordering algorithm here"), so this code was constructed
3089;; after some experimentation. In other words, bugs be here.
3090(define (find-reorder-commands-internal nfa closure dfa-states)
3091 (let ((num-tags (nfa-num-tags nfa))
3092 (closure-summary (mst-mappings-summary closure)))
3093 (let lp ((dfa-states dfa-states))
3094 (if (null? dfa-states)
3095 #f
3096 (if (not (mst-same-states? (caar dfa-states) closure))
3097 (lp (cdr dfa-states))
3098 (let lp2 ((state-summary (mst-mappings-summary (caar dfa-states)))
3099 (t 0) (cmds '()))
3100 (if (= t num-tags)
3101 (cons (caar dfa-states) cmds)
3102 (let lp3 ((closure-slots (vector-ref closure-summary t))
3103 (state-slots (vector-ref state-summary t))
3104 (cmds cmds))
3105 (cond ((null? closure-slots)
3106 (if (null? state-slots)
3107 (lp2 state-summary (+ t 1) cmds)
3108 (lp (cdr dfa-states))))
3109 ((null? state-slots) (lp (cdr dfa-states)))
3110 (else (lp3 (cdr closure-slots)
3111 (cdr state-slots)
3112 (if (= (car closure-slots) (car state-slots))
3113 cmds
3114 (cons (vector t (car closure-slots) (car state-slots))
3115 cmds)))))))))))))
3116
3117(define (find-reorder-commands nfa closure dfa-states)
3118 (or (nfa-get-reorder-commands nfa closure)
3119 (let ((res (find-reorder-commands-internal nfa closure dfa-states)))
3120 (nfa-set-reorder-commands! nfa closure res)
3121 res)))
3122
3123;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3124;;;; Closure Compilation
3125;;
3126;; We use this for non-regular expressions instead of an interpreted
3127;; NFA matcher. We use backtracking anyway, but this gives us more
3128;; freedom of implementation, allowing us to support patterns that
3129;; can't be represented in the above NFA representation.
3130
3131(define (sre->procedure sre . o)
3132 (define names
3133 (if (and (pair? o) (pair? (cdr o))) (cadr o) (sre-names sre 1 '())))
3134 (let lp ((sre sre)
3135 (n 1)
3136 (flags (if (pair? o) (car o) ~none))
3137 (next (lambda (cnk init src str i end matches fail)
3138 (irregex-match-start-chunk-set! matches 0 (car init))
3139 (irregex-match-start-index-set! matches 0 (cdr init))
3140 (irregex-match-end-chunk-set! matches 0 src)
3141 (irregex-match-end-index-set! matches 0 i)
3142 (%irregex-match-fail-set! matches fail)
3143 matches)))
3144 ;; XXXX this should be inlined
3145 (define (rec sre) (lp sre n flags next))
3146 (cond
3147 ((pair? sre)
3148 (if (string? (car sre))
3149 (sre-cset->procedure
3150 (sre->cset (car sre) (flag-set? flags ~case-insensitive?))
3151 next)
3152 (case (car sre)
3153 ((~ - & /)
3154 (sre-cset->procedure
3155 (sre->cset sre (flag-set? flags ~case-insensitive?))
3156 next))
3157 ((or)
3158 (case (length (cdr sre))
3159 ((0) (lambda (cnk init src str i end matches fail) (fail)))
3160 ((1) (rec (cadr sre)))
3161 (else
3162 (let* ((first (rec (cadr sre)))
3163 (rest (lp (sre-alternate (cddr sre))
3164 (+ n (sre-count-submatches (cadr sre)))
3165 flags
3166 next)))
3167 (lambda (cnk init src str i end matches fail)
3168 (first cnk init src str i end matches
3169 (lambda ()
3170 (rest cnk init src str i end matches fail))))))))
3171 ((w/case)
3172 (lp (sre-sequence (cdr sre))
3173 n
3174 (flag-clear flags ~case-insensitive?)
3175 next))
3176 ((w/nocase)
3177 (lp (sre-sequence (cdr sre))
3178 n
3179 (flag-join flags ~case-insensitive?)
3180 next))
3181 ((w/utf8)
3182 (lp (sre-sequence (cdr sre)) n (flag-join flags ~utf8?) next))
3183 ((w/noutf8)
3184 (lp (sre-sequence (cdr sre)) n (flag-clear flags ~utf8?) next))
3185 ((seq :)
3186 (case (length (cdr sre))
3187 ((0) next)
3188 ((1) (rec (cadr sre)))
3189 (else
3190 (let ((rest (lp (sre-sequence (cddr sre))
3191 (+ n (sre-count-submatches (cadr sre)))
3192 flags
3193 next)))
3194 (lp (cadr sre) n flags rest)))))
3195 ((?)
3196 (let ((body (rec (sre-sequence (cdr sre)))))
3197 (lambda (cnk init src str i end matches fail)
3198 (body cnk init src str i end matches
3199 (lambda () (next cnk init src str i end matches fail))))))
3200 ((??)
3201 (let ((body (rec (sre-sequence (cdr sre)))))
3202 (lambda (cnk init src str i end matches fail)
3203 (next cnk init src str i end matches
3204 (lambda () (body cnk init src str i end matches fail))))))
3205 ((*)
3206 (cond
3207 ((sre-empty? (sre-sequence (cdr sre)))
3208 (error "invalid sre: empty *" sre))
3209 (else
3210 (let ((body (rec (list '+ (sre-sequence (cdr sre))))))
3211 (lambda (cnk init src str i end matches fail)
3212 (body cnk init src str i end matches
3213 (lambda ()
3214 (next cnk init src str i end matches fail))))))))
3215 ((*?)
3216 (cond
3217 ((sre-empty? (sre-sequence (cdr sre)))
3218 (error "invalid sre: empty *?" sre))
3219 (else
3220 (letrec
3221 ((body
3222 (lp (sre-sequence (cdr sre))
3223 n
3224 flags
3225 (lambda (cnk init src str i end matches fail)
3226 (next cnk init src str i end matches
3227 (lambda ()
3228 (body cnk init src str i end matches fail)
3229 ))))))
3230 (lambda (cnk init src str i end matches fail)
3231 (next cnk init src str i end matches
3232 (lambda ()
3233 (body cnk init src str i end matches fail))))))))
3234 ((+)
3235 (cond
3236 ((sre-empty? (sre-sequence (cdr sre)))
3237 (error "invalid sre: empty +" sre))
3238 (else
3239 (letrec
3240 ((body
3241 (lp (sre-sequence (cdr sre))
3242 n
3243 flags
3244 (lambda (cnk init src str i end matches fail)
3245 (body cnk init src str i end matches
3246 (lambda ()
3247 (next cnk init src str i end matches fail)
3248 ))))))
3249 body))))
3250 ((=)
3251 (rec `(** ,(cadr sre) ,(cadr sre) ,@(cddr sre))))
3252 ((>=)
3253 (rec `(** ,(cadr sre) #f ,@(cddr sre))))
3254 ((**)
3255 (cond
3256 ((or (and (number? (cadr sre))
3257 (number? (caddr sre))
3258 (> (cadr sre) (caddr sre)))
3259 (and (not (cadr sre)) (caddr sre)))
3260 (lambda (cnk init src str i end matches fail) (fail)))
3261 (else
3262 (letrec
3263 ((from (cadr sre))
3264 (to (caddr sre))
3265 (body-contents (sre-sequence (cdddr sre)))
3266 (body
3267 (lambda (count)
3268 (lp body-contents
3269 n
3270 flags
3271 (lambda (cnk init src str i end matches fail)
3272 (if (and to (= count to))
3273 (next cnk init src str i end matches fail)
3274 ((body (+ 1 count))
3275 cnk init src str i end matches
3276 (lambda ()
3277 (if (>= count from)
3278 (next cnk init src str i end matches fail)
3279 (fail))))))))))
3280 (if (and (zero? from) to (zero? to))
3281 next
3282 (lambda (cnk init src str i end matches fail)
3283 ((body 1) cnk init src str i end matches
3284 (lambda ()
3285 (if (zero? from)
3286 (next cnk init src str i end matches fail)
3287 (fail))))))))))
3288 ((**?)
3289 (cond
3290 ((or (and (number? (cadr sre))
3291 (number? (caddr sre))
3292 (> (cadr sre) (caddr sre)))
3293 (and (not (cadr sre)) (caddr sre)))
3294 (lambda (cnk init src str i end matches fail) (fail)))
3295 (else
3296 (letrec
3297 ((from (cadr sre))
3298 (to (caddr sre))
3299 (body-contents (sre-sequence (cdddr sre)))
3300 (body
3301 (lambda (count)
3302 (lp body-contents
3303 n
3304 flags
3305 (lambda (cnk init src str i end matches fail)
3306 (if (< count from)
3307 ((body (+ 1 count)) cnk init
3308 src str i end matches fail)
3309 (next cnk init src str i end matches
3310 (lambda ()
3311 (if (and to (= count to))
3312 (fail)
3313 ((body (+ 1 count)) cnk init
3314 src str i end matches fail))))))))))
3315 (if (and (zero? from) to (zero? to))
3316 next
3317 (lambda (cnk init src str i end matches fail)
3318 (if (zero? from)
3319 (next cnk init src str i end matches
3320 (lambda ()
3321 ((body 1) cnk init src str i end matches fail)))
3322 ((body 1) cnk init src str i end matches fail))))))))
3323 ((word)
3324 (rec `(seq bow ,@(cdr sre) eow)))
3325 ((word+)
3326 (rec `(seq bow (+ (& (or alphanumeric "_")
3327 (or ,@(cdr sre)))) eow)))
3328 ((posix-string)
3329 (rec (string->sre (cadr sre))))
3330 ((look-ahead)
3331 (let ((check
3332 (lp (sre-sequence (cdr sre))
3333 n
3334 flags
3335 (lambda (cnk init src str i end matches fail) i))))
3336 (lambda (cnk init src str i end matches fail)
3337 (if (check cnk init src str i end matches (lambda () #f))
3338 (next cnk init src str i end matches fail)
3339 (fail)))))
3340 ((neg-look-ahead)
3341 (let ((check
3342 (lp (sre-sequence (cdr sre))
3343 n
3344 flags
3345 (lambda (cnk init src str i end matches fail) i))))
3346 (lambda (cnk init src str i end matches fail)
3347 (if (check cnk init src str i end matches (lambda () #f))
3348 (fail)
3349 (next cnk init src str i end matches fail)))))
3350 ((look-behind neg-look-behind)
3351 (let ((check
3352 (lp (sre-sequence
3353 (cons '(* any) (append (cdr sre) '(eos))))
3354 n
3355 flags
3356 (lambda (cnk init src str i end matches fail) i))))
3357 (lambda (cnk init src str i end matches fail)
3358 (let* ((cnk* (wrap-end-chunker cnk src i))
3359 (str* ((chunker-get-str cnk*) (car init)))
3360 (i* (cdr init))
3361 (end* ((chunker-get-end cnk*) (car init))))
3362 (if ((if (eq? (car sre) 'look-behind) (lambda (x) x) not)
3363 (check cnk* init (car init) str* i* end* matches
3364 (lambda () #f)))
3365 (next cnk init src str i end matches fail)
3366 (fail))))))
3367 ((atomic)
3368 (let ((once
3369 (lp (sre-sequence (cdr sre))
3370 n
3371 flags
3372 (lambda (cnk init src str i end matches fail) i))))
3373 (lambda (cnk init src str i end matches fail)
3374 (let ((j (once cnk init src str i end matches (lambda () #f))))
3375 (if j
3376 (next cnk init src str j end matches fail)
3377 (fail))))))
3378 ((if)
3379 (let* ((test-submatches (sre-count-submatches (cadr sre)))
3380 (pass (lp (caddr sre) flags (+ n test-submatches) next))
3381 (fail (if (pair? (cdddr sre))
3382 (lp (cadddr sre)
3383 (+ n test-submatches
3384 (sre-count-submatches (caddr sre)))
3385 flags
3386 next)
3387 (lambda (cnk init src str i end matches fail)
3388 (fail)))))
3389 (cond
3390 ((or (number? (cadr sre)) (symbol? (cadr sre)))
3391 (let ((index
3392 (if (symbol? (cadr sre))
3393 (cond
3394 ((assq (cadr sre) names) => cdr)
3395 (else
3396 (error "unknown named backref in SRE IF" sre)))
3397 (cadr sre))))
3398 (lambda (cnk init src str i end matches fail2)
3399 (if (%irregex-match-end-chunk matches index)
3400 (pass cnk init src str i end matches fail2)
3401 (fail cnk init src str i end matches fail2)))))
3402 (else
3403 (let ((test (lp (cadr sre) n flags pass)))
3404 (lambda (cnk init src str i end matches fail2)
3405 (test cnk init src str i end matches
3406 (lambda () (fail cnk init src str i end matches fail2)))
3407 ))))))
3408 ((backref backref-ci)
3409 (let ((n (cond ((number? (cadr sre)) (cadr sre))
3410 ((assq (cadr sre) names) => cdr)
3411 (else (error "unknown backreference" (cadr sre)))))
3412 (compare (if (or (eq? (car sre) 'backref-ci)
3413 (flag-set? flags ~case-insensitive?))
3414 string-ci=?
3415 string=?)))
3416 (lambda (cnk init src str i end matches fail)
3417 (let ((s (irregex-match-substring matches n)))
3418 (if (not s)
3419 (fail)
3420 ;; XXXX create an abstract subchunk-compare
3421 (let lp ((src src)
3422 (str str)
3423 (i i)
3424 (end end)
3425 (j 0)
3426 (len (string-length s)))
3427 (cond
3428 ((<= len (- end i))
3429 (cond
3430 ((compare (substring s j (string-length s))
3431 (substring str i (+ i len)))
3432 (next cnk init src str (+ i len) end matches fail))
3433 (else
3434 (fail))))
3435 (else
3436 (cond
3437 ((compare (substring s j (+ j (- end i)))
3438 (substring str i end))
3439 (let ((src2 ((chunker-get-next cnk) src)))
3440 (if src2
3441 (lp src2
3442 ((chunker-get-str cnk) src2)
3443 ((chunker-get-start cnk) src2)
3444 ((chunker-get-end cnk) src2)
3445 (+ j (- end i))
3446 (- len (- end i)))
3447 (fail))))
3448 (else
3449 (fail)))))))))))
3450 ((dsm)
3451 (lp (sre-sequence (cdddr sre)) (+ n (cadr sre)) flags next))
3452 (($ submatch)
3453 (let ((body
3454 (lp (sre-sequence (cdr sre))
3455 (+ n 1)
3456 flags
3457 (lambda (cnk init src str i end matches fail)
3458 (let ((old-source
3459 (%irregex-match-end-chunk matches n))
3460 (old-index
3461 (%irregex-match-end-index matches n)))
3462 (irregex-match-end-chunk-set! matches n src)
3463 (irregex-match-end-index-set! matches n i)
3464 (next cnk init src str i end matches
3465 (lambda ()
3466 (irregex-match-end-chunk-set!
3467 matches n old-source)
3468 (irregex-match-end-index-set!
3469 matches n old-index)
3470 (fail))))))))
3471 (lambda (cnk init src str i end matches fail)
3472 (let ((old-source (%irregex-match-start-chunk matches n))
3473 (old-index (%irregex-match-start-index matches n)))
3474 (irregex-match-start-chunk-set! matches n src)
3475 (irregex-match-start-index-set! matches n i)
3476 (body cnk init src str i end matches
3477 (lambda ()
3478 (irregex-match-start-chunk-set!
3479 matches n old-source)
3480 (irregex-match-start-index-set!
3481 matches n old-index)
3482 (fail)))))))
3483 ((=> submatch-named)
3484 (rec `(submatch ,@(cddr sre))))
3485 (else
3486 (error "unknown regexp operator" sre)))))
3487 ((symbol? sre)
3488 (case sre
3489 ((any)
3490 (lambda (cnk init src str i end matches fail)
3491 (if (< i end)
3492 (next cnk init src str (+ i 1) end matches fail)
3493 (let ((src2 ((chunker-get-next cnk) src)))
3494 (if src2
3495 (let ((str2 ((chunker-get-str cnk) src2))
3496 (i2 ((chunker-get-start cnk) src2))
3497 (end2 ((chunker-get-end cnk) src2)))
3498 (next cnk init src2 str2 (+ i2 1) end2 matches fail))
3499 (fail))))))
3500 ((nonl)
3501 (lambda (cnk init src str i end matches fail)
3502 (if (< i end)
3503 (if (not (eqv? #\newline (string-ref str i)))
3504 (next cnk init src str (+ i 1) end matches fail)
3505 (fail))
3506 (let ((src2 ((chunker-get-next cnk) src)))
3507 (if src2
3508 (let ((str2 ((chunker-get-str cnk) src2))
3509 (i2 ((chunker-get-start cnk) src2))
3510 (end2 ((chunker-get-end cnk) src2)))
3511 (if (not (eqv? #\newline (string-ref str2 i2)))
3512 (next cnk init src2 str2 (+ i2 1) end2 matches fail)
3513 (fail)))
3514 (fail))))))
3515 ((bos)
3516 (lambda (cnk init src str i end matches fail)
3517 (if (and (eq? src (car init)) (eqv? i (cdr init)))
3518 (next cnk init src str i end matches fail)
3519 (fail))))
3520 ((bol)
3521 (lambda (cnk init src str i end matches fail)
3522 (if (let ((ch (if (> i ((chunker-get-start cnk) src))
3523 (string-ref str (- i 1))
3524 (chunker-prev-char cnk init src))))
3525 (or (not ch) (eqv? #\newline ch)))
3526 (next cnk init src str i end matches fail)
3527 (fail))))
3528 ((bow)
3529 (lambda (cnk init src str i end matches fail)
3530 (if (and (if (> i ((chunker-get-start cnk) src))
3531 (not (char-alphanumeric? (string-ref str (- i 1))))
3532 (let ((ch (chunker-prev-char cnk init src)))
3533 (or (not ch) (not (char-alphanumeric? ch)))))
3534 (if (< i end)
3535 (char-alphanumeric? (string-ref str i))
3536 (let ((next ((chunker-get-next cnk) src)))
3537 (and next
3538 (char-alphanumeric?
3539 (string-ref ((chunker-get-str cnk) next)
3540 ((chunker-get-start cnk) next)))))))
3541 (next cnk init src str i end matches fail)
3542 (fail))))
3543 ((eos)
3544 (lambda (cnk init src str i end matches fail)
3545 (if (and (>= i end) (not ((chunker-get-next cnk) src)))
3546 (next cnk init src str i end matches fail)
3547 (fail))))
3548 ((eol)
3549 (lambda (cnk init src str i end matches fail)
3550 (if (if (< i end)
3551 (eqv? #\newline (string-ref str i))
3552 (let ((src2 ((chunker-get-next cnk) src)))
3553 (if (not src2)
3554 #t
3555 (eqv? #\newline
3556 (string-ref ((chunker-get-str cnk) src2)
3557 ((chunker-get-start cnk) src2))))))
3558 (next cnk init src str i end matches fail)
3559 (fail))))
3560 ((eow)
3561 (lambda (cnk init src str i end matches fail)
3562 (if (and (if (< i end)
3563 (not (char-alphanumeric? (string-ref str i)))
3564 (let ((ch (chunker-next-char cnk src)))
3565 (or (not ch) (not (char-alphanumeric? ch)))))
3566 (if (> i ((chunker-get-start cnk) src))
3567 (char-alphanumeric? (string-ref str (- i 1)))
3568 (let ((prev (chunker-prev-char cnk init src)))
3569 (or (not prev) (char-alphanumeric? prev)))))
3570 (next cnk init src str i end matches fail)
3571 (fail))))
3572 ((nwb) ;; non-word-boundary
3573 (lambda (cnk init src str i end matches fail)
3574 (let ((c1 (if (< i end)
3575 (string-ref str i)
3576 (chunker-next-char cnk src)))
3577 (c2 (if (> i ((chunker-get-start cnk) src))
3578 (string-ref str (- i 1))
3579 (chunker-prev-char cnk init src))))
3580 (if (and c1 c2
3581 (if (char-alphanumeric? c1)
3582 (char-alphanumeric? c2)
3583 (not (char-alphanumeric? c2))))
3584 (next cnk init src str i end matches fail)
3585 (fail)))))
3586 ((epsilon)
3587 next)
3588 (else
3589 (let ((cell (assq sre sre-named-definitions)))
3590 (if cell
3591 (rec (cdr cell))
3592 (error "unknown regexp" sre))))))
3593 ((char? sre)
3594 (if (flag-set? flags ~case-insensitive?)
3595 ;; case-insensitive
3596 (lambda (cnk init src str i end matches fail)
3597 (if (>= i end)
3598 (let lp ((src2 ((chunker-get-next cnk) src)))
3599 (if src2
3600 (let ((str2 ((chunker-get-str cnk) src2))
3601 (i2 ((chunker-get-start cnk) src2))
3602 (end2 ((chunker-get-end cnk) src2)))
3603 (if (>= i2 end2)
3604 (lp ((chunker-get-next cnk) src2))
3605 (if (char-ci=? sre (string-ref str2 i2))
3606 (next cnk init src2 str2 (+ i2 1) end2
3607 matches fail)
3608 (fail))))
3609 (fail)))
3610 (if (char-ci=? sre (string-ref str i))
3611 (next cnk init src str (+ i 1) end matches fail)
3612 (fail))))
3613 ;; case-sensitive
3614 (lambda (cnk init src str i end matches fail)
3615 (if (>= i end)
3616 (let lp ((src2 ((chunker-get-next cnk) src)))
3617 (if src2
3618 (let ((str2 ((chunker-get-str cnk) src2))
3619 (i2 ((chunker-get-start cnk) src2))
3620 (end2 ((chunker-get-end cnk) src2)))
3621 (if (>= i2 end2)
3622 (lp ((chunker-get-next cnk) src2))
3623 (if (char=? sre (string-ref str2 i2))
3624 (next cnk init src2 str2 (+ i2 1) end2
3625 matches fail)
3626 (fail))))
3627 (fail)))
3628 (if (char=? sre (string-ref str i))
3629 (next cnk init src str (+ i 1) end matches fail)
3630 (fail))))
3631 ))
3632 ((string? sre)
3633 (rec (sre-sequence (string->list sre)))
3634;; XXXX reintroduce faster string matching on chunks
3635;; (if (flag-set? flags ~case-insensitive?)
3636;; (rec (sre-sequence (string->list sre)))
3637;; (let ((len (string-length sre)))
3638;; (lambda (cnk init src str i end matches fail)
3639;; (if (and (<= (+ i len) end)
3640;; (%substring=? sre str 0 i len))
3641;; (next str (+ i len) matches fail)
3642;; (fail)))))
3643 )
3644 (else
3645 (error "unknown regexp" sre)))))
3646
3647;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3648;;;; Character Sets
3649;;
3650;; Simple character sets as lists of ranges, as used in the NFA/DFA
3651;; compilation. This is not especially efficient, but is portable and
3652;; scalable for any range of character sets.
3653
3654(define (sre-cset->procedure cset next)
3655 (lambda (cnk init src str i end matches fail)
3656 (if (< i end)
3657 (if (cset-contains? cset (string-ref str i))
3658 (next cnk init src str (+ i 1) end matches fail)
3659 (fail))
3660 (let ((src2 ((chunker-get-next cnk) src)))
3661 (if src2
3662 (let ((str2 ((chunker-get-str cnk) src2))
3663 (i2 ((chunker-get-start cnk) src2))
3664 (end2 ((chunker-get-end cnk) src2)))
3665 (if (cset-contains? cset (string-ref str2 i2))
3666 (next cnk init src2 str2 (+ i2 1) end2 matches fail)
3667 (fail)))
3668 (fail))))))
3669
3670(define (make-cset) (vector))
3671(define (range->cset from to) (vector (cons from to)))
3672(define (char->cset ch) (vector (cons ch ch)))
3673(define (cset-empty? cs) (zero? (vector-length cs)))
3674(define (maybe-cset->char cs)
3675 (if (and (= (vector-length cs) 1)
3676 (char=? (car (vector-ref cs 0)) (cdr (vector-ref cs 0))))
3677 (car (vector-ref cs 0))
3678 cs))
3679
3680;; Since csets are sorted, there's only one possible representation of any cset
3681(define cset=? equal?)
3682
3683(define (cset-size cs)
3684 (let ((len (vector-length cs)))
3685 (let lp ((i 0) (size 0))
3686 (if (= i len)
3687 size
3688 (lp (+ i 1) (+ size 1
3689 (- (char->integer (cdr (vector-ref cs i)))
3690 (char->integer (car (vector-ref cs i))))))))))
3691
3692(define (cset->plist cs)
3693 (let lp ((i (- (vector-length cs) 1))
3694 (res '()))
3695 (if (= i -1)
3696 res
3697 (lp (- i 1) (cons (car (vector-ref cs i))
3698 (cons (cdr (vector-ref cs i)) res))))))
3699
3700(define (plist->cset ls)
3701 (let lp ((ls ls) (res (make-cset)))
3702 (if (null? ls)
3703 res
3704 (lp (cddr ls) (cset-union (range->cset (car ls) (cadr ls)) res)))))
3705
3706(define (string->cset s)
3707 (fold (lambda (ch cs)
3708 (cset-adjoin cs ch))
3709 (make-cset)
3710 (string->list s)))
3711
3712(define (sre->cset sre . o)
3713 (let lp ((sre sre) (ci? (and (pair? o) (car o))))
3714 (define (rec sre) (lp sre ci?))
3715 (cond
3716 ((pair? sre)
3717 (if (string? (car sre))
3718 (if ci?
3719 (cset-case-insensitive (string->cset (car sre)))
3720 (string->cset (car sre)))
3721 (case (car sre)
3722 ((~)
3723 (cset-complement
3724 (fold cset-union (rec (cadr sre)) (map rec (cddr sre)))))
3725 ((&)
3726 (fold cset-intersection (rec (cadr sre)) (map rec (cddr sre))))
3727 ((-)
3728 (fold (lambda (x res) (cset-difference res x))
3729 (rec (cadr sre))
3730 (map rec (cddr sre))))
3731 ((/)
3732 (let ((res (plist->cset (sre-flatten-ranges (cdr sre)))))
3733 (if ci?
3734 (cset-case-insensitive res)
3735 res)))
3736 ((or)
3737 (fold cset-union (rec (cadr sre)) (map rec (cddr sre))))
3738 ((w/case)
3739 (lp (sre-alternate (cdr sre)) #f))
3740 ((w/nocase)
3741 (lp (sre-alternate (cdr sre)) #t))
3742 (else
3743 (error "not a valid sre char-set operator" sre)))))
3744 ((char? sre) (if ci?
3745 (cset-case-insensitive (range->cset sre sre))
3746 (range->cset sre sre)))
3747 ((string? sre) (rec (list sre)))
3748 (else
3749 (let ((cell (assq sre sre-named-definitions)))
3750 (if cell
3751 (rec (cdr cell))
3752 (error "not a valid sre char-set" sre)))))))
3753
3754(define (cset->sre cset)
3755 (cons '/
3756 (fold (lambda (x res) (cons (car x) (cons (cdr x) res)))
3757 '()
3758 (vector->list cset))))
3759
3760(define (cset-contains? cset ch)
3761 ;; CHICKEN: Type assumption added for performance. This is a very
3762 ;; hot code path, so every type improvement matters.
3763 (assume ((cset (vector-of (pair char char)))
3764 (ch char))
3765 (let ((len (vector-length cset)))
3766 (case len
3767 ((0) #f)
3768 ((1) (let ((range (vector-ref cset 0)))
3769 (and (char<=? ch (cdr range)) (char<=? (car range) ch))))
3770 (else (let lp ((lower 0) (upper len))
3771 (let* ((middle (quotient (+ upper lower) 2))
3772 (range (vector-ref cset middle)))
3773 (cond ((char<? (cdr range) ch)
3774 (let ((next (+ middle 1)))
3775 (and (< next upper) (lp next upper))))
3776 ((char<? ch (car range))
3777 (and (< lower middle) (lp lower middle)))
3778 (else #t)))))))))
3779
3780(define (char-ranges-union a b)
3781 (cons (if (char<=? (car a) (car b)) (car a) (car b))
3782 (if (char>=? (cdr a) (cdr b)) (cdr a) (cdr b))))
3783
3784(define (cset-union a b)
3785 (let union-range ((a (vector->list a))
3786 (b (vector->list b))
3787 (res '()))
3788 (cond
3789 ((null? a) (list->vector (reverse (append (reverse b) res))))
3790 ((null? b) (list->vector (reverse (append (reverse a) res))))
3791 (else
3792 (let ((a-range (car a))
3793 (b-range (car b)))
3794 (cond
3795 ;; Can't use next-char here since it will cause an error if we are
3796 ;; comparing a cset with the maximum character as high char.
3797 ((< (+ (char->integer (cdr a-range)) 1) (char->integer (car b-range)))
3798 (union-range (cdr a) b (cons a-range res)))
3799 ((> (char->integer (car a-range)) (+ (char->integer (cdr b-range)) 1))
3800 (union-range (cdr b) a (cons b-range res)))
3801 ((char>=? (cdr a-range) (car b-range))
3802 (union-range (cons (char-ranges-union a-range b-range) (cdr a))
3803 (cdr b)
3804 res))
3805 (else (union-range (cdr a)
3806 (cons (char-ranges-union a-range b-range) (cdr b))
3807 res))))))))
3808
3809(define (cset-adjoin cs ch) (cset-union cs (char->cset ch)))
3810
3811(define (next-char c)
3812 (integer->char (+ (char->integer c) 1)))
3813
3814(define (prev-char c)
3815 (integer->char (- (char->integer c) 1)))
3816
3817(define (cset-difference a b)
3818 (let diff ((a (vector->list a))
3819 (b (vector->list b))
3820 (res '()))
3821 (cond ((null? a) (list->vector (reverse res)))
3822 ((null? b) (list->vector (append (reverse res) a)))
3823 (else
3824 (let ((a-range (car a))
3825 (b-range (car b)))
3826 (cond
3827 ((char<? (cdr a-range) (car b-range))
3828 (diff (cdr a) b (cons a-range res)))
3829 ((char>? (car a-range) (cdr b-range))
3830 (diff a (cdr b) res))
3831 ((and (char<=? (car b-range) (car a-range))
3832 (char>=? (cdr b-range) (cdr a-range)))
3833 (diff (cdr a) b res))
3834 (else (let ((left (and (char<? (car a-range) (car b-range))
3835 (cons (car a-range)
3836 (prev-char (car b-range)))))
3837 (right (and (char>? (cdr a-range) (cdr b-range))
3838 (cons (next-char (cdr b-range))
3839 (cdr a-range)))))
3840 (diff (if right (cons right (cdr a)) (cdr a))
3841 b
3842 (if left (cons left res) res))))))))))
3843
3844(define (min-char a b)
3845 (if (char<? a b) a b))
3846
3847(define (max-char a b)
3848 (if (char<? a b) b a))
3849
3850(define (cset-intersection a b)
3851 (let intersect ((a (vector->list a))
3852 (b (vector->list b))
3853 (res '()))
3854 (if (or (null? a) (null? b))
3855 (list->vector (reverse res))
3856 (let ((a-range (car a))
3857 (b-range (car b)))
3858 (cond
3859 ((char<? (cdr a-range) (car b-range))
3860 (intersect (cdr a) b res))
3861 ((char>? (car a-range) (cdr b-range))
3862 (intersect a (cdr b) res))
3863 (else
3864 (let ((result (cons (max-char (car b-range) (car a-range))
3865 (min-char (cdr a-range) (cdr b-range)))))
3866 (intersect (if (char>? (cdr a-range) (cdr result))
3867 a (cdr a))
3868 (if (char>? (cdr b-range) (cdr result))
3869 b (cdr b))
3870 (cons result res)))))))))
3871
3872(define (cset-complement a)
3873 (cset-difference (sre->cset *all-chars*) a))
3874
3875;; This could use some optimization :)
3876(define (cset-case-insensitive a)
3877 (let lp ((ls (vector->list a)) (res '()))
3878 (cond ((null? ls) (list->vector (reverse res)))
3879 ((and (char-alphabetic? (caar ls))
3880 (char-alphabetic? (cdar ls)))
3881 (lp (cdr ls)
3882 (reverse
3883 (vector->list
3884 (cset-union (cset-union (list->vector (reverse res))
3885 (vector (car ls)))
3886 (range->cset (char-altcase (caar ls))
3887 (char-altcase (cdar ls))))))))
3888 (else (lp (cdr ls) (reverse (vector->list
3889 (cset-union (list->vector (reverse res))
3890 (vector (car ls))))))))))
3891
3892;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3893;;;; Match and Replace Utilities
3894
3895(define (irregex-fold/fast irx kons knil str . o)
3896 (if (not (string? str)) (error 'irregex-fold "not a string" str))
3897 (let* ((irx (irregex irx))
3898 (matches (irregex-new-matches irx))
3899 (finish (or (and (pair? o) (car o)) (lambda (i acc) acc)))
3900 (start (if (and (pair? o) (pair? (cdr o))) (cadr o) 0))
3901 (end (if (and (pair? o) (pair? (cdr o)) (pair? (cddr o)))
3902 (caddr o)
3903 (string-length str)))
3904 (init-src (list str start end))
3905 (init (cons init-src start)))
3906 (if (not (and (integer? start) (exact? start)))
3907 (error 'irregex-fold "not an exact integer" start))
3908 (if (not (and (integer? end) (exact? end)))
3909 (error 'irregex-fold "not an exact integer" end))
3910 (irregex-match-chunker-set! matches irregex-basic-string-chunker)
3911 (let lp ((src init-src) (from start) (i start) (acc knil))
3912 (if (>= i end)
3913 (finish from acc)
3914 (let ((m (irregex-search/matches
3915 irx
3916 irregex-basic-string-chunker
3917 init
3918 src
3919 i
3920 matches)))
3921 (if (not m)
3922 (finish from acc)
3923 (let ((j-start (%irregex-match-start-index m 0))
3924 (j (%irregex-match-end-index m 0))
3925 (acc (kons from m acc)))
3926 (irregex-reset-matches! matches)
3927 (cond
3928 ((flag-set? (irregex-flags irx) ~consumer?)
3929 (finish j acc))
3930 ((= j j-start)
3931 ;; skip one char forward if we match the empty string
3932 (lp (list str j end) j (+ j 1) acc))
3933 (else
3934 (lp (list str j end) j j acc))))))))))
3935
3936(define (irregex-fold irx kons . args)
3937 (if (not (procedure? kons)) (error 'irregex-fold "not a procedure" kons))
3938 (let ((kons2 (lambda (i m acc) (kons i (irregex-copy-matches m) acc))))
3939 (apply irregex-fold/fast irx kons2 args)))
3940
3941(define (irregex-fold/chunked/fast irx kons knil cnk start . o)
3942 (let* ((irx (irregex irx))
3943 (matches (irregex-new-matches irx))
3944 (finish (or (and (pair? o) (car o)) (lambda (src i acc) acc)))
3945 (i (if (and (pair? o) (pair? (cdr o)))
3946 (cadr o)
3947 ((chunker-get-start cnk) start)))
3948 (init (cons start i)))
3949 (if (not (integer? i)) (error 'irregex-fold/chunked "not an integer" i))
3950 (irregex-match-chunker-set! matches cnk)
3951 (let lp ((start start) (i i) (acc knil))
3952 (if (not start)
3953 (finish start i acc)
3954 (let ((m (irregex-search/matches irx cnk init start i matches)))
3955 (if (not m)
3956 (finish start i acc)
3957 (let ((end-src (%irregex-match-end-chunk m 0))
3958 (end-index (%irregex-match-end-index m 0)))
3959 (if (and (eq? end-src start) (= end-index i))
3960 (if (>= end-index ((chunker-get-end cnk) end-src ))
3961 (let ((next ((chunker-get-next cnk) end-src)))
3962 (lp next ((chunker-get-start cnk) next) acc))
3963 (lp end-src (+ end-index 1) acc))
3964 (let ((acc (kons start i m acc)))
3965 (irregex-reset-matches! matches)
3966 (if (flag-set? (irregex-flags irx) ~consumer?)
3967 (finish end-src end-index acc)
3968 (lp end-src end-index acc)))))))))))
3969
3970(define (irregex-fold/chunked irx kons . args)
3971 (if (not (procedure? kons)) (error 'irregex-fold/chunked "not a procedure" kons))
3972 (let ((kons2 (lambda (s i m acc) (kons s i (irregex-copy-matches m) acc))))
3973 (apply irregex-fold/chunked/fast irx kons2 args)))
3974
3975(define (irregex-replace irx str . o)
3976 (if (not (string? str)) (error 'irregex-replace "not a string" str))
3977 (let ((m (irregex-search irx str)))
3978 (if m
3979 (string-cat-reverse
3980 (cons (substring str (%irregex-match-end-index m 0) (string-length str))
3981 (append (irregex-apply-match m o)
3982 (list (substring str 0 (%irregex-match-start-index m 0)))
3983 )))
3984 str)))
3985
3986(define (irregex-replace/all irx str . o)
3987 (if (not (string? str)) (error 'irregex-replace/all "not a string" str))
3988 (irregex-fold/fast
3989 irx
3990 (lambda (i m acc)
3991 (let ((m-start (%irregex-match-start-index m 0)))
3992 (if (>= i m-start)
3993 (append (irregex-apply-match m o) acc)
3994 (append (irregex-apply-match m o)
3995 (cons (substring str i m-start) acc)))))
3996 '()
3997 str
3998 (lambda (i acc)
3999 (let ((end (string-length str)))
4000 (string-cat-reverse (if (>= i end)
4001 acc
4002 (cons (substring str i end) acc)))))))
4003
4004(define (irregex-apply-match m ls)
4005 (let lp ((ls ls) (res '()))
4006 (if (null? ls)
4007 res
4008 (cond
4009 ((integer? (car ls))
4010 (lp (cdr ls)
4011 (cons (or (irregex-match-substring m (car ls)) "") res)))
4012 ((procedure? (car ls))
4013 (lp (cdr ls) (cons ((car ls) m) res)))
4014 ((symbol? (car ls))
4015 (case (car ls)
4016 ((pre)
4017 (lp (cdr ls)
4018 (cons (substring (car (%irregex-match-start-chunk m 0))
4019 0
4020 (%irregex-match-start-index m 0))
4021 res)))
4022 ((post)
4023 (let ((str (car (%irregex-match-start-chunk m 0))))
4024 (lp (cdr ls)
4025 (cons (substring str
4026 (%irregex-match-end-index m 0)
4027 (string-length str))
4028 res))))
4029 (else
4030 (cond
4031 ((assq (car ls) (irregex-match-names m))
4032 => (lambda (x) (lp (cons (cdr x) (cdr ls)) res)))
4033 (else
4034 (error "unknown match replacement" (car ls)))))))
4035 (else
4036 (lp (cdr ls) (cons (car ls) res)))))))
4037
4038(define (irregex-extract irx str . o)
4039 (if (not (string? str)) (error 'irregex-extract "not a string" str))
4040 (apply irregex-fold/fast
4041 irx
4042 (lambda (i m a) (cons (irregex-match-substring m) a))
4043 '()
4044 str
4045 (lambda (i a) (reverse a))
4046 o))
4047
4048(define (irregex-split irx str . o)
4049 (if (not (string? str)) (error 'irregex-split "not a string" str))
4050 (let ((start (if (pair? o) (car o) 0))
4051 (end (if (and (pair? o) (pair? (cdr o))) (cadr o) (string-length str))))
4052 (irregex-fold/fast
4053 irx
4054 (lambda (i m a)
4055 (cond
4056 ((= i (%irregex-match-start-index m 0))
4057 a)
4058 (else
4059 (cons (substring str i (%irregex-match-start-index m 0)) a))))
4060 '()
4061 str
4062 (lambda (i a)
4063 (let lp ((ls (if (= i end) a (cons (substring str i end) a)))
4064 (res '())
4065 (was-char? #f))
4066 (cond
4067 ((null? ls) res)
4068 ((char? (car ls))
4069 (lp (cdr ls)
4070 (if (or was-char? (null? res))
4071 (cons (string (car ls)) res)
4072 (cons (string-append (string (car ls)) (car res))
4073 (cdr res)))
4074 #t))
4075 (else (lp (cdr ls) (cons (car ls) res) #f)))))
4076 start
4077 end)))