~ chicken-core (chicken-5) 4a3cec73c9a723bcf5ea656c9599a3b793cb6ecf
commit 4a3cec73c9a723bcf5ea656c9599a3b793cb6ecf Author: felix <felix@call-with-current-continuation.org> AuthorDate: Thu Jul 15 15:08:03 2010 +0200 Commit: felix <felix@call-with-current-continuation.org> CommitDate: Tue Jul 27 13:09:33 2010 +0200 added irregex manual (thanks to sjamaan) and made various changes for completing irregex promotion diff --git a/distribution/manifest b/distribution/manifest index c0687166..2f3e2898 100644 --- a/distribution/manifest +++ b/distribution/manifest @@ -29,6 +29,7 @@ compiler-syntax.c scrutinizer.c unboxing.c regex.c +irregex.c posixunix.c posixwin.c profiler.c @@ -79,6 +80,7 @@ scrutinizer.scm unboxing.scm regex.scm irregex.scm +irregex-core.scm posixunix.scm posixwin.scm posix-common.scm @@ -290,6 +292,7 @@ manual/Unit lolevel manual/Unit ports manual/Unit posix manual/Unit regex +manual/Unit irregex manual/Unit srfi-1 manual/Unit srfi-13 manual/Unit srfi-14 diff --git a/files.scm b/files.scm index 2c1c167f..c0a47c66 100644 --- a/files.scm +++ b/files.scm @@ -277,10 +277,10 @@ EOF (##sys#check-string pn 'decompose-pathname) (if (fx= 0 (##sys#size pn)) (values #f #f #f) - (let ([ms (string-match rx1 pn)]) + (let ([ms (string-search rx1 pn)]) (if ms (values (strip-pds (cadr ms)) (caddr ms) (car (cddddr ms))) - (let ([ms (string-match rx2 pn)]) + (let ([ms (string-search rx2 pn)]) (if ms (values (strip-pds (cadr ms)) (caddr ms) #f) (values (strip-pds pn) #f #f) ) ) ) ) ) ) ) ) ) diff --git a/irregex-core.scm b/irregex-core.scm index 7fc539b4..ed3be22d 100644 --- a/irregex-core.scm +++ b/irregex-core.scm @@ -3635,6 +3635,7 @@ (lp end-src end-index acc)))))))) (define (irregex-fold/chunked irx kons . args) + (if (not (procedure? kons)) (error "irregex-fold/chunked: not a procedure" kons)) (let ((kons2 (lambda (s i m acc) (kons s i (irregex-copy-matches m) acc)))) (apply irregex-fold/chunked/fast irx kons2 args))) diff --git a/irregex.import.scm b/irregex.import.scm index 71f406c9..53e001d0 100644 --- a/irregex.import.scm +++ b/irregex.import.scm @@ -31,28 +31,43 @@ irregex-dfa irregex-dfa/extract irregex-dfa/search + irregex-extract irregex-flags irregex-fold + irregex-fold/chunked irregex-lengths irregex-match + irregex-match? irregex-match-data? irregex-match-end + irregex-match-end-chunk irregex-match-end-index + irregex-match-names irregex-match-num-submatches irregex-match-start + irregex-match-start-chunk irregex-match-start-index irregex-match-string + irregex-match-subchunk irregex-match-substring + irregex-match/chunked irregex-names irregex-new-matches irregex-nfa + irregex-num-submatches + irregex-opt + irregex-quote irregex-replace irregex-replace/all irregex-reset-matches! irregex-search + irregex-search/chunked irregex-search/matches + irregex-split irregex-submatches irregex? + make-irregex-chunker + maybe-string->sre sre->irregex string->irregex string->sre diff --git a/irregex.scm b/irregex.scm new file mode 100644 index 00000000..bd37aba9 --- /dev/null +++ b/irregex.scm @@ -0,0 +1,87 @@ +;;;; irregex.scm - container for irregex-core.scm +; +; Copyright (c) 2010, The Chicken Team +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following +; conditions are met: +; +; Redistributions of source code must retain the above copyright notice, this list of conditions and the following +; disclaimer. +; Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following +; disclaimer in the documentation and/or other materials provided with the distribution. +; Neither the name of the author nor the names of its contributors may be used to endorse or promote +; products derived from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS +; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY +; AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR +; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +; OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +; POSSIBILITY OF SUCH DAMAGE. + + + +(declare (unit irregex)) + +(declare + (disable-interrupts) + (no-procedure-checks) + (fixnum) + (export + irregex + irregex-apply-match + irregex-dfa + irregex-dfa/extract + irregex-dfa/search + irregex-extract + irregex-flags + irregex-fold + irregex-fold/chunked + irregex-lengths + irregex-match + irregex-match? + irregex-match-data? + irregex-match-end + irregex-match-end-chunk + irregex-match-end-index + irregex-match-names + irregex-match-num-submatches + irregex-match-start + irregex-match-start-chunk + irregex-match-start-index + irregex-match-string + irregex-match-subchunk + irregex-match-substring + irregex-match/chunked + irregex-names + irregex-new-matches + irregex-nfa + irregex-num-submatches + irregex-opt + irregex-quote + irregex-replace + irregex-replace/all + irregex-reset-matches! + irregex-search + irregex-search/chunked + irregex-search/matches + irregex-split + irregex-submatches + irregex? + make-irregex-chunker + maybe-string->sre + irregex-search/chunked + sre->irregex + string->irregex + string->sre + )) + +(include "common-declarations.scm") + +(register-feature! 'irregex) + +(include "irregex-core.scm") diff --git a/library.scm b/library.scm index 9d725885..53036b6e 100644 --- a/library.scm +++ b/library.scm @@ -76,7 +76,8 @@ #define C_a_get_current_seconds(ptr, c, dummy) C_flonum(ptr, time(NULL)) #define C_peek_c_string_at(ptr, i) ((C_char *)(((C_char **)ptr)[ i ])) -static C_word fast_read_line_from_file(C_word str, C_word port, C_word size) { +static C_word +fast_read_line_from_file(C_word str, C_word port, C_word size) { int n = C_unfix(size); int i; int c; @@ -101,7 +102,7 @@ static C_word fast_read_line_from_file(C_word str, C_word port, C_word size) { } static C_word -fast_read_string_from_file (C_word dest, C_word port, C_word len, C_word pos) +fast_read_string_from_file(C_word dest, C_word port, C_word len, C_word pos) { int n = C_unfix (len); char * buf = ((char *)C_data_pointer (dest) + C_unfix (pos)); @@ -1731,7 +1732,8 @@ EOF (define (##sys#check-port x . loc) (unless (%port? x) - (##sys#signal-hook #:type-error (and (pair? loc) (car loc)) "argument is not a port" x) ) ) + (##sys#signal-hook + #:type-error (and (pair? loc) (car loc)) "argument is not a port" x) ) ) (define (##sys#check-port-mode port mode . loc) (unless (eq? mode (##sys#slot port 1)) diff --git a/manual/Supported language b/manual/Supported language index 3fe014cd..07794a9e 100644 --- a/manual/Supported language +++ b/manual/Supported language @@ -19,7 +19,8 @@ * [[Unit ports]] I/O ports * [[Unit files]] File and pathname operations * [[Unit extras]] Useful utility definitions -* [[Unit regex]] Regular expressions +* [[Unit irregex]] Regular expressions +* [[Unit regex]] Regular expression utilities * [[Unit srfi-1]] List Library * [[Unit srfi-4]] Homogeneous numeric vectors * [[Unit srfi-13]] String library diff --git a/manual/Unit extras b/manual/Unit extras index 72081c1f..be49fa4f 100644 --- a/manual/Unit extras +++ b/manual/Unit extras @@ -196,4 +196,4 @@ false. Returns a string with the accumulated characters. --- Previous: [[Unit files]] -Next: [[Unit regex]] +Next: [[Unit irregex]] diff --git a/manual/Unit irregex b/manual/Unit irregex new file mode 100644 index 00000000..f8c37036 --- /dev/null +++ b/manual/Unit irregex @@ -0,0 +1,819 @@ +[[tags: manual]] +[[toc:]] + +== Unit irregex + +This library unit provides support for regular expressions, using the +powerful ''irregex'' regular expression engine by Alex Shinn. It +supports both POSIX syntax with various (irregular) PCRE extensions, +as well as SCSH's SRE syntax, with various aliases for commonly used +patterns. DFA matching is used when possible, otherwise a +closure-compiled NFA approach is used. Matching may be performed over +standard Scheme strings, or over arbitrarily chunked streams of +strings. + +On systems that support dynamic loading, the {{irregex}} unit can +be made available in the Chicken interpreter ({{csi}}) by entering + +<enscript highlight=scheme> +(require-extension irregex) +</enscript> + +[[toc:]] + +=== Specification + +==== Procedures + +===== irregex +===== string->irregex +===== sre->irregex + +<procedure>(irregex <posix-string-or-sre> [<options> ...])</procedure><br> +<procedure>(string->irregex <posix-string> [<options> ...])</procedure><br> +<procedure>(sre->irregex <sre> [<options> ...])</procedure><br> + +Compiles a regular expression from either a POSIX-style regular +expression string (with most PCRE extensions) or an SCSH-style SRE. +There is no {{(rx ...)}} syntax - just use normal Scheme lists, with +{{quasiquote}} if you like. + +Technically a string by itself could be considered a valid (though +rather silly) SRE, so if you want to just match a literal string you +should use something like {{(irregex `(: ,str))}}, or use the explicit +{{(sre->irregex str)}}. + +The options are a list of any of the following symbols: + +; {{'i}}, {{'case-insensitive}} : match case-insensitively +; {{'m}}, {{'multi-line}} : treat string as multiple lines (effects {{^}} and {{$}}) +; {{'s}}, {{'single-line}} : treat string as a single line ({{.}} can match newline) +; {{'utf8}} : utf8-mode (assumes strings are byte-strings) +; {{'fast}} : try to optimize the regular expression +; {{'small}} : try to compile a smaller regular expression +; {{'backtrack}} : enforce a backtracking implementation + +The {{'fast}} and {{'small}} options are heuristic guidelines and will +not necessarily make the compiled expression faster or smaller. + +===== string->sre +===== maybe-string->sre + +<procedure>(string->sre <str>)</procedure><br> +<procedure>(maybe-string->sre <obj>)</procedure><br> + +For backwards compatibility, procedures to convert a POSIX string into +an SRE. + +{{maybe-string->sre}} does the same thing, but only if the argument is +a string, otherwise it assumes {{<obj>}} is an SRE and returns it +as-is. This is useful when you want to provide an API that allows +either a POSIX string or SRE (like {{irregex}} or {{irregex-search}} +below) - it ensures the result is an SRE. + +===== irregex? + +<procedure>(irregex? <obj>)</procedure><br> + +Returns {{#t}} iff the object is a regular expression. + +===== irregex-search + +<procedure>(irregex-search <irx> <str> [<start> <end>])</procedure> + +Searches for any instances of the pattern {{<irx>}} (a POSIX string, SRE +sexp, or pre-compiled regular expression) in {{<str>}}, optionally between +the given range. If a match is found, returns a match object, +otherwise returns {{#f}}. + +Match objects can be used to query the original range of the string or +its submatches using the {{irregex-match-*}} procedures below. + +Examples: + +<enscript highlight=scheme> +(irregex-search "foobar" "abcFOOBARdef") => #f + +(irregex-search "foobar" "abcFOOBARdef" 'i) => #<match> + +(irregex-search '(w/nocase "foobar") "abcFOOBARdef") => #<match> +</enscript> + +Note, the actual match result is represented by a vector in the +default implementation. Throughout this manual, we'll just write +{{#<match>}} to show that a successful match was returned when the +details are not important. + +Matching follows the POSIX leftmost, longest semantics, when +searching. That is, of all possible matches in the string, +{{irregex-search}} will return the match at the first position +(leftmost). If multiple matches are possible from that same first +position, the longest match is returned. + +===== irregex-match + +<procedure>(irregex-match <irx> <str>)</procedure> + +Like {{irregex-search}}, but performs an anchored match against the +beginning and end of the string, without searching. + +Examples: + +<enscript highlight=scheme> +(irregex-match '(w/nocase "foobar") "abcFOOBARdef") => #f + +(irregex-match '(w/nocase "foobar") "FOOBAR") => #<match> +</enscript> + +===== irregex-match-data? + +<procedure>(irregex-match-data? <obj>)</procedure> + +Returns {{#t}} iff the object is a successful match result from +{{irregex-search}} or {{irregex-match}}. + +===== irregex-num-submatches +===== irregex-match-num-submatches + +<procedure>(irregex-num-submatches <irx>)</procedure><br> +<procedure>(irregex-match-num-submatches <match>)</procedure> + +Returns the number of numbered submatches that are defined in the +irregex or match object. + +===== irregex-names +===== irregex-match-names + +<procedure>(irregex-names <irx>)</procedure><br> +<procedure>(irregex-match-names <match>)</procedure> + +Returns an association list of named submatches that are defined in +the irregex or match object. The {{car}} of each item in this list is +the name of a submatch, the {{cdr}} of each item is the numerical +submatch corresponding to this name. If a named submatch occurs +multiple times in the irregex, it will also occur multiple times in +this list. + +===== irregex-match-substring +===== irregex-match-start-index +===== irregex-match-end-index + +<procedure>(irregex-match-substring <match> [<index-or-name>])</procedure><br> +<procedure>(irregex-match-start-index <match> <index-or-name>)</procedure><br> +<procedure>(irregex-match-end-index <match> <index-or-name>)</procedure> + +Fetches the matched substring (or its start or end offset) at the +given submatch index, or named submatch. The entire match is index 0, +the first 1, etc. The default is index 0. + +===== irregex-match-subchunk + +<procedure>(irregex-match-subchunk <match> [<index-or-name>])</procedure> + +Generates a chunked data-type for the given match item, of the same +type as the underlying chunk type (see Chunked String Matching below). +This is only available if the chunk type specifies the get-subchunk +API, otherwise an error is raised. + +===== irregex-replace +===== irregex-replace/all + +<procedure>(irregex-replace <irx> <str> [<replacements> ...])</procedure><br> +<procedure>(irregex-replace/all <irx> <str> [<replacements> ...])</procedure> + +Matches a pattern in a string, and replaces it with a (possibly empty) +list of substitutions. Each {{<replacement>}} can be either a string +literal, a numeric index, a symbol (as a named submatch), or a +procedure which takes one argument (the match object) and returns a +string. + +Examples: + +<enscript highlight=scheme> +(irregex-replace "[aeiou]" "hello world" "*") => "h*llo world" + +(irregex-replace/all "[aeiou]" "hello world" "*") => "h*ll* w*rld" +</enscript> + +===== irregex-split +===== irregex-extract + +<procedure>(irregex-split <irx> <str> [<start> <end>])</procedure><br> +<procedure>(irregex-extract <irx> <str> [<start> <end>])</procedure> + +{{irregex-split}} splits the string {{<str>}} into substrings divided +by the pattern in {{<irx>}}. {{irregex-extract}} does the opposite, +returning a list of each instance of the pattern matched disregarding +the substrings in between. + +===== irregex-fold + +<procedure>(irregex-fold <irx> <kons> <knil> <str> [<finish> <start> <end>])</procedure> + +This performs a fold operation over every non-overlapping place +{{<irx>}} occurs in the string {{str}}. + +The {{<kons>}} procedure takes the following signature: + +<enscript highlight=scheme> +(<kons> <from-index> <match> <seed>) +</enscript> + +where {{<from-index>}} is the index from where we started searching +(initially {{<start>}} and thereafter the end index of the last +match), {{<match>}} is the resulting match-data object, and {{<seed>}} +is the accumulated fold result starting with {{<knil>}}. + +The rationale for providing the {{<from-index>}} (which is not +provided in the SCSH {{regexp-fold}} utility), is because this +information is useful (e.g. for extracting the unmatched portion of +the string before the current match, as needed in +{{irregex-replace}}), and not otherwise directly accessible. + +The optional {{<finish>}} takes two arguments: + +<enscript highlight=scheme> +(<finish> <from-index> <seed>) +</enscript> + +which simiarly allows you to pick up the unmatched tail of the string, +and defaults to just returning the {{<seed>}}. + +{{<start>}} and {{<end>}} are numeric indices letting you specify the +boundaries of the string on which you want to fold. + +To extract all instances of a match out of a string, you can use + +<enscript highlight=scheme> +(map irregex-match-substring + (irregex-fold <irx> + (lambda (i m s) (cons m s)) + '() + <str> + (lambda (i s) (reverse s)))) +</enscript> + +==== Extended SRE Syntax + +Irregex provides the first native implementation of SREs (Scheme +Regular Expressions), and includes many extensions necessary both for +minimal POSIX compatibility, as well as for modern extensions found in +libraries such as PCRE. + +The following table summarizes the SRE syntax, with detailed +explanations following. + + ;; basic patterns + <string> ; literal string + (seq <sre> ...) ; sequence + (: <sre> ...) + (or <sre> ...) ; alternation + + ;; optional/multiple patterns + (? <sre> ...) ; 0 or 1 matches + (* <sre> ...) ; 0 or more matches + (+ <sre> ...) ; 1 or more matches + (= <n> <sre> ...) ; exactly <n> matches + (>= <n> <sre> ...) ; <n> or more matches + (** <from> <to> <sre> ...) ; <n> to <m> matches + (?? <sre> ...) ; non-greedy (non-greedy) pattern: (0 or 1) + (*? <sre> ...) ; non-greedy kleene star + (**? <from> <to> <sre> ...) ; non-greedy range + + ;; submatch patterns + (submatch <sre> ...) ; numbered submatch + ($ <sre> ...) + (submatch-named <name> <sre> ...) ; named submatch + (=> <name> <sre> ...) + (backref <n-or-name>) ; match a previous submatch + + ;; toggling case-sensitivity + (w/case <sre> ...) ; enclosed <sre>s are case-sensitive + (w/nocase <sre> ...) ; enclosed <sre>s are case-insensitive + + ;; character sets + <char> ; singleton char set + (<string>) ; set of chars + (or <cset-sre> ...) ; set union + (~ <cset-sre> ...) ; set complement (i.e. [^...]) + (- <cset-sre> ...) ; set difference + (& <cset-sre> ...) ; set intersection + (/ <range-spec> ...) ; pairs of chars as ranges + + ;; named character sets + any + nonl + ascii + lower-case lower + upper-case upper + alphabetic alpha + numeric num + alphanumeric alphanum alnum + punctuation punct + graphic graph + whitespace white space + printing print + control cntrl + hex-digit xdigit + + ;; assertions and conditionals + bos eos ; beginning/end of string + bol eol ; beginning/end of line + bow eow ; beginning/end of word + nwb ; non-word-boundary + (look-ahead <sre> ...) ; zero-width look-ahead assertion + (look-behind <sre> ...) ; zero-width look-behind assertion + (neg-look-ahead <sre> ...) ; zero-width negative look-ahead assertion + (neg-look-behind <sre> ...) ; zero-width negative look-behind assertion + (atomic <sre> ...) ; for (?>...) independent patterns + (if <test> <pass> [<fail>]) ; conditional patterns + commit ; don't backtrack beyond this (i.e. cut) + + ;; backwards compatibility + (posix-string <string>) ; embed a POSIX string literal + +===== Basic SRE Patterns + +The simplest SRE is a literal string, which matches that string +exactly. + +<enscript highlight=scheme> +(irregex-search "needle" "hayneedlehay") => #<match> +</enscipt> + +By default the match is case-sensitive, though you can control this +either with the compiler flags or local overrides: + +<enscript highlight=scheme> +(irregex-search "needle" "haynEEdlehay") => #f + +(irregex-search (irregex "needle" 'i) "haynEEdlehay") => #<match> + +(irregex-search '(w/nocase "needle") "haynEEdlehay") => #<match> +</enscript> + +You can use {{w/case}} to switch back to case-sensitivity inside a +{{w/nocase}} or when the SRE was compiled with {{'i}}: + +<enscript highlight=scheme> +(irregex-search '(w/nocase "SMALL" (w/case "BIG")) "smallBIGsmall") => #<match> + +(irregex-search '(w/nocase "small" (w/case "big")) "smallBIGsmall") => #f +</enscript> + +Of course, literal strings by themselves aren't very interesting +regular expressions, so we want to be able to compose them. The most +basic way to do this is with the {{seq}} operator (or its abbreviation +{{:}}), which matches one or more patterns consecutively: + +<enscript highlight=scheme> +(irregex-search '(: "one" space "two" space "three") "one two three") => #<match> +</enscript> + +As you may have noticed above, the {{w/case}} and {{w/nocase}} +operators allowed multiple SREs in a sequence - other operators that +take any number of arguments (e.g. the repetition operators below) +allow such implicit sequences. + +To match any one of a set of patterns use the {{or}} alternation +operator: + +<enscript highlight=scheme> +(irregex-search '(or "eeney" "meeney" "miney") "meeney") => #<match> + +(irregex-search '(or "eeney" "meeney" "miney") "moe") => #f +</enscript> + +===== SRE Repetition Patterns + +There are also several ways to control the number of times a pattern +is matched. The simplest of these is {{?}} which just optionally +matches the pattern: + +<enscript highlight=scheme> +(irregex-search '(: "match" (? "es") "!") "matches!") => #<match> + +(irregex-search '(: "match" (? "es") "!") "match!") => #<match> + +(irregex-search '(: "match" (? "es") "!") "matche!") => #<match> +</enscript> + +To optionally match any number of times, use {{*}}, the Kleene star: + +<enscript highlight=scheme> +(irregex-search '(: "<" (* (~ #\>)) ">") "<html>") => #<match> + +(irregex-search '(: "<" (* (~ #\>)) ">") "<>") => #<match> + +(irregex-search '(: "<" (* (~ #\>)) ">") "<html") => #f +</enscript> + +Often you want to match any number of times, but at least one time is +required, and for that you use {{+}}: + +<enscript highlight=scheme> +(irregex-search '(: "<" (+ (~ #\>)) ">") "<html>") => #<match> + +(irregex-search '(: "<" (+ (~ #\>)) ">") "<a>") => #<match> + +(irregex-search '(: "<" (+ (~ #\>)) ">") "<>") => #f +</enscript> + +More generally, to match at least a given number of times, use {{>=}}: + +<enscript highlight=scheme> +(irregex-search '(: "<" (>= 3 (~ #\>)) ">") "<table>") => #<match> + +(irregex-search '(: "<" (>= 3 (~ #\>)) ">") "<pre>") => #<match> + +(irregex-search '(: "<" (>= 3 (~ #\>)) ">") "<tr>") => #f +</enscript> + +To match a specific number of times exactly, use {{=}}: + +<enscript highlight=scheme> +(irregex-search '(: "<" (= 4 (~ #\>)) ">") "<html>") => #<match> + +(irregex-search '(: "<" (= 4 (~ #\>)) ">") "<table>") => #f +</enscript> + +And finally, the most general form is {{**}} which specifies a range +of times to match. All of the earlier forms are special cases of this. + +<enscript highlight=scheme> +(irregex-search '(: (= 3 (** 1 3 numeric) ".") (** 1 3 numeric)) "192.168.1.10") => #<match> + +(irregex-search '(: (= 3 (** 1 3 numeric) ".") (** 1 3 numeric)) "192.0168.1.10") => #f +</enscript> + +There are also so-called "non-greedy" variants of these repetition +operators, by convention suffixed with an additional {{?}}. Since the +normal repetition patterns can match any of the allotted repetition +range, these operators will match a string if and only if the normal +versions matched. However, when the endpoints of which submatch +matched where are taken into account (specifically, all matches when +using irregex-search since the endpoints of the match itself matter), +the use of a non-greedy repetition can change the result. + +So, whereas {{?}} can be thought to mean "match or don't match," +{{??}} means "don't match or match." {{*}} typically consumes as much +as possible, but {{*?}} tries first to match zero times, and only +consumes one at a time if that fails. If you have a greedy operator +followed by a non-greedy operator in the same pattern, they can +produce surprisins results as they compete to make the match longer or +shorter. If this seems confusing, that's because it is. Non-greedy +repetitions are defined only in terms of the specific backtracking +algorithm used to implement them, which for compatibility purposes +always means the Perl algorithm. Thus, when using these patterns you +force IrRegex to use a backtracking engine, and can't rely on +efficient execution. + +===== SRE Character Sets + +Perhaps more common than matching specific strings is matching any of +a set of characters. You can use the {{or}} alternation pattern on a +list of single-character strings to simulate a character set, but this +is too clumsy for everyday use so SRE syntax allows a number of +shortcuts. + +A single character matches that character literally, a trivial +character class. More conveniently, a list holding a single element +which is a string refers to the character set composed of every +character in the string. + +<enscript highlight=scheme> +(irregex-match '(* #\-) "---") => #<match> + +(irregex-match '(* #\-) "-_-") => #f + +(irregex-match '(* ("aeiou")) "oui") => #<match> + +(irregex-match '(* ("aeiou")) "ouais") => #f +</enscript> + +Ranges are introduced with the \q{/} operator. Any strings or +characters in the \q{/} are flattened and then taken in pairs to +represent the start and end points, inclusive, of character ranges. + +<enscript highlight=scheme> +(irregex-match '(* (/ "AZ09")) "R2D2") => #<match> + +(irregex-match '(* (/ "AZ09")) "C-3PO") => #f +</enscript> + +In addition, a number of set algebra operations are provided. \q{or}, +of course, has the same meaning, but when all the options are +character sets it can be thought of as the set union operator. This +is further extended by the \q{&} set intersection, \q{-} set +difference, and \q{~} set complement operators. + +<enscript highlight=scheme> +(irregex-match '(* (& (/ "az") (~ ("aeiou")))) "xyzzy") => #<match> + +(irregex-match '(* (& (/ "az") (~ ("aeiou")))) "vowels") => #f + +(irregex-match '(* (- (/ "az") ("aeiou"))) "xyzzy") => #<match> + +(irregex-match '(* (- (/ "az") ("aeiou"))) "vowels") => #f +</enscript> + +===== SRE Assertion Patterns + +There are a number of times it can be useful to assert something about +the area around a pattern without explicitly making it part of the +pattern. The most common cases are specifically anchoring some +pattern to the beginning or end of a word or line or even the whole +string. For example, to match on the end of a word: + +<enscript highlight=scheme> +(irregex-match '(: "foo" eow) "foo") => #<match> + +(irregex-match '(: "foo" eow) "foo!") => #<match> + +(irregex-match '(: "foo" eow) "foof") => #f +</enscript> + +The {{bow}}, {{bol}}, {{eol}}, {{bos}} and {{eos}} work similarly. +{{nwb}} asserts that you are not in a word-boundary - if replaced for +{{eow}} in the above examples it would reverse all the results. + +There is no {{wb}}, since you tend to know from context whether it +would be the beginning or end of a word, but if you need it you can +always use {{(or bow eow)}}. + +Somewhat more generally, Perl introduced positive and negative +look-ahead and look-behind patterns. Perl look-behind patterns are +limited to a fixed length, however the IrRegex versions have no such +limit. + +<enscript highlight=scheme> +(irregex-match '(: "regular" (look-ahead " expression")) + "regular expression") + => #<match> +</enscript> + +The most general case, of course, would be an \q{and} pattern to +complement the \q{or} pattern - all the patterns must match or the +whole pattern fails. This may be provided in a future release, +although it (and look-ahead and look-behind assertions) are unlikely +to be compiled efficiently. + +===== SRE Utility Patterns + +The following utility regular expressions are also provided for common +patterns that people are eternally reinventing. They are not +necessarily the official patterns matching the RFC definitions of the +given data, because of the way that such patterns tend to be used. +There are three general usages for regexps: + +; searching : search for a pattern matching a desired object in a larger text + +; validation : determine whether an entire string matches a pattern + +; extraction : given a string already known to be valid, extract certain fields from it as submatches + +In some cases, but not always, these will overlap. When they are +different, {{irregex-search}} will naturally always want the searching +version, so IrRegex provides that version. + +As an example where these might be different, consider a URL. If you +want to match all the URLs in some arbitrary text, you probably want +to exclude a period or comma at the tail end of a URL, since it's more +likely being used as punctuation rather than part of the URL, despite +the fact that it would be valid URL syntax. + +Another problem with the RFC definitions is the standard itself may +have become irrelevant. For example, the pattern IrRegex provides for +email addresses doesn't match quoted local parts (e.g. +{{"first last"@domain.com}}) because these are increasingly rare, and +unsupported by enough software that it's better to discourage their use. +Conversely, technically consecutive periods +(e.g. {{first..last@domain.com}}) are not allowed in email addresses, but +most email software does allow this, and in fact such addresses are +quite common in Japan. + +The current patterns provided are: + + newline ; general newline pattern (crlf, cr, lf) + integer ; an integer + real ; a real number (including scientific) + string ; a "quoted" string + symbol ; an R5RS Scheme symbol + ipv4-address ; a numeric decimal ipv4 address + ipv6-address ; a numeric hexadecimal ipv6 address + domain ; a domain name + email ; an email address + http-url ; a URL beginning with https?:// + +Because of these issues the exact definitions of these patterns are +subject to be changed, but will be documented clearly when they are +finalized. More common patterns are also planned, but as what you +want increases in complexity it's probably better to use a real +parser. + +==== Supported PCRE Syntax + +Since the PCRE syntax is so overwhelming complex, it's easier to just +list what we *don't* support for now. Refer to the +[[http://pcre.org/pcre.txt|PCRE documentation]] for details. You +should be using the SRE syntax anyway! + +Unicode character classes ({{\P}}) are not supported, but will be +in an upcoming release. {{\C}} named characters are not supported. + +Callbacks, subroutine patterns and recursive patterns are not +supported. ({{*FOO}}) patterns are not supported and may never be. + +{{\G}} and {{\K}} are not supported. + +Octal character escapes are not supported because they are ambiguous +with back-references - just use hex character escapes. + +Other than that everything should work, including named submatches, +zero-width assertions, conditional patterns, etc. + +In addition, {{\<}} and {{\>}} act as beginning-of-word and end-of-word +marks, respectively, as in Emacs regular expressions. + +Also, two escapes are provided to embed SRE patterns inside PCRE +strings, {{"\'<sre>"}} and {{"(*'<sre>)"}}. For example, to match a +comma-delimited list of integers you could use + +<enscript highlight=scheme> +"\\'integer(,\\'integer)*" +</enscript> + +and to match a URL in angle brackets you could use + +<enscript highlight=scheme> +"<('*http-url)>" +</enscript> + +Note in the second example the enclosing {{"('*...)"}} syntax is needed +because the Scheme reader would consider the closing {{">"}} as part of +the SRE symbol. + +The following chart gives a quick reference from PCRE form to the SRE +equivalent: + + ;; basic syntax + "^" ;; bos (or eos inside (?m: ...)) + "$" ;; eos (or eos inside (?m: ...)) + "." ;; nonl + "a?" ;; (? a) + "a*" ;; (* a) + "a+" ;; (+ a) + "a??" ;; (?? a) + "a*?" ;; (*? a) + "a+?" ;; (+? a) + "a{n,m}" ;; (** n m a) + + ;; grouping + "(...)" ;; (submatch ...) + "(?:...)" ;; (: ...) + "(?i:...)" ;; (w/nocase ...) + "(?-i:...)" ;; (w/case ...) + "(?<name>...)" ;; (=> <name>...) + + ;; character classes + "[aeiou]" ;; ("aeiou") + "[^aeiou]" ;; (~ "aeiou") + "[a-z]" ;; (/ "az") or (/ "a" "z") + "[[:alpha:]]" ;; alpha + + ;; assertions + "(?=...)" ;; (look-ahead ...) + "(?!...)" ;; (neg-look-ahead ...) + "(?<=...)" ;; (look-behind ...) + "(?<!...)" ;; (neg-look-behind ...) + "(?(test)pass|fail)" ;; (if test pass fail) + "(*COMMIT)" ;; commit + +==== Chunked String Matching + +It's often desirable to perform regular expression matching over +sequences of characters not represented as a single string. The most +obvious example is a text-buffer data structure, but you may also want +to match over lists or trees of strings (i.e. ropes), over only +certain ranges within a string, over an input port, etc. With +existing regular expression libraries, the only way to accomplish this +is by converting the abstract sequence into a freshly allocated +string. This can be expensive, or even impossible if the object is a +text-buffer opened onto a 500MB file. + +IrRegex provides a chunked string API specifically for this purpose. +You define a chunking API with {{make-irregex-chunker}}: + +===== make-irregex-chunker + +<procedure>(make-irregex-chunker <get-next> <get-string> [<get-start> <get-end> <get-substring> <get-subchunk>])</procedure> + +where + +{{(<get-next> chunk) => }} returns the next chunk, or {{#f}} if there are no more chunks + +{{(<get-string> chunk) => }} a string source for the chunk + +{{(<get-start> chunk) => }} the start index of the result of {{<get-string>}} (defaults to always 0) + +{{(<get-end> chunk) => }} the end (exclusive) of the string (defaults to {{string-length}} of the source string) + +{{(<get-substring> cnk1 i cnk2 j) => }} a substring for the range between the chunk {{cnk1}} starting at index {{i}} and ending at {{cnk2}} at index {{j}} + +{{(<get-subchunk> cnk1 i cnk2 j) => }} as above but returns a new chunked data type instead of a string (optional) + +There are two important constraints on the {{<get-next>}} procedure. +It must return an {{eq?}} identical object when called multiple times +on the same chunk, and it must not return a chunk with an empty string +(start == end). This second constraint is for performance reasons - +we push the work of possibly filtering empty chunks to the chunker +since there are many chunk types for which empty strings aren't +possible, and this work is thus not needed. Note that the initial +chunk passed to match on is allowed to be empty. + +{{<get-substring>}} is provided for possible performance improvements +- without it a default is used. {{<get-subchunk>}} is optional - +without it you may not use {{irregex-match-subchunk}} described above. + +You can then match chunks of these types with the following +procedures: + +===== irregex-search/chunked +===== irregex-match/chunked + +<procedure>(irregex-search/chunked <irx> <chunker> <chunk> [<start>])</procedure><br> +<procedure>(irregex-match/chunked <irx> <chunker> <chunk> [<start>])</procedure> + +These return normal match-data objects. + +Example: + +To match against a simple, flat list of strings use: + +<enscript highlight=scheme> + (define (rope->string rope1 start rope2 end) + (if (eq? rope1 rope2) + (substring (car rope1) start end) + (let loop ((rope (cdr rope1)) + (res (list (substring (car rope1) start)))) + (if (eq? rope rope2) + (string-concatenate-reverse ; from SRFI-13 + (cons (substring (car rope) 0 end) res)) + (loop (cdr rope) (cons (car rope) res)))))) + + (define rope-chunker + (make-irregex-chunker (lambda (x) (and (pair? (cdr x)) (cdr x))) + car + (lambda (x) 0) + (lambda (x) (string-length (car x))) + rope->string)) + + (irregex-search/chunked <pat> rope-chunker <list-of-strings>) +</enscript> + +Here we are just using the default start, end and substring behaviors, +so the above chunker could simply be defined as: + +<enscript highlight=scheme> + (define rope-chunker + (make-irregex-chunker (lambda (x) (and (pair? (cdr x)) (cdr x))) car)) +</enscript> + +===== irregex-fold/chunked + +<procedure>(irregex-fold/chunked <irx> <kons> <knil> <chunker> <chunk> [<finish> [<start-index>]])</procedure> + +Chunked version of {{irregex-fold}}. + +==== Utilities + +The following procedures are also available. + +===== irregex-quote + +<procedure>(irregex-quote <str>)</procedure> + +Returns a new string with any special regular expression characters +escaped, to match the original string literally in POSIX regular +expressions. + +===== irregex-opt + +<procedure>(irregex-opt <list-of-strings>)</procedure> + +Returns an optimized SRE matching any of the literal strings +in the list, like Emacs' \q{regexp-opt}. Note this optimization +doesn't help when irregex is able to build a DFA. + +===== sre->string + +<procedure>(sre->string <sre>)</procedure> + +Convert an SRE to a POSIX-style regular expression string, if +possible. + + +--- +Previous: [[Unit extras]] + +Next: [[Unit regex]] diff --git a/manual/Unit regex b/manual/Unit regex index 2d0c249e..bd6eb479 100644 --- a/manual/Unit regex +++ b/manual/Unit regex @@ -3,21 +3,15 @@ == Unit regex -This library unit provides support for regular expressions. The regular -expression package used is {{irregex}} -written by Alex Shinn. Irregex supports most Perl-extensions and is -written completely in Scheme. -This library unit exposes two APIs: the standard Chicken API described below, and the -original irregex API. You may use either API or both: +This library unit provides some high-level operations for regular +expression and operations that are kept for backward compatibility +to older versions of CHICKEN. - (require-library regex) ; required for either API, or both - (import regex) ; import the Chicken regex API - (import irregex) ; import the original irregex API +This unit uses the {{irregex}} unit internally. It is recommended +to use the {{irregex}} API where possible, since it provides a +more featureful interface. -Regular expressions may be either POSIX-style strings (with most PCRE -extensions) or an SCSH-style SRE. There is no {{(rx ...)}} syntax - -just use normal Scheme lists, with quasiquote if you like. === grep @@ -196,266 +190,7 @@ into a regular expression. => "\\^\\[0-9\\]\\+:.\n.\\*\\$" </enscript> -=== Extended SRE Syntax - -The following table summarizes the SRE syntax, with detailed explanations following. - - ;; basic patterns - <string> ; literal string - (seq <sre> ...) ; sequence - (: <sre> ...) - (or <sre> ...) ; alternation - - ;; optional/multiple patterns - (? <sre> ...) ; 0 or 1 matches - (* <sre> ...) ; 0 or more matches - (+ <sre> ...) ; 1 or more matches - (= <n> <sre> ...) ; exactly <n> matches - (>= <n> <sre> ...) ; <n> or more matches - (** <from> <to> <sre> ...) ; <n> to <m> matches - (?? <sre> ...) ; non-greedy (non-greedy) pattern: (0 or 1) - (*? <sre> ...) ; non-greedy kleene star - (**? <from> <to> <sre> ...) ; non-greedy range - - ;; submatch patterns - (submatch <sre> ...) ; numbered submatch - (submatch-named <name> <sre> ...) ; named submatch - (=> <name> <sre> ...) - (backref <n-or-name>) ; match a previous submatch - - ;; toggling case-sensitivity - (w/case <sre> ...) ; enclosed <sre>s are case-sensitive - (w/nocase <sre> ...) ; enclosed <sre>s are case-insensitive - - ;; character sets - <char> ; singleton char set - (<string>) ; set of chars - (or <cset-sre> ...) ; set union - (~ <cset-sre> ...) ; set complement (i.e. [^...]) - (- <cset-sre> ...) ; set difference - (& <cset-sre> ...) ; set intersection - (/ <range-spec> ...) ; pairs of chars as ranges - - ;; named character sets - any - nonl - ascii - lower-case lower - upper-case upper - alphabetic alpha - numeric num - alphanumeric alphanum alnum - punctuation punct - graphic graph - whitespace white space - printing print - control cntrl - hex-digit xdigit - - ;; assertions and conditionals - bos eos ; beginning/end of string - bol eol ; beginning/end of line - bow eow ; beginning/end of word - nwb ; non-word-boundary - (look-ahead <sre> ...) ; zero-width look-ahead assertion - (look-behind <sre> ...) ; zero-width look-behind assertion - (neg-look-ahead <sre> ...) ; zero-width negative look-ahead assertion - (neg-look-behind <sre> ...) ; zero-width negative look-behind assertion - (atomic <sre> ...) ; for (?>...) independent patterns - (if <test> <pass> [<fail>]) ; conditional patterns - commit ; don't backtrack beyond this (i.e. cut) - - ;; backwards compatibility - (posix-string <string>) ; embed a POSIX string literal - -==== Basic SRE Patterns - -The simplest SRE is a literal string, which matches that string exactly. - - (string-search "needle" "hayneedlehay") => <match> - -By default the match is case-sensitive, though you can control this either with the compiler flags or local overrides: - - (string-search "needle" "haynEEdlehay") => #f - - (string-search (irregex "needle" 'i) "haynEEdlehay") => <match> - - (string-search '(w/nocase "needle") "haynEEdlehay") => <match> - -You can use {{w/case}} to switch back to case-sensitivity inside a {{w/nocase}}: - - (string-search '(w/nocase "SMALL" (w/case "BIG")) "smallBIGsmall") => <match> - - (string-search '(w/nocase "small" (w/case "big")) "smallBIGsmall") => #f - -Of course, literal strings by themselves aren't very interesting -regular expressions, so we want to be able to compose them. The most -basic way to do this is with the {{seq}} operator (or its abbreviation {{:}}), -which matches one or more patterns consecutively: - - (string-search '(: "one" space "two" space "three") "one two three") => <match> - -As you may have noticed above, the {{w/case}} and {{w/nocase}} operators -allowed multiple SREs in a sequence - other operators that take any -number of arguments (e.g. the repetition operators below) allow such -implicit sequences. - -To match any one of a set of patterns use the or alternation operator: - - (string-search '(or "eeney" "meeney" "miney") "meeney") => <match> - - (string-search '(or "eeney" "meeney" "miney") "moe") => #f - -==== SRE Repetition Patterns - -There are also several ways to control the number of times a pattern -is matched. The simplest of these is {{?}} which just optionally matches -the pattern: - - (string-search '(: "match" (? "es") "!") "matches!") => <match> - - (string-search '(: "match" (? "es") "!") "match!") => <match> - - (string-search '(: "match" (? "es") "!") "matche!") => #f - -To optionally match any number of times, use {{*}}, the Kleene star: - - (string-search '(: "<" (* (~ #\>)) ">") "<html>") => <match> - - (string-search '(: "<" (* (~ #\>)) ">") "<>") => <match> - - (string-search '(: "<" (* (~ #\>)) ">") "<html") => #f - -Often you want to match any number of times, but at least one time is required, and for that you use {{+}}: - - (string-search '(: "<" (+ (~ #\>)) ">") "<html>") => <match> - - (string-search '(: "<" (+ (~ #\>)) ">") "<a>") => <match> - - (string-search '(: "<" (+ (~ #\>)) ">") "<>") => #f - -More generally, to match at least a given number of times, use {{>=}}: - - (string-search '(: "<" (>= 3 (~ #\>)) ">") "<table>") => <match> - - (string-search '(: "<" (>= 3 (~ #\>)) ">") "<pre>") => <match> - - (string-search '(: "<" (>= 3 (~ #\>)) ">") "<tr>") => #f - -To match a specific number of times exactly, use {=}: - - (string-search '(: "<" (= 4 (~ #\>)) ">") "<html>") => <match> - - (string-search '(: "<" (= 4 (~ #\>)) ">") "<table>") => #f - -And finally, the most general form is {{**}} which specifies a range -of times to match. All of the earlier forms are special cases of this. - - (string-search '(: (= 3 (** 1 3 numeric) ".") (** 1 3 numeric)) "192.168.1.10") => <match> - - (string-search '(: (= 3 (** 1 3 numeric) ".") (** 1 3 numeric)) "192.0168.1.10") => #f - -There are also so-called "non-greedy" variants of these repetition -operators, by convention suffixed with an additional {{?}}. Since the -normal repetition patterns can match any of the allotted repetition -range, these operators will match a string if and only if the normal -versions matched. However, when the endpoints of which submatch -matched where are taken into account (specifically, all matches when -using string-search since the endpoints of the match itself matter), -the use of a non-greedy repetition can change the result. - -So, whereas {{?}} can be thought to mean "match or don't match," {{??}} means -"don't match or match." {{*}} typically consumes as much as possible, but -{{*?}} tries first to match zero times, and only consumes one at a time if -that fails. If you have a greedy operator followed by a non-greedy -operator in the same pattern, they can produce surprisins results as -they compete to make the match longer or shorter. If this seems -confusing, that's because it is. Non-greedy repetitions are defined -only in terms of the specific backtracking algorithm used to implement -them, which for compatibility purposes always means the Perl -algorithm. Thus, when using these patterns you force IrRegex to use a -backtracking engine, and can't rely on efficient execution. - -==== SRE Character Sets - -Perhaps more common than matching specific strings is matching any of -a set of characters. You can use the or alternation pattern on a list -of single-character strings to simulate a character set, but this is -too clumsy for everyday use so SRE syntax allows a number of -shortcuts. - -A single character matches that character literally, a trivial -character class. More conveniently, a list holding a single element -which is a string refers to the character set composed of every -character in the string. - - (string-match '(* #\-) "---") => <match> - - (string-match '(* #\-) "-_-") => #f - - (string-match '(* ("aeiou")) "oui") => <match> - - (string-match '(* ("aeiou")) "ouais") => #f - -Ranges are introduced with the {{/}} operator. Any strings or characters -in the {{/}} are flattened and then taken in pairs to represent the start -and end points, inclusive, of character ranges. - - (string-match '(* (/ "AZ09")) "R2D2") => <match> - - (string-match '(* (/ "AZ09")) "C-3PO") => #f - -In addition, a number of set algebra operations are provided. or, of -course, has the same meaning, but when all the options are character -sets it can be thought of as the set union operator. This is further -extended by the {{&}} set intersection, {{-}} set difference, and {{~}} set -complement operators. - - (string-match '(* (& (/ "az") (~ ("aeiou")))) "xyzzy") => <match> - - (string-match '(* (& (/ "az") (~ ("aeiou")))) "vowels") => #f - - (string-match '(* (- (/ "az") ("aeiou"))) "xyzzy") => <match> - - (string-match '(* (- (/ "az") ("aeiou"))) "vowels") => #f - -==== SRE Assertion Patterns - -There are a number of times it can be useful to assert something about -the area around a pattern without explicitly making it part of the -pattern. The most common cases are specifically anchoring some pattern -to the beginning or end of a word or line or even the whole -string. For example, to match on the end of a word: - - (string-match '(: "foo" eow) "foo") => <match> - - (string-match '(: "foo" eow) "foo!") => <match> - - (string-match '(: "foo" eow) "foof") => #f - -The {{bow}}, {{bol}}, {{eol}}, {{bos}} and {{eos}} work similarly. {{nwb}} asserts that you -are not in a word-boundary - if replaced for {{eow}} in the above examples -it would reverse all the results. - -There is no {{wb}}, since you tend to know from context whether it -would be the beginning or end of a word, but if you need it you can -always use (or bow eow). - -Somewhat more generally, Perl introduced positive and negative -look-ahead and look-behind patterns. Perl look-behind patterns are -limited to a fixed length, however the IrRegex versions have no such -limit. - - (string-match '(: "regular" (look-ahead " expression")) "regular expression") => <match> - -The most general case, of course, would be an and pattern to -complement the or pattern - all the patterns must match or the whole -pattern fails. This may be provided in a future release, although it -(and look-ahead and look-behind assertions) are unlikely to be -compiled efficiently. - - --- -Previous: [[Unit extras]] +Previous: [[Unit irregex]] Next: [[Unit srfi-1]] diff --git a/posix-common.scm b/posix-common.scm index 55f9f488..724d3283 100644 --- a/posix-common.scm +++ b/posix-common.scm @@ -245,12 +245,12 @@ EOF '() (let ((path (car paths))) (let-values (((dir fil ext) (decompose-pathname path))) - (let* ((patt (glob->regexp (make-pathname #f (or fil "*") ext))) - (rx (regexp patt))) + (let ((rx (glob->regexp (make-pathname #f (or fil "*") ext)))) (let loop ((fns (directory (or dir ".") #t))) (cond ((null? fns) (conc-loop (cdr paths))) ((string-match rx (car fns)) - => (lambda (m) (cons (make-pathname dir (car m)) (loop (cdr fns)))) ) + => (lambda (m) + (cons (make-pathname dir (car m)) (loop (cdr fns)))) ) (else (loop (cdr fns))) ) ) ) ) ) ) ) ) ) ) diff --git a/tests/test-irregex.scm b/tests/test-irregex.scm index 4a62befb..8118ad5f 100644 --- a/tests/test-irregex.scm +++ b/tests/test-irregex.scm @@ -43,19 +43,21 @@ (lp))))))))))) (define (test-re matcher line) - (match (string-split line "\t" #t) - ((pattern input result subst output) - (let ((name (sprintf "~A ~A ~A ~A" pattern input result subst))) - (cond - ((equal? "c" result) - (test-error name (matcher pattern input))) - ((equal? "n" result) - (test-assert name (not (matcher pattern input)))) - (else - (test name output - (subst-matches (matcher pattern input) subst)))))) - (else - (warning "invalid regex test line" line)))) + (let ((splt (string-split line "\t" #t))) + (if (list? splt) + (apply + (lambda (pattern input result subst output) + (let ((name (sprintf "~A ~A ~A ~A" pattern input result subst))) + (cond + ((equal? "c" result) + (test-error name (matcher pattern input))) + ((equal? "n" result) + (test-assert name (not (matcher pattern input)))) + (else + (test-equal name output + (subst-matches (matcher pattern input) subst)))))) + splt) + (warning "invalid regex test line" line)))) (test-begin) @@ -151,40 +153,43 @@ (for-each (lambda (opts) - (test-group (sprintf "irregex/chunked - ~S" opts) - (with-input-from-file "re-tests.txt" - (lambda () - (port-for-each - (lambda (line) - (match (string-split line "\t" #t) - ((pattern input result subst output) - (let ((name - (sprintf "~A ~A ~A ~A" pattern input result subst))) - (cond - ((equal? "c" result)) - ((equal? "n" result) - (for-each - (lambda (rope) - (test-assert name - (not (irregex-search/chunked pattern - rope-chunker - rope)))) - (append (make-ropes input) - (make-shared-ropes input)))) - (else - (for-each - (lambda (rope) - (test name output - (subst-matches (irregex-search/chunked pattern - rope-chunker - rope) - subst))) - (append (make-ropes input) - (make-shared-ropes input))))))) - (else - (warning "invalid regex test line" line))) - ) - read-line))))) + (test-group + (sprintf "irregex/chunked - ~S" opts) + (with-input-from-file "re-tests.txt" + (lambda () + (port-for-each + (lambda (line) + (let ((splt (string-split line "\t" #t))) + (if (list? splt) + (apply + (lambda (pattern input result subst output) + (let ((name + (sprintf "~A ~A ~A ~A" pattern input result subst))) + (cond + ((equal? "c" result)) + ((equal? "n" result) + (for-each + (lambda (rope) + (test-assert name + (not (irregex-search/chunked pattern + rope-chunker + rope)))) + (append (make-ropes input) + (make-shared-ropes input)))) + (else + (for-each + (lambda (rope) + (test-equal + name output + (subst-matches (irregex-search/chunked pattern + rope-chunker + rope) + subst))) + (append (make-ropes input) + (make-shared-ropes input))))))) + splt) + (warning "invalid regex test line" line)))) + read-line))))) '((backtrack) (fast) )) @@ -252,32 +257,32 @@ (test-assert (irregex-match-data? (irregex-match "a.*b" "axxxb"))) (test-assert (not (irregex-match-data? (vector '*irregex-match-tag* #f #f #f #f #f #f #f #f #f)))) (test-assert (not (irregex-match-data? (vector #f #f #f #f #f #f #f #f #f #f #f)))) - (test 0 (irregex-num-submatches (irregex "a.*b"))) - (test 1 (irregex-num-submatches (irregex "a(.*)b"))) - (test 2 (irregex-num-submatches (irregex "(a(.*))b"))) - (test 2 (irregex-num-submatches (irregex "a(.*)(b)"))) - (test 10 (irregex-num-submatches (irregex "((((((((((a))))))))))"))) - (test 0 (irregex-match-num-submatches (irregex-search "a.*b" "axxxb"))) - (test 1 (irregex-match-num-submatches (irregex-search "a(.*)b" "axxxb"))) - (test 2 (irregex-match-num-submatches (irregex-search "(a(.*))b" "axxxb"))) - (test 2 (irregex-match-num-submatches (irregex-search "a(.*)(b)" "axxxb"))) - (test 10 (irregex-match-num-submatches (irregex-search "((((((((((a))))))))))" "a"))) + (test-equal 0 (irregex-num-submatches (irregex "a.*b"))) + (test-equal 1 (irregex-num-submatches (irregex "a(.*)b"))) + (test-equal 2 (irregex-num-submatches (irregex "(a(.*))b"))) + (test-equal 2 (irregex-num-submatches (irregex "a(.*)(b)"))) + (test-equal 10 (irregex-num-submatches (irregex "((((((((((a))))))))))"))) + (test-equal 0 (irregex-match-num-submatches (irregex-search "a.*b" "axxxb"))) + (test-equal 1 (irregex-match-num-submatches (irregex-search "a(.*)b" "axxxb"))) + (test-equal 2 (irregex-match-num-submatches (irregex-search "(a(.*))b" "axxxb"))) + (test-equal 2 (irregex-match-num-submatches (irregex-search "a(.*)(b)" "axxxb"))) + (test-equal 10 (irregex-match-num-submatches (irregex-search "((((((((((a))))))))))" "a"))) ) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (test-group "utils" - (test "h*llo world" + (test-equal "h*llo world" (irregex-replace "[aeiou]" "hello world" "*")) - (test "h*ll* w*rld" + (test-equal "h*ll* w*rld" (irregex-replace/all "[aeiou]" "hello world" "*")) - (test '("bob@test.com" "fred@example.com") + (test-equal '("bob@test.com" "fred@example.com") (irregex-fold 'email (lambda (i m s) (cons (irregex-match-substring m) s)) '() "bob@test.com and fred@example.com" (lambda (i s) (reverse s)))) - (test '("bob@test.com" "fred@example.com") + (test-equal '("bob@test.com" "fred@example.com") (irregex-fold/chunked 'email (lambda (src i m s) (cons (irregex-match-substring m) s)) diff --git a/tests/test.scm b/tests/test.scm index e9b43c14..c16de6a5 100644 --- a/tests/test.scm +++ b/tests/test.scm @@ -77,7 +77,8 @@ (define-syntax test-equal (syntax-rules () ((_ name expr value eq) (run-equal name (lambda () expr) value eq)) - ((_ name expr value) (run-equal name (lambda () expr) value equal?)))) + ((_ name expr value) (run-equal name (lambda () expr) value equal?)) + ((_ expr value) (run-equal (->string value) (lambda () expr) value equal?)))) (define-syntax test-error (syntax-rules () @@ -89,7 +90,8 @@ (define-syntax test-assert (syntax-rules () - ((_ name expr) (run-equal name (lambda () (if expr #t #f)) #t eq?)))) + ((_ name expr) (run-equal name (lambda () (if expr #t #f)) #t eq?)) + ((_ expr) (run-equal (->string expr) (lambda () (if expr #t #f)) #t eq?)))) (define-syntax test-group (syntax-rules ()Trap