~ chicken-core (chicken-5) 2ef671b6d4f3266815f7795b0d81d1d579f2056b
commit 2ef671b6d4f3266815f7795b0d81d1d579f2056b
Author: Peter Bex <peter.bex@xs4all.nl>
AuthorDate: Sun Jun 22 18:02:33 2014 +0200
Commit: Mario Domenech Goulart <mario.goulart@gmail.com>
CommitDate: Tue Jul 1 09:49:15 2014 -0300
Fix #878 which was indeed a bug, caused by an incorrect hand-rolled optimisation.
This adds some more "integration" test cases so that we can verify
more easily that the combination of make-kmp-restart-vector and
kmp-step is looping through the pattern correctly as it advances
through the search string.
The optimisation was due to a mistaken reading of the reference
implementation: the pattern was indexed as pat[k+1] at the j=-1 case
and as pat[k] at the pat[k]=pat[j+start] case, but the optimisation
changed the code to use pat[k] in both cases.
Signed-off-by: Mario Domenech Goulart <mario.goulart@gmail.com>
diff --git a/NEWS b/NEWS
index c8ce4276..0d62ee04 100644
--- a/NEWS
+++ b/NEWS
@@ -8,6 +8,7 @@
require extras but use procedures from it.
- SRFI-13: fix string-copy! in cases source and destination strings'
memory areas overlap (#1135).
+ - Fixed bug in make-kmp-restart-vector from SRFI-13.
- Unit lolevel:
- Restore long-lost but still documented "vector-like?" procedure (#983)
diff --git a/srfi-13.scm b/srfi-13.scm
index 65b748f9..dec54b27 100644
--- a/srfi-13.scm
+++ b/srfi-13.scm
@@ -1404,35 +1404,36 @@
((c= char=?) rest) ; (procedure? c=))
(receive (rest2 start end) (string-parse-start+end make-kmp-restart-vector pattern rest)
(let* ((rvlen (- end start))
- (rv (make-vector rvlen -1)))
+ (rv (make-vector rvlen -1)))
(if (> rvlen 0)
(let ((rvlen-1 (- rvlen 1))
(c0 (string-ref pattern start)))
;; Here's the main loop. We have set rv[0] ... rv[i].
;; K = I + START -- it is the corresponding index into PATTERN.
- (let lp1 ((i 0) (j -1) (k start))
+ (let lp1 ((i 0) (j -1) (k start))
(if (< i rvlen-1)
- (let ((ck (string-ref pattern k)))
- ;; lp2 invariant:
- ;; pat[(k-j) .. k-1] matches pat[start .. start+j-1]
- ;; or j = -1.
- (let lp2 ((j j))
-
- (cond ((= j -1)
- (let ((i1 (+ i 1)))
- (vector-set! rv i1 (if (c= ck c0) -1 0))
- (lp1 i1 0 (+ k 1))))
-
- ;; pat[(k-j) .. k] matches pat[start..start+j].
- ((c= ck (string-ref pattern (+ j start)))
- (let* ((i1 (+ 1 i))
- (j1 (+ 1 j)))
- (vector-set! rv i1 j1)
- (lp1 i1 j1 (+ k 1))))
-
- (else (lp2 (vector-ref rv j))))))))))
+ ;; lp2 invariant:
+ ;; pat[(k-j) .. k-1] matches pat[start .. start+j-1]
+ ;; or j = -1.
+ (let lp2 ((j j))
+
+ (cond ((= j -1)
+ (let ((i1 (+ i 1))
+ (ck+1 (string-ref pattern (add1 k))))
+ (vector-set! rv i1 (if (c= ck+1 c0) -1 0))
+ (lp1 i1 0 (+ k 1))))
+
+ ;; pat[(k-j) .. k] matches pat[start..start+j].
+ ((c= (string-ref pattern k)
+ (string-ref pattern (+ j start)))
+ (let* ((i1 (+ 1 i))
+ (j1 (+ 1 j)))
+ (vector-set! rv i1 j1)
+ (lp1 i1 j1 (+ k 1))))
+
+ (else (lp2 (vector-ref rv j)))))))))
rv))))
diff --git a/tests/srfi-13-tests.scm b/tests/srfi-13-tests.scm
index 8df378ac..bc32885b 100644
--- a/tests/srfi-13-tests.scm
+++ b/tests/srfi-13-tests.scm
@@ -619,17 +619,65 @@
(test "make-kmp-restart-vector" '#(-1) (make-kmp-restart-vector "a"))
-;;; The following two tests for make-kmp-restart-vector are
-;;; intentionally commented (see http://bugs.call-cc.org/ticket/878)
-;;; -- mario
-
-; This seems right to me, but is it?
-; (test "make-kmp-restart-vector" '#(-1 0) (make-kmp-restart-vector "ab"))
-
-; The following is from an example in the code, but I expect it is not right.
-; (test "make-kmp-restart-vector" '#(-1 0 0 -1 1 2) (make-kmp-restart-vector "abdabx"))
-
-
+(test "make-kmp-restart-vector" '#(-1 0) (make-kmp-restart-vector "ab"))
+
+; The following is from an example in the code. It is the "optimised"
+; version; it's also valid to return #(-1 0 0 0 1 2), but that will
+; needlessly check the "a" twice before giving up.
+(test "make-kmp-restart-vector"
+ '#(-1 0 0 -1 1 2)
+ (make-kmp-restart-vector "abdabx"))
+
+;; Each entry in kmp-cases is a pattern, a string to match against and
+;; the expected run of the algorithm through the positions in the
+;; pattern. So for example 0 1 2 means it looks at position 0 first,
+;; then at 1 and then at 2.
+;;
+;; This is easy to verify in simple cases; If there's a shared
+;; substring and matching fails, you try matching again starting at
+;; the end of the shared substring, otherwise you rewind. For more
+;; complex cases, it's increasingly difficult for humans to verify :)
+(define kmp-cases
+ '(("abc" "xx" #f 0 0)
+ ("abc" "abc" #t 0 1 2)
+ ("abcd" "abc" #f 0 1 2)
+ ("abc" "abcd" #t 0 1 2)
+ ("abc" "aabc" #t 0 1 1 2)
+ ("ab" "aa" #f 0 1)
+ ("ab" "aab" #t 0 1 1)
+ ("abdabx" "abdbbabda" #f 0 1 2 3 0 0 1 2 3)
+ ("aabc" "axaabc" #t 0 1 0 1 2 3)
+ ("aabac" "aabaabac" #t 0 1 2 3 4 2 3 4)))
+
+(for-each
+ (lambda (test-case)
+ (let* ((pat (car test-case))
+ (n (string-length pat))
+ (str (cadr test-case))
+ (match? (caddr test-case))
+ (steps (cdddr test-case))
+ (rv (make-kmp-restart-vector pat)))
+ (call-with-input-string
+ str
+ (lambda (p)
+ (let lp ((i 0)
+ (step 0)
+ (steps steps))
+ (cond
+ ((or (= i n) (eof-object? (peek-char p)))
+ (test-assert (sprintf "KMP match? ~S, case: ~S" match? test-case)
+ (eq? (= i n) match?))
+ (test-assert (sprintf "KMP empty remaining steps: ~S, case: ~S"
+ steps test-case)
+ (null? steps)))
+ (else
+ (let ((new-i (kmp-step pat rv (read-char p) i char=? 0))
+ (expected-i (and (not (null? steps)) (car steps))))
+ (test (sprintf "KMP step ~S (exp: ~S, act: ~S), case: ~S"
+ step expected-i i test-case)
+ expected-i i)
+ (lp new-i (add1 step) (cdr steps))))))))))
+ kmp-cases)
; FIXME! Implement tests for these:
; string-kmp-partial-search
Trap