From ca16f0eac8a90c4404f874976433d2b672536754 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Fri, 18 Aug 2017 12:27:17 +0100 Subject: [PATCH] [functional-tests] start working on a little regex matcher --- functional-tests/regex.scm | 94 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 functional-tests/regex.scm diff --git a/functional-tests/regex.scm b/functional-tests/regex.scm new file mode 100644 index 0000000..be133b0 --- /dev/null +++ b/functional-tests/regex.scm @@ -0,0 +1,94 @@ +(library + (regex) + (export compile-rx) + (import (chezscheme) + (matchable)) + + ;; Simple regex library, because it's friday and I'm bored. + ;; Playing with the ideas in: https://swtch.com/~rsc/regexp/regexp2.html + ;; which reminded me of reading through the source code to Sam in '93. + + ;; Rather than parsing a string we'll use expressions. + ;; (lit ) + ;; (cat rx1 rx2) + ;; (alt rx1 rx2) + ;; (opt rx) + ;; (star rx) + ;; (plus rx) + ;; + ;; The expressions get compile-rxd into a vector of vm instructions. + ;; (char c) + ;; (match) + ;; (jmp x) + ;; (split x y) + + ;; instructions are closures that manipulate the thread + + ;; FIXME: slow + (define (append-instr code . i) + (append code i)) + + (define (label-instr l) + `(label ,l)) + + (define (jmp-instr l) + `(jmp ,l)) + + (define (char-instr c) + `(char ,c)) + + (define (split-instr l1 l2) + `(split ,l1 ,l2)) + + (define (match-instr) + '(match)) + + (define (label-code label code) + (cons (label-instr label) code)) + + ;; Compiles to a list of labelled instructions that can later be flattened + ;; into a linear sequence. + (define (compile-rx exp) + (match exp + (('lit str) + (map char-instr (string->list str))) + + (('cat rx1 rx2) + (append (compile-rx rx1) (compile-rx rx2))) + + (('alt rx1 rx2) + (let ((label1 (gensym)) + (label2 (gensym))) + (let ((c1 (label-code label1 (compile-rx rx1))) + (c2 (label-code label2 (compile-rx rx2)))) + (cons (split-instr label1 label2) + (append c1 c2))))) + + (('opt rx) + (let ((head (gensym)) + (tail (gensym))) + (cons (split-instr head tail) + (label-code head (append-instr (compile-rx rx) + (label-instr tail)))))) + + (('star rx) + (let ((head (gensym)) + (tail (gensym))) + (cons (split-instr head tail) + (label-code head (append-instr (compile-rx rx) + (jmp-instr head) + (label-instr tail)))))) + + (('plus rx) + (let ((head (gensym)) + (tail (gensym))) + (label-code head + (append-instr (compile-rx rx) + (split-instr head tail) + (label-instr tail))))))) + + ;; A 'thread' consists of an index into the instructions. A 'bundle' holds + ;; the current threads. Note there cannot be more threads than instructions, + ;; so a bundle is represented as a bitvector the same length as the + ;; instructions. Threads are run in lock step, all taking the same input. + )