392 lines
12 KiB
Scheme
392 lines
12 KiB
Scheme
(library
|
|
(regex)
|
|
(export regex)
|
|
(import (chezscheme)
|
|
(fmt fmt)
|
|
(loops)
|
|
(prefix (parser-combinators) p:)
|
|
(srfi s8 receive)
|
|
(matchable)
|
|
(utils))
|
|
|
|
;; Simple regex library, because it's friday and I'm bored.
|
|
;; Playing with the ideas in: https://swtch.com/~rsc/regexp/regexp2.html
|
|
;; which reminded me of reading through the source code to Sam in '93.
|
|
|
|
;; Rather than parsing a string we'll use expressions.
|
|
;; (lit <string>)
|
|
;; (seq rx1 rx2)
|
|
;; (alt rx1 rx2)
|
|
;; (opt rx)
|
|
;; (star rx)
|
|
;; (plus rx)
|
|
;;
|
|
;; The expressions get compiled into a vector of vm instructions.
|
|
;; (char pred) ; where fn :: char -> bool
|
|
;; (match)
|
|
;; (jmp x)
|
|
;; (split x y)
|
|
|
|
(define (append-instr code . i) (append code i))
|
|
(define (label-instr l) `(label ,l))
|
|
(define (jmp-instr l) `(jmp ,l))
|
|
(define (char-instr fn) `(char ,fn))
|
|
(define (split-instr l1 l2) `(split ,l1 ,l2))
|
|
(define (match-instr) '(match))
|
|
(define (match-instr? instr) (equal? '(match) instr))
|
|
|
|
(define (label-code label code)
|
|
(cons (label-instr label) code))
|
|
|
|
;; Compiles to a list of labelled instructions that can later be flattened
|
|
;; into a linear sequence.
|
|
(define (lit str)
|
|
(map (lambda (c1)
|
|
(char-instr
|
|
(lambda (c2)
|
|
(char=? c1 c2))))
|
|
(string->list str)))
|
|
|
|
(define (seq rx1 rx2)
|
|
(append rx1 rx2))
|
|
|
|
(define (alt rx1 rx2)
|
|
(let ((label1 (gensym))
|
|
(label2 (gensym))
|
|
(tail (gensym)))
|
|
(let ((c1 (label-code label1
|
|
(append-instr rx1 (jmp-instr tail))))
|
|
(c2 (label-code label2 rx2)))
|
|
(cons (split-instr label1 label2)
|
|
(append-instr (append c1 c2) (label-instr tail))))))
|
|
|
|
(define (opt rx)
|
|
(let ((head (gensym))
|
|
(tail (gensym)))
|
|
(cons (split-instr head tail)
|
|
(label-code head
|
|
(append-instr rx (label-instr tail))))))
|
|
|
|
(define (star rx)
|
|
(let ((head (gensym))
|
|
(body (gensym))
|
|
(tail (gensym)))
|
|
(label-code head
|
|
(cons (split-instr body tail)
|
|
(label-code body
|
|
(append-instr rx
|
|
(jmp-instr head)
|
|
(label-instr tail)))))))
|
|
|
|
(define (plus rx)
|
|
(let ((head (gensym))
|
|
(tail (gensym)))
|
|
(label-code head
|
|
(append-instr rx
|
|
(split-instr head tail)
|
|
(label-instr tail)))))
|
|
|
|
(define (label-locations code)
|
|
(let ((locs (make-eq-hashtable)))
|
|
(let loop ((pc 0)
|
|
(code code))
|
|
(if (null? code)
|
|
locs
|
|
(match (car code)
|
|
(('label l)
|
|
(begin
|
|
(hashtable-set! locs l pc)
|
|
(loop pc (cdr code))))
|
|
(instr
|
|
(loop (+ 1 pc) (cdr code))))))))
|
|
|
|
(define (remove-labels code locs)
|
|
(let loop ((pc 0)
|
|
(code code)
|
|
(acc '()))
|
|
(if (null? code)
|
|
(reverse acc)
|
|
(match (car code)
|
|
(('label l)
|
|
(loop pc (cdr code) acc))
|
|
|
|
(('jmp l)
|
|
(loop (+ 1 pc) (cdr code)
|
|
(cons `(jmp ,(hashtable-ref locs l #f)) acc)))
|
|
|
|
(('split l1 l2)
|
|
(loop (+ 1 pc) (cdr code)
|
|
(cons `(split ,(hashtable-ref locs l1 #f)
|
|
,(hashtable-ref locs l2 #f))
|
|
acc)))
|
|
|
|
(instr (loop (+ 1 pc) (cdr code) (cons instr acc)))))))
|
|
|
|
(define (optimise-jumps! code)
|
|
(define (single-pass)
|
|
(let ((changed #f))
|
|
(upto (n (vector-length code))
|
|
(match (vector-ref code n)
|
|
(('jmp l)
|
|
(when (match-instr? (vector-ref code l))
|
|
(set! changed #t)
|
|
(vector-set! code n (match-instr))))
|
|
|
|
(('split l1 l2)
|
|
(when (or (match-instr? (vector-ref code l1))
|
|
(match-instr? (vector-ref code l2)))
|
|
(set! changed #t)
|
|
(vector-set! code n (match-instr))))
|
|
|
|
(_ _)))
|
|
changed))
|
|
|
|
(let loop ()
|
|
(when (single-pass)
|
|
(loop)))
|
|
code)
|
|
|
|
(define (compile-to-symbols rx)
|
|
(let ((rx (append-instr rx (match-instr))))
|
|
(optimise-jumps!
|
|
(list->vector
|
|
(remove-labels rx (label-locations rx))))))
|
|
|
|
;; A 'thread' consists of an index into the instructions. A 'yarn holds the
|
|
;; current threads. Note there cannot be more threads than instructions, so
|
|
;; a yarn is represented as a vector the same length as the instructions.
|
|
;; Threads are run in lock step, all taking the same input.
|
|
(define-record-type yarn
|
|
(fields (mutable size)
|
|
(mutable stack)
|
|
(mutable seen)))
|
|
|
|
(define (mk-yarn count)
|
|
(make-yarn 0 (make-vector count) (make-vector count #f)))
|
|
|
|
(define (clear-yarn! y)
|
|
(yarn-size-set! y 0)
|
|
(vector-fill! (yarn-seen y) #f))
|
|
|
|
(define (add-thread! y i)
|
|
(unless (vector-ref (yarn-seen y) i)
|
|
(vector-set! (yarn-seen y) i #t)
|
|
(vector-set! (yarn-stack y) (yarn-size y) i)
|
|
(yarn-size-set! y (+ 1 (yarn-size y)))))
|
|
|
|
(define (pop-thread! y)
|
|
(if (zero? (yarn-size y))
|
|
#f
|
|
(begin
|
|
(yarn-size-set! y (- (yarn-size y) 1))
|
|
(vector-ref (yarn-stack y) (yarn-size y)))))
|
|
|
|
(define (no-threads? y)
|
|
(zero? (yarn-size y)))
|
|
|
|
;; FIXME: hack
|
|
(define end-of-string #\x0)
|
|
|
|
(define (compile-rx rx)
|
|
(let* ((sym-code (compile-to-symbols rx))
|
|
(code-len (vector-length sym-code))
|
|
(threads (mk-yarn code-len))
|
|
(next-threads (mk-yarn code-len))
|
|
(code #f))
|
|
|
|
(define (compile-instr instr)
|
|
(match instr
|
|
(('match)
|
|
(lambda (in-c pc) 'match))
|
|
|
|
(('char fn)
|
|
(lambda (in-c pc)
|
|
;; use eq? because in-c isn't always a char
|
|
(when (fn in-c)
|
|
(add-thread! next-threads (+ 1 pc)))))
|
|
|
|
(('jmp l)
|
|
(lambda (in-c pc)
|
|
(add-thread! threads l)))
|
|
|
|
(('split l1 l2)
|
|
(lambda (in-c pc)
|
|
(add-thread! threads l1)
|
|
(add-thread! threads l2)))))
|
|
|
|
(define (step in-c)
|
|
(let loop ((pc (pop-thread! threads)))
|
|
(and pc
|
|
(if (eq? 'match ((vector-ref code pc) in-c pc))
|
|
'match
|
|
(loop (pop-thread! threads))))))
|
|
|
|
;(fmt #t (dsp "running ") (pretty code) nl)
|
|
|
|
;; compile to closures to avoid calling match in the loop.
|
|
(upto (n code-len)
|
|
(set! code (vector-map compile-instr sym-code)))
|
|
|
|
(lambda (txt)
|
|
(add-thread! threads 0)
|
|
(let ((txt-len (string-length txt)))
|
|
(let c-loop ((c-index 0))
|
|
(if (< c-index txt-len)
|
|
;; FIXME: make step return a bool
|
|
(if (eq? 'match (step (string-ref txt c-index)))
|
|
#t
|
|
(if (no-threads? next-threads)
|
|
#f
|
|
(begin
|
|
(swap! threads next-threads)
|
|
(clear-yarn! next-threads)
|
|
(c-loop (+ 1 c-index)))))
|
|
(eq? 'match (step end-of-string))))))))
|
|
|
|
;;;--------------------------------------------------------
|
|
;;; Parser
|
|
|
|
;; FIXME: ^ and ? aren't in the grammar, and eos/$ isn't wired up
|
|
|
|
(define raw-char
|
|
(let ((meta-chars (string->list "\\^$*+?[]()|")))
|
|
(define (not-meta c)
|
|
(not (member c meta-chars)))
|
|
|
|
(p:alt (p:parse-m (p:<- c (p:accept-char not-meta))
|
|
(p:pure c))
|
|
(p:>> (p:lit "\\")
|
|
(p:accept-char (lambda (c) #t))))))
|
|
|
|
(define (bracket before after ma)
|
|
(p:>> before (p:<* ma after)))
|
|
|
|
(define (negate fn)
|
|
(lambda (c)
|
|
(not (fn c))))
|
|
|
|
;;-----------------------------------------------------------
|
|
;; Low level char combinators. These build char predicates.
|
|
|
|
;; char-rx := any non metacharacter | "\" metacharacter
|
|
;; builds a predicate that accepts the char
|
|
(define char-rx
|
|
(p:parse-m (p:<- c1 raw-char)
|
|
(p:pure (lambda (c2)
|
|
(char=? c1 c2)))))
|
|
|
|
;; range := char-rx "-" char-rx
|
|
(define range
|
|
(p:parse-m (p:<- c1 raw-char)
|
|
(p:lit "-")
|
|
(p:<- c2 raw-char)
|
|
(p:pure (lambda (c)
|
|
(char<=? c1 c c2)))))
|
|
|
|
;; set-items := range | char-rx
|
|
(define set-item (p:alt range char-rx))
|
|
|
|
(define (or-preds preds)
|
|
(lambda (c)
|
|
(let loop ((preds preds))
|
|
(if (null? preds)
|
|
#f
|
|
(or ((car preds) c)
|
|
(loop (cdr preds)))))))
|
|
|
|
;; set-items := set-item+
|
|
(define set-items
|
|
(p:lift or-preds (p:many+ set-item)))
|
|
|
|
;; negative-set := "[^" set-items "]"
|
|
(define negative-set
|
|
(bracket (p:lit "[^")
|
|
(p:lit "]")
|
|
(p:lift negate set-items)))
|
|
|
|
;; positive-set := "[" set-items "]"
|
|
(define positive-set
|
|
(bracket (p:lit "[")
|
|
(p:lit "]")
|
|
set-items))
|
|
|
|
;; set := positive-set | negative-set
|
|
(define set (p:alt positive-set negative-set))
|
|
|
|
;; eos := "$"
|
|
;; FIXME: ???
|
|
(define eos (p:lit "$"))
|
|
|
|
;; any := "."
|
|
(define any (p:>> (p:lit ".") (p:pure (lambda (_) #t))))
|
|
|
|
(define (combine rs)
|
|
(fold-left seq (car rs) (cdr rs)))
|
|
|
|
;;-----------------------------------------------------------
|
|
;; Higher level combinators, these build a symbolic rx
|
|
|
|
;; There's mutual recursion here which would send the combinators into an
|
|
;; infinite loop whilst they are being built (not during parsing). So we hot
|
|
;; patch rx, making it available for construction, and then redefine it on
|
|
;; first use.
|
|
(define rx
|
|
(indirect-lambda ()
|
|
(p:error-m "rx not bound")))
|
|
|
|
;; group := "(" rx ")"
|
|
(define group
|
|
(bracket (p:lit "(")
|
|
(p:lit ")")
|
|
rx))
|
|
|
|
;; elementary-rx := group | any | eos | char-rx | set
|
|
;; FIXME: put eos and group back in
|
|
(define elementary-rx
|
|
(p:alt (p:lift (lambda (fn)
|
|
(list (char-instr fn)))
|
|
(p:one-of any char-rx set))
|
|
group))
|
|
|
|
;; plus-rx := elementary-rx "+"
|
|
(define plus-rx
|
|
(p:lift plus (p:<* elementary-rx (p:lit "+"))))
|
|
|
|
;; star-rx := elementary-rx "*"
|
|
(define star-rx
|
|
(p:lift star (p:<* elementary-rx (p:lit "*"))))
|
|
|
|
;; basic-rx := star-rx | plus-rx | elementary-rx
|
|
(define basic-rx
|
|
(p:one-of star-rx plus-rx elementary-rx))
|
|
|
|
;; simple-rx := basic-rx+
|
|
(define simple-rx
|
|
(p:lift combine (p:many+ basic-rx)))
|
|
|
|
;; rx := simple-rx ("|" simple-rx)*
|
|
(define hotpatch-rx
|
|
(let ((patched #f))
|
|
(lambda ()
|
|
(unless patched
|
|
(set! patched #t)
|
|
(set-lambda! rx
|
|
(p:lift2 (lambda (r rs)
|
|
(fold-left alt r rs))
|
|
simple-rx
|
|
(p:many* (p:>> (p:lit "|") simple-rx))))))))
|
|
|
|
;;-----------------------------------------------------------------------
|
|
;; The top level routine, parses the regex string and compiles it into a
|
|
;; matcher, or returns false if the parse failed.
|
|
;; regex :: string -> (matcher <string>)
|
|
;; FIXME: it's tempting to return a function that raises if there's a parse error.
|
|
(define (regex str)
|
|
(hotpatch-rx)
|
|
(receive (v st) (p:parse rx str)
|
|
(if (p:success? st)
|
|
(compile-rx v)
|
|
#f))))
|
|
|
|
|