codegen.scm 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382
  1. ;;;; codegen.scm --- code generation for composable parsers
  2. ;;;;
  3. ;;;; Copyright (C) 2011 Free Software Foundation, Inc.
  4. ;;;;
  5. ;;;; This library is free software; you can redistribute it and/or
  6. ;;;; modify it under the terms of the GNU Lesser General Public
  7. ;;;; License as published by the Free Software Foundation; either
  8. ;;;; version 3 of the License, or (at your option) any later version.
  9. ;;;;
  10. ;;;; This library is distributed in the hope that it will be useful,
  11. ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. ;;;; Lesser General Public License for more details.
  14. ;;;;
  15. ;;;; You should have received a copy of the GNU Lesser General Public
  16. ;;;; License along with this library; if not, write to the Free Software
  17. ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  18. ;;;;
  19. (define-module (ice-9 peg codegen)
  20. #:export (compile-peg-pattern wrap-parser-for-users add-peg-compiler!)
  21. #:use-module (ice-9 pretty-print)
  22. #:use-module (system base pmatch))
  23. (define-syntax single?
  24. (syntax-rules ()
  25. "Return #t if X is a list of one element."
  26. ((_ x)
  27. (pmatch x
  28. ((_) #t)
  29. (else #f)))))
  30. (define-syntax single-filter
  31. (syntax-rules ()
  32. "If EXP is a list of one element, return the element. Otherwise
  33. return EXP."
  34. ((_ exp)
  35. (pmatch exp
  36. ((,elt) elt)
  37. (,elts elts)))))
  38. (define-syntax push-not-null!
  39. (syntax-rules ()
  40. "If OBJ is non-null, push it onto LST, otherwise do nothing."
  41. ((_ lst obj)
  42. (if (not (null? obj))
  43. (push! lst obj)))))
  44. (define-syntax push!
  45. (syntax-rules ()
  46. "Push an object onto a list."
  47. ((_ lst obj)
  48. (set! lst (cons obj lst)))))
  49. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  50. ;;;;; CODE GENERATORS
  51. ;; These functions generate scheme code for parsing PEGs.
  52. ;; Conventions:
  53. ;; accum: (all name body none)
  54. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  55. ;; Code we generate will have a certain return structure depending on how we're
  56. ;; accumulating (the ACCUM variable).
  57. (define (cg-generic-ret accum name body-uneval at)
  58. ;; name, body-uneval and at are syntax
  59. #`(let ((body #,body-uneval))
  60. #,(cond
  61. ((and (eq? accum 'all) name)
  62. #`(list #,at
  63. (cond
  64. ((not (list? body)) (list '#,name body))
  65. ((null? body) '#,name)
  66. ((symbol? (car body)) (list '#,name body))
  67. (else (cons '#,name body)))))
  68. ((eq? accum 'name)
  69. #`(list #,at '#,name))
  70. ((eq? accum 'body)
  71. #`(list #,at
  72. (cond
  73. ((single? body) (car body))
  74. (else body))))
  75. ((eq? accum 'none)
  76. #`(list #,at '()))
  77. (else
  78. (begin
  79. (pretty-print `(cg-generic-ret-error ,accum ,name ,body-uneval ,at))
  80. (pretty-print "Defaulting to accum of none.\n")
  81. #`(list #,at '()))))))
  82. ;; The short name makes the formatting below much easier to read.
  83. (define cggr cg-generic-ret)
  84. ;; Generates code that matches a particular string.
  85. ;; E.g.: (cg-string syntax "abc" 'body)
  86. (define (cg-string pat accum)
  87. (let ((plen (string-length pat)))
  88. #`(lambda (str len pos)
  89. (let ((end (+ pos #,plen)))
  90. (and (<= end len)
  91. (string= str #,pat pos end)
  92. #,(case accum
  93. ((all) #`(list end (list 'cg-string #,pat)))
  94. ((name) #`(list end 'cg-string))
  95. ((body) #`(list end #,pat))
  96. ((none) #`(list end '()))
  97. (else (error "bad accum" accum))))))))
  98. ;; Generates code for matching any character.
  99. ;; E.g.: (cg-peg-any syntax 'body)
  100. (define (cg-peg-any accum)
  101. #`(lambda (str len pos)
  102. (and (< pos len)
  103. #,(case accum
  104. ((all) #`(list (1+ pos)
  105. (list 'cg-peg-any (substring str pos (1+ pos)))))
  106. ((name) #`(list (1+ pos) 'cg-peg-any))
  107. ((body) #`(list (1+ pos) (substring str pos (1+ pos))))
  108. ((none) #`(list (1+ pos) '()))
  109. (else (error "bad accum" accum))))))
  110. ;; Generates code for matching a range of characters between start and end.
  111. ;; E.g.: (cg-range syntax #\a #\z 'body)
  112. (define (cg-range pat accum)
  113. (syntax-case pat ()
  114. ((start end)
  115. (if (not (and (char? (syntax->datum #'start))
  116. (char? (syntax->datum #'end))))
  117. (error "range PEG should have characters after it; instead got"
  118. #'start #'end))
  119. #`(lambda (str len pos)
  120. (and (< pos len)
  121. (let ((c (string-ref str pos)))
  122. (and (char>=? c start)
  123. (char<=? c end)
  124. #,(case accum
  125. ((all) #`(list (1+ pos) (list 'cg-range (string c))))
  126. ((name) #`(list (1+ pos) 'cg-range))
  127. ((body) #`(list (1+ pos) (string c)))
  128. ((none) #`(list (1+ pos) '()))
  129. (else (error "bad accum" accum))))))))))
  130. ;; Generates code for matching a range of characters not between start and end.
  131. ;; E.g.: (cg-not-in-range syntax #\a #\z 'body)
  132. (define (cg-not-in-range pat accum)
  133. (syntax-case pat ()
  134. ((start end)
  135. (if (not (and (char? (syntax->datum #'start))
  136. (char? (syntax->datum #'end))))
  137. (error "range PEG should have characters after it; instead got"
  138. #'start #'end))
  139. #`(lambda (str len pos)
  140. (and (< pos len)
  141. (let ((c (string-ref str pos)))
  142. (and (or (char<? c start) (char>? c end))
  143. #,(case accum
  144. ((all) #`(list (1+ pos)
  145. (list 'cg-not-in-range (string c))))
  146. ((name) #`(list (1+ pos) 'cg-not-in-range))
  147. ((body) #`(list (1+ pos) (string c)))
  148. ((none) #`(list (1+ pos) '()))
  149. (else (error "bad accum" accum))))))))))
  150. ;; Generate code to match a pattern and do nothing with the result
  151. (define (cg-ignore pat accum)
  152. (syntax-case pat ()
  153. ((inner)
  154. (compile-peg-pattern #'inner 'none))))
  155. (define (cg-capture pat accum)
  156. (syntax-case pat ()
  157. ((inner)
  158. (compile-peg-pattern #'inner 'body))))
  159. ;; Filters the accum argument to compile-peg-pattern for buildings like string
  160. ;; literals (since we don't want to tag them with their name if we're doing an
  161. ;; "all" accum).
  162. (define (builtin-accum-filter accum)
  163. (cond
  164. ((eq? accum 'all) 'body)
  165. ((eq? accum 'name) 'name)
  166. ((eq? accum 'body) 'body)
  167. ((eq? accum 'none) 'none)))
  168. (define baf builtin-accum-filter)
  169. ;; Top-level function builder for AND. Reduces to a call to CG-AND-INT.
  170. (define (cg-and clauses accum)
  171. #`(lambda (str len pos)
  172. (let ((body '()))
  173. #,(cg-and-int clauses (baf accum) #'str #'len #'pos #'body))))
  174. ;; Internal function builder for AND (calls itself).
  175. (define (cg-and-int clauses accum str strlen at body)
  176. (syntax-case clauses ()
  177. (()
  178. (cggr accum 'cg-and #`(reverse #,body) at))
  179. ((first rest ...)
  180. #`(let ((res (#,(compile-peg-pattern #'first accum) #,str #,strlen #,at)))
  181. (and res
  182. ;; update AT and BODY then recurse
  183. (let ((newat (car res))
  184. (newbody (cadr res)))
  185. (set! #,at newat)
  186. (push-not-null! #,body (single-filter newbody))
  187. #,(cg-and-int #'(rest ...) accum str strlen at body)))))))
  188. ;; Top-level function builder for OR. Reduces to a call to CG-OR-INT.
  189. (define (cg-or clauses accum)
  190. #`(lambda (str len pos)
  191. #,(cg-or-int clauses (baf accum) #'str #'len #'pos)))
  192. ;; Internal function builder for OR (calls itself).
  193. (define (cg-or-int clauses accum str strlen at)
  194. (syntax-case clauses ()
  195. (()
  196. #f)
  197. ((first rest ...)
  198. #`(or (#,(compile-peg-pattern #'first accum) #,str #,strlen #,at)
  199. #,(cg-or-int #'(rest ...) accum str strlen at)))))
  200. (define (cg-* args accum)
  201. (syntax-case args ()
  202. ((pat)
  203. #`(lambda (str strlen at)
  204. (let ((body '()))
  205. (let lp ((end at) (count 0))
  206. (let* ((match (#,(compile-peg-pattern #'pat (baf accum))
  207. str strlen end))
  208. (new-end (if match (car match) end))
  209. (count (if (> new-end end) (1+ count) count)))
  210. (if (> new-end end)
  211. (push-not-null! body (single-filter (cadr match))))
  212. (if (and (> new-end end)
  213. #,#t)
  214. (lp new-end count)
  215. (let ((success #,#t))
  216. #,#`(and success
  217. #,(cggr (baf accum) 'cg-body
  218. #'(reverse body) #'new-end)))))))))))
  219. (define (cg-+ args accum)
  220. (syntax-case args ()
  221. ((pat)
  222. #`(lambda (str strlen at)
  223. (let ((body '()))
  224. (let lp ((end at) (count 0))
  225. (let* ((match (#,(compile-peg-pattern #'pat (baf accum))
  226. str strlen end))
  227. (new-end (if match (car match) end))
  228. (count (if (> new-end end) (1+ count) count)))
  229. (if (> new-end end)
  230. (push-not-null! body (single-filter (cadr match))))
  231. (if (and (> new-end end)
  232. #,#t)
  233. (lp new-end count)
  234. (let ((success #,#'(>= count 1)))
  235. #,#`(and success
  236. #,(cggr (baf accum) 'cg-body
  237. #'(reverse body) #'new-end)))))))))))
  238. (define (cg-? args accum)
  239. (syntax-case args ()
  240. ((pat)
  241. #`(lambda (str strlen at)
  242. (let ((body '()))
  243. (let lp ((end at) (count 0))
  244. (let* ((match (#,(compile-peg-pattern #'pat (baf accum))
  245. str strlen end))
  246. (new-end (if match (car match) end))
  247. (count (if (> new-end end) (1+ count) count)))
  248. (if (> new-end end)
  249. (push-not-null! body (single-filter (cadr match))))
  250. (if (and (> new-end end)
  251. #,#'(< count 1))
  252. (lp new-end count)
  253. (let ((success #,#t))
  254. #,#`(and success
  255. #,(cggr (baf accum) 'cg-body
  256. #'(reverse body) #'new-end)))))))))))
  257. (define (cg-followed-by args accum)
  258. (syntax-case args ()
  259. ((pat)
  260. #`(lambda (str strlen at)
  261. (let ((body '()))
  262. (let lp ((end at) (count 0))
  263. (let* ((match (#,(compile-peg-pattern #'pat (baf accum))
  264. str strlen end))
  265. (new-end (if match (car match) end))
  266. (count (if (> new-end end) (1+ count) count)))
  267. (if (> new-end end)
  268. (push-not-null! body (single-filter (cadr match))))
  269. (if (and (> new-end end)
  270. #,#'(< count 1))
  271. (lp new-end count)
  272. (let ((success #,#'(= count 1)))
  273. #,#`(and success
  274. #,(cggr (baf accum) 'cg-body #''() #'at)))))))))))
  275. (define (cg-not-followed-by args accum)
  276. (syntax-case args ()
  277. ((pat)
  278. #`(lambda (str strlen at)
  279. (let ((body '()))
  280. (let lp ((end at) (count 0))
  281. (let* ((match (#,(compile-peg-pattern #'pat (baf accum))
  282. str strlen end))
  283. (new-end (if match (car match) end))
  284. (count (if (> new-end end) (1+ count) count)))
  285. (if (> new-end end)
  286. (push-not-null! body (single-filter (cadr match))))
  287. (if (and (> new-end end)
  288. #,#'(< count 1))
  289. (lp new-end count)
  290. (let ((success #,#'(= count 1)))
  291. #,#`(if success
  292. #f
  293. #,(cggr (baf accum) 'cg-body #''() #'at)))))))))))
  294. ;; Association list of functions to handle different expressions as PEGs
  295. (define peg-compiler-alist '())
  296. (define (add-peg-compiler! symbol function)
  297. (set! peg-compiler-alist
  298. (assq-set! peg-compiler-alist symbol function)))
  299. (add-peg-compiler! 'range cg-range)
  300. (add-peg-compiler! 'not-in-range cg-not-in-range)
  301. (add-peg-compiler! 'ignore cg-ignore)
  302. (add-peg-compiler! 'capture cg-capture)
  303. (add-peg-compiler! 'and cg-and)
  304. (add-peg-compiler! 'or cg-or)
  305. (add-peg-compiler! '* cg-*)
  306. (add-peg-compiler! '+ cg-+)
  307. (add-peg-compiler! '? cg-?)
  308. (add-peg-compiler! 'followed-by cg-followed-by)
  309. (add-peg-compiler! 'not-followed-by cg-not-followed-by)
  310. ;; Takes an arbitrary expressions and accumulation variable, then parses it.
  311. ;; E.g.: (compile-peg-pattern syntax '(and "abc" (or "-" (range #\a #\z))) 'all)
  312. (define (compile-peg-pattern pat accum)
  313. (syntax-case pat (peg-any)
  314. (peg-any
  315. (cg-peg-any (baf accum)))
  316. (sym (identifier? #'sym) ;; nonterminal
  317. #'sym)
  318. (str (string? (syntax->datum #'str)) ;; literal string
  319. (cg-string (syntax->datum #'str) (baf accum)))
  320. ((name . args) (let* ((nm (syntax->datum #'name))
  321. (entry (assq-ref peg-compiler-alist nm)))
  322. (if entry
  323. (entry #'args accum)
  324. (error "Bad peg form" nm #'args
  325. "Not one of" (map car peg-compiler-alist)))))))
  326. ;; Packages the results of a parser
  327. (define (wrap-parser-for-users for-syntax parser accumsym s-syn)
  328. #`(lambda (str strlen at)
  329. (let ((res (#,parser str strlen at)))
  330. ;; Try to match the nonterminal.
  331. (if res
  332. ;; If we matched, do some post-processing to figure out
  333. ;; what data to propagate upward.
  334. (let ((at (car res))
  335. (body (cadr res)))
  336. #,(cond
  337. ((eq? accumsym 'name)
  338. #`(list at '#,s-syn))
  339. ((eq? accumsym 'all)
  340. #`(list (car res)
  341. (cond
  342. ((not (list? body))
  343. (list '#,s-syn body))
  344. ((null? body) '#,s-syn)
  345. ((symbol? (car body))
  346. (list '#,s-syn body))
  347. (else (cons '#,s-syn body)))))
  348. ((eq? accumsym 'none) #`(list (car res) '()))
  349. (else #`(begin res))))
  350. ;; If we didn't match, just return false.
  351. #f))))