nxml-parse.el 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319
  1. ;;; nxml-parse.el --- XML parser, sharing infrastructure with nxml-mode
  2. ;; Copyright (C) 2003, 2007-2012 Free Software Foundation, Inc.
  3. ;; Author: James Clark
  4. ;; Keywords: XML
  5. ;; This file is part of GNU Emacs.
  6. ;; GNU Emacs is free software: you can redistribute it and/or modify
  7. ;; it under the terms of the GNU General Public License as published by
  8. ;; the Free Software Foundation, either version 3 of the License, or
  9. ;; (at your option) any later version.
  10. ;; GNU Emacs is distributed in the hope that it will be useful,
  11. ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. ;; GNU General Public License for more details.
  14. ;; You should have received a copy of the GNU General Public License
  15. ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
  16. ;;; Commentary:
  17. ;; Entry point is `nxml-parse-file'.
  18. ;;; Code:
  19. (require 'nxml-util)
  20. (require 'xmltok)
  21. (require 'nxml-enc)
  22. (require 'nxml-ns)
  23. (defvar nxml-parse-file-name nil)
  24. (defvar nxml-validate-function nil
  25. "Either nil or a function called by `nxml-parse-file' to perform validation.
  26. The function will be called once for each start-tag or end-tag. The
  27. function is passed two arguments TEXT and START-TAG. For a start-tag,
  28. START-TAG is a list (NAME ATTRIBUTES) where NAME and ATTRIBUTES are in
  29. the same form as returned by `nxml-parse-file'. For an end-tag,
  30. START-TAG is nil. TEXT is a string containing the text immediately
  31. preceding the tag, or nil if there was no such text. An empty element
  32. is treated as a start-tag followed by an end-tag.
  33. For a start-tag, the namespace state will be the state after
  34. processing the namespace declarations in the start-tag. For an
  35. end-tag, the namespace state will be the state before popping the
  36. namespace declarations for the corresponding start-tag.
  37. The function must return nil if no error is detected or a
  38. cons (MESSAGE . LOCATION) where MESSAGE is a string containing
  39. an error message and LOCATION indicates what caused the error
  40. as follows:
  41. - nil indicates the tag as whole caused it; this is always allowed;
  42. - text indicates the text caused it; this is allowed only if
  43. TEXT is non-nil;
  44. - tag-close indicates the close of the tag caused it; this is
  45. allowed only if START-TAG is non-nil;
  46. - (attribute-name . N) indicates that the name of the Nth attribute
  47. caused it; N counts from 0; this is allowed only if START-TAG is non-nil
  48. and N must be less than the number of attributes;
  49. - (attribute-value . N) indicates that the value of the Nth attribute
  50. caused it; N counts from 0; this is allowed only if START-TAG is non-nil
  51. and N must be less than the number of attributes.")
  52. (defun nxml-parse-file (file)
  53. "Parse the XML document in FILE and return it as a list.
  54. An XML element is represented as a list (NAME ATTRIBUTES . CHILDREN).
  55. NAME is either a string, in the case where the name does not have a
  56. namespace, or a cons (NAMESPACE . LOCAL-NAME), where NAMESPACE is a
  57. symbol and LOCAL-NAME is a string, in the case where the name does
  58. have a namespace. NAMESPACE is a keyword whose name is `:URI', where
  59. URI is the namespace name. ATTRIBUTES is an alist of attributes where
  60. each attribute has the form (NAME . VALUE), where NAME has the same
  61. form as an element name, and VALUE is a string. A namespace
  62. declaration is represented as an attribute whose name is
  63. \(:http://www.w3.org/2000/xmlns/ . LOCAL-NAME). CHILDREN is a list
  64. containing strings and child elements; CHILDREN never contains two
  65. consecutive strings and never contains an empty string. Processing
  66. instructions and comments are not represented. The return value is a
  67. list representing the document element.
  68. If the XML document is not well-formed, an error having the condition
  69. `nxml-file-parse-error' will be signaled; the error data will be a
  70. list of the form \(FILE POSITION MESSAGE), where POSITION is an
  71. integer specifying the position where the error was detected, and
  72. MESSAGE is a string describing the error.
  73. The current contents of FILE will be parsed even if there is a
  74. modified buffer currently visiting FILE.
  75. If the variable `nxml-validate-function' is non-nil, it will be called
  76. twice for each element, and any reported error will be signaled in the
  77. same way as well-formedness error."
  78. (with-current-buffer (nxml-parse-find-file file)
  79. (unwind-protect
  80. (let ((nxml-parse-file-name file))
  81. (nxml-parse-instance))
  82. (kill-buffer nil))))
  83. (defun nxml-parse-find-file (file)
  84. (with-current-buffer (get-buffer-create " *nXML Parse*")
  85. (erase-buffer)
  86. (let ((set-auto-coding-function 'nxml-set-xml-coding))
  87. (insert-file-contents file))
  88. (current-buffer)))
  89. (defun nxml-parse-instance ()
  90. (let (xmltok-dtd)
  91. (xmltok-save
  92. (xmltok-forward-prolog)
  93. (nxml-check-xmltok-errors)
  94. (nxml-ns-save
  95. (nxml-parse-instance-1)))))
  96. (defun nxml-parse-instance-1 ()
  97. (let* ((top (cons nil nil))
  98. ;; tail is a cons cell, whose cdr is nil
  99. ;; additional elements will destructively appended to tail
  100. (tail top)
  101. ;; stack of tails one for each open element
  102. tail-stack
  103. ;; list of QNames of open elements
  104. open-element-tags
  105. ;; list of strings buffering a text node, in reverse order
  106. text
  107. ;; position of beginning of first (in buffer) string in text
  108. text-pos)
  109. (while (xmltok-forward)
  110. (nxml-check-xmltok-errors)
  111. (cond ((memq xmltok-type '(start-tag end-tag empty-element))
  112. (when text
  113. (setq text (apply 'concat (nreverse text)))
  114. (setcdr tail (cons text nil))
  115. (setq tail (cdr tail)))
  116. (when (not (eq xmltok-type 'end-tag))
  117. (when (and (not open-element-tags)
  118. (not (eq tail top)))
  119. (nxml-parse-error nil "Multiple top-level elements"))
  120. (setq open-element-tags
  121. (cons (xmltok-start-tag-qname)
  122. open-element-tags))
  123. (nxml-ns-push-state)
  124. (let ((tag (nxml-parse-start-tag)))
  125. (nxml-validate-tag text text-pos tag)
  126. (setq text nil)
  127. (setcdr tail (cons tag nil))
  128. (setq tail (cdr tail))
  129. (setq tail-stack (cons tail tail-stack))
  130. (setq tail (last tag))))
  131. (when (not (eq xmltok-type 'start-tag))
  132. (or (eq xmltok-type 'empty-element)
  133. (equal (car open-element-tags)
  134. (xmltok-end-tag-qname))
  135. (if open-element-tags
  136. (nxml-parse-error nil
  137. "Unbalanced end-tag; expected </%s>"
  138. (car open-element-tags))
  139. (nxml-parse-error nil "Extra end-tag")))
  140. (nxml-validate-tag text text-pos nil)
  141. (setq text nil)
  142. (nxml-ns-pop-state)
  143. (setq open-element-tags (cdr open-element-tags))
  144. (setq tail (car tail-stack))
  145. (setq tail-stack (cdr tail-stack)))
  146. (setq text-pos nil))
  147. ((memq xmltok-type '(space data entity-ref char-ref cdata-section))
  148. (cond (open-element-tags
  149. (unless text-pos
  150. (setq text-pos xmltok-start))
  151. (setq text
  152. (cons (nxml-current-text-string) text)))
  153. ((not (eq xmltok-type 'space))
  154. (nxml-parse-error
  155. nil
  156. "%s at top-level"
  157. (cdr (assq xmltok-type
  158. '((data . "Text characters")
  159. (entity-ref . "Entity reference")
  160. (char-ref . "Character reference")
  161. (cdata-section . "CDATA section"))))))))))
  162. (unless (cdr top)
  163. (nxml-parse-error (point-max) "Missing document element"))
  164. (cadr top)))
  165. (defun nxml-parse-start-tag ()
  166. (let (parsed-attributes
  167. parsed-namespace-attributes
  168. atts att prefixes prefix ns value name)
  169. (setq atts xmltok-namespace-attributes)
  170. (while atts
  171. (setq att (car atts))
  172. (setq value (or (xmltok-attribute-value att)
  173. (nxml-parse-error nil "Invalid attribute value")))
  174. (setq ns (nxml-make-namespace value))
  175. (setq prefix (and (xmltok-attribute-prefix att)
  176. (xmltok-attribute-local-name att)))
  177. (cond ((member prefix prefixes)
  178. (nxml-parse-error nil "Duplicate namespace declaration"))
  179. ((not prefix)
  180. (nxml-ns-set-default ns))
  181. (ns
  182. (nxml-ns-set-prefix prefix ns))
  183. (t (nxml-parse-error nil "Cannot undeclare namespace prefix")))
  184. (setq prefixes (cons prefix prefixes))
  185. (setq parsed-namespace-attributes
  186. (cons (cons (nxml-make-name nxml-xmlns-namespace-uri
  187. (xmltok-attribute-local-name att))
  188. value)
  189. parsed-namespace-attributes))
  190. (setq atts (cdr atts)))
  191. (setq name
  192. (nxml-make-name
  193. (let ((prefix (xmltok-start-tag-prefix)))
  194. (if prefix
  195. (or (nxml-ns-get-prefix prefix)
  196. (nxml-parse-error (1+ xmltok-start)
  197. "Prefix `%s' undeclared"
  198. prefix))
  199. (nxml-ns-get-default)))
  200. (xmltok-start-tag-local-name)))
  201. (setq atts xmltok-attributes)
  202. (while atts
  203. (setq att (car atts))
  204. (setq ns
  205. (let ((prefix (xmltok-attribute-prefix att)))
  206. (and prefix
  207. (or (nxml-ns-get-prefix prefix)
  208. (nxml-parse-error (xmltok-attribute-name-start att)
  209. "Prefix `%s' undeclared"
  210. prefix)))))
  211. (setq parsed-attributes
  212. (let ((nm (nxml-make-name ns
  213. (xmltok-attribute-local-name att))))
  214. (when (assoc nm parsed-attributes)
  215. (nxml-parse-error (xmltok-attribute-name-start att)
  216. "Duplicate attribute"))
  217. (cons (cons nm (or (xmltok-attribute-value att)
  218. (nxml-parse-error nil "Invalid attribute value")))
  219. parsed-attributes)))
  220. (setq atts (cdr atts)))
  221. ;; We want to end up with the attributes followed by the
  222. ;; the namespace attributes in the same order as
  223. ;; xmltok-attributes and xmltok-namespace-attributes respectively.
  224. (when parsed-namespace-attributes
  225. (setq parsed-attributes
  226. (nconc parsed-namespace-attributes parsed-attributes)))
  227. (list name (nreverse parsed-attributes))))
  228. (defun nxml-validate-tag (text text-pos tag)
  229. (when nxml-validate-function
  230. (let ((err (funcall nxml-validate-function text tag))
  231. pos)
  232. (when err
  233. (setq pos (nxml-validate-error-position (cdr err)
  234. (and text text-pos)
  235. tag))
  236. (or pos (error "Incorrect return value from %s"
  237. nxml-validate-function))
  238. (nxml-parse-error pos (car err))))))
  239. (defun nxml-validate-error-position (location text-pos tag)
  240. (cond ((null location) xmltok-start)
  241. ((eq location 'text) text-pos)
  242. ((eq location 'tag-close)
  243. (and tag (- (point) (if (eq xmltok-type 'empty-element ) 2 1))))
  244. ((consp location)
  245. (let ((att (nth (cdr location) xmltok-attributes)))
  246. (when (not att)
  247. (setq att (nth (- (cdr location) (length xmltok-attributes))
  248. xmltok-namespace-attributes)))
  249. (cond ((not att))
  250. ((eq (car location) 'attribute-name)
  251. (xmltok-attribute-name-start att))
  252. ((eq (car location) 'attribute-value)
  253. (xmltok-attribute-value-start att)))))))
  254. (defun nxml-make-name (ns local-name)
  255. (if ns
  256. (cons ns local-name)
  257. local-name))
  258. (defun nxml-current-text-string ()
  259. (cond ((memq xmltok-type '(space data))
  260. (buffer-substring-no-properties xmltok-start
  261. (point)))
  262. ((eq xmltok-type 'cdata-section)
  263. (buffer-substring-no-properties (+ xmltok-start 9)
  264. (- (point) 3)))
  265. ((memq xmltok-type '(char-ref entity-ref))
  266. (unless xmltok-replacement
  267. (nxml-parse-error nil
  268. (if (eq xmltok-type 'char-ref)
  269. "Reference to unsupported Unicode character"
  270. "Unresolvable entity reference")))
  271. xmltok-replacement)))
  272. (defun nxml-parse-error (position &rest args)
  273. (nxml-signal-file-parse-error nxml-parse-file-name
  274. (or position xmltok-start)
  275. (apply 'format args)))
  276. (defun nxml-check-xmltok-errors ()
  277. (when xmltok-errors
  278. (let ((err (car (last xmltok-errors))))
  279. (nxml-signal-file-parse-error nxml-parse-file-name
  280. (xmltok-error-start err)
  281. (xmltok-error-message err)))))
  282. (provide 'nxml-parse)
  283. ;;; nxml-parse.el ends here