xml2hash.pl 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325
  1. #!/usr/bin/perl
  2. # Daniel "Trizen" Șuteu
  3. # Date: 27 December 2013
  4. # Edit: 01 January 2018
  5. # License: GPLv3
  6. # https://github.com/trizen
  7. # A tiny pure-Perl XML parser.
  8. use utf8;
  9. use 5.010;
  10. use strict;
  11. use warnings;
  12. {
  13. my %entities = (
  14. 'amp' => '&',
  15. 'quot' => '"',
  16. 'apos' => "'",
  17. 'gt' => '>',
  18. 'lt' => '<',
  19. );
  20. state $ent_re = do {
  21. local $" = '|';
  22. qr/&(@{[keys %entities]});/;
  23. };
  24. sub _decode_entities {
  25. $_[0] =~ s/$ent_re/$entities{$1}/gor;
  26. }
  27. }
  28. sub xml2hash {
  29. my $xml = shift(@_) // '';
  30. my $xml_ref = {};
  31. $xml = "$xml";
  32. my %args = (
  33. attr => '-',
  34. text => '#text',
  35. empty => q{},
  36. @_
  37. );
  38. my %ctags;
  39. my $ref = $xml_ref;
  40. state $inv_chars = q{!"#$@%&'()*+,/;\\<=>?\]\[^`{|}~};
  41. state $valid_tag = qr{[^\-.\s0-9$inv_chars][^$inv_chars\s]*};
  42. {
  43. if (
  44. $xml =~ m{\G< \s*
  45. ($valid_tag) \s*
  46. ((?>$valid_tag\s*=\s*(?>".*?"|'.*?')|\s+)+)? \s*
  47. (/)?\s*> \s*
  48. }gcsxo
  49. ) {
  50. my ($tag, $attrs, $closed) = ($1, $2, $3);
  51. if (defined $attrs) {
  52. push @{$ctags{$tag}}, $ref;
  53. $ref =
  54. ref $ref eq 'HASH'
  55. ? ref $ref->{$tag}
  56. ? $ref->{$tag}
  57. : (
  58. defined $ref->{$tag}
  59. ? ($ref->{$tag} = [$ref->{$tag}])
  60. : ($ref->{$tag} //= [])
  61. )
  62. : ref $ref eq 'ARRAY' ? ref $ref->[-1]{$tag}
  63. ? $ref->[-1]{$tag}
  64. : (
  65. defined $ref->[-1]{$tag}
  66. ? ($ref->[-1]{$tag} = [$ref->[-1]{$tag}])
  67. : ($ref->[-1]{$tag} //= [])
  68. )
  69. : [];
  70. ++$#{$ref} if ref $ref eq 'ARRAY';
  71. while (
  72. $attrs =~ m{\G
  73. ($valid_tag) \s*=\s*
  74. (?>
  75. "(.*?)"
  76. |
  77. '(.*?)'
  78. ) \s*
  79. }gsxo
  80. ) {
  81. my ($key, $value) = ($1, $+);
  82. $key = join(q{}, $args{attr}, $key);
  83. if (ref $ref eq 'ARRAY') {
  84. $ref->[-1]{$key} = _decode_entities($value);
  85. }
  86. elsif (ref $ref eq 'HASH') {
  87. $ref->{$key} = $value;
  88. }
  89. }
  90. if (defined $closed) {
  91. $ref = pop @{$ctags{$tag}};
  92. }
  93. if ($xml =~ m{\G<\s*/\s*\Q$tag\E\s*>\s*}gc) {
  94. $ref = pop @{$ctags{$tag}};
  95. }
  96. elsif ($xml =~ m{\G([^<]+)(?=<)}gsc) {
  97. if (ref $ref eq 'ARRAY') {
  98. $ref->[-1]{$args{text}} .= _decode_entities($1);
  99. $ref = pop @{$ctags{$tag}};
  100. }
  101. elsif (ref $ref eq 'HASH') {
  102. $ref->{$args{text}} .= $1;
  103. $ref = pop @{$ctags{$tag}};
  104. }
  105. }
  106. }
  107. elsif (defined $closed) {
  108. if (ref $ref eq 'ARRAY') {
  109. if (exists $ref->[-1]{$tag}) {
  110. if (ref $ref->[-1]{$tag} ne 'ARRAY') {
  111. $ref->[-1]{$tag} = [$ref->[-1]{$tag}];
  112. }
  113. push @{$ref->[-1]{$tag}}, $args{empty};
  114. }
  115. else {
  116. $ref->[-1]{$tag} = $args{empty};
  117. }
  118. }
  119. }
  120. else {
  121. if ($xml =~ /\G(?=<(?!!))/) {
  122. push @{$ctags{$tag}}, $ref;
  123. $ref =
  124. ref $ref eq 'HASH'
  125. ? ref $ref->{$tag}
  126. ? $ref->{$tag}
  127. : (
  128. defined $ref->{$tag}
  129. ? ($ref->{$tag} = [$ref->{$tag}])
  130. : ($ref->{$tag} //= [])
  131. )
  132. : ref $ref eq 'ARRAY' ? ref $ref->[-1]{$tag}
  133. ? $ref->[-1]{$tag}
  134. : (
  135. defined $ref->[-1]{$tag}
  136. ? ($ref->[-1]{$tag} = [$ref->[-1]{$tag}])
  137. : ($ref->[-1]{$tag} //= [])
  138. )
  139. : [];
  140. ++$#{$ref} if ref $ref eq 'ARRAY';
  141. redo;
  142. }
  143. elsif ($xml =~ /\G<!\[CDATA\[(.*?)\]\]>\s*/gcs or $xml =~ /\G([^<]+)(?=<)/gsc) {
  144. my ($text) = $1;
  145. if ($xml =~ m{\G<\s*/\s*\Q$tag\E\s*>\s*}gc) {
  146. if (ref $ref eq 'ARRAY') {
  147. if (exists $ref->[-1]{$tag}) {
  148. if (ref $ref->[-1]{$tag} ne 'ARRAY') {
  149. $ref->[-1]{$tag} = [$ref->[-1]{$tag}];
  150. }
  151. push @{$ref->[-1]{$tag}}, $text;
  152. }
  153. else {
  154. $ref->[-1]{$tag} .= _decode_entities($text);
  155. }
  156. }
  157. elsif (ref $ref eq 'HASH') {
  158. $ref->{$tag} .= $text;
  159. }
  160. }
  161. else {
  162. push @{$ctags{$tag}}, $ref;
  163. $ref =
  164. ref $ref eq 'HASH'
  165. ? ref $ref->{$tag}
  166. ? $ref->{$tag}
  167. : (
  168. defined $ref->{$tag}
  169. ? ($ref->{$tag} = [$ref->{$tag}])
  170. : ($ref->{$tag} //= [])
  171. )
  172. : ref $ref eq 'ARRAY' ? ref $ref->[-1]{$tag}
  173. ? $ref->[-1]{$tag}
  174. : (
  175. defined $ref->[-1]{$tag}
  176. ? ($ref->[-1]{$tag} = [$ref->[-1]{$tag}])
  177. : ($ref->[-1]{$tag} //= [])
  178. )
  179. : [];
  180. ++$#{$ref} if ref $ref eq 'ARRAY';
  181. if (ref $ref eq 'ARRAY') {
  182. if (exists $ref->[-1]{$tag}) {
  183. if (ref $ref->[-1]{$tag} ne 'ARRAY') {
  184. $ref->[-1] = [$ref->[-1]{$tag}];
  185. }
  186. push @{$ref->[-1]}, {$args{text} => $text};
  187. }
  188. else {
  189. $ref->[-1]{$args{text}} .= $text;
  190. }
  191. }
  192. elsif (ref $ref eq 'HASH') {
  193. $ref->{$tag} .= $text;
  194. }
  195. }
  196. }
  197. }
  198. if ($xml =~ m{\G<\s*/\s*\Q$tag\E\s*>\s*}gc) {
  199. ## tag closed - ok
  200. }
  201. redo;
  202. }
  203. elsif ($xml =~ m{\G<\s*/\s*($valid_tag)\s*>\s*}gco) {
  204. if (exists $ctags{$1} and @{$ctags{$1}}) {
  205. $ref = pop @{$ctags{$1}};
  206. }
  207. redo;
  208. }
  209. elsif ($xml =~ /\G<!\[CDATA\[(.*?)\]\]>\s*/gcs or $xml =~ m{\G([^<]+)(?=<)}gsc) {
  210. if (ref $ref eq 'ARRAY') {
  211. $ref->[-1]{$args{text}} .= $1;
  212. }
  213. elsif (ref $ref eq 'HASH') {
  214. $ref->{$args{text}} .= $1;
  215. }
  216. redo;
  217. }
  218. elsif ($xml =~ /\G<\?/gc) {
  219. $xml =~ /\G.*?\?>\s*/gcs or die "Invalid XML!";
  220. redo;
  221. }
  222. elsif ($xml =~ /\G<!--/gc) {
  223. $xml =~ /\G.*?-->\s*/gcs or die "Comment not closed!";
  224. redo;
  225. }
  226. elsif ($xml =~ /\G<!DOCTYPE\s+/gc) {
  227. $xml =~ /\G(?>$valid_tag|\s+|".*?"|'.*?')*\[.*?\]>\s*/sgco
  228. or $xml =~ /\G.*?>\s*/sgc
  229. or die "DOCTYPE not closed!";
  230. redo;
  231. }
  232. elsif ($xml =~ /\G\z/gc) {
  233. ## ok
  234. }
  235. elsif ($xml =~ /\G\s+/gc) {
  236. redo;
  237. }
  238. else {
  239. die "Syntax error near: --> ", [split(/\n/, substr($xml, pos($xml), 2**6))]->[0], " <--\n";
  240. }
  241. }
  242. return $xml_ref;
  243. }
  244. #
  245. ## Usage: $hash = xml2hash($xml)
  246. #
  247. use Data::Dump qw(pp);
  248. pp xml2hash(
  249. do { local $/; <DATA> }
  250. );
  251. __DATA__
  252. <?xml version="1.0"?>
  253. <?xml-stylesheet href="catalog.xsl" type="text/xsl"?>
  254. <!DOCTYPE catalog SYSTEM "catalog.dtd">
  255. <catalog>
  256. <product description="Cardigan Sweater" product_image="cardigan.jpg">
  257. <catalog_item gender="Men's">
  258. <item_number>QWZ5671</item_number>
  259. <price>39.95</price>
  260. <size description="Medium">
  261. <color_swatch image="red_cardigan.jpg">Red</color_swatch>
  262. <color_swatch image="burgundy_cardigan.jpg">Burgundy</color_swatch>
  263. </size>
  264. <size description="Large">
  265. <color_swatch image="red_cardigan.jpg">Red</color_swatch>
  266. <color_swatch image="burgundy_cardigan.jpg">Burgundy</color_swatch>
  267. </size>
  268. </catalog_item>
  269. <catalog_item gender="Women's">
  270. <item_number>RRX9856</item_number>
  271. <price>42.50</price>
  272. <size description="Small">
  273. <color_swatch image="red_cardigan.jpg">Red</color_swatch>
  274. <color_swatch image="navy_cardigan.jpg">Navy</color_swatch>
  275. <color_swatch image="burgundy_cardigan.jpg">Burgundy</color_swatch>
  276. </size>
  277. <size description="Medium">
  278. <color_swatch image="red_cardigan.jpg">Red</color_swatch>
  279. <color_swatch image="navy_cardigan.jpg">Navy</color_swatch>
  280. <color_swatch image="burgundy_cardigan.jpg">Burgundy</color_swatch>
  281. <color_swatch image="black_cardigan.jpg">Black</color_swatch>
  282. </size>
  283. <size description="Large">
  284. <color_swatch image="navy_cardigan.jpg">Navy</color_swatch>
  285. <color_swatch image="black_cardigan.jpg">Black</color_swatch>
  286. </size>
  287. <size description="Extra Large">
  288. <color_swatch image="burgundy_cardigan.jpg">Burgundy</color_swatch>
  289. <color_swatch image="black_cardigan.jpg">Black</color_swatch>
  290. </size>
  291. </catalog_item>
  292. </product>
  293. </catalog>