xml.c 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416
  1. #include <errno.h>
  2. #include <stdio.h>
  3. #include <stdlib.h>
  4. #include <string.h>
  5. #include "xml.h"
  6. #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
  7. #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
  8. static void
  9. xml_parseattrs(XMLParser *x)
  10. {
  11. size_t namelen = 0, valuelen;
  12. int c, endsep, endname = 0, valuestart = 0;
  13. while ((c = GETNEXT()) != EOF) {
  14. if (ISSPACE(c)) {
  15. if (namelen)
  16. endname = 1;
  17. continue;
  18. } else if (c == '?')
  19. ; /* ignore */
  20. else if (c == '=') {
  21. x->name[namelen] = '\0';
  22. valuestart = 1;
  23. endname = 1;
  24. } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) {
  25. /* attribute without value */
  26. x->name[namelen] = '\0';
  27. if (x->xmlattrstart)
  28. x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
  29. if (x->xmlattr)
  30. x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
  31. if (x->xmlattrend)
  32. x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
  33. endname = 0;
  34. x->name[0] = c;
  35. namelen = 1;
  36. } else if (namelen && valuestart) {
  37. /* attribute with value */
  38. if (x->xmlattrstart)
  39. x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
  40. valuelen = 0;
  41. if (c == '\'' || c == '"') {
  42. endsep = c;
  43. } else {
  44. endsep = ' '; /* ISSPACE() */
  45. goto startvalue;
  46. }
  47. while ((c = GETNEXT()) != EOF) {
  48. startvalue:
  49. if (c == '&') { /* entities */
  50. x->data[valuelen] = '\0';
  51. /* call data function with data before entity if there is data */
  52. if (valuelen && x->xmlattr)
  53. x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
  54. x->data[0] = c;
  55. valuelen = 1;
  56. while ((c = GETNEXT()) != EOF) {
  57. if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c))))
  58. break;
  59. if (valuelen < sizeof(x->data) - 1)
  60. x->data[valuelen++] = c;
  61. else {
  62. /* entity too long for buffer, handle as normal data */
  63. x->data[valuelen] = '\0';
  64. if (x->xmlattr)
  65. x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
  66. x->data[0] = c;
  67. valuelen = 1;
  68. break;
  69. }
  70. if (c == ';') {
  71. x->data[valuelen] = '\0';
  72. if (x->xmlattrentity)
  73. x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
  74. valuelen = 0;
  75. break;
  76. }
  77. }
  78. } else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) {
  79. if (valuelen < sizeof(x->data) - 1) {
  80. x->data[valuelen++] = c;
  81. } else {
  82. x->data[valuelen] = '\0';
  83. if (x->xmlattr)
  84. x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
  85. x->data[0] = c;
  86. valuelen = 1;
  87. }
  88. }
  89. if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) {
  90. x->data[valuelen] = '\0';
  91. if (x->xmlattr)
  92. x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
  93. if (x->xmlattrend)
  94. x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
  95. break;
  96. }
  97. }
  98. namelen = endname = valuestart = 0;
  99. } else if (namelen < sizeof(x->name) - 1) {
  100. x->name[namelen++] = c;
  101. }
  102. if (c == '>') {
  103. break;
  104. } else if (c == '/') {
  105. x->isshorttag = 1;
  106. x->name[0] = '\0';
  107. namelen = 0;
  108. }
  109. }
  110. }
  111. static void
  112. xml_parsecomment(XMLParser *x)
  113. {
  114. int c, i = 0;
  115. while ((c = GETNEXT()) != EOF) {
  116. if (c == '-') {
  117. if (++i > 2)
  118. i = 2;
  119. continue;
  120. } else if (c == '>' && i == 2) {
  121. return;
  122. } else if (i) {
  123. i = 0;
  124. }
  125. }
  126. }
  127. static void
  128. xml_parsecdata(XMLParser *x)
  129. {
  130. size_t datalen = 0, i = 0;
  131. int c;
  132. while ((c = GETNEXT()) != EOF) {
  133. if (c == ']' || c == '>') {
  134. if (x->xmlcdata && datalen) {
  135. x->data[datalen] = '\0';
  136. x->xmlcdata(x, x->data, datalen);
  137. datalen = 0;
  138. }
  139. }
  140. if (c == ']') {
  141. if (++i > 2) {
  142. if (x->xmlcdata)
  143. for (; i > 2; i--)
  144. x->xmlcdata(x, "]", 1);
  145. i = 2;
  146. }
  147. continue;
  148. } else if (c == '>' && i == 2) {
  149. return;
  150. } else if (i) {
  151. if (x->xmlcdata)
  152. for (; i > 0; i--)
  153. x->xmlcdata(x, "]", 1);
  154. i = 0;
  155. }
  156. if (datalen < sizeof(x->data) - 1) {
  157. x->data[datalen++] = c;
  158. } else {
  159. x->data[datalen] = '\0';
  160. if (x->xmlcdata)
  161. x->xmlcdata(x, x->data, datalen);
  162. x->data[0] = c;
  163. datalen = 1;
  164. }
  165. }
  166. }
  167. static int
  168. codepointtoutf8(long r, char *s)
  169. {
  170. if (r == 0) {
  171. return 0; /* NUL byte */
  172. } else if (r <= 0x7F) {
  173. /* 1 byte: 0aaaaaaa */
  174. s[0] = r;
  175. return 1;
  176. } else if (r <= 0x07FF) {
  177. /* 2 bytes: 00000aaa aabbbbbb */
  178. s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */
  179. s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */
  180. return 2;
  181. } else if (r <= 0xFFFF) {
  182. /* 3 bytes: aaaabbbb bbcccccc */
  183. s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
  184. s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */
  185. s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */
  186. return 3;
  187. } else {
  188. /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
  189. s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
  190. s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
  191. s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */
  192. s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */
  193. return 4;
  194. }
  195. }
  196. static int
  197. namedentitytostr(const char *e, char *buf, size_t bufsiz)
  198. {
  199. static const struct {
  200. const char *entity;
  201. int c;
  202. } entities[] = {
  203. { "amp;", '&' },
  204. { "lt;", '<' },
  205. { "gt;", '>' },
  206. { "apos;", '\'' },
  207. { "quot;", '"' },
  208. };
  209. size_t i;
  210. /* buffer is too small */
  211. if (bufsiz < 2)
  212. return -1;
  213. for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
  214. if (!strcmp(e, entities[i].entity)) {
  215. buf[0] = entities[i].c;
  216. buf[1] = '\0';
  217. return 1;
  218. }
  219. }
  220. return -1;
  221. }
  222. static int
  223. numericentitytostr(const char *e, char *buf, size_t bufsiz)
  224. {
  225. long l;
  226. int len;
  227. char *end;
  228. /* buffer is too small */
  229. if (bufsiz < 5)
  230. return -1;
  231. errno = 0;
  232. /* hex (16) or decimal (10) */
  233. if (*e == 'x')
  234. l = strtol(++e, &end, 16);
  235. else
  236. l = strtol(e, &end, 10);
  237. /* invalid value or not a well-formed entity or invalid code point */
  238. if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff ||
  239. (l >= 0xd800 && l <= 0xdfff))
  240. return -1;
  241. len = codepointtoutf8(l, buf);
  242. buf[len] = '\0';
  243. return len;
  244. }
  245. /* convert named- or numeric entity string to buffer string
  246. * returns byte-length of string or -1 on failure. */
  247. int
  248. xml_entitytostr(const char *e, char *buf, size_t bufsiz)
  249. {
  250. /* doesn't start with & */
  251. if (e[0] != '&')
  252. return -1;
  253. /* numeric entity */
  254. if (e[1] == '#')
  255. return numericentitytostr(e + 2, buf, bufsiz);
  256. else /* named entity */
  257. return namedentitytostr(e + 1, buf, bufsiz);
  258. }
  259. void
  260. xml_parse(XMLParser *x)
  261. {
  262. size_t datalen, tagdatalen;
  263. int c, isend;
  264. while ((c = GETNEXT()) != EOF && c != '<')
  265. ; /* skip until < */
  266. while (c != EOF) {
  267. if (c == '<') { /* parse tag */
  268. if ((c = GETNEXT()) == EOF)
  269. return;
  270. if (c == '!') { /* CDATA and comments */
  271. for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
  272. /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */
  273. if (tagdatalen <= sizeof("[CDATA[") - 1)
  274. x->data[tagdatalen++] = c;
  275. if (c == '>')
  276. break;
  277. else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
  278. (x->data[0] == '-')) {
  279. xml_parsecomment(x);
  280. break;
  281. } else if (c == '[') {
  282. if (tagdatalen == sizeof("[CDATA[") - 1 &&
  283. !strncmp(x->data, "[CDATA[", tagdatalen)) {
  284. xml_parsecdata(x);
  285. break;
  286. }
  287. }
  288. }
  289. } else {
  290. /* normal tag (open, short open, close), processing instruction. */
  291. x->tag[0] = c;
  292. x->taglen = 1;
  293. x->isshorttag = isend = 0;
  294. /* treat processing instruction as shorttag, don't strip "?" prefix. */
  295. if (c == '?') {
  296. x->isshorttag = 1;
  297. } else if (c == '/') {
  298. if ((c = GETNEXT()) == EOF)
  299. return;
  300. x->tag[0] = c;
  301. isend = 1;
  302. }
  303. while ((c = GETNEXT()) != EOF) {
  304. if (c == '/')
  305. x->isshorttag = 1; /* short tag */
  306. else if (c == '>' || ISSPACE(c)) {
  307. x->tag[x->taglen] = '\0';
  308. if (isend) { /* end tag, starts with </ */
  309. if (x->xmltagend)
  310. x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
  311. x->tag[0] = '\0';
  312. x->taglen = 0;
  313. } else {
  314. /* start tag */
  315. if (x->xmltagstart)
  316. x->xmltagstart(x, x->tag, x->taglen);
  317. if (ISSPACE(c))
  318. xml_parseattrs(x);
  319. if (x->xmltagstartparsed)
  320. x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
  321. }
  322. /* call tagend for shortform or processing instruction */
  323. if (x->isshorttag) {
  324. if (x->xmltagend)
  325. x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
  326. x->tag[0] = '\0';
  327. x->taglen = 0;
  328. }
  329. break;
  330. } else if (x->taglen < sizeof(x->tag) - 1)
  331. x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
  332. }
  333. }
  334. } else {
  335. /* parse tag data */
  336. datalen = 0;
  337. while ((c = GETNEXT()) != EOF) {
  338. if (c == '&') {
  339. if (datalen) {
  340. x->data[datalen] = '\0';
  341. if (x->xmldata)
  342. x->xmldata(x, x->data, datalen);
  343. }
  344. x->data[0] = c;
  345. datalen = 1;
  346. while ((c = GETNEXT()) != EOF) {
  347. if (c == '<')
  348. break;
  349. if (datalen < sizeof(x->data) - 1)
  350. x->data[datalen++] = c;
  351. else {
  352. /* entity too long for buffer, handle as normal data */
  353. x->data[datalen] = '\0';
  354. if (x->xmldata)
  355. x->xmldata(x, x->data, datalen);
  356. x->data[0] = c;
  357. datalen = 1;
  358. break;
  359. }
  360. if (c == ';') {
  361. x->data[datalen] = '\0';
  362. if (x->xmldataentity)
  363. x->xmldataentity(x, x->data, datalen);
  364. datalen = 0;
  365. break;
  366. }
  367. }
  368. } else if (c != '<') {
  369. if (datalen < sizeof(x->data) - 1) {
  370. x->data[datalen++] = c;
  371. } else {
  372. x->data[datalen] = '\0';
  373. if (x->xmldata)
  374. x->xmldata(x, x->data, datalen);
  375. x->data[0] = c;
  376. datalen = 1;
  377. }
  378. }
  379. if (c == '<') {
  380. x->data[datalen] = '\0';
  381. if (x->xmldata && datalen)
  382. x->xmldata(x, x->data, datalen);
  383. break;
  384. }
  385. }
  386. }
  387. }
  388. }