xmltok_impl.c 45 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784
  1. /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
  2. See the file COPYING for copying permission.
  3. */
  4. /* This file is included! */
  5. #ifdef XML_TOK_IMPL_C
  6. #ifndef IS_INVALID_CHAR
  7. #define IS_INVALID_CHAR(enc, ptr, n) (0)
  8. #endif
  9. #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
  10. case BT_LEAD ## n: \
  11. if (end - ptr < n) \
  12. return XML_TOK_PARTIAL_CHAR; \
  13. if (IS_INVALID_CHAR(enc, ptr, n)) { \
  14. *(nextTokPtr) = (ptr); \
  15. return XML_TOK_INVALID; \
  16. } \
  17. ptr += n; \
  18. break;
  19. #define INVALID_CASES(ptr, nextTokPtr) \
  20. INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
  21. INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
  22. INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
  23. case BT_NONXML: \
  24. case BT_MALFORM: \
  25. case BT_TRAIL: \
  26. *(nextTokPtr) = (ptr); \
  27. return XML_TOK_INVALID;
  28. #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
  29. case BT_LEAD ## n: \
  30. if (end - ptr < n) \
  31. return XML_TOK_PARTIAL_CHAR; \
  32. if (!IS_NAME_CHAR(enc, ptr, n)) { \
  33. *nextTokPtr = ptr; \
  34. return XML_TOK_INVALID; \
  35. } \
  36. ptr += n; \
  37. break;
  38. #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
  39. case BT_NONASCII: \
  40. if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
  41. *nextTokPtr = ptr; \
  42. return XML_TOK_INVALID; \
  43. } \
  44. case BT_NMSTRT: \
  45. case BT_HEX: \
  46. case BT_DIGIT: \
  47. case BT_NAME: \
  48. case BT_MINUS: \
  49. ptr += MINBPC(enc); \
  50. break; \
  51. CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
  52. CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
  53. CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
  54. #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
  55. case BT_LEAD ## n: \
  56. if (end - ptr < n) \
  57. return XML_TOK_PARTIAL_CHAR; \
  58. if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
  59. *nextTokPtr = ptr; \
  60. return XML_TOK_INVALID; \
  61. } \
  62. ptr += n; \
  63. break;
  64. #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
  65. case BT_NONASCII: \
  66. if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
  67. *nextTokPtr = ptr; \
  68. return XML_TOK_INVALID; \
  69. } \
  70. case BT_NMSTRT: \
  71. case BT_HEX: \
  72. ptr += MINBPC(enc); \
  73. break; \
  74. CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
  75. CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
  76. CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
  77. #ifndef PREFIX
  78. #define PREFIX(ident) ident
  79. #endif
  80. /* ptr points to character following "<!-" */
  81. static int PTRCALL
  82. PREFIX(scanComment)(const ENCODING *enc, const char *ptr,
  83. const char *end, const char **nextTokPtr)
  84. {
  85. if (ptr != end) {
  86. if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
  87. *nextTokPtr = ptr;
  88. return XML_TOK_INVALID;
  89. }
  90. ptr += MINBPC(enc);
  91. while (ptr != end) {
  92. switch (BYTE_TYPE(enc, ptr)) {
  93. INVALID_CASES(ptr, nextTokPtr)
  94. case BT_MINUS:
  95. if ((ptr += MINBPC(enc)) == end)
  96. return XML_TOK_PARTIAL;
  97. if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
  98. if ((ptr += MINBPC(enc)) == end)
  99. return XML_TOK_PARTIAL;
  100. if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  101. *nextTokPtr = ptr;
  102. return XML_TOK_INVALID;
  103. }
  104. *nextTokPtr = ptr + MINBPC(enc);
  105. return XML_TOK_COMMENT;
  106. }
  107. break;
  108. default:
  109. ptr += MINBPC(enc);
  110. break;
  111. }
  112. }
  113. }
  114. return XML_TOK_PARTIAL;
  115. }
  116. /* ptr points to character following "<!" */
  117. static int PTRCALL
  118. PREFIX(scanDecl)(const ENCODING *enc, const char *ptr,
  119. const char *end, const char **nextTokPtr)
  120. {
  121. if (ptr == end)
  122. return XML_TOK_PARTIAL;
  123. switch (BYTE_TYPE(enc, ptr)) {
  124. case BT_MINUS:
  125. return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  126. case BT_LSQB:
  127. *nextTokPtr = ptr + MINBPC(enc);
  128. return XML_TOK_COND_SECT_OPEN;
  129. case BT_NMSTRT:
  130. case BT_HEX:
  131. ptr += MINBPC(enc);
  132. break;
  133. default:
  134. *nextTokPtr = ptr;
  135. return XML_TOK_INVALID;
  136. }
  137. while (ptr != end) {
  138. switch (BYTE_TYPE(enc, ptr)) {
  139. case BT_PERCNT:
  140. if (ptr + MINBPC(enc) == end)
  141. return XML_TOK_PARTIAL;
  142. /* don't allow <!ENTITY% foo "whatever"> */
  143. switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
  144. case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
  145. *nextTokPtr = ptr;
  146. return XML_TOK_INVALID;
  147. }
  148. /* fall through */
  149. case BT_S: case BT_CR: case BT_LF:
  150. *nextTokPtr = ptr;
  151. return XML_TOK_DECL_OPEN;
  152. case BT_NMSTRT:
  153. case BT_HEX:
  154. ptr += MINBPC(enc);
  155. break;
  156. default:
  157. *nextTokPtr = ptr;
  158. return XML_TOK_INVALID;
  159. }
  160. }
  161. return XML_TOK_PARTIAL;
  162. }
  163. static int PTRCALL
  164. PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr,
  165. const char *end, int *tokPtr)
  166. {
  167. int upper = 0;
  168. *tokPtr = XML_TOK_PI;
  169. if (end - ptr != MINBPC(enc)*3)
  170. return 1;
  171. switch (BYTE_TO_ASCII(enc, ptr)) {
  172. case ASCII_x:
  173. break;
  174. case ASCII_X:
  175. upper = 1;
  176. break;
  177. default:
  178. return 1;
  179. }
  180. ptr += MINBPC(enc);
  181. switch (BYTE_TO_ASCII(enc, ptr)) {
  182. case ASCII_m:
  183. break;
  184. case ASCII_M:
  185. upper = 1;
  186. break;
  187. default:
  188. return 1;
  189. }
  190. ptr += MINBPC(enc);
  191. switch (BYTE_TO_ASCII(enc, ptr)) {
  192. case ASCII_l:
  193. break;
  194. case ASCII_L:
  195. upper = 1;
  196. break;
  197. default:
  198. return 1;
  199. }
  200. if (upper)
  201. return 0;
  202. *tokPtr = XML_TOK_XML_DECL;
  203. return 1;
  204. }
  205. /* ptr points to character following "<?" */
  206. static int PTRCALL
  207. PREFIX(scanPi)(const ENCODING *enc, const char *ptr,
  208. const char *end, const char **nextTokPtr)
  209. {
  210. int tok;
  211. const char *target = ptr;
  212. if (ptr == end)
  213. return XML_TOK_PARTIAL;
  214. switch (BYTE_TYPE(enc, ptr)) {
  215. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  216. default:
  217. *nextTokPtr = ptr;
  218. return XML_TOK_INVALID;
  219. }
  220. while (ptr != end) {
  221. switch (BYTE_TYPE(enc, ptr)) {
  222. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  223. case BT_S: case BT_CR: case BT_LF:
  224. if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
  225. *nextTokPtr = ptr;
  226. return XML_TOK_INVALID;
  227. }
  228. ptr += MINBPC(enc);
  229. while (ptr != end) {
  230. switch (BYTE_TYPE(enc, ptr)) {
  231. INVALID_CASES(ptr, nextTokPtr)
  232. case BT_QUEST:
  233. ptr += MINBPC(enc);
  234. if (ptr == end)
  235. return XML_TOK_PARTIAL;
  236. if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  237. *nextTokPtr = ptr + MINBPC(enc);
  238. return tok;
  239. }
  240. break;
  241. default:
  242. ptr += MINBPC(enc);
  243. break;
  244. }
  245. }
  246. return XML_TOK_PARTIAL;
  247. case BT_QUEST:
  248. if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
  249. *nextTokPtr = ptr;
  250. return XML_TOK_INVALID;
  251. }
  252. ptr += MINBPC(enc);
  253. if (ptr == end)
  254. return XML_TOK_PARTIAL;
  255. if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  256. *nextTokPtr = ptr + MINBPC(enc);
  257. return tok;
  258. }
  259. /* fall through */
  260. default:
  261. *nextTokPtr = ptr;
  262. return XML_TOK_INVALID;
  263. }
  264. }
  265. return XML_TOK_PARTIAL;
  266. }
  267. static int PTRCALL
  268. PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr,
  269. const char *end, const char **nextTokPtr)
  270. {
  271. static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A,
  272. ASCII_T, ASCII_A, ASCII_LSQB };
  273. int i;
  274. /* CDATA[ */
  275. if (end - ptr < 6 * MINBPC(enc))
  276. return XML_TOK_PARTIAL;
  277. for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
  278. if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
  279. *nextTokPtr = ptr;
  280. return XML_TOK_INVALID;
  281. }
  282. }
  283. *nextTokPtr = ptr;
  284. return XML_TOK_CDATA_SECT_OPEN;
  285. }
  286. static int PTRCALL
  287. PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr,
  288. const char *end, const char **nextTokPtr)
  289. {
  290. if (ptr == end)
  291. return XML_TOK_NONE;
  292. if (MINBPC(enc) > 1) {
  293. size_t n = end - ptr;
  294. if (n & (MINBPC(enc) - 1)) {
  295. n &= ~(MINBPC(enc) - 1);
  296. if (n == 0)
  297. return XML_TOK_PARTIAL;
  298. end = ptr + n;
  299. }
  300. }
  301. switch (BYTE_TYPE(enc, ptr)) {
  302. case BT_RSQB:
  303. ptr += MINBPC(enc);
  304. if (ptr == end)
  305. return XML_TOK_PARTIAL;
  306. if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
  307. break;
  308. ptr += MINBPC(enc);
  309. if (ptr == end)
  310. return XML_TOK_PARTIAL;
  311. if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  312. ptr -= MINBPC(enc);
  313. break;
  314. }
  315. *nextTokPtr = ptr + MINBPC(enc);
  316. return XML_TOK_CDATA_SECT_CLOSE;
  317. case BT_CR:
  318. ptr += MINBPC(enc);
  319. if (ptr == end)
  320. return XML_TOK_PARTIAL;
  321. if (BYTE_TYPE(enc, ptr) == BT_LF)
  322. ptr += MINBPC(enc);
  323. *nextTokPtr = ptr;
  324. return XML_TOK_DATA_NEWLINE;
  325. case BT_LF:
  326. *nextTokPtr = ptr + MINBPC(enc);
  327. return XML_TOK_DATA_NEWLINE;
  328. INVALID_CASES(ptr, nextTokPtr)
  329. default:
  330. ptr += MINBPC(enc);
  331. break;
  332. }
  333. while (ptr != end) {
  334. switch (BYTE_TYPE(enc, ptr)) {
  335. #define LEAD_CASE(n) \
  336. case BT_LEAD ## n: \
  337. if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
  338. *nextTokPtr = ptr; \
  339. return XML_TOK_DATA_CHARS; \
  340. } \
  341. ptr += n; \
  342. break;
  343. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  344. #undef LEAD_CASE
  345. case BT_NONXML:
  346. case BT_MALFORM:
  347. case BT_TRAIL:
  348. case BT_CR:
  349. case BT_LF:
  350. case BT_RSQB:
  351. *nextTokPtr = ptr;
  352. return XML_TOK_DATA_CHARS;
  353. default:
  354. ptr += MINBPC(enc);
  355. break;
  356. }
  357. }
  358. *nextTokPtr = ptr;
  359. return XML_TOK_DATA_CHARS;
  360. }
  361. /* ptr points to character following "</" */
  362. static int PTRCALL
  363. PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr,
  364. const char *end, const char **nextTokPtr)
  365. {
  366. if (ptr == end)
  367. return XML_TOK_PARTIAL;
  368. switch (BYTE_TYPE(enc, ptr)) {
  369. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  370. default:
  371. *nextTokPtr = ptr;
  372. return XML_TOK_INVALID;
  373. }
  374. while (ptr != end) {
  375. switch (BYTE_TYPE(enc, ptr)) {
  376. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  377. case BT_S: case BT_CR: case BT_LF:
  378. for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
  379. switch (BYTE_TYPE(enc, ptr)) {
  380. case BT_S: case BT_CR: case BT_LF:
  381. break;
  382. case BT_GT:
  383. *nextTokPtr = ptr + MINBPC(enc);
  384. return XML_TOK_END_TAG;
  385. default:
  386. *nextTokPtr = ptr;
  387. return XML_TOK_INVALID;
  388. }
  389. }
  390. return XML_TOK_PARTIAL;
  391. #ifdef XML_NS
  392. case BT_COLON:
  393. /* no need to check qname syntax here,
  394. since end-tag must match exactly */
  395. ptr += MINBPC(enc);
  396. break;
  397. #endif
  398. case BT_GT:
  399. *nextTokPtr = ptr + MINBPC(enc);
  400. return XML_TOK_END_TAG;
  401. default:
  402. *nextTokPtr = ptr;
  403. return XML_TOK_INVALID;
  404. }
  405. }
  406. return XML_TOK_PARTIAL;
  407. }
  408. /* ptr points to character following "&#X" */
  409. static int PTRCALL
  410. PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr,
  411. const char *end, const char **nextTokPtr)
  412. {
  413. if (ptr != end) {
  414. switch (BYTE_TYPE(enc, ptr)) {
  415. case BT_DIGIT:
  416. case BT_HEX:
  417. break;
  418. default:
  419. *nextTokPtr = ptr;
  420. return XML_TOK_INVALID;
  421. }
  422. for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
  423. switch (BYTE_TYPE(enc, ptr)) {
  424. case BT_DIGIT:
  425. case BT_HEX:
  426. break;
  427. case BT_SEMI:
  428. *nextTokPtr = ptr + MINBPC(enc);
  429. return XML_TOK_CHAR_REF;
  430. default:
  431. *nextTokPtr = ptr;
  432. return XML_TOK_INVALID;
  433. }
  434. }
  435. }
  436. return XML_TOK_PARTIAL;
  437. }
  438. /* ptr points to character following "&#" */
  439. static int PTRCALL
  440. PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr,
  441. const char *end, const char **nextTokPtr)
  442. {
  443. if (ptr != end) {
  444. if (CHAR_MATCHES(enc, ptr, ASCII_x))
  445. return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  446. switch (BYTE_TYPE(enc, ptr)) {
  447. case BT_DIGIT:
  448. break;
  449. default:
  450. *nextTokPtr = ptr;
  451. return XML_TOK_INVALID;
  452. }
  453. for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
  454. switch (BYTE_TYPE(enc, ptr)) {
  455. case BT_DIGIT:
  456. break;
  457. case BT_SEMI:
  458. *nextTokPtr = ptr + MINBPC(enc);
  459. return XML_TOK_CHAR_REF;
  460. default:
  461. *nextTokPtr = ptr;
  462. return XML_TOK_INVALID;
  463. }
  464. }
  465. }
  466. return XML_TOK_PARTIAL;
  467. }
  468. /* ptr points to character following "&" */
  469. static int PTRCALL
  470. PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
  471. const char **nextTokPtr)
  472. {
  473. if (ptr == end)
  474. return XML_TOK_PARTIAL;
  475. switch (BYTE_TYPE(enc, ptr)) {
  476. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  477. case BT_NUM:
  478. return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  479. default:
  480. *nextTokPtr = ptr;
  481. return XML_TOK_INVALID;
  482. }
  483. while (ptr != end) {
  484. switch (BYTE_TYPE(enc, ptr)) {
  485. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  486. case BT_SEMI:
  487. *nextTokPtr = ptr + MINBPC(enc);
  488. return XML_TOK_ENTITY_REF;
  489. default:
  490. *nextTokPtr = ptr;
  491. return XML_TOK_INVALID;
  492. }
  493. }
  494. return XML_TOK_PARTIAL;
  495. }
  496. /* ptr points to character following first character of attribute name */
  497. static int PTRCALL
  498. PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
  499. const char **nextTokPtr)
  500. {
  501. #ifdef XML_NS
  502. int hadColon = 0;
  503. #endif
  504. while (ptr != end) {
  505. switch (BYTE_TYPE(enc, ptr)) {
  506. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  507. #ifdef XML_NS
  508. case BT_COLON:
  509. if (hadColon) {
  510. *nextTokPtr = ptr;
  511. return XML_TOK_INVALID;
  512. }
  513. hadColon = 1;
  514. ptr += MINBPC(enc);
  515. if (ptr == end)
  516. return XML_TOK_PARTIAL;
  517. switch (BYTE_TYPE(enc, ptr)) {
  518. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  519. default:
  520. *nextTokPtr = ptr;
  521. return XML_TOK_INVALID;
  522. }
  523. break;
  524. #endif
  525. case BT_S: case BT_CR: case BT_LF:
  526. for (;;) {
  527. int t;
  528. ptr += MINBPC(enc);
  529. if (ptr == end)
  530. return XML_TOK_PARTIAL;
  531. t = BYTE_TYPE(enc, ptr);
  532. if (t == BT_EQUALS)
  533. break;
  534. switch (t) {
  535. case BT_S:
  536. case BT_LF:
  537. case BT_CR:
  538. break;
  539. default:
  540. *nextTokPtr = ptr;
  541. return XML_TOK_INVALID;
  542. }
  543. }
  544. /* fall through */
  545. case BT_EQUALS:
  546. {
  547. int open;
  548. #ifdef XML_NS
  549. hadColon = 0;
  550. #endif
  551. for (;;) {
  552. ptr += MINBPC(enc);
  553. if (ptr == end)
  554. return XML_TOK_PARTIAL;
  555. open = BYTE_TYPE(enc, ptr);
  556. if (open == BT_QUOT || open == BT_APOS)
  557. break;
  558. switch (open) {
  559. case BT_S:
  560. case BT_LF:
  561. case BT_CR:
  562. break;
  563. default:
  564. *nextTokPtr = ptr;
  565. return XML_TOK_INVALID;
  566. }
  567. }
  568. ptr += MINBPC(enc);
  569. /* in attribute value */
  570. for (;;) {
  571. int t;
  572. if (ptr == end)
  573. return XML_TOK_PARTIAL;
  574. t = BYTE_TYPE(enc, ptr);
  575. if (t == open)
  576. break;
  577. switch (t) {
  578. INVALID_CASES(ptr, nextTokPtr)
  579. case BT_AMP:
  580. {
  581. int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
  582. if (tok <= 0) {
  583. if (tok == XML_TOK_INVALID)
  584. *nextTokPtr = ptr;
  585. return tok;
  586. }
  587. break;
  588. }
  589. case BT_LT:
  590. *nextTokPtr = ptr;
  591. return XML_TOK_INVALID;
  592. default:
  593. ptr += MINBPC(enc);
  594. break;
  595. }
  596. }
  597. ptr += MINBPC(enc);
  598. if (ptr == end)
  599. return XML_TOK_PARTIAL;
  600. switch (BYTE_TYPE(enc, ptr)) {
  601. case BT_S:
  602. case BT_CR:
  603. case BT_LF:
  604. break;
  605. case BT_SOL:
  606. goto sol;
  607. case BT_GT:
  608. goto gt;
  609. default:
  610. *nextTokPtr = ptr;
  611. return XML_TOK_INVALID;
  612. }
  613. /* ptr points to closing quote */
  614. for (;;) {
  615. ptr += MINBPC(enc);
  616. if (ptr == end)
  617. return XML_TOK_PARTIAL;
  618. switch (BYTE_TYPE(enc, ptr)) {
  619. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  620. case BT_S: case BT_CR: case BT_LF:
  621. continue;
  622. case BT_GT:
  623. gt:
  624. *nextTokPtr = ptr + MINBPC(enc);
  625. return XML_TOK_START_TAG_WITH_ATTS;
  626. case BT_SOL:
  627. sol:
  628. ptr += MINBPC(enc);
  629. if (ptr == end)
  630. return XML_TOK_PARTIAL;
  631. if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  632. *nextTokPtr = ptr;
  633. return XML_TOK_INVALID;
  634. }
  635. *nextTokPtr = ptr + MINBPC(enc);
  636. return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
  637. default:
  638. *nextTokPtr = ptr;
  639. return XML_TOK_INVALID;
  640. }
  641. break;
  642. }
  643. break;
  644. }
  645. default:
  646. *nextTokPtr = ptr;
  647. return XML_TOK_INVALID;
  648. }
  649. }
  650. return XML_TOK_PARTIAL;
  651. }
  652. /* ptr points to character following "<" */
  653. static int PTRCALL
  654. PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
  655. const char **nextTokPtr)
  656. {
  657. #ifdef XML_NS
  658. int hadColon;
  659. #endif
  660. if (ptr == end)
  661. return XML_TOK_PARTIAL;
  662. switch (BYTE_TYPE(enc, ptr)) {
  663. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  664. case BT_EXCL:
  665. if ((ptr += MINBPC(enc)) == end)
  666. return XML_TOK_PARTIAL;
  667. switch (BYTE_TYPE(enc, ptr)) {
  668. case BT_MINUS:
  669. return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  670. case BT_LSQB:
  671. return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc),
  672. end, nextTokPtr);
  673. }
  674. *nextTokPtr = ptr;
  675. return XML_TOK_INVALID;
  676. case BT_QUEST:
  677. return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  678. case BT_SOL:
  679. return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  680. default:
  681. *nextTokPtr = ptr;
  682. return XML_TOK_INVALID;
  683. }
  684. #ifdef XML_NS
  685. hadColon = 0;
  686. #endif
  687. /* we have a start-tag */
  688. while (ptr != end) {
  689. switch (BYTE_TYPE(enc, ptr)) {
  690. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  691. #ifdef XML_NS
  692. case BT_COLON:
  693. if (hadColon) {
  694. *nextTokPtr = ptr;
  695. return XML_TOK_INVALID;
  696. }
  697. hadColon = 1;
  698. ptr += MINBPC(enc);
  699. if (ptr == end)
  700. return XML_TOK_PARTIAL;
  701. switch (BYTE_TYPE(enc, ptr)) {
  702. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  703. default:
  704. *nextTokPtr = ptr;
  705. return XML_TOK_INVALID;
  706. }
  707. break;
  708. #endif
  709. case BT_S: case BT_CR: case BT_LF:
  710. {
  711. ptr += MINBPC(enc);
  712. while (ptr != end) {
  713. switch (BYTE_TYPE(enc, ptr)) {
  714. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  715. case BT_GT:
  716. goto gt;
  717. case BT_SOL:
  718. goto sol;
  719. case BT_S: case BT_CR: case BT_LF:
  720. ptr += MINBPC(enc);
  721. continue;
  722. default:
  723. *nextTokPtr = ptr;
  724. return XML_TOK_INVALID;
  725. }
  726. return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
  727. }
  728. return XML_TOK_PARTIAL;
  729. }
  730. case BT_GT:
  731. gt:
  732. *nextTokPtr = ptr + MINBPC(enc);
  733. return XML_TOK_START_TAG_NO_ATTS;
  734. case BT_SOL:
  735. sol:
  736. ptr += MINBPC(enc);
  737. if (ptr == end)
  738. return XML_TOK_PARTIAL;
  739. if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  740. *nextTokPtr = ptr;
  741. return XML_TOK_INVALID;
  742. }
  743. *nextTokPtr = ptr + MINBPC(enc);
  744. return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
  745. default:
  746. *nextTokPtr = ptr;
  747. return XML_TOK_INVALID;
  748. }
  749. }
  750. return XML_TOK_PARTIAL;
  751. }
  752. static int PTRCALL
  753. PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
  754. const char **nextTokPtr)
  755. {
  756. if (ptr == end)
  757. return XML_TOK_NONE;
  758. if (MINBPC(enc) > 1) {
  759. size_t n = end - ptr;
  760. if (n & (MINBPC(enc) - 1)) {
  761. n &= ~(MINBPC(enc) - 1);
  762. if (n == 0)
  763. return XML_TOK_PARTIAL;
  764. end = ptr + n;
  765. }
  766. }
  767. switch (BYTE_TYPE(enc, ptr)) {
  768. case BT_LT:
  769. return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  770. case BT_AMP:
  771. return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  772. case BT_CR:
  773. ptr += MINBPC(enc);
  774. if (ptr == end)
  775. return XML_TOK_TRAILING_CR;
  776. if (BYTE_TYPE(enc, ptr) == BT_LF)
  777. ptr += MINBPC(enc);
  778. *nextTokPtr = ptr;
  779. return XML_TOK_DATA_NEWLINE;
  780. case BT_LF:
  781. *nextTokPtr = ptr + MINBPC(enc);
  782. return XML_TOK_DATA_NEWLINE;
  783. case BT_RSQB:
  784. ptr += MINBPC(enc);
  785. if (ptr == end)
  786. return XML_TOK_TRAILING_RSQB;
  787. if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
  788. break;
  789. ptr += MINBPC(enc);
  790. if (ptr == end)
  791. return XML_TOK_TRAILING_RSQB;
  792. if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  793. ptr -= MINBPC(enc);
  794. break;
  795. }
  796. *nextTokPtr = ptr;
  797. return XML_TOK_INVALID;
  798. INVALID_CASES(ptr, nextTokPtr)
  799. default:
  800. ptr += MINBPC(enc);
  801. break;
  802. }
  803. while (ptr != end) {
  804. switch (BYTE_TYPE(enc, ptr)) {
  805. #define LEAD_CASE(n) \
  806. case BT_LEAD ## n: \
  807. if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
  808. *nextTokPtr = ptr; \
  809. return XML_TOK_DATA_CHARS; \
  810. } \
  811. ptr += n; \
  812. break;
  813. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  814. #undef LEAD_CASE
  815. case BT_RSQB:
  816. if (ptr + MINBPC(enc) != end) {
  817. if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
  818. ptr += MINBPC(enc);
  819. break;
  820. }
  821. if (ptr + 2*MINBPC(enc) != end) {
  822. if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
  823. ptr += MINBPC(enc);
  824. break;
  825. }
  826. *nextTokPtr = ptr + 2*MINBPC(enc);
  827. return XML_TOK_INVALID;
  828. }
  829. }
  830. /* fall through */
  831. case BT_AMP:
  832. case BT_LT:
  833. case BT_NONXML:
  834. case BT_MALFORM:
  835. case BT_TRAIL:
  836. case BT_CR:
  837. case BT_LF:
  838. *nextTokPtr = ptr;
  839. return XML_TOK_DATA_CHARS;
  840. default:
  841. ptr += MINBPC(enc);
  842. break;
  843. }
  844. }
  845. *nextTokPtr = ptr;
  846. return XML_TOK_DATA_CHARS;
  847. }
  848. /* ptr points to character following "%" */
  849. static int PTRCALL
  850. PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
  851. const char **nextTokPtr)
  852. {
  853. if (ptr == end)
  854. return -XML_TOK_PERCENT;
  855. switch (BYTE_TYPE(enc, ptr)) {
  856. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  857. case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
  858. *nextTokPtr = ptr;
  859. return XML_TOK_PERCENT;
  860. default:
  861. *nextTokPtr = ptr;
  862. return XML_TOK_INVALID;
  863. }
  864. while (ptr != end) {
  865. switch (BYTE_TYPE(enc, ptr)) {
  866. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  867. case BT_SEMI:
  868. *nextTokPtr = ptr + MINBPC(enc);
  869. return XML_TOK_PARAM_ENTITY_REF;
  870. default:
  871. *nextTokPtr = ptr;
  872. return XML_TOK_INVALID;
  873. }
  874. }
  875. return XML_TOK_PARTIAL;
  876. }
  877. static int PTRCALL
  878. PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
  879. const char **nextTokPtr)
  880. {
  881. if (ptr == end)
  882. return XML_TOK_PARTIAL;
  883. switch (BYTE_TYPE(enc, ptr)) {
  884. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  885. default:
  886. *nextTokPtr = ptr;
  887. return XML_TOK_INVALID;
  888. }
  889. while (ptr != end) {
  890. switch (BYTE_TYPE(enc, ptr)) {
  891. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  892. case BT_CR: case BT_LF: case BT_S:
  893. case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
  894. *nextTokPtr = ptr;
  895. return XML_TOK_POUND_NAME;
  896. default:
  897. *nextTokPtr = ptr;
  898. return XML_TOK_INVALID;
  899. }
  900. }
  901. return -XML_TOK_POUND_NAME;
  902. }
  903. static int PTRCALL
  904. PREFIX(scanLit)(int open, const ENCODING *enc,
  905. const char *ptr, const char *end,
  906. const char **nextTokPtr)
  907. {
  908. while (ptr != end) {
  909. int t = BYTE_TYPE(enc, ptr);
  910. switch (t) {
  911. INVALID_CASES(ptr, nextTokPtr)
  912. case BT_QUOT:
  913. case BT_APOS:
  914. ptr += MINBPC(enc);
  915. if (t != open)
  916. break;
  917. if (ptr == end)
  918. return -XML_TOK_LITERAL;
  919. *nextTokPtr = ptr;
  920. switch (BYTE_TYPE(enc, ptr)) {
  921. case BT_S: case BT_CR: case BT_LF:
  922. case BT_GT: case BT_PERCNT: case BT_LSQB:
  923. return XML_TOK_LITERAL;
  924. default:
  925. return XML_TOK_INVALID;
  926. }
  927. default:
  928. ptr += MINBPC(enc);
  929. break;
  930. }
  931. }
  932. return XML_TOK_PARTIAL;
  933. }
  934. static int PTRCALL
  935. PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
  936. const char **nextTokPtr)
  937. {
  938. int tok;
  939. if (ptr == end)
  940. return XML_TOK_NONE;
  941. if (MINBPC(enc) > 1) {
  942. size_t n = end - ptr;
  943. if (n & (MINBPC(enc) - 1)) {
  944. n &= ~(MINBPC(enc) - 1);
  945. if (n == 0)
  946. return XML_TOK_PARTIAL;
  947. end = ptr + n;
  948. }
  949. }
  950. switch (BYTE_TYPE(enc, ptr)) {
  951. case BT_QUOT:
  952. return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
  953. case BT_APOS:
  954. return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
  955. case BT_LT:
  956. {
  957. ptr += MINBPC(enc);
  958. if (ptr == end)
  959. return XML_TOK_PARTIAL;
  960. switch (BYTE_TYPE(enc, ptr)) {
  961. case BT_EXCL:
  962. return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  963. case BT_QUEST:
  964. return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  965. case BT_NMSTRT:
  966. case BT_HEX:
  967. case BT_NONASCII:
  968. case BT_LEAD2:
  969. case BT_LEAD3:
  970. case BT_LEAD4:
  971. *nextTokPtr = ptr - MINBPC(enc);
  972. return XML_TOK_INSTANCE_START;
  973. }
  974. *nextTokPtr = ptr;
  975. return XML_TOK_INVALID;
  976. }
  977. case BT_CR:
  978. if (ptr + MINBPC(enc) == end) {
  979. *nextTokPtr = end;
  980. /* indicate that this might be part of a CR/LF pair */
  981. return -XML_TOK_PROLOG_S;
  982. }
  983. /* fall through */
  984. case BT_S: case BT_LF:
  985. for (;;) {
  986. ptr += MINBPC(enc);
  987. if (ptr == end)
  988. break;
  989. switch (BYTE_TYPE(enc, ptr)) {
  990. case BT_S: case BT_LF:
  991. break;
  992. case BT_CR:
  993. /* don't split CR/LF pair */
  994. if (ptr + MINBPC(enc) != end)
  995. break;
  996. /* fall through */
  997. default:
  998. *nextTokPtr = ptr;
  999. return XML_TOK_PROLOG_S;
  1000. }
  1001. }
  1002. *nextTokPtr = ptr;
  1003. return XML_TOK_PROLOG_S;
  1004. case BT_PERCNT:
  1005. return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  1006. case BT_COMMA:
  1007. *nextTokPtr = ptr + MINBPC(enc);
  1008. return XML_TOK_COMMA;
  1009. case BT_LSQB:
  1010. *nextTokPtr = ptr + MINBPC(enc);
  1011. return XML_TOK_OPEN_BRACKET;
  1012. case BT_RSQB:
  1013. ptr += MINBPC(enc);
  1014. if (ptr == end)
  1015. return -XML_TOK_CLOSE_BRACKET;
  1016. if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
  1017. if (ptr + MINBPC(enc) == end)
  1018. return XML_TOK_PARTIAL;
  1019. if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
  1020. *nextTokPtr = ptr + 2*MINBPC(enc);
  1021. return XML_TOK_COND_SECT_CLOSE;
  1022. }
  1023. }
  1024. *nextTokPtr = ptr;
  1025. return XML_TOK_CLOSE_BRACKET;
  1026. case BT_LPAR:
  1027. *nextTokPtr = ptr + MINBPC(enc);
  1028. return XML_TOK_OPEN_PAREN;
  1029. case BT_RPAR:
  1030. ptr += MINBPC(enc);
  1031. if (ptr == end)
  1032. return -XML_TOK_CLOSE_PAREN;
  1033. switch (BYTE_TYPE(enc, ptr)) {
  1034. case BT_AST:
  1035. *nextTokPtr = ptr + MINBPC(enc);
  1036. return XML_TOK_CLOSE_PAREN_ASTERISK;
  1037. case BT_QUEST:
  1038. *nextTokPtr = ptr + MINBPC(enc);
  1039. return XML_TOK_CLOSE_PAREN_QUESTION;
  1040. case BT_PLUS:
  1041. *nextTokPtr = ptr + MINBPC(enc);
  1042. return XML_TOK_CLOSE_PAREN_PLUS;
  1043. case BT_CR: case BT_LF: case BT_S:
  1044. case BT_GT: case BT_COMMA: case BT_VERBAR:
  1045. case BT_RPAR:
  1046. *nextTokPtr = ptr;
  1047. return XML_TOK_CLOSE_PAREN;
  1048. }
  1049. *nextTokPtr = ptr;
  1050. return XML_TOK_INVALID;
  1051. case BT_VERBAR:
  1052. *nextTokPtr = ptr + MINBPC(enc);
  1053. return XML_TOK_OR;
  1054. case BT_GT:
  1055. *nextTokPtr = ptr + MINBPC(enc);
  1056. return XML_TOK_DECL_CLOSE;
  1057. case BT_NUM:
  1058. return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  1059. #define LEAD_CASE(n) \
  1060. case BT_LEAD ## n: \
  1061. if (end - ptr < n) \
  1062. return XML_TOK_PARTIAL_CHAR; \
  1063. if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
  1064. ptr += n; \
  1065. tok = XML_TOK_NAME; \
  1066. break; \
  1067. } \
  1068. if (IS_NAME_CHAR(enc, ptr, n)) { \
  1069. ptr += n; \
  1070. tok = XML_TOK_NMTOKEN; \
  1071. break; \
  1072. } \
  1073. *nextTokPtr = ptr; \
  1074. return XML_TOK_INVALID;
  1075. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1076. #undef LEAD_CASE
  1077. case BT_NMSTRT:
  1078. case BT_HEX:
  1079. tok = XML_TOK_NAME;
  1080. ptr += MINBPC(enc);
  1081. break;
  1082. case BT_DIGIT:
  1083. case BT_NAME:
  1084. case BT_MINUS:
  1085. #ifdef XML_NS
  1086. case BT_COLON:
  1087. #endif
  1088. tok = XML_TOK_NMTOKEN;
  1089. ptr += MINBPC(enc);
  1090. break;
  1091. case BT_NONASCII:
  1092. if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
  1093. ptr += MINBPC(enc);
  1094. tok = XML_TOK_NAME;
  1095. break;
  1096. }
  1097. if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
  1098. ptr += MINBPC(enc);
  1099. tok = XML_TOK_NMTOKEN;
  1100. break;
  1101. }
  1102. /* fall through */
  1103. default:
  1104. *nextTokPtr = ptr;
  1105. return XML_TOK_INVALID;
  1106. }
  1107. while (ptr != end) {
  1108. switch (BYTE_TYPE(enc, ptr)) {
  1109. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  1110. case BT_GT: case BT_RPAR: case BT_COMMA:
  1111. case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
  1112. case BT_S: case BT_CR: case BT_LF:
  1113. *nextTokPtr = ptr;
  1114. return tok;
  1115. #ifdef XML_NS
  1116. case BT_COLON:
  1117. ptr += MINBPC(enc);
  1118. switch (tok) {
  1119. case XML_TOK_NAME:
  1120. if (ptr == end)
  1121. return XML_TOK_PARTIAL;
  1122. tok = XML_TOK_PREFIXED_NAME;
  1123. switch (BYTE_TYPE(enc, ptr)) {
  1124. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  1125. default:
  1126. tok = XML_TOK_NMTOKEN;
  1127. break;
  1128. }
  1129. break;
  1130. case XML_TOK_PREFIXED_NAME:
  1131. tok = XML_TOK_NMTOKEN;
  1132. break;
  1133. }
  1134. break;
  1135. #endif
  1136. case BT_PLUS:
  1137. if (tok == XML_TOK_NMTOKEN) {
  1138. *nextTokPtr = ptr;
  1139. return XML_TOK_INVALID;
  1140. }
  1141. *nextTokPtr = ptr + MINBPC(enc);
  1142. return XML_TOK_NAME_PLUS;
  1143. case BT_AST:
  1144. if (tok == XML_TOK_NMTOKEN) {
  1145. *nextTokPtr = ptr;
  1146. return XML_TOK_INVALID;
  1147. }
  1148. *nextTokPtr = ptr + MINBPC(enc);
  1149. return XML_TOK_NAME_ASTERISK;
  1150. case BT_QUEST:
  1151. if (tok == XML_TOK_NMTOKEN) {
  1152. *nextTokPtr = ptr;
  1153. return XML_TOK_INVALID;
  1154. }
  1155. *nextTokPtr = ptr + MINBPC(enc);
  1156. return XML_TOK_NAME_QUESTION;
  1157. default:
  1158. *nextTokPtr = ptr;
  1159. return XML_TOK_INVALID;
  1160. }
  1161. }
  1162. return -tok;
  1163. }
  1164. static int PTRCALL
  1165. PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr,
  1166. const char *end, const char **nextTokPtr)
  1167. {
  1168. const char *start;
  1169. if (ptr == end)
  1170. return XML_TOK_NONE;
  1171. start = ptr;
  1172. while (ptr != end) {
  1173. switch (BYTE_TYPE(enc, ptr)) {
  1174. #define LEAD_CASE(n) \
  1175. case BT_LEAD ## n: ptr += n; break;
  1176. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1177. #undef LEAD_CASE
  1178. case BT_AMP:
  1179. if (ptr == start)
  1180. return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  1181. *nextTokPtr = ptr;
  1182. return XML_TOK_DATA_CHARS;
  1183. case BT_LT:
  1184. /* this is for inside entity references */
  1185. *nextTokPtr = ptr;
  1186. return XML_TOK_INVALID;
  1187. case BT_LF:
  1188. if (ptr == start) {
  1189. *nextTokPtr = ptr + MINBPC(enc);
  1190. return XML_TOK_DATA_NEWLINE;
  1191. }
  1192. *nextTokPtr = ptr;
  1193. return XML_TOK_DATA_CHARS;
  1194. case BT_CR:
  1195. if (ptr == start) {
  1196. ptr += MINBPC(enc);
  1197. if (ptr == end)
  1198. return XML_TOK_TRAILING_CR;
  1199. if (BYTE_TYPE(enc, ptr) == BT_LF)
  1200. ptr += MINBPC(enc);
  1201. *nextTokPtr = ptr;
  1202. return XML_TOK_DATA_NEWLINE;
  1203. }
  1204. *nextTokPtr = ptr;
  1205. return XML_TOK_DATA_CHARS;
  1206. case BT_S:
  1207. if (ptr == start) {
  1208. *nextTokPtr = ptr + MINBPC(enc);
  1209. return XML_TOK_ATTRIBUTE_VALUE_S;
  1210. }
  1211. *nextTokPtr = ptr;
  1212. return XML_TOK_DATA_CHARS;
  1213. default:
  1214. ptr += MINBPC(enc);
  1215. break;
  1216. }
  1217. }
  1218. *nextTokPtr = ptr;
  1219. return XML_TOK_DATA_CHARS;
  1220. }
  1221. static int PTRCALL
  1222. PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr,
  1223. const char *end, const char **nextTokPtr)
  1224. {
  1225. const char *start;
  1226. if (ptr == end)
  1227. return XML_TOK_NONE;
  1228. start = ptr;
  1229. while (ptr != end) {
  1230. switch (BYTE_TYPE(enc, ptr)) {
  1231. #define LEAD_CASE(n) \
  1232. case BT_LEAD ## n: ptr += n; break;
  1233. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1234. #undef LEAD_CASE
  1235. case BT_AMP:
  1236. if (ptr == start)
  1237. return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  1238. *nextTokPtr = ptr;
  1239. return XML_TOK_DATA_CHARS;
  1240. case BT_PERCNT:
  1241. if (ptr == start) {
  1242. int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
  1243. end, nextTokPtr);
  1244. return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
  1245. }
  1246. *nextTokPtr = ptr;
  1247. return XML_TOK_DATA_CHARS;
  1248. case BT_LF:
  1249. if (ptr == start) {
  1250. *nextTokPtr = ptr + MINBPC(enc);
  1251. return XML_TOK_DATA_NEWLINE;
  1252. }
  1253. *nextTokPtr = ptr;
  1254. return XML_TOK_DATA_CHARS;
  1255. case BT_CR:
  1256. if (ptr == start) {
  1257. ptr += MINBPC(enc);
  1258. if (ptr == end)
  1259. return XML_TOK_TRAILING_CR;
  1260. if (BYTE_TYPE(enc, ptr) == BT_LF)
  1261. ptr += MINBPC(enc);
  1262. *nextTokPtr = ptr;
  1263. return XML_TOK_DATA_NEWLINE;
  1264. }
  1265. *nextTokPtr = ptr;
  1266. return XML_TOK_DATA_CHARS;
  1267. default:
  1268. ptr += MINBPC(enc);
  1269. break;
  1270. }
  1271. }
  1272. *nextTokPtr = ptr;
  1273. return XML_TOK_DATA_CHARS;
  1274. }
  1275. #ifdef XML_DTD
  1276. static int PTRCALL
  1277. PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr,
  1278. const char *end, const char **nextTokPtr)
  1279. {
  1280. int level = 0;
  1281. if (MINBPC(enc) > 1) {
  1282. size_t n = end - ptr;
  1283. if (n & (MINBPC(enc) - 1)) {
  1284. n &= ~(MINBPC(enc) - 1);
  1285. end = ptr + n;
  1286. }
  1287. }
  1288. while (ptr != end) {
  1289. switch (BYTE_TYPE(enc, ptr)) {
  1290. INVALID_CASES(ptr, nextTokPtr)
  1291. case BT_LT:
  1292. if ((ptr += MINBPC(enc)) == end)
  1293. return XML_TOK_PARTIAL;
  1294. if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
  1295. if ((ptr += MINBPC(enc)) == end)
  1296. return XML_TOK_PARTIAL;
  1297. if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
  1298. ++level;
  1299. ptr += MINBPC(enc);
  1300. }
  1301. }
  1302. break;
  1303. case BT_RSQB:
  1304. if ((ptr += MINBPC(enc)) == end)
  1305. return XML_TOK_PARTIAL;
  1306. if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
  1307. if ((ptr += MINBPC(enc)) == end)
  1308. return XML_TOK_PARTIAL;
  1309. if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  1310. ptr += MINBPC(enc);
  1311. if (level == 0) {
  1312. *nextTokPtr = ptr;
  1313. return XML_TOK_IGNORE_SECT;
  1314. }
  1315. --level;
  1316. }
  1317. }
  1318. break;
  1319. default:
  1320. ptr += MINBPC(enc);
  1321. break;
  1322. }
  1323. }
  1324. return XML_TOK_PARTIAL;
  1325. }
  1326. #endif /* XML_DTD */
  1327. static int PTRCALL
  1328. PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
  1329. const char **badPtr)
  1330. {
  1331. ptr += MINBPC(enc);
  1332. end -= MINBPC(enc);
  1333. for (; ptr != end; ptr += MINBPC(enc)) {
  1334. switch (BYTE_TYPE(enc, ptr)) {
  1335. case BT_DIGIT:
  1336. case BT_HEX:
  1337. case BT_MINUS:
  1338. case BT_APOS:
  1339. case BT_LPAR:
  1340. case BT_RPAR:
  1341. case BT_PLUS:
  1342. case BT_COMMA:
  1343. case BT_SOL:
  1344. case BT_EQUALS:
  1345. case BT_QUEST:
  1346. case BT_CR:
  1347. case BT_LF:
  1348. case BT_SEMI:
  1349. case BT_EXCL:
  1350. case BT_AST:
  1351. case BT_PERCNT:
  1352. case BT_NUM:
  1353. #ifdef XML_NS
  1354. case BT_COLON:
  1355. #endif
  1356. break;
  1357. case BT_S:
  1358. if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
  1359. *badPtr = ptr;
  1360. return 0;
  1361. }
  1362. break;
  1363. case BT_NAME:
  1364. case BT_NMSTRT:
  1365. if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
  1366. break;
  1367. default:
  1368. switch (BYTE_TO_ASCII(enc, ptr)) {
  1369. case 0x24: /* $ */
  1370. case 0x40: /* @ */
  1371. break;
  1372. default:
  1373. *badPtr = ptr;
  1374. return 0;
  1375. }
  1376. break;
  1377. }
  1378. }
  1379. return 1;
  1380. }
  1381. /* This must only be called for a well-formed start-tag or empty
  1382. element tag. Returns the number of attributes. Pointers to the
  1383. first attsMax attributes are stored in atts.
  1384. */
  1385. static int PTRCALL
  1386. PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
  1387. int attsMax, ATTRIBUTE *atts)
  1388. {
  1389. enum { other, inName, inValue } state = inName;
  1390. int nAtts = 0;
  1391. int open = 0; /* defined when state == inValue;
  1392. initialization just to shut up compilers */
  1393. for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
  1394. switch (BYTE_TYPE(enc, ptr)) {
  1395. #define START_NAME \
  1396. if (state == other) { \
  1397. if (nAtts < attsMax) { \
  1398. atts[nAtts].name = ptr; \
  1399. atts[nAtts].normalized = 1; \
  1400. } \
  1401. state = inName; \
  1402. }
  1403. #define LEAD_CASE(n) \
  1404. case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
  1405. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1406. #undef LEAD_CASE
  1407. case BT_NONASCII:
  1408. case BT_NMSTRT:
  1409. case BT_HEX:
  1410. START_NAME
  1411. break;
  1412. #undef START_NAME
  1413. case BT_QUOT:
  1414. if (state != inValue) {
  1415. if (nAtts < attsMax)
  1416. atts[nAtts].valuePtr = ptr + MINBPC(enc);
  1417. state = inValue;
  1418. open = BT_QUOT;
  1419. }
  1420. else if (open == BT_QUOT) {
  1421. state = other;
  1422. if (nAtts < attsMax)
  1423. atts[nAtts].valueEnd = ptr;
  1424. nAtts++;
  1425. }
  1426. break;
  1427. case BT_APOS:
  1428. if (state != inValue) {
  1429. if (nAtts < attsMax)
  1430. atts[nAtts].valuePtr = ptr + MINBPC(enc);
  1431. state = inValue;
  1432. open = BT_APOS;
  1433. }
  1434. else if (open == BT_APOS) {
  1435. state = other;
  1436. if (nAtts < attsMax)
  1437. atts[nAtts].valueEnd = ptr;
  1438. nAtts++;
  1439. }
  1440. break;
  1441. case BT_AMP:
  1442. if (nAtts < attsMax)
  1443. atts[nAtts].normalized = 0;
  1444. break;
  1445. case BT_S:
  1446. if (state == inName)
  1447. state = other;
  1448. else if (state == inValue
  1449. && nAtts < attsMax
  1450. && atts[nAtts].normalized
  1451. && (ptr == atts[nAtts].valuePtr
  1452. || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
  1453. || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
  1454. || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
  1455. atts[nAtts].normalized = 0;
  1456. break;
  1457. case BT_CR: case BT_LF:
  1458. /* This case ensures that the first attribute name is counted
  1459. Apart from that we could just change state on the quote. */
  1460. if (state == inName)
  1461. state = other;
  1462. else if (state == inValue && nAtts < attsMax)
  1463. atts[nAtts].normalized = 0;
  1464. break;
  1465. case BT_GT:
  1466. case BT_SOL:
  1467. if (state != inValue)
  1468. return nAtts;
  1469. break;
  1470. default:
  1471. break;
  1472. }
  1473. }
  1474. /* not reached */
  1475. }
  1476. static int PTRFASTCALL
  1477. PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr)
  1478. {
  1479. int result = 0;
  1480. /* skip &# */
  1481. ptr += 2*MINBPC(enc);
  1482. if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
  1483. for (ptr += MINBPC(enc);
  1484. !CHAR_MATCHES(enc, ptr, ASCII_SEMI);
  1485. ptr += MINBPC(enc)) {
  1486. int c = BYTE_TO_ASCII(enc, ptr);
  1487. switch (c) {
  1488. case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
  1489. case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
  1490. result <<= 4;
  1491. result |= (c - ASCII_0);
  1492. break;
  1493. case ASCII_A: case ASCII_B: case ASCII_C:
  1494. case ASCII_D: case ASCII_E: case ASCII_F:
  1495. result <<= 4;
  1496. result += 10 + (c - ASCII_A);
  1497. break;
  1498. case ASCII_a: case ASCII_b: case ASCII_c:
  1499. case ASCII_d: case ASCII_e: case ASCII_f:
  1500. result <<= 4;
  1501. result += 10 + (c - ASCII_a);
  1502. break;
  1503. }
  1504. if (result >= 0x110000)
  1505. return -1;
  1506. }
  1507. }
  1508. else {
  1509. for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
  1510. int c = BYTE_TO_ASCII(enc, ptr);
  1511. result *= 10;
  1512. result += (c - ASCII_0);
  1513. if (result >= 0x110000)
  1514. return -1;
  1515. }
  1516. }
  1517. return checkCharRefNumber(result);
  1518. }
  1519. static int PTRCALL
  1520. PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr,
  1521. const char *end)
  1522. {
  1523. switch ((end - ptr)/MINBPC(enc)) {
  1524. case 2:
  1525. if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
  1526. switch (BYTE_TO_ASCII(enc, ptr)) {
  1527. case ASCII_l:
  1528. return ASCII_LT;
  1529. case ASCII_g:
  1530. return ASCII_GT;
  1531. }
  1532. }
  1533. break;
  1534. case 3:
  1535. if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
  1536. ptr += MINBPC(enc);
  1537. if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
  1538. ptr += MINBPC(enc);
  1539. if (CHAR_MATCHES(enc, ptr, ASCII_p))
  1540. return ASCII_AMP;
  1541. }
  1542. }
  1543. break;
  1544. case 4:
  1545. switch (BYTE_TO_ASCII(enc, ptr)) {
  1546. case ASCII_q:
  1547. ptr += MINBPC(enc);
  1548. if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
  1549. ptr += MINBPC(enc);
  1550. if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
  1551. ptr += MINBPC(enc);
  1552. if (CHAR_MATCHES(enc, ptr, ASCII_t))
  1553. return ASCII_QUOT;
  1554. }
  1555. }
  1556. break;
  1557. case ASCII_a:
  1558. ptr += MINBPC(enc);
  1559. if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
  1560. ptr += MINBPC(enc);
  1561. if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
  1562. ptr += MINBPC(enc);
  1563. if (CHAR_MATCHES(enc, ptr, ASCII_s))
  1564. return ASCII_APOS;
  1565. }
  1566. }
  1567. break;
  1568. }
  1569. }
  1570. return 0;
  1571. }
  1572. static int PTRCALL
  1573. PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
  1574. {
  1575. for (;;) {
  1576. switch (BYTE_TYPE(enc, ptr1)) {
  1577. #define LEAD_CASE(n) \
  1578. case BT_LEAD ## n: \
  1579. if (*ptr1++ != *ptr2++) \
  1580. return 0;
  1581. LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
  1582. #undef LEAD_CASE
  1583. /* fall through */
  1584. if (*ptr1++ != *ptr2++)
  1585. return 0;
  1586. break;
  1587. case BT_NONASCII:
  1588. case BT_NMSTRT:
  1589. #ifdef XML_NS
  1590. case BT_COLON:
  1591. #endif
  1592. case BT_HEX:
  1593. case BT_DIGIT:
  1594. case BT_NAME:
  1595. case BT_MINUS:
  1596. if (*ptr2++ != *ptr1++)
  1597. return 0;
  1598. if (MINBPC(enc) > 1) {
  1599. if (*ptr2++ != *ptr1++)
  1600. return 0;
  1601. if (MINBPC(enc) > 2) {
  1602. if (*ptr2++ != *ptr1++)
  1603. return 0;
  1604. if (MINBPC(enc) > 3) {
  1605. if (*ptr2++ != *ptr1++)
  1606. return 0;
  1607. }
  1608. }
  1609. }
  1610. break;
  1611. default:
  1612. if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
  1613. return 1;
  1614. switch (BYTE_TYPE(enc, ptr2)) {
  1615. case BT_LEAD2:
  1616. case BT_LEAD3:
  1617. case BT_LEAD4:
  1618. case BT_NONASCII:
  1619. case BT_NMSTRT:
  1620. #ifdef XML_NS
  1621. case BT_COLON:
  1622. #endif
  1623. case BT_HEX:
  1624. case BT_DIGIT:
  1625. case BT_NAME:
  1626. case BT_MINUS:
  1627. return 0;
  1628. default:
  1629. return 1;
  1630. }
  1631. }
  1632. }
  1633. /* not reached */
  1634. }
  1635. static int PTRCALL
  1636. PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
  1637. const char *end1, const char *ptr2)
  1638. {
  1639. for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
  1640. if (ptr1 == end1)
  1641. return 0;
  1642. if (!CHAR_MATCHES(enc, ptr1, *ptr2))
  1643. return 0;
  1644. }
  1645. return ptr1 == end1;
  1646. }
  1647. static int PTRFASTCALL
  1648. PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
  1649. {
  1650. const char *start = ptr;
  1651. for (;;) {
  1652. switch (BYTE_TYPE(enc, ptr)) {
  1653. #define LEAD_CASE(n) \
  1654. case BT_LEAD ## n: ptr += n; break;
  1655. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1656. #undef LEAD_CASE
  1657. case BT_NONASCII:
  1658. case BT_NMSTRT:
  1659. #ifdef XML_NS
  1660. case BT_COLON:
  1661. #endif
  1662. case BT_HEX:
  1663. case BT_DIGIT:
  1664. case BT_NAME:
  1665. case BT_MINUS:
  1666. ptr += MINBPC(enc);
  1667. break;
  1668. default:
  1669. return (int)(ptr - start);
  1670. }
  1671. }
  1672. }
  1673. static const char * PTRFASTCALL
  1674. PREFIX(skipS)(const ENCODING *enc, const char *ptr)
  1675. {
  1676. for (;;) {
  1677. switch (BYTE_TYPE(enc, ptr)) {
  1678. case BT_LF:
  1679. case BT_CR:
  1680. case BT_S:
  1681. ptr += MINBPC(enc);
  1682. break;
  1683. default:
  1684. return ptr;
  1685. }
  1686. }
  1687. }
  1688. static void PTRCALL
  1689. PREFIX(updatePosition)(const ENCODING *enc,
  1690. const char *ptr,
  1691. const char *end,
  1692. POSITION *pos)
  1693. {
  1694. while (ptr != end) {
  1695. switch (BYTE_TYPE(enc, ptr)) {
  1696. #define LEAD_CASE(n) \
  1697. case BT_LEAD ## n: \
  1698. ptr += n; \
  1699. break;
  1700. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1701. #undef LEAD_CASE
  1702. case BT_LF:
  1703. pos->columnNumber = (XML_Size)-1;
  1704. pos->lineNumber++;
  1705. ptr += MINBPC(enc);
  1706. break;
  1707. case BT_CR:
  1708. pos->lineNumber++;
  1709. ptr += MINBPC(enc);
  1710. if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF)
  1711. ptr += MINBPC(enc);
  1712. pos->columnNumber = (XML_Size)-1;
  1713. break;
  1714. default:
  1715. ptr += MINBPC(enc);
  1716. break;
  1717. }
  1718. pos->columnNumber++;
  1719. }
  1720. }
  1721. #undef DO_LEAD_CASE
  1722. #undef MULTIBYTE_CASES
  1723. #undef INVALID_CASES
  1724. #undef CHECK_NAME_CASE
  1725. #undef CHECK_NAME_CASES
  1726. #undef CHECK_NMSTRT_CASE
  1727. #undef CHECK_NMSTRT_CASES
  1728. #endif /* XML_TOK_IMPL_C */