parsexml.nim 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2010 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## This module implements a simple high performance `XML`:idx: / `HTML`:idx:
  10. ## parser.
  11. ## The only encoding that is supported is UTF-8. The parser has been designed
  12. ## to be somewhat error correcting, so that even most "wild HTML" found on the
  13. ## web can be parsed with it. **Note:** This parser does not check that each
  14. ## ``<tag>`` has a corresponding ``</tag>``! These checks have do be
  15. ## implemented by the client code for various reasons:
  16. ##
  17. ## * Old HTML contains tags that have no end tag: ``<br>`` for example.
  18. ## * HTML tags are case insensitive, XML tags are case sensitive. Since this
  19. ## library can parse both, only the client knows which comparison is to be
  20. ## used.
  21. ## * Thus the checks would have been very difficult to implement properly with
  22. ## little benefit, especially since they are simple to implement in the
  23. ## client. The client should use the `errorMsgExpected` proc to generate
  24. ## a nice error message that fits the other error messages this library
  25. ## creates.
  26. ##
  27. ##
  28. ##[
  29. Example 1: Retrieve HTML title
  30. ==============================
  31. The file ``examples/htmltitle.nim`` demonstrates how to use the
  32. XML parser to accomplish a simple task: To determine the title of an HTML
  33. document.
  34. .. code-block:: nim
  35. # Example program to show the parsexml module
  36. # This program reads an HTML file and writes its title to stdout.
  37. # Errors and whitespace are ignored.
  38. import os, streams, parsexml, strutils
  39. if paramCount() < 1:
  40. quit("Usage: htmltitle filename[.html]")
  41. var filename = addFileExt(paramStr(1), "html")
  42. var s = newFileStream(filename, fmRead)
  43. if s == nil: quit("cannot open the file " & filename)
  44. var x: XmlParser
  45. open(x, s, filename)
  46. while true:
  47. x.next()
  48. case x.kind
  49. of xmlElementStart:
  50. if cmpIgnoreCase(x.elementName, "title") == 0:
  51. var title = ""
  52. x.next() # skip "<title>"
  53. while x.kind == xmlCharData:
  54. title.add(x.charData)
  55. x.next()
  56. if x.kind == xmlElementEnd and cmpIgnoreCase(x.elementName, "title") == 0:
  57. echo("Title: " & title)
  58. quit(0) # Success!
  59. else:
  60. echo(x.errorMsgExpected("/title"))
  61. of xmlEof: break # end of file reached
  62. else: discard # ignore other events
  63. x.close()
  64. quit("Could not determine title!")
  65. ]##
  66. ##[
  67. Example 2: Retrieve all HTML links
  68. ==================================
  69. The file ``examples/htmlrefs.nim`` demonstrates how to use the
  70. XML parser to accomplish another simple task: To determine all the links
  71. an HTML document contains.
  72. .. code-block:: nim
  73. # Example program to show the new parsexml module
  74. # This program reads an HTML file and writes all its used links to stdout.
  75. # Errors and whitespace are ignored.
  76. import os, streams, parsexml, strutils
  77. proc `=?=` (a, b: string): bool =
  78. # little trick: define our own comparator that ignores case
  79. return cmpIgnoreCase(a, b) == 0
  80. if paramCount() < 1:
  81. quit("Usage: htmlrefs filename[.html]")
  82. var links = 0 # count the number of links
  83. var filename = addFileExt(paramStr(1), "html")
  84. var s = newFileStream(filename, fmRead)
  85. if s == nil: quit("cannot open the file " & filename)
  86. var x: XmlParser
  87. open(x, s, filename)
  88. next(x) # get first event
  89. block mainLoop:
  90. while true:
  91. case x.kind
  92. of xmlElementOpen:
  93. # the <a href = "xyz"> tag we are interested in always has an attribute,
  94. # thus we search for ``xmlElementOpen`` and not for ``xmlElementStart``
  95. if x.elementName =?= "a":
  96. x.next()
  97. if x.kind == xmlAttribute:
  98. if x.attrKey =?= "href":
  99. var link = x.attrValue
  100. inc(links)
  101. # skip until we have an ``xmlElementClose`` event
  102. while true:
  103. x.next()
  104. case x.kind
  105. of xmlEof: break mainLoop
  106. of xmlElementClose: break
  107. else: discard
  108. x.next() # skip ``xmlElementClose``
  109. # now we have the description for the ``a`` element
  110. var desc = ""
  111. while x.kind == xmlCharData:
  112. desc.add(x.charData)
  113. x.next()
  114. echo(desc & ": " & link)
  115. else:
  116. x.next()
  117. of xmlEof: break # end of file reached
  118. of xmlError:
  119. echo(errorMsg(x))
  120. x.next()
  121. else: x.next() # skip other events
  122. echo($links & " link(s) found!")
  123. x.close()
  124. ]##
  125. import
  126. strutils, lexbase, streams, unicode
  127. # the parser treats ``<br />`` as ``<br></br>``
  128. # xmlElementCloseEnd, ## ``/>``
  129. type
  130. XmlEventKind* = enum ## enumeration of all events that may occur when parsing
  131. xmlError, ## an error occurred during parsing
  132. xmlEof, ## end of file reached
  133. xmlCharData, ## character data
  134. xmlWhitespace, ## whitespace has been parsed
  135. xmlComment, ## a comment has been parsed
  136. xmlPI, ## processing instruction (``<?name something ?>``)
  137. xmlElementStart, ## ``<elem>``
  138. xmlElementEnd, ## ``</elem>``
  139. xmlElementOpen, ## ``<elem
  140. xmlAttribute, ## ``key = "value"`` pair
  141. xmlElementClose, ## ``>``
  142. xmlCData, ## ``<![CDATA[`` ... data ... ``]]>``
  143. xmlEntity, ## &entity;
  144. xmlSpecial ## ``<! ... data ... >``
  145. XmlErrorKind* = enum ## enumeration that lists all errors that can occur
  146. errNone, ## no error
  147. errEndOfCDataExpected, ## ``]]>`` expected
  148. errNameExpected, ## name expected
  149. errSemicolonExpected, ## ``;`` expected
  150. errQmGtExpected, ## ``?>`` expected
  151. errGtExpected, ## ``>`` expected
  152. errEqExpected, ## ``=`` expected
  153. errQuoteExpected, ## ``"`` or ``'`` expected
  154. errEndOfCommentExpected ## ``-->`` expected
  155. errAttributeValueExpected ## non-empty attribute value expected
  156. ParserState = enum
  157. stateStart, stateNormal, stateAttr, stateEmptyElementTag, stateError
  158. XmlParseOption* = enum ## options for the XML parser
  159. reportWhitespace, ## report whitespace
  160. reportComments ## report comments
  161. allowUnquotedAttribs ## allow unquoted attribute values (for HTML)
  162. allowEmptyAttribs ## allow empty attributes (without explicit value)
  163. XmlParser* = object of BaseLexer ## the parser object.
  164. a, b, c: string
  165. kind: XmlEventKind
  166. err: XmlErrorKind
  167. state: ParserState
  168. cIsEmpty: bool
  169. filename: string
  170. options: set[XmlParseOption]
  171. const
  172. errorMessages: array[XmlErrorKind, string] = [
  173. "no error",
  174. "']]>' expected",
  175. "name expected",
  176. "';' expected",
  177. "'?>' expected",
  178. "'>' expected",
  179. "'=' expected",
  180. "'\"' or \"'\" expected",
  181. "'-->' expected",
  182. "attribute value expected"
  183. ]
  184. proc open*(my: var XmlParser, input: Stream, filename: string,
  185. options: set[XmlParseOption] = {}) =
  186. ## initializes the parser with an input stream. `Filename` is only used
  187. ## for nice error messages. The parser's behaviour can be controlled by
  188. ## the `options` parameter: If `options` contains ``reportWhitespace``
  189. ## a whitespace token is reported as an ``xmlWhitespace`` event.
  190. ## If `options` contains ``reportComments`` a comment token is reported as an
  191. ## ``xmlComment`` event.
  192. lexbase.open(my, input, 8192, {'\c', '\L', '/'})
  193. my.filename = filename
  194. my.state = stateStart
  195. my.kind = xmlError
  196. my.a = ""
  197. my.b = ""
  198. my.c = ""
  199. my.cIsEmpty = true
  200. my.options = options
  201. proc close*(my: var XmlParser) {.inline.} =
  202. ## closes the parser `my` and its associated input stream.
  203. lexbase.close(my)
  204. proc kind*(my: XmlParser): XmlEventKind {.inline.} =
  205. ## returns the current event type for the XML parser
  206. return my.kind
  207. template charData*(my: XmlParser): string =
  208. ## returns the character data for the events: ``xmlCharData``,
  209. ## ``xmlWhitespace``, ``xmlComment``, ``xmlCData``, ``xmlSpecial``
  210. ## Raises an assertion in debug mode if ``my.kind`` is not one
  211. ## of those events. In release mode, this will not trigger an error
  212. ## but the value returned will not be valid.
  213. assert(my.kind in {xmlCharData, xmlWhitespace, xmlComment, xmlCData,
  214. xmlSpecial})
  215. my.a
  216. template elementName*(my: XmlParser): string =
  217. ## returns the element name for the events: ``xmlElementStart``,
  218. ## ``xmlElementEnd``, ``xmlElementOpen``
  219. ## Raises an assertion in debug mode if ``my.kind`` is not one
  220. ## of those events. In release mode, this will not trigger an error
  221. ## but the value returned will not be valid.
  222. assert(my.kind in {xmlElementStart, xmlElementEnd, xmlElementOpen})
  223. my.a
  224. template entityName*(my: XmlParser): string =
  225. ## returns the entity name for the event: ``xmlEntity``
  226. ## Raises an assertion in debug mode if ``my.kind`` is not
  227. ## ``xmlEntity``. In release mode, this will not trigger an error
  228. ## but the value returned will not be valid.
  229. assert(my.kind == xmlEntity)
  230. my.a
  231. template attrKey*(my: XmlParser): string =
  232. ## returns the attribute key for the event ``xmlAttribute``
  233. ## Raises an assertion in debug mode if ``my.kind`` is not
  234. ## ``xmlAttribute``. In release mode, this will not trigger an error
  235. ## but the value returned will not be valid.
  236. assert(my.kind == xmlAttribute)
  237. my.a
  238. template attrValue*(my: XmlParser): string =
  239. ## returns the attribute value for the event ``xmlAttribute``
  240. ## Raises an assertion in debug mode if ``my.kind`` is not
  241. ## ``xmlAttribute``. In release mode, this will not trigger an error
  242. ## but the value returned will not be valid.
  243. assert(my.kind == xmlAttribute)
  244. my.b
  245. template piName*(my: XmlParser): string =
  246. ## returns the processing instruction name for the event ``xmlPI``
  247. ## Raises an assertion in debug mode if ``my.kind`` is not
  248. ## ``xmlPI``. In release mode, this will not trigger an error
  249. ## but the value returned will not be valid.
  250. assert(my.kind == xmlPI)
  251. my.a
  252. template piRest*(my: XmlParser): string =
  253. ## returns the rest of the processing instruction for the event ``xmlPI``
  254. ## Raises an assertion in debug mode if ``my.kind`` is not
  255. ## ``xmlPI``. In release mode, this will not trigger an error
  256. ## but the value returned will not be valid.
  257. assert(my.kind == xmlPI)
  258. my.b
  259. proc rawData*(my: var XmlParser): string {.inline.} =
  260. ## returns the underlying 'data' string by reference.
  261. ## This is only used for speed hacks.
  262. when defined(gcDestructors):
  263. result = move(my.a)
  264. else:
  265. shallowCopy(result, my.a)
  266. proc rawData2*(my: var XmlParser): string {.inline.} =
  267. ## returns the underlying second 'data' string by reference.
  268. ## This is only used for speed hacks.
  269. when defined(gcDestructors):
  270. result = move(my.b)
  271. else:
  272. shallowCopy(result, my.b)
  273. proc getColumn*(my: XmlParser): int {.inline.} =
  274. ## get the current column the parser has arrived at.
  275. result = getColNumber(my, my.bufpos)
  276. proc getLine*(my: XmlParser): int {.inline.} =
  277. ## get the current line the parser has arrived at.
  278. result = my.lineNumber
  279. proc getFilename*(my: XmlParser): string {.inline.} =
  280. ## get the filename of the file that the parser processes.
  281. result = my.filename
  282. proc errorMsg*(my: XmlParser): string =
  283. ## returns a helpful error message for the event ``xmlError``
  284. assert(my.kind == xmlError)
  285. result = "$1($2, $3) Error: $4" % [
  286. my.filename, $getLine(my), $getColumn(my), errorMessages[my.err]]
  287. proc errorMsgExpected*(my: XmlParser, tag: string): string =
  288. ## returns an error message "<tag> expected" in the same format as the
  289. ## other error messages
  290. result = "$1($2, $3) Error: $4" % [
  291. my.filename, $getLine(my), $getColumn(my), "<$1> expected" % tag]
  292. proc errorMsg*(my: XmlParser, msg: string): string =
  293. ## returns an error message with text `msg` in the same format as the
  294. ## other error messages
  295. result = "$1($2, $3) Error: $4" % [
  296. my.filename, $getLine(my), $getColumn(my), msg]
  297. proc markError(my: var XmlParser, kind: XmlErrorKind) {.inline.} =
  298. my.err = kind
  299. my.state = stateError
  300. proc parseCDATA(my: var XmlParser) =
  301. var pos = my.bufpos + len("<![CDATA[")
  302. while true:
  303. case my.buf[pos]
  304. of ']':
  305. if my.buf[pos+1] == ']' and my.buf[pos+2] == '>':
  306. inc(pos, 3)
  307. break
  308. add(my.a, ']')
  309. inc(pos)
  310. of '\0':
  311. markError(my, errEndOfCDataExpected)
  312. break
  313. of '\c':
  314. pos = lexbase.handleCR(my, pos)
  315. add(my.a, '\L')
  316. of '\L':
  317. pos = lexbase.handleLF(my, pos)
  318. add(my.a, '\L')
  319. of '/':
  320. pos = lexbase.handleRefillChar(my, pos)
  321. add(my.a, '/')
  322. else:
  323. add(my.a, my.buf[pos])
  324. inc(pos)
  325. my.bufpos = pos # store back
  326. my.kind = xmlCData
  327. proc parseComment(my: var XmlParser) =
  328. var pos = my.bufpos + len("<!--")
  329. while true:
  330. case my.buf[pos]
  331. of '-':
  332. if my.buf[pos+1] == '-' and my.buf[pos+2] == '>':
  333. inc(pos, 3)
  334. break
  335. if my.options.contains(reportComments): add(my.a, '-')
  336. inc(pos)
  337. of '\0':
  338. markError(my, errEndOfCommentExpected)
  339. break
  340. of '\c':
  341. pos = lexbase.handleCR(my, pos)
  342. if my.options.contains(reportComments): add(my.a, '\L')
  343. of '\L':
  344. pos = lexbase.handleLF(my, pos)
  345. if my.options.contains(reportComments): add(my.a, '\L')
  346. of '/':
  347. pos = lexbase.handleRefillChar(my, pos)
  348. if my.options.contains(reportComments): add(my.a, '/')
  349. else:
  350. if my.options.contains(reportComments): add(my.a, my.buf[pos])
  351. inc(pos)
  352. my.bufpos = pos
  353. my.kind = xmlComment
  354. proc parseWhitespace(my: var XmlParser, skip = false) =
  355. var pos = my.bufpos
  356. while true:
  357. case my.buf[pos]
  358. of ' ', '\t':
  359. if not skip: add(my.a, my.buf[pos])
  360. inc(pos)
  361. of '\c':
  362. # the specification says that CR-LF, CR are to be transformed to LF
  363. pos = lexbase.handleCR(my, pos)
  364. if not skip: add(my.a, '\L')
  365. of '\L':
  366. pos = lexbase.handleLF(my, pos)
  367. if not skip: add(my.a, '\L')
  368. else:
  369. break
  370. my.bufpos = pos
  371. const
  372. NameStartChar = {'A'..'Z', 'a'..'z', '_', ':', '\128'..'\255'}
  373. NameChar = {'A'..'Z', 'a'..'z', '0'..'9', '.', '-', '_', ':', '\128'..'\255'}
  374. proc parseName(my: var XmlParser, dest: var string) =
  375. var pos = my.bufpos
  376. if my.buf[pos] in NameStartChar:
  377. while true:
  378. add(dest, my.buf[pos])
  379. inc(pos)
  380. if my.buf[pos] notin NameChar: break
  381. my.bufpos = pos
  382. else:
  383. markError(my, errNameExpected)
  384. proc parseEntity(my: var XmlParser, dest: var string) =
  385. var pos = my.bufpos+1
  386. my.kind = xmlCharData
  387. if my.buf[pos] == '#':
  388. var r: int
  389. inc(pos)
  390. if my.buf[pos] == 'x':
  391. inc(pos)
  392. while true:
  393. case my.buf[pos]
  394. of '0'..'9': r = (r shl 4) or (ord(my.buf[pos]) - ord('0'))
  395. of 'a'..'f': r = (r shl 4) or (ord(my.buf[pos]) - ord('a') + 10)
  396. of 'A'..'F': r = (r shl 4) or (ord(my.buf[pos]) - ord('A') + 10)
  397. else: break
  398. inc(pos)
  399. else:
  400. while my.buf[pos] in {'0'..'9'}:
  401. r = r * 10 + (ord(my.buf[pos]) - ord('0'))
  402. inc(pos)
  403. add(dest, toUTF8(Rune(r)))
  404. elif my.buf[pos] == 'l' and my.buf[pos+1] == 't' and my.buf[pos+2] == ';':
  405. add(dest, '<')
  406. inc(pos, 2)
  407. elif my.buf[pos] == 'g' and my.buf[pos+1] == 't' and my.buf[pos+2] == ';':
  408. add(dest, '>')
  409. inc(pos, 2)
  410. elif my.buf[pos] == 'a' and my.buf[pos+1] == 'm' and my.buf[pos+2] == 'p' and
  411. my.buf[pos+3] == ';':
  412. add(dest, '&')
  413. inc(pos, 3)
  414. elif my.buf[pos] == 'a' and my.buf[pos+1] == 'p' and my.buf[pos+2] == 'o' and
  415. my.buf[pos+3] == 's' and my.buf[pos+4] == ';':
  416. add(dest, '\'')
  417. inc(pos, 4)
  418. elif my.buf[pos] == 'q' and my.buf[pos+1] == 'u' and my.buf[pos+2] == 'o' and
  419. my.buf[pos+3] == 't' and my.buf[pos+4] == ';':
  420. add(dest, '"')
  421. inc(pos, 4)
  422. else:
  423. my.bufpos = pos
  424. var name = ""
  425. parseName(my, name)
  426. pos = my.bufpos
  427. if my.err != errNameExpected and my.buf[pos] == ';':
  428. my.kind = xmlEntity
  429. else:
  430. add(dest, '&')
  431. add(dest, name)
  432. if my.buf[pos] == ';':
  433. inc(pos)
  434. else:
  435. my.err = errSemicolonExpected
  436. # do not overwrite 'my.state' here, it's a benign error
  437. my.bufpos = pos
  438. proc parsePI(my: var XmlParser) =
  439. inc(my.bufpos, "<?".len)
  440. parseName(my, my.a)
  441. var pos = my.bufpos
  442. setLen(my.b, 0)
  443. while true:
  444. case my.buf[pos]
  445. of '\0':
  446. markError(my, errQmGtExpected)
  447. break
  448. of '?':
  449. if my.buf[pos+1] == '>':
  450. inc(pos, 2)
  451. break
  452. add(my.b, '?')
  453. inc(pos)
  454. of '\c':
  455. # the specification says that CR-LF, CR are to be transformed to LF
  456. pos = lexbase.handleCR(my, pos)
  457. add(my.b, '\L')
  458. of '\L':
  459. pos = lexbase.handleLF(my, pos)
  460. add(my.b, '\L')
  461. of '/':
  462. pos = lexbase.handleRefillChar(my, pos)
  463. add(my.b, '/')
  464. else:
  465. add(my.b, my.buf[pos])
  466. inc(pos)
  467. my.bufpos = pos
  468. my.kind = xmlPI
  469. proc parseSpecial(my: var XmlParser) =
  470. # things that start with <!
  471. var pos = my.bufpos + 2
  472. var opentags = 0
  473. while true:
  474. case my.buf[pos]
  475. of '\0':
  476. markError(my, errGtExpected)
  477. break
  478. of '<':
  479. inc(opentags)
  480. inc(pos)
  481. add(my.a, '<')
  482. of '>':
  483. if opentags <= 0:
  484. inc(pos)
  485. break
  486. dec(opentags)
  487. inc(pos)
  488. add(my.a, '>')
  489. of '\c':
  490. pos = lexbase.handleCR(my, pos)
  491. add(my.a, '\L')
  492. of '\L':
  493. pos = lexbase.handleLF(my, pos)
  494. add(my.a, '\L')
  495. of '/':
  496. pos = lexbase.handleRefillChar(my, pos)
  497. add(my.b, '/')
  498. else:
  499. add(my.a, my.buf[pos])
  500. inc(pos)
  501. my.bufpos = pos
  502. my.kind = xmlSpecial
  503. proc parseTag(my: var XmlParser) =
  504. inc(my.bufpos)
  505. parseName(my, my.a)
  506. # if we have no name, do not interpret the '<':
  507. if my.a.len == 0:
  508. my.kind = xmlCharData
  509. add(my.a, '<')
  510. return
  511. parseWhitespace(my, skip = true)
  512. if my.buf[my.bufpos] in NameStartChar:
  513. # an attribute follows:
  514. my.kind = xmlElementOpen
  515. my.state = stateAttr
  516. my.c = my.a # save for later
  517. my.cIsEmpty = false
  518. else:
  519. my.kind = xmlElementStart
  520. let slash = my.buf[my.bufpos] == '/'
  521. if slash:
  522. my.bufpos = lexbase.handleRefillChar(my, my.bufpos)
  523. if slash and my.buf[my.bufpos] == '>':
  524. inc(my.bufpos)
  525. my.state = stateEmptyElementTag
  526. my.c = ""
  527. my.cIsEmpty = true
  528. elif my.buf[my.bufpos] == '>':
  529. inc(my.bufpos)
  530. else:
  531. markError(my, errGtExpected)
  532. proc parseEndTag(my: var XmlParser) =
  533. my.bufpos = lexbase.handleRefillChar(my, my.bufpos+1)
  534. #inc(my.bufpos, 2)
  535. parseName(my, my.a)
  536. parseWhitespace(my, skip = true)
  537. if my.buf[my.bufpos] == '>':
  538. inc(my.bufpos)
  539. else:
  540. markError(my, errGtExpected)
  541. my.kind = xmlElementEnd
  542. proc parseAttribute(my: var XmlParser) =
  543. my.kind = xmlAttribute
  544. setLen(my.a, 0)
  545. setLen(my.b, 0)
  546. parseName(my, my.a)
  547. # if we have no name, we have '<tag attr= key %&$$%':
  548. if my.a.len == 0:
  549. markError(my, errGtExpected)
  550. return
  551. let startPos = my.bufpos
  552. parseWhitespace(my, skip = true)
  553. if my.buf[my.bufpos] != '=':
  554. if allowEmptyAttribs notin my.options or
  555. (my.buf[my.bufpos] != '>' and my.bufpos == startPos):
  556. markError(my, errEqExpected)
  557. return
  558. inc(my.bufpos)
  559. parseWhitespace(my, skip = true)
  560. var pos = my.bufpos
  561. if my.buf[pos] in {'\'', '"'}:
  562. var quote = my.buf[pos]
  563. var pendingSpace = false
  564. inc(pos)
  565. while true:
  566. case my.buf[pos]
  567. of '\0':
  568. markError(my, errQuoteExpected)
  569. break
  570. of '&':
  571. if pendingSpace:
  572. add(my.b, ' ')
  573. pendingSpace = false
  574. my.bufpos = pos
  575. parseEntity(my, my.b)
  576. my.kind = xmlAttribute # parseEntity overwrites my.kind!
  577. pos = my.bufpos
  578. of ' ', '\t':
  579. pendingSpace = true
  580. inc(pos)
  581. of '\c':
  582. pos = lexbase.handleCR(my, pos)
  583. pendingSpace = true
  584. of '\L':
  585. pos = lexbase.handleLF(my, pos)
  586. pendingSpace = true
  587. of '/':
  588. pos = lexbase.handleRefillChar(my, pos)
  589. add(my.b, '/')
  590. else:
  591. if my.buf[pos] == quote:
  592. inc(pos)
  593. break
  594. else:
  595. if pendingSpace:
  596. add(my.b, ' ')
  597. pendingSpace = false
  598. add(my.b, my.buf[pos])
  599. inc(pos)
  600. elif allowUnquotedAttribs in my.options:
  601. const disallowedChars = {'"', '\'', '`', '=', '<', '>', ' ',
  602. '\0', '\t', '\L', '\F', '\f'}
  603. let startPos = pos
  604. while (let c = my.buf[pos]; c notin disallowedChars):
  605. if c == '&':
  606. my.bufpos = pos
  607. parseEntity(my, my.b)
  608. my.kind = xmlAttribute # parseEntity overwrites my.kind!
  609. pos = my.bufpos
  610. else:
  611. add(my.b, c)
  612. inc(pos)
  613. if pos == startPos:
  614. markError(my, errAttributeValueExpected)
  615. else:
  616. markError(my, errQuoteExpected)
  617. # error corrections: guess what was meant
  618. while my.buf[pos] != '>' and my.buf[pos] > ' ':
  619. add(my.b, my.buf[pos])
  620. inc pos
  621. my.bufpos = pos
  622. parseWhitespace(my, skip = true)
  623. proc parseCharData(my: var XmlParser) =
  624. var pos = my.bufpos
  625. while true:
  626. case my.buf[pos]
  627. of '\0', '<', '&': break
  628. of '\c':
  629. # the specification says that CR-LF, CR are to be transformed to LF
  630. pos = lexbase.handleCR(my, pos)
  631. add(my.a, '\L')
  632. of '\L':
  633. pos = lexbase.handleLF(my, pos)
  634. add(my.a, '\L')
  635. of '/':
  636. pos = lexbase.handleRefillChar(my, pos)
  637. add(my.a, '/')
  638. else:
  639. add(my.a, my.buf[pos])
  640. inc(pos)
  641. my.bufpos = pos
  642. my.kind = xmlCharData
  643. proc rawGetTok(my: var XmlParser) =
  644. my.kind = xmlError
  645. setLen(my.a, 0)
  646. var pos = my.bufpos
  647. case my.buf[pos]
  648. of '<':
  649. case my.buf[pos+1]
  650. of '/':
  651. parseEndTag(my)
  652. of '!':
  653. if my.buf[pos+2] == '[' and my.buf[pos+3] == 'C' and
  654. my.buf[pos+4] == 'D' and my.buf[pos+5] == 'A' and
  655. my.buf[pos+6] == 'T' and my.buf[pos+7] == 'A' and
  656. my.buf[pos+8] == '[':
  657. parseCDATA(my)
  658. elif my.buf[pos+2] == '-' and my.buf[pos+3] == '-':
  659. parseComment(my)
  660. else:
  661. parseSpecial(my)
  662. of '?':
  663. parsePI(my)
  664. else:
  665. parseTag(my)
  666. of ' ', '\t', '\c', '\l':
  667. parseWhitespace(my)
  668. my.kind = xmlWhitespace
  669. of '\0':
  670. my.kind = xmlEof
  671. of '&':
  672. parseEntity(my, my.a)
  673. else:
  674. parseCharData(my)
  675. assert my.kind != xmlError
  676. proc getTok(my: var XmlParser) =
  677. while true:
  678. let lastKind = my.kind
  679. rawGetTok(my)
  680. case my.kind
  681. of xmlComment:
  682. if my.options.contains(reportComments): break
  683. of xmlWhitespace:
  684. if my.options.contains(reportWhitespace) or lastKind in {xmlCharData,
  685. xmlComment, xmlEntity}:
  686. break
  687. else: break
  688. proc next*(my: var XmlParser) =
  689. ## retrieves the first/next event. This controls the parser.
  690. case my.state
  691. of stateNormal:
  692. getTok(my)
  693. of stateStart:
  694. my.state = stateNormal
  695. getTok(my)
  696. if my.kind == xmlPI and my.a == "xml":
  697. # just skip the first ``<?xml >`` processing instruction
  698. getTok(my)
  699. of stateAttr:
  700. # parse an attribute key-value pair:
  701. if my.buf[my.bufpos] == '>':
  702. my.kind = xmlElementClose
  703. inc(my.bufpos)
  704. my.state = stateNormal
  705. elif my.buf[my.bufpos] == '/':
  706. my.bufpos = lexbase.handleRefillChar(my, my.bufpos)
  707. if my.buf[my.bufpos] == '>':
  708. my.kind = xmlElementClose
  709. inc(my.bufpos)
  710. my.state = stateEmptyElementTag
  711. else:
  712. markError(my, errGtExpected)
  713. else:
  714. parseAttribute(my)
  715. # state remains the same
  716. of stateEmptyElementTag:
  717. my.state = stateNormal
  718. my.kind = xmlElementEnd
  719. if not my.cIsEmpty:
  720. my.a = my.c
  721. of stateError:
  722. my.kind = xmlError
  723. my.state = stateNormal
  724. when not defined(testing) and isMainModule:
  725. import os
  726. var s = newFileStream(paramStr(1), fmRead)
  727. if s == nil: quit("cannot open the file" & paramStr(1))
  728. var x: XmlParser
  729. open(x, s, paramStr(1))
  730. while true:
  731. next(x)
  732. case x.kind
  733. of xmlError: echo(x.errorMsg())
  734. of xmlEof: break
  735. of xmlCharData: echo(x.charData)
  736. of xmlWhitespace: echo("|$1|" % x.charData)
  737. of xmlComment: echo("<!-- $1 -->" % x.charData)
  738. of xmlPI: echo("<? $1 ## $2 ?>" % [x.piName, x.piRest])
  739. of xmlElementStart: echo("<$1>" % x.elementName)
  740. of xmlElementEnd: echo("</$1>" % x.elementName)
  741. of xmlElementOpen: echo("<$1" % x.elementName)
  742. of xmlAttribute:
  743. echo("Key: " & x.attrKey)
  744. echo("Value: " & x.attrValue)
  745. of xmlElementClose: echo(">")
  746. of xmlCData:
  747. echo("<![CDATA[$1]]>" % x.charData)
  748. of xmlEntity:
  749. echo("&$1;" % x.entityName)
  750. of xmlSpecial:
  751. echo("SPECIAL: " & x.charData)
  752. close(x)