htmlparser.nim 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2013 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## This module parses an HTML document and creates its XML tree representation.
  10. ## It is supposed to handle the *wild* HTML the real world uses.
  11. ##
  12. ## It can be used to parse a wild HTML document and output it as valid XHTML
  13. ## document (well, if you are lucky):
  14. ##
  15. ## .. code-block:: Nim
  16. ##
  17. ## echo loadHtml("mydirty.html")
  18. ##
  19. ## Every tag in the resulting tree is in lower case.
  20. ##
  21. ## **Note:** The resulting ``XmlNode`` already uses the ``clientData`` field,
  22. ## so it cannot be used by clients of this library.
  23. ##
  24. ## Example: Transforming hyperlinks
  25. ## ================================
  26. ##
  27. ## This code demonstrates how you can iterate over all the tags in an HTML file
  28. ## and write back the modified version. In this case we look for hyperlinks
  29. ## ending with the extension ``.rst`` and convert them to ``.html``.
  30. ##
  31. ## .. code-block:: Nim
  32. ##
  33. ## import htmlparser
  34. ## import xmltree # To use '$' for XmlNode
  35. ## import strtabs # To access XmlAttributes
  36. ## import os # To use splitFile
  37. ## import strutils # To use cmpIgnoreCase
  38. ##
  39. ## proc transformHyperlinks() =
  40. ## let html = loadHTML("input.html")
  41. ##
  42. ## for a in html.findAll("a"):
  43. ## let href = a.attrs["href"]
  44. ## if not href.isNil:
  45. ## let (dir, filename, ext) = splitFile(href)
  46. ## if cmpIgnoreCase(ext, ".rst") == 0:
  47. ## a.attrs["href"] = dir / filename & ".html"
  48. ##
  49. ## writeFile("output.html", $html)
  50. import strutils, streams, parsexml, xmltree, unicode, strtabs
  51. type
  52. HtmlTag* = enum ## list of all supported HTML tags; order will always be
  53. ## alphabetically
  54. tagUnknown, ## unknown HTML element
  55. tagA, ## the HTML ``a`` element
  56. tagAbbr, ## the deprecated HTML ``abbr`` element
  57. tagAcronym, ## the HTML ``acronym`` element
  58. tagAddress, ## the HTML ``address`` element
  59. tagApplet, ## the deprecated HTML ``applet`` element
  60. tagArea, ## the HTML ``area`` element
  61. tagArticle, ## the HTML ``article`` element
  62. tagAside, ## the HTML ``aside`` element
  63. tagAudio, ## the HTML ``audio`` element
  64. tagB, ## the HTML ``b`` element
  65. tagBase, ## the HTML ``base`` element
  66. tagBdi, ## the HTML ``bdi`` element
  67. tagBdo, ## the deprecated HTML ``dbo`` element
  68. tagBasefont, ## the deprecated HTML ``basefont`` element
  69. tagBig, ## the HTML ``big`` element
  70. tagBlockquote, ## the HTML ``blockquote`` element
  71. tagBody, ## the HTML ``body`` element
  72. tagBr, ## the HTML ``br`` element
  73. tagButton, ## the HTML ``button`` element
  74. tagCanvas, ## the HTML ``canvas`` element
  75. tagCaption, ## the HTML ``caption`` element
  76. tagCenter, ## the deprecated HTML ``center`` element
  77. tagCite, ## the HTML ``cite`` element
  78. tagCode, ## the HTML ``code`` element
  79. tagCol, ## the HTML ``col`` element
  80. tagColgroup, ## the HTML ``colgroup`` element
  81. tagCommand, ## the HTML ``command`` element
  82. tagDatalist, ## the HTML ``datalist`` element
  83. tagDd, ## the HTML ``dd`` element
  84. tagDel, ## the HTML ``del`` element
  85. tagDetails, ## the HTML ``details`` element
  86. tagDfn, ## the HTML ``dfn`` element
  87. tagDialog, ## the HTML ``dialog`` element
  88. tagDiv, ## the HTML ``div`` element
  89. tagDir, ## the deprecated HTLM ``dir`` element
  90. tagDl, ## the HTML ``dl`` element
  91. tagDt, ## the HTML ``dt`` element
  92. tagEm, ## the HTML ``em`` element
  93. tagEmbed, ## the HTML ``embed`` element
  94. tagFieldset, ## the HTML ``fieldset`` element
  95. tagFigcaption, ## the HTML ``figcaption`` element
  96. tagFigure, ## the HTML ``figure`` element
  97. tagFont, ## the deprecated HTML ``font`` element
  98. tagFooter, ## the HTML ``footer`` element
  99. tagForm, ## the HTML ``form`` element
  100. tagFrame, ## the HTML ``frame`` element
  101. tagFrameset, ## the deprecated HTML ``frameset`` element
  102. tagH1, ## the HTML ``h1`` element
  103. tagH2, ## the HTML ``h2`` element
  104. tagH3, ## the HTML ``h3`` element
  105. tagH4, ## the HTML ``h4`` element
  106. tagH5, ## the HTML ``h5`` element
  107. tagH6, ## the HTML ``h6`` element
  108. tagHead, ## the HTML ``head`` element
  109. tagHeader, ## the HTML ``header`` element
  110. tagHgroup, ## the HTML ``hgroup`` element
  111. tagHtml, ## the HTML ``html`` element
  112. tagHr, ## the HTML ``hr`` element
  113. tagI, ## the HTML ``i`` element
  114. tagIframe, ## the deprecated HTML ``iframe`` element
  115. tagImg, ## the HTML ``img`` element
  116. tagInput, ## the HTML ``input`` element
  117. tagIns, ## the HTML ``ins`` element
  118. tagIsindex, ## the deprecated HTML ``isindex`` element
  119. tagKbd, ## the HTML ``kbd`` element
  120. tagKeygen, ## the HTML ``keygen`` element
  121. tagLabel, ## the HTML ``label`` element
  122. tagLegend, ## the HTML ``legend`` element
  123. tagLi, ## the HTML ``li`` element
  124. tagLink, ## the HTML ``link`` element
  125. tagMap, ## the HTML ``map`` element
  126. tagMark, ## the HTML ``mark`` element
  127. tagMenu, ## the deprecated HTML ``menu`` element
  128. tagMeta, ## the HTML ``meta`` element
  129. tagMeter, ## the HTML ``meter`` element
  130. tagNav, ## the HTML ``nav`` element
  131. tagNobr, ## the deprecated HTML ``nobr`` element
  132. tagNoframes, ## the deprecated HTML ``noframes`` element
  133. tagNoscript, ## the HTML ``noscript`` element
  134. tagObject, ## the HTML ``object`` element
  135. tagOl, ## the HTML ``ol`` element
  136. tagOptgroup, ## the HTML ``optgroup`` element
  137. tagOption, ## the HTML ``option`` element
  138. tagOutput, ## the HTML ``output`` element
  139. tagP, ## the HTML ``p`` element
  140. tagParam, ## the HTML ``param`` element
  141. tagPre, ## the HTML ``pre`` element
  142. tagProgress, ## the HTML ``progress`` element
  143. tagQ, ## the HTML ``q`` element
  144. tagRp, ## the HTML ``rp`` element
  145. tagRt, ## the HTML ``rt`` element
  146. tagRuby, ## the HTML ``ruby`` element
  147. tagS, ## the deprecated HTML ``s`` element
  148. tagSamp, ## the HTML ``samp`` element
  149. tagScript, ## the HTML ``script`` element
  150. tagSection, ## the HTML ``section`` element
  151. tagSelect, ## the HTML ``select`` element
  152. tagSmall, ## the HTML ``small`` element
  153. tagSource, ## the HTML ``source`` element
  154. tagSpan, ## the HTML ``span`` element
  155. tagStrike, ## the deprecated HTML ``strike`` element
  156. tagStrong, ## the HTML ``strong`` element
  157. tagStyle, ## the HTML ``style`` element
  158. tagSub, ## the HTML ``sub`` element
  159. tagSummary, ## the HTML ``summary`` element
  160. tagSup, ## the HTML ``sup`` element
  161. tagTable, ## the HTML ``table`` element
  162. tagTbody, ## the HTML ``tbody`` element
  163. tagTd, ## the HTML ``td`` element
  164. tagTextarea, ## the HTML ``textarea`` element
  165. tagTfoot, ## the HTML ``tfoot`` element
  166. tagTh, ## the HTML ``th`` element
  167. tagThead, ## the HTML ``thead`` element
  168. tagTime, ## the HTML ``time`` element
  169. tagTitle, ## the HTML ``title`` element
  170. tagTr, ## the HTML ``tr`` element
  171. tagTrack, ## the HTML ``track`` element
  172. tagTt, ## the HTML ``tt`` element
  173. tagU, ## the deprecated HTML ``u`` element
  174. tagUl, ## the HTML ``ul`` element
  175. tagVar, ## the HTML ``var`` element
  176. tagVideo, ## the HTML ``video`` element
  177. tagWbr ## the HTML ``wbr`` element
  178. {.deprecated: [THtmlTag: HtmlTag].}
  179. const
  180. tagToStr* = [
  181. "a", "abbr", "acronym", "address", "applet", "area", "article",
  182. "aside", "audio",
  183. "b", "base", "basefont", "bdi", "bdo", "big", "blockquote", "body",
  184. "br", "button", "canvas", "caption", "center", "cite", "code",
  185. "col", "colgroup", "command",
  186. "datalist", "dd", "del", "details", "dfn", "dialog", "div",
  187. "dir", "dl", "dt", "em", "embed", "fieldset",
  188. "figcaption", "figure", "font", "footer",
  189. "form", "frame", "frameset", "h1", "h2", "h3",
  190. "h4", "h5", "h6", "head", "header", "hgroup", "html", "hr",
  191. "i", "iframe", "img", "input", "ins", "isindex",
  192. "kbd", "keygen", "label", "legend", "li", "link", "map", "mark",
  193. "menu", "meta", "meter", "nav", "nobr", "noframes", "noscript",
  194. "object", "ol",
  195. "optgroup", "option", "output", "p", "param", "pre", "progress", "q",
  196. "rp", "rt", "ruby", "s", "samp", "script", "section", "select", "small",
  197. "source", "span", "strike", "strong", "style",
  198. "sub", "summary", "sup", "table",
  199. "tbody", "td", "textarea", "tfoot", "th", "thead", "time",
  200. "title", "tr", "track", "tt", "u", "ul", "var", "video", "wbr"]
  201. InlineTags* = {tagA, tagAbbr, tagAcronym, tagApplet, tagB, tagBasefont,
  202. tagBdo, tagBig, tagBr, tagButton, tagCite, tagCode, tagDel, tagDfn,
  203. tagEm, tagFont, tagI, tagImg, tagIns, tagInput, tagIframe, tagKbd,
  204. tagLabel, tagMap, tagObject, tagQ, tagSamp, tagScript, tagSelect,
  205. tagSmall, tagSpan, tagStrong, tagSub, tagSup, tagTextarea, tagTt,
  206. tagVar, tagApplet, tagBasefont, tagFont, tagIframe, tagU, tagS,
  207. tagStrike, tagWbr}
  208. BlockTags* = {tagAddress, tagBlockquote, tagCenter, tagDel, tagDir, tagDiv,
  209. tagDl, tagFieldset, tagForm, tagH1, tagH2, tagH3, tagH4,
  210. tagH5, tagH6, tagHr, tagIns, tagIsindex, tagMenu, tagNoframes, tagNoscript,
  211. tagOl, tagP, tagPre, tagTable, tagUl, tagCenter, tagDir, tagIsindex,
  212. tagMenu, tagNoframes}
  213. SingleTags* = {tagArea, tagBase, tagBasefont,
  214. tagBr, tagCol, tagFrame, tagHr, tagImg, tagIsindex,
  215. tagLink, tagMeta, tagParam, tagWbr}
  216. Entities = [
  217. ("nbsp", 0x00A0), ("iexcl", 0x00A1), ("cent", 0x00A2), ("pound", 0x00A3),
  218. ("curren", 0x00A4), ("yen", 0x00A5), ("brvbar", 0x00A6), ("sect", 0x00A7),
  219. ("uml", 0x00A8), ("copy", 0x00A9), ("ordf", 0x00AA), ("laquo", 0x00AB),
  220. ("not", 0x00AC), ("shy", 0x00AD), ("reg", 0x00AE), ("macr", 0x00AF),
  221. ("deg", 0x00B0), ("plusmn", 0x00B1), ("sup2", 0x00B2), ("sup3", 0x00B3),
  222. ("acute", 0x00B4), ("micro", 0x00B5), ("para", 0x00B6), ("middot", 0x00B7),
  223. ("cedil", 0x00B8), ("sup1", 0x00B9), ("ordm", 0x00BA), ("raquo", 0x00BB),
  224. ("frac14", 0x00BC), ("frac12", 0x00BD), ("frac34", 0x00BE),
  225. ("iquest", 0x00BF), ("Agrave", 0x00C0), ("Aacute", 0x00C1),
  226. ("Acirc", 0x00C2), ("Atilde", 0x00C3), ("Auml", 0x00C4), ("Aring", 0x00C5),
  227. ("AElig", 0x00C6), ("Ccedil", 0x00C7), ("Egrave", 0x00C8),
  228. ("Eacute", 0x00C9), ("Ecirc", 0x00CA), ("Euml", 0x00CB), ("Igrave", 0x00CC),
  229. ("Iacute", 0x00CD), ("Icirc", 0x00CE), ("Iuml", 0x00CF), ("ETH", 0x00D0),
  230. ("Ntilde", 0x00D1), ("Ograve", 0x00D2), ("Oacute", 0x00D3),
  231. ("Ocirc", 0x00D4), ("Otilde", 0x00D5), ("Ouml", 0x00D6), ("times", 0x00D7),
  232. ("Oslash", 0x00D8), ("Ugrave", 0x00D9), ("Uacute", 0x00DA),
  233. ("Ucirc", 0x00DB), ("Uuml", 0x00DC), ("Yacute", 0x00DD), ("THORN", 0x00DE),
  234. ("szlig", 0x00DF), ("agrave", 0x00E0), ("aacute", 0x00E1),
  235. ("acirc", 0x00E2), ("atilde", 0x00E3), ("auml", 0x00E4), ("aring", 0x00E5),
  236. ("aelig", 0x00E6), ("ccedil", 0x00E7), ("egrave", 0x00E8),
  237. ("eacute", 0x00E9), ("ecirc", 0x00EA), ("euml", 0x00EB), ("igrave", 0x00EC),
  238. ("iacute", 0x00ED), ("icirc", 0x00EE), ("iuml", 0x00EF), ("eth", 0x00F0),
  239. ("ntilde", 0x00F1), ("ograve", 0x00F2), ("oacute", 0x00F3),
  240. ("ocirc", 0x00F4), ("otilde", 0x00F5), ("ouml", 0x00F6), ("divide", 0x00F7),
  241. ("oslash", 0x00F8), ("ugrave", 0x00F9), ("uacute", 0x00FA),
  242. ("ucirc", 0x00FB), ("uuml", 0x00FC), ("yacute", 0x00FD), ("thorn", 0x00FE),
  243. ("yuml", 0x00FF), ("OElig", 0x0152), ("oelig", 0x0153), ("Scaron", 0x0160),
  244. ("scaron", 0x0161), ("Yuml", 0x0178), ("fnof", 0x0192), ("circ", 0x02C6),
  245. ("tilde", 0x02DC), ("Alpha", 0x0391), ("Beta", 0x0392), ("Gamma", 0x0393),
  246. ("Delta", 0x0394), ("Epsilon", 0x0395), ("Zeta", 0x0396), ("Eta", 0x0397),
  247. ("Theta", 0x0398), ("Iota", 0x0399), ("Kappa", 0x039A), ("Lambda", 0x039B),
  248. ("Mu", 0x039C), ("Nu", 0x039D), ("Xi", 0x039E), ("Omicron", 0x039F),
  249. ("Pi", 0x03A0), ("Rho", 0x03A1), ("Sigma", 0x03A3), ("Tau", 0x03A4),
  250. ("Upsilon", 0x03A5), ("Phi", 0x03A6), ("Chi", 0x03A7), ("Psi", 0x03A8),
  251. ("Omega", 0x03A9), ("alpha", 0x03B1), ("beta", 0x03B2), ("gamma", 0x03B3),
  252. ("delta", 0x03B4), ("epsilon", 0x03B5), ("zeta", 0x03B6), ("eta", 0x03B7),
  253. ("theta", 0x03B8), ("iota", 0x03B9), ("kappa", 0x03BA), ("lambda", 0x03BB),
  254. ("mu", 0x03BC), ("nu", 0x03BD), ("xi", 0x03BE), ("omicron", 0x03BF),
  255. ("pi", 0x03C0), ("rho", 0x03C1), ("sigmaf", 0x03C2), ("sigma", 0x03C3),
  256. ("tau", 0x03C4), ("upsilon", 0x03C5), ("phi", 0x03C6), ("chi", 0x03C7),
  257. ("psi", 0x03C8), ("omega", 0x03C9), ("thetasym", 0x03D1), ("upsih", 0x03D2),
  258. ("piv", 0x03D6), ("ensp", 0x2002), ("emsp", 0x2003), ("thinsp", 0x2009),
  259. ("zwnj", 0x200C), ("zwj", 0x200D), ("lrm", 0x200E), ("rlm", 0x200F),
  260. ("ndash", 0x2013), ("mdash", 0x2014), ("lsquo", 0x2018), ("rsquo", 0x2019),
  261. ("sbquo", 0x201A), ("ldquo", 0x201C), ("rdquo", 0x201D), ("bdquo", 0x201E),
  262. ("dagger", 0x2020), ("Dagger", 0x2021), ("bull", 0x2022),
  263. ("hellip", 0x2026), ("permil", 0x2030), ("prime", 0x2032),
  264. ("Prime", 0x2033), ("lsaquo", 0x2039), ("rsaquo", 0x203A),
  265. ("oline", 0x203E), ("frasl", 0x2044), ("euro", 0x20AC),
  266. ("image", 0x2111), ("weierp", 0x2118), ("real", 0x211C),
  267. ("trade", 0x2122), ("alefsym", 0x2135), ("larr", 0x2190),
  268. ("uarr", 0x2191), ("rarr", 0x2192), ("darr", 0x2193),
  269. ("harr", 0x2194), ("crarr", 0x21B5), ("lArr", 0x21D0),
  270. ("uArr", 0x21D1), ("rArr", 0x21D2), ("dArr", 0x21D3),
  271. ("hArr", 0x21D4), ("forall", 0x2200), ("part", 0x2202),
  272. ("exist", 0x2203), ("empty", 0x2205), ("nabla", 0x2207),
  273. ("isin", 0x2208), ("notin", 0x2209), ("ni", 0x220B),
  274. ("prod", 0x220F), ("sum", 0x2211), ("minus", 0x2212),
  275. ("lowast", 0x2217), ("radic", 0x221A), ("prop", 0x221D),
  276. ("infin", 0x221E), ("ang", 0x2220), ("and", 0x2227),
  277. ("or", 0x2228), ("cap", 0x2229), ("cup", 0x222A),
  278. ("int", 0x222B), ("there4", 0x2234), ("sim", 0x223C),
  279. ("cong", 0x2245), ("asymp", 0x2248), ("ne", 0x2260),
  280. ("equiv", 0x2261), ("le", 0x2264), ("ge", 0x2265),
  281. ("sub", 0x2282), ("sup", 0x2283), ("nsub", 0x2284),
  282. ("sube", 0x2286), ("supe", 0x2287), ("oplus", 0x2295),
  283. ("otimes", 0x2297), ("perp", 0x22A5), ("sdot", 0x22C5),
  284. ("lceil", 0x2308), ("rceil", 0x2309), ("lfloor", 0x230A),
  285. ("rfloor", 0x230B), ("lang", 0x2329), ("rang", 0x232A),
  286. ("loz", 0x25CA), ("spades", 0x2660), ("clubs", 0x2663),
  287. ("hearts", 0x2665), ("diams", 0x2666)]
  288. proc allLower(s: string): bool =
  289. for c in s:
  290. if c < 'a' or c > 'z': return false
  291. return true
  292. proc toHtmlTag(s: string): HtmlTag =
  293. case s
  294. of "a": tagA
  295. of "abbr": tagAbbr
  296. of "acronym": tagAcronym
  297. of "address": tagAddress
  298. of "applet": tagApplet
  299. of "area": tagArea
  300. of "article": tagArticle
  301. of "aside": tagAside
  302. of "audio": tagAudio
  303. of "b": tagB
  304. of "base": tagBase
  305. of "basefont": tagBasefont
  306. of "bdi": tagBdi
  307. of "bdo": tagBdo
  308. of "big": tagBig
  309. of "blockquote": tagBlockquote
  310. of "body": tagBody
  311. of "br": tagBr
  312. of "button": tagButton
  313. of "canvas": tagCanvas
  314. of "caption": tagCaption
  315. of "center": tagCenter
  316. of "cite": tagCite
  317. of "code": tagCode
  318. of "col": tagCol
  319. of "colgroup": tagColgroup
  320. of "command": tagCommand
  321. of "datalist": tagDatalist
  322. of "dd": tagDd
  323. of "del": tagDel
  324. of "details": tagDetails
  325. of "dfn": tagDfn
  326. of "dialog": tagDialog
  327. of "div": tagDiv
  328. of "dir": tagDir
  329. of "dl": tagDl
  330. of "dt": tagDt
  331. of "em": tagEm
  332. of "embed": tagEmbed
  333. of "fieldset": tagFieldset
  334. of "figcaption": tagFigcaption
  335. of "figure": tagFigure
  336. of "font": tagFont
  337. of "footer": tagFooter
  338. of "form": tagForm
  339. of "frame": tagFrame
  340. of "frameset": tagFrameset
  341. of "h1": tagH1
  342. of "h2": tagH2
  343. of "h3": tagH3
  344. of "h4": tagH4
  345. of "h5": tagH5
  346. of "h6": tagH6
  347. of "head": tagHead
  348. of "header": tagHeader
  349. of "hgroup": tagHgroup
  350. of "html": tagHtml
  351. of "hr": tagHr
  352. of "i": tagI
  353. of "iframe": tagIframe
  354. of "img": tagImg
  355. of "input": tagInput
  356. of "ins": tagIns
  357. of "isindex": tagIsindex
  358. of "kbd": tagKbd
  359. of "keygen": tagKeygen
  360. of "label": tagLabel
  361. of "legend": tagLegend
  362. of "li": tagLi
  363. of "link": tagLink
  364. of "map": tagMap
  365. of "mark": tagMark
  366. of "menu": tagMenu
  367. of "meta": tagMeta
  368. of "meter": tagMeter
  369. of "nav": tagNav
  370. of "nobr": tagNobr
  371. of "noframes": tagNoframes
  372. of "noscript": tagNoscript
  373. of "object": tagObject
  374. of "ol": tagOl
  375. of "optgroup": tagOptgroup
  376. of "option": tagOption
  377. of "output": tagOutput
  378. of "p": tagP
  379. of "param": tagParam
  380. of "pre": tagPre
  381. of "progress": tagProgress
  382. of "q": tagQ
  383. of "rp": tagRp
  384. of "rt": tagRt
  385. of "ruby": tagRuby
  386. of "s": tagS
  387. of "samp": tagSamp
  388. of "script": tagScript
  389. of "section": tagSection
  390. of "select": tagSelect
  391. of "small": tagSmall
  392. of "source": tagSource
  393. of "span": tagSpan
  394. of "strike": tagStrike
  395. of "strong": tagStrong
  396. of "style": tagStyle
  397. of "sub": tagSub
  398. of "summary": tagSummary
  399. of "sup": tagSup
  400. of "table": tagTable
  401. of "tbody": tagTbody
  402. of "td": tagTd
  403. of "textarea": tagTextarea
  404. of "tfoot": tagTfoot
  405. of "th": tagTh
  406. of "thead": tagThead
  407. of "time": tagTime
  408. of "title": tagTitle
  409. of "tr": tagTr
  410. of "track": tagTrack
  411. of "tt": tagTt
  412. of "u": tagU
  413. of "ul": tagUl
  414. of "var": tagVar
  415. of "video": tagVideo
  416. of "wbr": tagWbr
  417. else: tagUnknown
  418. proc htmlTag*(n: XmlNode): HtmlTag =
  419. ## gets `n`'s tag as a ``HtmlTag``.
  420. if n.clientData == 0:
  421. n.clientData = toHtmlTag(n.tag).ord
  422. result = HtmlTag(n.clientData)
  423. proc htmlTag*(s: string): HtmlTag =
  424. ## converts `s` to a ``HtmlTag``. If `s` is no HTML tag, ``tagUnknown`` is
  425. ## returned.
  426. let s = if allLower(s): s else: toLowerAscii(s)
  427. result = toHtmlTag(s)
  428. proc entityToUtf8*(entity: string): string =
  429. ## converts an HTML entity name like ``&Uuml;`` to its UTF-8 equivalent.
  430. ## "" is returned if the entity name is unknown. The HTML parser
  431. ## already converts entities to UTF-8.
  432. for name, val in items(Entities):
  433. if name == entity: return toUTF8(Rune(val))
  434. result = ""
  435. proc addNode(father, son: XmlNode) =
  436. if son != nil: add(father, son)
  437. proc parse(x: var XmlParser, errors: var seq[string]): XmlNode
  438. proc expected(x: var XmlParser, n: XmlNode): string =
  439. result = errorMsg(x, "</" & n.tag & "> expected")
  440. template elemName(x: untyped): untyped = rawData(x)
  441. template adderr(x: untyped) =
  442. errors.add(x)
  443. proc untilElementEnd(x: var XmlParser, result: XmlNode,
  444. errors: var seq[string]) =
  445. # we parsed e.g. ``<br>`` and don't really expect a ``</br>``:
  446. if result.htmlTag in SingleTags:
  447. if x.kind != xmlElementEnd or cmpIgnoreCase(x.elemName, result.tag) != 0:
  448. return
  449. while true:
  450. case x.kind
  451. of xmlElementStart, xmlElementOpen:
  452. case result.htmlTag
  453. of tagP, tagInput, tagOption:
  454. # some tags are common to have no ``</end>``, like ``<li>`` but
  455. # allow ``<p>`` in `<dd>`, `<dt>` and ``<li>`` in next case
  456. if htmlTag(x.elemName) in {tagLi, tagP, tagDt, tagDd, tagInput,
  457. tagOption}:
  458. adderr(expected(x, result))
  459. break
  460. of tagDd, tagDt, tagLi:
  461. if htmlTag(x.elemName) in {tagLi, tagDt, tagDd, tagInput,
  462. tagOption}:
  463. adderr(expected(x, result))
  464. break
  465. of tagTd, tagTh:
  466. if htmlTag(x.elemName) in {tagTr, tagTd, tagTh, tagTfoot, tagThead}:
  467. adderr(expected(x, result))
  468. break
  469. of tagTr:
  470. if htmlTag(x.elemName) == tagTr:
  471. adderr(expected(x, result))
  472. break
  473. of tagOptgroup:
  474. if htmlTag(x.elemName) in {tagOption, tagOptgroup}:
  475. adderr(expected(x, result))
  476. break
  477. else: discard
  478. result.addNode(parse(x, errors))
  479. of xmlElementEnd:
  480. if cmpIgnoreCase(x.elemName, result.tag) != 0:
  481. #echo "5; expected: ", result.htmltag, " ", x.elemName
  482. adderr(expected(x, result))
  483. # this seems to do better match error corrections in browsers:
  484. while x.kind in {xmlElementEnd, xmlWhitespace}:
  485. if x.kind == xmlElementEnd and cmpIgnoreCase(x.elemName, result.tag) == 0:
  486. break
  487. next(x)
  488. next(x)
  489. break
  490. of xmlEof:
  491. adderr(expected(x, result))
  492. break
  493. else:
  494. result.addNode(parse(x, errors))
  495. proc parse(x: var XmlParser, errors: var seq[string]): XmlNode =
  496. case x.kind
  497. of xmlComment:
  498. result = newComment(x.rawData)
  499. next(x)
  500. of xmlCharData, xmlWhitespace:
  501. result = newText(x.rawData)
  502. next(x)
  503. of xmlPI, xmlSpecial:
  504. # we just ignore processing instructions for now
  505. next(x)
  506. of xmlError:
  507. adderr(errorMsg(x))
  508. next(x)
  509. of xmlElementStart:
  510. result = newElement(toLowerAscii(x.elemName))
  511. next(x)
  512. untilElementEnd(x, result, errors)
  513. of xmlElementEnd:
  514. adderr(errorMsg(x, "unexpected ending tag: " & x.elemName))
  515. of xmlElementOpen:
  516. result = newElement(toLowerAscii(x.elemName))
  517. next(x)
  518. result.attrs = newStringTable()
  519. while true:
  520. case x.kind
  521. of xmlAttribute:
  522. result.attrs[x.rawData] = x.rawData2
  523. next(x)
  524. of xmlElementClose:
  525. next(x)
  526. break
  527. of xmlError:
  528. adderr(errorMsg(x))
  529. next(x)
  530. break
  531. else:
  532. adderr(errorMsg(x, "'>' expected"))
  533. next(x)
  534. break
  535. untilElementEnd(x, result, errors)
  536. of xmlAttribute, xmlElementClose:
  537. adderr(errorMsg(x, "<some_tag> expected"))
  538. next(x)
  539. of xmlCData:
  540. result = newCData(x.rawData)
  541. next(x)
  542. of xmlEntity:
  543. var u = entityToUtf8(x.rawData)
  544. if u.len != 0: result = newText(u)
  545. next(x)
  546. of xmlEof: discard
  547. proc parseHtml*(s: Stream, filename: string,
  548. errors: var seq[string]): XmlNode =
  549. ## parses the XML from stream `s` and returns a ``PXmlNode``. Every
  550. ## occurred parsing error is added to the `errors` sequence.
  551. var x: XmlParser
  552. open(x, s, filename, {reportComments, reportWhitespace})
  553. next(x)
  554. # skip the DOCTYPE:
  555. if x.kind == xmlSpecial: next(x)
  556. result = newElement("document")
  557. result.addNode(parse(x, errors))
  558. #if x.kind != xmlEof:
  559. # adderr(errorMsg(x, "EOF expected"))
  560. while x.kind != xmlEof:
  561. var oldPos = x.bufpos # little hack to see if we made any progess
  562. result.addNode(parse(x, errors))
  563. if x.bufpos == oldPos:
  564. # force progress!
  565. next(x)
  566. close(x)
  567. if result.len == 1:
  568. result = result[0]
  569. proc parseHtml*(s: Stream): XmlNode =
  570. ## parses the XTML from stream `s` and returns a ``PXmlNode``. All parsing
  571. ## errors are ignored.
  572. var errors: seq[string] = @[]
  573. result = parseHtml(s, "unknown_html_doc", errors)
  574. proc loadHtml*(path: string, errors: var seq[string]): XmlNode =
  575. ## Loads and parses HTML from file specified by ``path``, and returns
  576. ## a ``PXmlNode``. Every occurred parsing error is added to
  577. ## the `errors` sequence.
  578. var s = newFileStream(path, fmRead)
  579. if s == nil: raise newException(IOError, "Unable to read file: " & path)
  580. result = parseHtml(s, path, errors)
  581. proc loadHtml*(path: string): XmlNode =
  582. ## Loads and parses HTML from file specified by ``path``, and returns
  583. ## a ``PXmlNode``. All parsing errors are ignored.
  584. var errors: seq[string] = @[]
  585. result = loadHtml(path, errors)
  586. when not defined(testing) and isMainModule:
  587. import os
  588. var errors: seq[string] = @[]
  589. var x = loadHtml(paramStr(1), errors)
  590. for e in items(errors): echo e
  591. var f: File
  592. if open(f, "test.txt", fmWrite):
  593. f.write($x)
  594. f.close()
  595. else:
  596. quit("cannot write test.txt")