parsecsv.nim 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2009 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## This module implements a simple high performance `CSV`:idx:
  10. ## (`comma separated value`:idx:) parser.
  11. ##
  12. ## Basic usage
  13. ## ===========
  14. ##
  15. ## .. code-block:: nim
  16. ## import parsecsv
  17. ## from os import paramStr
  18. ## from streams import newFileStream
  19. ##
  20. ## var s = newFileStream(paramStr(1), fmRead)
  21. ## if s == nil:
  22. ## quit("cannot open the file" & paramStr(1))
  23. ##
  24. ## var x: CsvParser
  25. ## open(x, s, paramStr(1))
  26. ## while readRow(x):
  27. ## echo "new row: "
  28. ## for val in items(x.row):
  29. ## echo "##", val, "##"
  30. ## close(x)
  31. ##
  32. ## For CSV files with a header row, the header can be read and then used as a
  33. ## reference for item access with `rowEntry <#rowEntry,CsvParser,string>`_:
  34. ##
  35. ## .. code-block:: nim
  36. ## import parsecsv
  37. ##
  38. ## # Prepare a file
  39. ## let content = """One,Two,Three,Four
  40. ## 1,2,3,4
  41. ## 10,20,30,40
  42. ## 100,200,300,400
  43. ## """
  44. ## writeFile("temp.csv", content)
  45. ##
  46. ## var p: CsvParser
  47. ## p.open("temp.csv")
  48. ## p.readHeaderRow()
  49. ## while p.readRow():
  50. ## echo "new row: "
  51. ## for col in items(p.headers):
  52. ## echo "##", col, ":", p.rowEntry(col), "##"
  53. ## p.close()
  54. ##
  55. ## See also
  56. ## ========
  57. ##
  58. ## * `streams module <streams.html>`_ for using
  59. ## `open proc <#open,CsvParser,Stream,string,char,char,char>`_
  60. ## and other stream processing (like `close proc <streams.html#close,Stream>`_)
  61. ## * `parseopt module <parseopt.html>`_ for a command line parser
  62. ## * `parsecfg module <parsecfg.html>`_ for a configuration file parser
  63. ## * `parsexml module <parsexml.html>`_ for a XML / HTML parser
  64. ## * `parsesql module <parsesql.html>`_ for a SQL parser
  65. ## * `other parsers <lib.html#pure-libraries-parsers>`_ for other parsers
  66. import
  67. lexbase, streams
  68. type
  69. CsvRow* = seq[string] ## A row in a CSV file.
  70. CsvParser* = object of BaseLexer ## The parser object.
  71. ##
  72. ## It consists of two public fields:
  73. ## * `row` is the current row
  74. ## * `headers` are the columns that are defined in the csv file
  75. ## (read using `readHeaderRow <#readHeaderRow,CsvParser>`_).
  76. ## Used with `rowEntry <#rowEntry,CsvParser,string>`_).
  77. row*: CsvRow
  78. filename: string
  79. sep, quote, esc: char
  80. skipWhite: bool
  81. currRow: int
  82. headers*: seq[string]
  83. CsvError* = object of IOError ## An exception that is raised if
  84. ## a parsing error occurs.
  85. proc raiseEInvalidCsv(filename: string, line, col: int,
  86. msg: string) {.noreturn.} =
  87. var e: ref CsvError
  88. new(e)
  89. if filename.len == 0:
  90. e.msg = "Error: " & msg
  91. else:
  92. e.msg = filename & "(" & $line & ", " & $col & ") Error: " & msg
  93. raise e
  94. proc error(my: CsvParser, pos: int, msg: string) =
  95. raiseEInvalidCsv(my.filename, my.lineNumber, getColNumber(my, pos), msg)
  96. proc open*(my: var CsvParser, input: Stream, filename: string,
  97. separator = ',', quote = '"', escape = '\0',
  98. skipInitialSpace = false) =
  99. ## Initializes the parser with an input stream. `Filename` is only used
  100. ## for nice error messages. The parser's behaviour can be controlled by
  101. ## the diverse optional parameters:
  102. ## - `separator`: character used to separate fields
  103. ## - `quote`: Used to quote fields containing special characters like
  104. ## `separator`, `quote` or new-line characters. '\0' disables the parsing
  105. ## of quotes.
  106. ## - `escape`: removes any special meaning from the following character;
  107. ## '\0' disables escaping; if escaping is disabled and `quote` is not '\0',
  108. ## two `quote` characters are parsed one literal `quote` character.
  109. ## - `skipInitialSpace`: If true, whitespace immediately following the
  110. ## `separator` is ignored.
  111. ##
  112. ## See also:
  113. ## * `open proc <#open,CsvParser,string,char,char,char>`_ which creates the
  114. ## file stream for you
  115. runnableExamples:
  116. import streams
  117. var strm = newStringStream("One,Two,Three\n1,2,3\n10,20,30")
  118. var parser: CsvParser
  119. parser.open(strm, "tmp.csv")
  120. parser.close()
  121. strm.close()
  122. lexbase.open(my, input)
  123. my.filename = filename
  124. my.sep = separator
  125. my.quote = quote
  126. my.esc = escape
  127. my.skipWhite = skipInitialSpace
  128. my.row = @[]
  129. my.currRow = 0
  130. proc open*(my: var CsvParser, filename: string,
  131. separator = ',', quote = '"', escape = '\0',
  132. skipInitialSpace = false) =
  133. ## Similar to the `other open proc<#open,CsvParser,Stream,string,char,char,char>`_,
  134. ## but creates the file stream for you.
  135. runnableExamples:
  136. from os import removeFile
  137. writeFile("tmp.csv", "One,Two,Three\n1,2,3\n10,20,300")
  138. var parser: CsvParser
  139. parser.open("tmp.csv")
  140. parser.close()
  141. removeFile("tmp.csv")
  142. var s = newFileStream(filename, fmRead)
  143. if s == nil: my.error(0, "cannot open: " & filename)
  144. open(my, s, filename, separator,
  145. quote, escape, skipInitialSpace)
  146. proc parseField(my: var CsvParser, a: var string) =
  147. var pos = my.bufpos
  148. if my.skipWhite:
  149. while my.buf[pos] in {' ', '\t'}: inc(pos)
  150. setLen(a, 0) # reuse memory
  151. if my.buf[pos] == my.quote and my.quote != '\0':
  152. inc(pos)
  153. while true:
  154. let c = my.buf[pos]
  155. if c == '\0':
  156. my.bufpos = pos # can continue after exception?
  157. error(my, pos, my.quote & " expected")
  158. break
  159. elif c == my.quote:
  160. if my.esc == '\0' and my.buf[pos+1] == my.quote:
  161. add(a, my.quote)
  162. inc(pos, 2)
  163. else:
  164. inc(pos)
  165. break
  166. elif c == my.esc:
  167. add(a, my.buf[pos+1])
  168. inc(pos, 2)
  169. else:
  170. case c
  171. of '\c':
  172. pos = handleCR(my, pos)
  173. add(a, "\n")
  174. of '\l':
  175. pos = handleLF(my, pos)
  176. add(a, "\n")
  177. else:
  178. add(a, c)
  179. inc(pos)
  180. else:
  181. while true:
  182. let c = my.buf[pos]
  183. if c == my.sep: break
  184. if c in {'\c', '\l', '\0'}: break
  185. add(a, c)
  186. inc(pos)
  187. my.bufpos = pos
  188. proc processedRows*(my: var CsvParser): int =
  189. ## Returns number of the processed rows.
  190. ##
  191. ## But even if `readRow <#readRow,CsvParser,int>`_ arrived at EOF then
  192. ## processed rows counter is incremented.
  193. runnableExamples:
  194. import streams
  195. var strm = newStringStream("One,Two,Three\n1,2,3")
  196. var parser: CsvParser
  197. parser.open(strm, "tmp.csv")
  198. doAssert parser.readRow()
  199. doAssert parser.processedRows() == 1
  200. doAssert parser.readRow()
  201. doAssert parser.processedRows() == 2
  202. ## Even if `readRow` arrived at EOF then `processedRows` is incremented.
  203. doAssert parser.readRow() == false
  204. doAssert parser.processedRows() == 3
  205. doAssert parser.readRow() == false
  206. doAssert parser.processedRows() == 4
  207. parser.close()
  208. strm.close()
  209. return my.currRow
  210. proc readRow*(my: var CsvParser, columns = 0): bool =
  211. ## Reads the next row; if `columns` > 0, it expects the row to have
  212. ## exactly this many columns. Returns false if the end of the file
  213. ## has been encountered else true.
  214. ##
  215. ## Blank lines are skipped.
  216. runnableExamples:
  217. import streams
  218. var strm = newStringStream("One,Two,Three\n1,2,3\n\n10,20,30")
  219. var parser: CsvParser
  220. parser.open(strm, "tmp.csv")
  221. doAssert parser.readRow()
  222. doAssert parser.row == @["One", "Two", "Three"]
  223. doAssert parser.readRow()
  224. doAssert parser.row == @["1", "2", "3"]
  225. ## Blank lines are skipped.
  226. doAssert parser.readRow()
  227. doAssert parser.row == @["10", "20", "30"]
  228. var emptySeq: seq[string]
  229. doAssert parser.readRow() == false
  230. doAssert parser.row == emptySeq
  231. doAssert parser.readRow() == false
  232. doAssert parser.row == emptySeq
  233. parser.close()
  234. strm.close()
  235. var col = 0 # current column
  236. let oldpos = my.bufpos
  237. # skip initial empty lines #8365
  238. while true:
  239. case my.buf[my.bufpos]
  240. of '\c': my.bufpos = handleCR(my, my.bufpos)
  241. of '\l': my.bufpos = handleLF(my, my.bufpos)
  242. else: break
  243. while my.buf[my.bufpos] != '\0':
  244. let oldlen = my.row.len
  245. if oldlen < col+1:
  246. setLen(my.row, col+1)
  247. my.row[col] = ""
  248. parseField(my, my.row[col])
  249. inc(col)
  250. if my.buf[my.bufpos] == my.sep:
  251. inc(my.bufpos)
  252. else:
  253. case my.buf[my.bufpos]
  254. of '\c', '\l':
  255. # skip empty lines:
  256. while true:
  257. case my.buf[my.bufpos]
  258. of '\c': my.bufpos = handleCR(my, my.bufpos)
  259. of '\l': my.bufpos = handleLF(my, my.bufpos)
  260. else: break
  261. of '\0': discard
  262. else: error(my, my.bufpos, my.sep & " expected")
  263. break
  264. setLen(my.row, col)
  265. result = col > 0
  266. if result and col != columns and columns > 0:
  267. error(my, oldpos+1, $columns & " columns expected, but found " &
  268. $col & " columns")
  269. inc(my.currRow)
  270. proc close*(my: var CsvParser) {.inline.} =
  271. ## Closes the parser `my` and its associated input stream.
  272. lexbase.close(my)
  273. proc readHeaderRow*(my: var CsvParser) =
  274. ## Reads the first row and creates a look-up table for column numbers
  275. ## See also:
  276. ## * `rowEntry proc <#rowEntry,CsvParser,string>`_
  277. runnableExamples:
  278. import streams
  279. var strm = newStringStream("One,Two,Three\n1,2,3")
  280. var parser: CsvParser
  281. parser.open(strm, "tmp.csv")
  282. parser.readHeaderRow()
  283. doAssert parser.headers == @["One", "Two", "Three"]
  284. doAssert parser.row == @["One", "Two", "Three"]
  285. doAssert parser.readRow()
  286. doAssert parser.headers == @["One", "Two", "Three"]
  287. doAssert parser.row == @["1", "2", "3"]
  288. parser.close()
  289. strm.close()
  290. let present = my.readRow()
  291. if present:
  292. my.headers = my.row
  293. proc rowEntry*(my: var CsvParser, entry: string): var string =
  294. ## Accesses a specified `entry` from the current row.
  295. ##
  296. ## Assumes that `readHeaderRow <#readHeaderRow,CsvParser>`_ has already been
  297. ## called.
  298. runnableExamples:
  299. import streams
  300. var strm = newStringStream("One,Two,Three\n1,2,3\n\n10,20,30")
  301. var parser: CsvParser
  302. parser.open(strm, "tmp.csv")
  303. ## Need calling `readHeaderRow`.
  304. parser.readHeaderRow()
  305. doAssert parser.readRow()
  306. doAssert parser.rowEntry("One") == "1"
  307. doAssert parser.rowEntry("Two") == "2"
  308. doAssert parser.rowEntry("Three") == "3"
  309. ## `parser.rowEntry("NotExistEntry")` causes SIGSEGV fault.
  310. parser.close()
  311. strm.close()
  312. let index = my.headers.find(entry)
  313. if index >= 0:
  314. result = my.row[index]
  315. when not defined(testing) and isMainModule:
  316. import os
  317. var s = newFileStream(paramStr(1), fmRead)
  318. if s == nil: quit("cannot open the file" & paramStr(1))
  319. var x: CsvParser
  320. open(x, s, paramStr(1))
  321. while readRow(x):
  322. echo "new row: "
  323. for val in items(x.row):
  324. echo "##", val, "##"
  325. close(x)
  326. when isMainModule:
  327. import os
  328. import strutils
  329. block: # Tests for reading the header row
  330. let content = "\nOne,Two,Three,Four\n1,2,3,4\n10,20,30,40,\n100,200,300,400\n"
  331. writeFile("temp.csv", content)
  332. var p: CsvParser
  333. p.open("temp.csv")
  334. p.readHeaderRow()
  335. while p.readRow():
  336. let zeros = repeat('0', p.currRow-2)
  337. doAssert p.rowEntry("One") == "1" & zeros
  338. doAssert p.rowEntry("Two") == "2" & zeros
  339. doAssert p.rowEntry("Three") == "3" & zeros
  340. doAssert p.rowEntry("Four") == "4" & zeros
  341. p.close()
  342. when not defined(testing):
  343. var parser: CsvParser
  344. parser.open("temp.csv")
  345. parser.readHeaderRow()
  346. while parser.readRow():
  347. echo "new row: "
  348. for col in items(parser.headers):
  349. echo "##", col, ":", parser.rowEntry(col), "##"
  350. parser.close()
  351. removeFile("temp.csv")
  352. # Tidy up
  353. removeFile("temp.csv")