nimlexbase.nim 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. #
  2. #
  3. # The Nim Compiler
  4. # (c) Copyright 2012 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. # Base Object of a lexer with efficient buffer handling. In fact
  10. # I believe that this is the most efficient method of buffer
  11. # handling that exists! Only at line endings checks are necessary
  12. # if the buffer needs refilling.
  13. import llstream
  14. import std/strutils
  15. when defined(nimPreviewSlimSystem):
  16. import std/assertions
  17. const
  18. Lrz* = ' '
  19. Apo* = '\''
  20. Tabulator* = '\x09'
  21. ESC* = '\x1B'
  22. CR* = '\x0D'
  23. FF* = '\x0C'
  24. LF* = '\x0A'
  25. BEL* = '\x07'
  26. BACKSPACE* = '\x08'
  27. VT* = '\x0B'
  28. const
  29. EndOfFile* = '\0' # end of file marker
  30. # A little picture makes everything clear :-)
  31. # buf:
  32. # "Example Text\n ha!" bufLen = 17
  33. # ^pos = 0 ^ sentinel = 12
  34. #
  35. NewLines* = {CR, LF}
  36. type
  37. TBaseLexer* = object of RootObj
  38. bufpos*: int
  39. buf*: cstring
  40. bufStorage: string
  41. bufLen: int
  42. stream*: PLLStream # we read from this stream
  43. lineNumber*: int # the current line number
  44. # private data:
  45. sentinel*: int
  46. lineStart*: int # index of last line start in buffer
  47. offsetBase*: int # use ``offsetBase + bufpos`` to get the offset
  48. proc openBaseLexer*(L: var TBaseLexer, inputstream: PLLStream,
  49. bufLen: int = 8192)
  50. # 8K is a reasonable buffer size
  51. proc closeBaseLexer*(L: var TBaseLexer)
  52. proc getCurrentLine*(L: TBaseLexer, marker: bool = true): string
  53. proc getColNumber*(L: TBaseLexer, pos: int): int
  54. proc handleCR*(L: var TBaseLexer, pos: int): int
  55. # Call this if you scanned over CR in the buffer; it returns the
  56. # position to continue the scanning from. `pos` must be the position
  57. # of the CR.
  58. proc handleLF*(L: var TBaseLexer, pos: int): int
  59. # Call this if you scanned over LF in the buffer; it returns the
  60. # position to continue the scanning from. `pos` must be the position
  61. # of the LF.
  62. # implementation
  63. proc closeBaseLexer(L: var TBaseLexer) =
  64. llStreamClose(L.stream)
  65. proc fillBuffer(L: var TBaseLexer) =
  66. var
  67. charsRead, toCopy, s: int # all are in characters,
  68. # not bytes (in case this
  69. # is not the same)
  70. oldBufLen: int
  71. # we know here that pos == L.sentinel, but not if this proc
  72. # is called the first time by initBaseLexer()
  73. assert(L.sentinel < L.bufLen)
  74. toCopy = L.bufLen - L.sentinel - 1
  75. assert(toCopy >= 0)
  76. if toCopy > 0:
  77. moveMem(addr L.buf[0], addr L.buf[L.sentinel + 1], toCopy)
  78. # "moveMem" handles overlapping regions
  79. charsRead = llStreamRead(L.stream, addr L.buf[toCopy], L.sentinel + 1)
  80. s = toCopy + charsRead
  81. if charsRead < L.sentinel + 1:
  82. L.buf[s] = EndOfFile # set end marker
  83. L.sentinel = s
  84. else:
  85. # compute sentinel:
  86. dec(s) # BUGFIX (valgrind)
  87. while true:
  88. assert(s < L.bufLen)
  89. while (s >= 0) and not (L.buf[s] in NewLines): dec(s)
  90. if s >= 0:
  91. # we found an appropriate character for a sentinel:
  92. L.sentinel = s
  93. break
  94. else:
  95. # rather than to give up here because the line is too long,
  96. # double the buffer's size and try again:
  97. oldBufLen = L.bufLen
  98. L.bufLen = L.bufLen * 2
  99. L.bufStorage.setLen(L.bufLen)
  100. L.buf = L.bufStorage.cstring
  101. assert(L.bufLen - oldBufLen == oldBufLen)
  102. charsRead = llStreamRead(L.stream, addr(L.buf[oldBufLen]),
  103. oldBufLen)
  104. if charsRead < oldBufLen:
  105. L.buf[oldBufLen + charsRead] = EndOfFile
  106. L.sentinel = oldBufLen + charsRead
  107. break
  108. s = L.bufLen - 1
  109. proc fillBaseLexer(L: var TBaseLexer, pos: int): int =
  110. assert(pos <= L.sentinel)
  111. if pos < L.sentinel:
  112. result = pos + 1 # nothing to do
  113. else:
  114. fillBuffer(L)
  115. L.offsetBase += pos + 1
  116. L.bufpos = 0
  117. result = 0
  118. L.lineStart = result
  119. proc handleCR(L: var TBaseLexer, pos: int): int =
  120. assert(L.buf[pos] == CR)
  121. inc(L.lineNumber)
  122. result = fillBaseLexer(L, pos)
  123. if L.buf[result] == LF:
  124. result = fillBaseLexer(L, result)
  125. proc handleLF(L: var TBaseLexer, pos: int): int =
  126. assert(L.buf[pos] == LF)
  127. inc(L.lineNumber)
  128. result = fillBaseLexer(L, pos) #L.lastNL := result-1; // BUGFIX: was: result;
  129. proc skipUTF8BOM(L: var TBaseLexer) =
  130. if L.buf[0] == '\xEF' and L.buf[1] == '\xBB' and L.buf[2] == '\xBF':
  131. inc(L.bufpos, 3)
  132. inc(L.lineStart, 3)
  133. proc openBaseLexer(L: var TBaseLexer, inputstream: PLLStream, bufLen = 8192) =
  134. assert(bufLen > 0)
  135. L.bufpos = 0
  136. L.offsetBase = 0
  137. L.bufStorage = newString(bufLen)
  138. L.buf = L.bufStorage.cstring
  139. L.bufLen = bufLen
  140. L.sentinel = bufLen - 1
  141. L.lineStart = 0
  142. L.lineNumber = 1 # lines start at 1
  143. L.stream = inputstream
  144. fillBuffer(L)
  145. skipUTF8BOM(L)
  146. proc getColNumber(L: TBaseLexer, pos: int): int =
  147. result = abs(pos - L.lineStart)
  148. proc getCurrentLine(L: TBaseLexer, marker: bool = true): string =
  149. result = ""
  150. var i = L.lineStart
  151. while L.buf[i] notin {CR, LF, EndOfFile}:
  152. result.add L.buf[i]
  153. inc i
  154. result.add "\n"
  155. if marker:
  156. result.add spaces(getColNumber(L, L.bufpos)) & '^' & "\n"