lexer.nim 46 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378
  1. #
  2. #
  3. # The Nim Compiler
  4. # (c) Copyright 2015 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. # This lexer is handwritten for efficiency. I used an elegant buffering
  10. # scheme which I have not seen anywhere else:
  11. # We guarantee that a whole line is in the buffer. Thus only when scanning
  12. # the \n or \r character we have to check whether we need to read in the next
  13. # chunk. (\n or \r already need special handling for incrementing the line
  14. # counter; choosing both \n and \r allows the lexer to properly read Unix,
  15. # DOS or Macintosh text files, even when it is not the native format.
  16. import
  17. options, msgs, platform, idents, nimlexbase, llstream,
  18. wordrecg, lineinfos, pathutils
  19. import std/[hashes, parseutils, strutils]
  20. when defined(nimPreviewSlimSystem):
  21. import std/[assertions, formatfloat]
  22. const
  23. numChars*: set[char] = {'0'..'9', 'a'..'z', 'A'..'Z'}
  24. SymChars*: set[char] = {'a'..'z', 'A'..'Z', '0'..'9', '\x80'..'\xFF'}
  25. SymStartChars*: set[char] = {'a'..'z', 'A'..'Z', '\x80'..'\xFF'}
  26. OpChars*: set[char] = {'+', '-', '*', '/', '\\', '<', '>', '!', '?', '^', '.',
  27. '|', '=', '%', '&', '$', '@', '~', ':'}
  28. UnaryMinusWhitelist = {' ', '\t', '\n', '\r', ',', ';', '(', '[', '{'}
  29. # don't forget to update the 'highlite' module if these charsets should change
  30. type
  31. TokType* = enum
  32. tkInvalid = "tkInvalid", tkEof = "[EOF]", # order is important here!
  33. tkSymbol = "tkSymbol", # keywords:
  34. tkAddr = "addr", tkAnd = "and", tkAs = "as", tkAsm = "asm",
  35. tkBind = "bind", tkBlock = "block", tkBreak = "break", tkCase = "case", tkCast = "cast",
  36. tkConcept = "concept", tkConst = "const", tkContinue = "continue", tkConverter = "converter",
  37. tkDefer = "defer", tkDiscard = "discard", tkDistinct = "distinct", tkDiv = "div", tkDo = "do",
  38. tkElif = "elif", tkElse = "else", tkEnd = "end", tkEnum = "enum", tkExcept = "except", tkExport = "export",
  39. tkFinally = "finally", tkFor = "for", tkFrom = "from", tkFunc = "func",
  40. tkIf = "if", tkImport = "import", tkIn = "in", tkInclude = "include", tkInterface = "interface",
  41. tkIs = "is", tkIsnot = "isnot", tkIterator = "iterator",
  42. tkLet = "let",
  43. tkMacro = "macro", tkMethod = "method", tkMixin = "mixin", tkMod = "mod", tkNil = "nil", tkNot = "not", tkNotin = "notin",
  44. tkObject = "object", tkOf = "of", tkOr = "or", tkOut = "out",
  45. tkProc = "proc", tkPtr = "ptr", tkRaise = "raise", tkRef = "ref", tkReturn = "return",
  46. tkShl = "shl", tkShr = "shr", tkStatic = "static",
  47. tkTemplate = "template",
  48. tkTry = "try", tkTuple = "tuple", tkType = "type", tkUsing = "using",
  49. tkVar = "var", tkWhen = "when", tkWhile = "while", tkXor = "xor",
  50. tkYield = "yield", # end of keywords
  51. tkIntLit = "tkIntLit", tkInt8Lit = "tkInt8Lit", tkInt16Lit = "tkInt16Lit",
  52. tkInt32Lit = "tkInt32Lit", tkInt64Lit = "tkInt64Lit",
  53. tkUIntLit = "tkUIntLit", tkUInt8Lit = "tkUInt8Lit", tkUInt16Lit = "tkUInt16Lit",
  54. tkUInt32Lit = "tkUInt32Lit", tkUInt64Lit = "tkUInt64Lit",
  55. tkFloatLit = "tkFloatLit", tkFloat32Lit = "tkFloat32Lit",
  56. tkFloat64Lit = "tkFloat64Lit", tkFloat128Lit = "tkFloat128Lit",
  57. tkStrLit = "tkStrLit", tkRStrLit = "tkRStrLit", tkTripleStrLit = "tkTripleStrLit",
  58. tkGStrLit = "tkGStrLit", tkGTripleStrLit = "tkGTripleStrLit", tkCharLit = "tkCharLit",
  59. tkCustomLit = "tkCustomLit",
  60. tkParLe = "(", tkParRi = ")", tkBracketLe = "[",
  61. tkBracketRi = "]", tkCurlyLe = "{", tkCurlyRi = "}",
  62. tkBracketDotLe = "[.", tkBracketDotRi = ".]",
  63. tkCurlyDotLe = "{.", tkCurlyDotRi = ".}",
  64. tkParDotLe = "(.", tkParDotRi = ".)",
  65. tkComma = ",", tkSemiColon = ";",
  66. tkColon = ":", tkColonColon = "::", tkEquals = "=",
  67. tkDot = ".", tkDotDot = "..", tkBracketLeColon = "[:",
  68. tkOpr, tkComment, tkAccent = "`",
  69. # these are fake tokens used by renderer.nim
  70. tkSpaces, tkInfixOpr, tkPrefixOpr, tkPostfixOpr, tkHideableStart, tkHideableEnd
  71. TokTypes* = set[TokType]
  72. const
  73. weakTokens = {tkComma, tkSemiColon, tkColon,
  74. tkParRi, tkParDotRi, tkBracketRi, tkBracketDotRi,
  75. tkCurlyRi} # \
  76. # tokens that should not be considered for previousToken
  77. tokKeywordLow* = succ(tkSymbol)
  78. tokKeywordHigh* = pred(tkIntLit)
  79. type
  80. NumericalBase* = enum
  81. base10, # base10 is listed as the first element,
  82. # so that it is the correct default value
  83. base2, base8, base16
  84. TokenSpacing* = enum
  85. tsLeading, tsTrailing, tsEof
  86. Token* = object # a Nim token
  87. tokType*: TokType # the type of the token
  88. base*: NumericalBase # the numerical base; only valid for int
  89. # or float literals
  90. spacing*: set[TokenSpacing] # spaces around token
  91. indent*: int # the indentation; != -1 if the token has been
  92. # preceded with indentation
  93. ident*: PIdent # the parsed identifier
  94. iNumber*: BiggestInt # the parsed integer literal
  95. fNumber*: BiggestFloat # the parsed floating point literal
  96. literal*: string # the parsed (string) literal; and
  97. # documentation comments are here too
  98. line*, col*: int
  99. when defined(nimpretty):
  100. offsetA*, offsetB*: int # used for pretty printing so that literals
  101. # like 0b01 or r"\L" are unaffected
  102. commentOffsetA*, commentOffsetB*: int
  103. ErrorHandler* = proc (conf: ConfigRef; info: TLineInfo; msg: TMsgKind; arg: string)
  104. Lexer* = object of TBaseLexer
  105. fileIdx*: FileIndex
  106. indentAhead*: int # if > 0 an indentation has already been read
  107. # this is needed because scanning comments
  108. # needs so much look-ahead
  109. currLineIndent*: int
  110. errorHandler*: ErrorHandler
  111. cache*: IdentCache
  112. when defined(nimsuggest):
  113. previousToken: TLineInfo
  114. tokenEnd*: TLineInfo
  115. previousTokenEnd*: TLineInfo
  116. config*: ConfigRef
  117. proc getLineInfo*(L: Lexer, tok: Token): TLineInfo {.inline.} =
  118. result = newLineInfo(L.fileIdx, tok.line, tok.col)
  119. when defined(nimpretty):
  120. result.offsetA = tok.offsetA
  121. result.offsetB = tok.offsetB
  122. result.commentOffsetA = tok.commentOffsetA
  123. result.commentOffsetB = tok.commentOffsetB
  124. proc isKeyword*(kind: TokType): bool =
  125. (kind >= tokKeywordLow) and (kind <= tokKeywordHigh)
  126. template ones(n): untyped = ((1 shl n)-1) # for utf-8 conversion
  127. proc isNimIdentifier*(s: string): bool =
  128. let sLen = s.len
  129. if sLen > 0 and s[0] in SymStartChars:
  130. var i = 1
  131. while i < sLen:
  132. if s[i] == '_': inc(i)
  133. if i < sLen and s[i] notin SymChars: return false
  134. inc(i)
  135. result = true
  136. else:
  137. result = false
  138. proc `$`*(tok: Token): string =
  139. case tok.tokType
  140. of tkIntLit..tkInt64Lit: $tok.iNumber
  141. of tkFloatLit..tkFloat64Lit: $tok.fNumber
  142. of tkInvalid, tkStrLit..tkCharLit, tkComment: tok.literal
  143. of tkParLe..tkColon, tkEof, tkAccent: $tok.tokType
  144. else:
  145. if tok.ident != nil:
  146. tok.ident.s
  147. else:
  148. ""
  149. proc prettyTok*(tok: Token): string =
  150. if isKeyword(tok.tokType): "keyword " & tok.ident.s
  151. else: $tok
  152. proc printTok*(conf: ConfigRef; tok: Token) =
  153. # xxx factor with toLocation
  154. msgWriteln(conf, $tok.line & ":" & $tok.col & "\t" & $tok.tokType & " " & $tok)
  155. proc openLexer*(lex: var Lexer, fileIdx: FileIndex, inputstream: PLLStream;
  156. cache: IdentCache; config: ConfigRef) =
  157. openBaseLexer(lex, inputstream)
  158. lex.fileIdx = fileIdx
  159. lex.indentAhead = -1
  160. lex.currLineIndent = 0
  161. inc(lex.lineNumber, inputstream.lineOffset)
  162. lex.cache = cache
  163. when defined(nimsuggest):
  164. lex.previousToken.fileIndex = fileIdx
  165. lex.config = config
  166. proc openLexer*(lex: var Lexer, filename: AbsoluteFile, inputstream: PLLStream;
  167. cache: IdentCache; config: ConfigRef) =
  168. openLexer(lex, fileInfoIdx(config, filename), inputstream, cache, config)
  169. proc closeLexer*(lex: var Lexer) =
  170. if lex.config != nil:
  171. inc(lex.config.linesCompiled, lex.lineNumber)
  172. closeBaseLexer(lex)
  173. proc getLineInfo(L: Lexer): TLineInfo =
  174. result = newLineInfo(L.fileIdx, L.lineNumber, getColNumber(L, L.bufpos))
  175. proc dispMessage(L: Lexer; info: TLineInfo; msg: TMsgKind; arg: string) =
  176. if L.errorHandler.isNil:
  177. msgs.message(L.config, info, msg, arg)
  178. else:
  179. L.errorHandler(L.config, info, msg, arg)
  180. proc lexMessage*(L: Lexer, msg: TMsgKind, arg = "") =
  181. L.dispMessage(getLineInfo(L), msg, arg)
  182. proc lexMessageTok*(L: Lexer, msg: TMsgKind, tok: Token, arg = "") =
  183. var info = newLineInfo(L.fileIdx, tok.line, tok.col)
  184. L.dispMessage(info, msg, arg)
  185. proc lexMessagePos(L: var Lexer, msg: TMsgKind, pos: int, arg = "") =
  186. var info = newLineInfo(L.fileIdx, L.lineNumber, pos - L.lineStart)
  187. L.dispMessage(info, msg, arg)
  188. proc matchTwoChars(L: Lexer, first: char, second: set[char]): bool =
  189. result = (L.buf[L.bufpos] == first) and (L.buf[L.bufpos + 1] in second)
  190. template tokenBegin(tok, pos) {.dirty.} =
  191. when defined(nimsuggest):
  192. var colA = getColNumber(L, pos)
  193. when defined(nimpretty):
  194. tok.offsetA = L.offsetBase + pos
  195. template tokenEnd(tok, pos) {.dirty.} =
  196. when defined(nimsuggest):
  197. let colB = getColNumber(L, pos)+1
  198. if L.fileIdx == L.config.m.trackPos.fileIndex and L.config.m.trackPos.col in colA..colB and
  199. L.lineNumber == L.config.m.trackPos.line.int and L.config.ideCmd in {ideSug, ideCon}:
  200. L.config.m.trackPos.col = colA.int16
  201. colA = 0
  202. when defined(nimpretty):
  203. tok.offsetB = L.offsetBase + pos
  204. template tokenEndIgnore(tok, pos) =
  205. when defined(nimsuggest):
  206. let colB = getColNumber(L, pos)
  207. if L.fileIdx == L.config.m.trackPos.fileIndex and L.config.m.trackPos.col in colA..colB and
  208. L.lineNumber == L.config.m.trackPos.line.int and L.config.ideCmd in {ideSug, ideCon}:
  209. L.config.m.trackPos.fileIndex = trackPosInvalidFileIdx
  210. L.config.m.trackPos.line = 0'u16
  211. colA = 0
  212. when defined(nimpretty):
  213. tok.offsetB = L.offsetBase + pos
  214. template tokenEndPrevious(tok, pos) =
  215. when defined(nimsuggest):
  216. # when we detect the cursor in whitespace, we attach the track position
  217. # to the token that came before that, but only if we haven't detected
  218. # the cursor in a string literal or comment:
  219. let colB = getColNumber(L, pos)
  220. if L.fileIdx == L.config.m.trackPos.fileIndex and L.config.m.trackPos.col in colA..colB and
  221. L.lineNumber == L.config.m.trackPos.line.int and L.config.ideCmd in {ideSug, ideCon}:
  222. L.config.m.trackPos = L.previousToken
  223. L.config.m.trackPosAttached = true
  224. colA = 0
  225. when defined(nimpretty):
  226. tok.offsetB = L.offsetBase + pos
  227. template eatChar(L: var Lexer, t: var Token, replacementChar: char) =
  228. t.literal.add(replacementChar)
  229. inc(L.bufpos)
  230. template eatChar(L: var Lexer, t: var Token) =
  231. t.literal.add(L.buf[L.bufpos])
  232. inc(L.bufpos)
  233. proc getNumber(L: var Lexer, result: var Token) =
  234. proc matchUnderscoreChars(L: var Lexer, tok: var Token, chars: set[char]): Natural =
  235. var pos = L.bufpos # use registers for pos, buf
  236. result = 0
  237. while true:
  238. if L.buf[pos] in chars:
  239. tok.literal.add(L.buf[pos])
  240. inc(pos)
  241. inc(result)
  242. else:
  243. break
  244. if L.buf[pos] == '_':
  245. if L.buf[pos+1] notin chars:
  246. lexMessage(L, errGenerated,
  247. "only single underscores may occur in a token and token may not " &
  248. "end with an underscore: e.g. '1__1' and '1_' are invalid")
  249. break
  250. tok.literal.add('_')
  251. inc(pos)
  252. L.bufpos = pos
  253. proc matchChars(L: var Lexer, tok: var Token, chars: set[char]) =
  254. var pos = L.bufpos # use registers for pos, buf
  255. while L.buf[pos] in chars:
  256. tok.literal.add(L.buf[pos])
  257. inc(pos)
  258. L.bufpos = pos
  259. proc lexMessageLitNum(L: var Lexer, msg: string, startpos: int, msgKind = errGenerated) =
  260. # Used to get slightly human friendlier err messages.
  261. const literalishChars = {'A'..'Z', 'a'..'z', '0'..'9', '_', '.', '\''}
  262. var msgPos = L.bufpos
  263. var t = Token(literal: "")
  264. L.bufpos = startpos # Use L.bufpos as pos because of matchChars
  265. matchChars(L, t, literalishChars)
  266. # We must verify +/- specifically so that we're not past the literal
  267. if L.buf[L.bufpos] in {'+', '-'} and
  268. L.buf[L.bufpos - 1] in {'e', 'E'}:
  269. t.literal.add(L.buf[L.bufpos])
  270. inc(L.bufpos)
  271. matchChars(L, t, literalishChars)
  272. if L.buf[L.bufpos] in literalishChars:
  273. t.literal.add(L.buf[L.bufpos])
  274. inc(L.bufpos)
  275. matchChars(L, t, {'0'..'9'})
  276. L.bufpos = msgPos
  277. lexMessage(L, msgKind, msg % t.literal)
  278. var
  279. xi: BiggestInt
  280. isBase10 = true
  281. numDigits = 0
  282. const
  283. # 'c', 'C' is deprecated
  284. baseCodeChars = {'X', 'x', 'o', 'b', 'B', 'c', 'C'}
  285. literalishChars = baseCodeChars + {'A'..'F', 'a'..'f', '0'..'9', '_', '\''}
  286. floatTypes = {tkFloatLit, tkFloat32Lit, tkFloat64Lit, tkFloat128Lit}
  287. result.tokType = tkIntLit # int literal until we know better
  288. result.literal = ""
  289. result.base = base10
  290. tokenBegin(result, L.bufpos)
  291. var isPositive = true
  292. if L.buf[L.bufpos] == '-':
  293. eatChar(L, result)
  294. isPositive = false
  295. let startpos = L.bufpos
  296. template setNumber(field, value) =
  297. field = (if isPositive: value else: -value)
  298. # First stage: find out base, make verifications, build token literal string
  299. # {'c', 'C'} is added for deprecation reasons to provide a clear error message
  300. if L.buf[L.bufpos] == '0' and L.buf[L.bufpos + 1] in baseCodeChars + {'c', 'C', 'O'}:
  301. isBase10 = false
  302. eatChar(L, result, '0')
  303. case L.buf[L.bufpos]
  304. of 'c', 'C':
  305. lexMessageLitNum(L,
  306. "$1 will soon be invalid for oct literals; Use '0o' " &
  307. "for octals. 'c', 'C' prefix",
  308. startpos,
  309. warnDeprecated)
  310. eatChar(L, result, 'c')
  311. numDigits = matchUnderscoreChars(L, result, {'0'..'7'})
  312. of 'O':
  313. lexMessageLitNum(L, "$1 is an invalid int literal; For octal literals " &
  314. "use the '0o' prefix.", startpos)
  315. of 'x', 'X':
  316. eatChar(L, result, 'x')
  317. numDigits = matchUnderscoreChars(L, result, {'0'..'9', 'a'..'f', 'A'..'F'})
  318. of 'o':
  319. eatChar(L, result, 'o')
  320. numDigits = matchUnderscoreChars(L, result, {'0'..'7'})
  321. of 'b', 'B':
  322. eatChar(L, result, 'b')
  323. numDigits = matchUnderscoreChars(L, result, {'0'..'1'})
  324. else:
  325. internalError(L.config, getLineInfo(L), "getNumber")
  326. if numDigits == 0:
  327. lexMessageLitNum(L, "invalid number: '$1'", startpos)
  328. else:
  329. discard matchUnderscoreChars(L, result, {'0'..'9'})
  330. if (L.buf[L.bufpos] == '.') and (L.buf[L.bufpos + 1] in {'0'..'9'}):
  331. result.tokType = tkFloatLit
  332. eatChar(L, result, '.')
  333. discard matchUnderscoreChars(L, result, {'0'..'9'})
  334. if L.buf[L.bufpos] in {'e', 'E'}:
  335. result.tokType = tkFloatLit
  336. eatChar(L, result)
  337. if L.buf[L.bufpos] in {'+', '-'}:
  338. eatChar(L, result)
  339. discard matchUnderscoreChars(L, result, {'0'..'9'})
  340. let endpos = L.bufpos
  341. # Second stage, find out if there's a datatype suffix and handle it
  342. var postPos = endpos
  343. if L.buf[postPos] in {'\'', 'f', 'F', 'd', 'D', 'i', 'I', 'u', 'U'}:
  344. let errPos = postPos
  345. var customLitPossible = false
  346. if L.buf[postPos] == '\'':
  347. inc(postPos)
  348. customLitPossible = true
  349. if L.buf[postPos] in SymChars:
  350. var suffix = newStringOfCap(10)
  351. while true:
  352. suffix.add L.buf[postPos]
  353. inc postPos
  354. if L.buf[postPos] notin SymChars+{'_'}: break
  355. let suffixAsLower = suffix.toLowerAscii
  356. case suffixAsLower
  357. of "f", "f32": result.tokType = tkFloat32Lit
  358. of "d", "f64": result.tokType = tkFloat64Lit
  359. of "f128": result.tokType = tkFloat128Lit
  360. of "i8": result.tokType = tkInt8Lit
  361. of "i16": result.tokType = tkInt16Lit
  362. of "i32": result.tokType = tkInt32Lit
  363. of "i64": result.tokType = tkInt64Lit
  364. of "u": result.tokType = tkUIntLit
  365. of "u8": result.tokType = tkUInt8Lit
  366. of "u16": result.tokType = tkUInt16Lit
  367. of "u32": result.tokType = tkUInt32Lit
  368. of "u64": result.tokType = tkUInt64Lit
  369. elif customLitPossible:
  370. # remember the position of the `'` so that the parser doesn't
  371. # have to reparse the custom literal:
  372. result.iNumber = len(result.literal)
  373. result.literal.add '\''
  374. result.literal.add suffix
  375. result.tokType = tkCustomLit
  376. else:
  377. lexMessageLitNum(L, "invalid number suffix: '$1'", errPos)
  378. else:
  379. lexMessageLitNum(L, "invalid number suffix: '$1'", errPos)
  380. # Is there still a literalish char awaiting? Then it's an error!
  381. if L.buf[postPos] in literalishChars or
  382. (L.buf[postPos] == '.' and L.buf[postPos + 1] in {'0'..'9'}):
  383. lexMessageLitNum(L, "invalid number: '$1'", startpos)
  384. if result.tokType != tkCustomLit:
  385. # Third stage, extract actual number
  386. L.bufpos = startpos # restore position
  387. var pos = startpos
  388. try:
  389. if (L.buf[pos] == '0') and (L.buf[pos + 1] in baseCodeChars):
  390. inc(pos, 2)
  391. xi = 0 # it is a base prefix
  392. case L.buf[pos - 1]
  393. of 'b', 'B':
  394. result.base = base2
  395. while pos < endpos:
  396. if L.buf[pos] != '_':
  397. xi = `shl`(xi, 1) or (ord(L.buf[pos]) - ord('0'))
  398. inc(pos)
  399. # 'c', 'C' is deprecated (a warning is issued elsewhere)
  400. of 'o', 'c', 'C':
  401. result.base = base8
  402. while pos < endpos:
  403. if L.buf[pos] != '_':
  404. xi = `shl`(xi, 3) or (ord(L.buf[pos]) - ord('0'))
  405. inc(pos)
  406. of 'x', 'X':
  407. result.base = base16
  408. while pos < endpos:
  409. case L.buf[pos]
  410. of '_':
  411. inc(pos)
  412. of '0'..'9':
  413. xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('0'))
  414. inc(pos)
  415. of 'a'..'f':
  416. xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('a') + 10)
  417. inc(pos)
  418. of 'A'..'F':
  419. xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('A') + 10)
  420. inc(pos)
  421. else:
  422. break
  423. else:
  424. internalError(L.config, getLineInfo(L), "getNumber")
  425. case result.tokType
  426. of tkIntLit, tkInt64Lit: setNumber result.iNumber, xi
  427. of tkInt8Lit: setNumber result.iNumber, ashr(xi shl 56, 56)
  428. of tkInt16Lit: setNumber result.iNumber, ashr(xi shl 48, 48)
  429. of tkInt32Lit: setNumber result.iNumber, ashr(xi shl 32, 32)
  430. of tkUIntLit, tkUInt64Lit: setNumber result.iNumber, xi
  431. of tkUInt8Lit: setNumber result.iNumber, xi and 0xff
  432. of tkUInt16Lit: setNumber result.iNumber, xi and 0xffff
  433. of tkUInt32Lit: setNumber result.iNumber, xi and 0xffffffff
  434. of tkFloat32Lit:
  435. setNumber result.fNumber, (cast[ptr float32](addr(xi)))[]
  436. # note: this code is endian neutral!
  437. # XXX: Test this on big endian machine!
  438. of tkFloat64Lit, tkFloatLit:
  439. setNumber result.fNumber, (cast[ptr float64](addr(xi)))[]
  440. else: internalError(L.config, getLineInfo(L), "getNumber")
  441. # Bounds checks. Non decimal literals are allowed to overflow the range of
  442. # the datatype as long as their pattern don't overflow _bitwise_, hence
  443. # below checks of signed sizes against uint*.high is deliberate:
  444. # (0x80'u8 = 128, 0x80'i8 = -128, etc == OK)
  445. if result.tokType notin floatTypes:
  446. let outOfRange =
  447. case result.tokType
  448. of tkUInt8Lit, tkUInt16Lit, tkUInt32Lit: result.iNumber != xi
  449. of tkInt8Lit: (xi > BiggestInt(uint8.high))
  450. of tkInt16Lit: (xi > BiggestInt(uint16.high))
  451. of tkInt32Lit: (xi > BiggestInt(uint32.high))
  452. else: false
  453. if outOfRange:
  454. #echo "out of range num: ", result.iNumber, " vs ", xi
  455. lexMessageLitNum(L, "number out of range: '$1'", startpos)
  456. else:
  457. case result.tokType
  458. of floatTypes:
  459. result.fNumber = parseFloat(result.literal)
  460. of tkUInt64Lit, tkUIntLit:
  461. var iNumber: uint64 = uint64(0)
  462. var len: int = 0
  463. try:
  464. len = parseBiggestUInt(result.literal, iNumber)
  465. except ValueError:
  466. raise newException(OverflowDefect, "number out of range: " & result.literal)
  467. if len != result.literal.len:
  468. raise newException(ValueError, "invalid integer: " & result.literal)
  469. result.iNumber = cast[int64](iNumber)
  470. else:
  471. var iNumber: int64 = int64(0)
  472. var len: int = 0
  473. try:
  474. len = parseBiggestInt(result.literal, iNumber)
  475. except ValueError:
  476. raise newException(OverflowDefect, "number out of range: " & result.literal)
  477. if len != result.literal.len:
  478. raise newException(ValueError, "invalid integer: " & result.literal)
  479. result.iNumber = iNumber
  480. # Explicit bounds checks.
  481. let outOfRange =
  482. case result.tokType
  483. of tkInt8Lit: result.iNumber > int8.high or result.iNumber < int8.low
  484. of tkUInt8Lit: result.iNumber > BiggestInt(uint8.high) or result.iNumber < 0
  485. of tkInt16Lit: result.iNumber > int16.high or result.iNumber < int16.low
  486. of tkUInt16Lit: result.iNumber > BiggestInt(uint16.high) or result.iNumber < 0
  487. of tkInt32Lit: result.iNumber > int32.high or result.iNumber < int32.low
  488. of tkUInt32Lit: result.iNumber > BiggestInt(uint32.high) or result.iNumber < 0
  489. else: false
  490. if outOfRange:
  491. lexMessageLitNum(L, "number out of range: '$1'", startpos)
  492. # Promote int literal to int64? Not always necessary, but more consistent
  493. if result.tokType == tkIntLit:
  494. if result.iNumber > high(int32) or result.iNumber < low(int32):
  495. result.tokType = tkInt64Lit
  496. except ValueError:
  497. lexMessageLitNum(L, "invalid number: '$1'", startpos)
  498. except OverflowDefect, RangeDefect:
  499. lexMessageLitNum(L, "number out of range: '$1'", startpos)
  500. tokenEnd(result, postPos-1)
  501. L.bufpos = postPos
  502. proc handleHexChar(L: var Lexer, xi: var int; position: range[0..4]) =
  503. template invalid() =
  504. lexMessage(L, errGenerated,
  505. "expected a hex digit, but found: " & L.buf[L.bufpos] &
  506. "; maybe prepend with 0")
  507. case L.buf[L.bufpos]
  508. of '0'..'9':
  509. xi = (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('0'))
  510. inc(L.bufpos)
  511. of 'a'..'f':
  512. xi = (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('a') + 10)
  513. inc(L.bufpos)
  514. of 'A'..'F':
  515. xi = (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('A') + 10)
  516. inc(L.bufpos)
  517. of '"', '\'':
  518. if position <= 1: invalid()
  519. # do not progress the bufpos here.
  520. if position == 0: inc(L.bufpos)
  521. else:
  522. invalid()
  523. # Need to progress for `nim check`
  524. inc(L.bufpos)
  525. proc handleDecChars(L: var Lexer, xi: var int) =
  526. while L.buf[L.bufpos] in {'0'..'9'}:
  527. xi = (xi * 10) + (ord(L.buf[L.bufpos]) - ord('0'))
  528. inc(L.bufpos)
  529. proc addUnicodeCodePoint(s: var string, i: int) =
  530. let i = cast[uint](i)
  531. # inlined toUTF-8 to avoid unicode and strutils dependencies.
  532. let pos = s.len
  533. if i <= 127:
  534. s.setLen(pos+1)
  535. s[pos+0] = chr(i)
  536. elif i <= 0x07FF:
  537. s.setLen(pos+2)
  538. s[pos+0] = chr((i shr 6) or 0b110_00000)
  539. s[pos+1] = chr((i and ones(6)) or 0b10_0000_00)
  540. elif i <= 0xFFFF:
  541. s.setLen(pos+3)
  542. s[pos+0] = chr(i shr 12 or 0b1110_0000)
  543. s[pos+1] = chr(i shr 6 and ones(6) or 0b10_0000_00)
  544. s[pos+2] = chr(i and ones(6) or 0b10_0000_00)
  545. elif i <= 0x001FFFFF:
  546. s.setLen(pos+4)
  547. s[pos+0] = chr(i shr 18 or 0b1111_0000)
  548. s[pos+1] = chr(i shr 12 and ones(6) or 0b10_0000_00)
  549. s[pos+2] = chr(i shr 6 and ones(6) or 0b10_0000_00)
  550. s[pos+3] = chr(i and ones(6) or 0b10_0000_00)
  551. elif i <= 0x03FFFFFF:
  552. s.setLen(pos+5)
  553. s[pos+0] = chr(i shr 24 or 0b111110_00)
  554. s[pos+1] = chr(i shr 18 and ones(6) or 0b10_0000_00)
  555. s[pos+2] = chr(i shr 12 and ones(6) or 0b10_0000_00)
  556. s[pos+3] = chr(i shr 6 and ones(6) or 0b10_0000_00)
  557. s[pos+4] = chr(i and ones(6) or 0b10_0000_00)
  558. elif i <= 0x7FFFFFFF:
  559. s.setLen(pos+6)
  560. s[pos+0] = chr(i shr 30 or 0b1111110_0)
  561. s[pos+1] = chr(i shr 24 and ones(6) or 0b10_0000_00)
  562. s[pos+2] = chr(i shr 18 and ones(6) or 0b10_0000_00)
  563. s[pos+3] = chr(i shr 12 and ones(6) or 0b10_0000_00)
  564. s[pos+4] = chr(i shr 6 and ones(6) or 0b10_0000_00)
  565. s[pos+5] = chr(i and ones(6) or 0b10_0000_00)
  566. proc getEscapedChar(L: var Lexer, tok: var Token) =
  567. inc(L.bufpos) # skip '\'
  568. case L.buf[L.bufpos]
  569. of 'n', 'N':
  570. tok.literal.add('\L')
  571. inc(L.bufpos)
  572. of 'p', 'P':
  573. if tok.tokType == tkCharLit:
  574. lexMessage(L, errGenerated, "\\p not allowed in character literal")
  575. tok.literal.add(L.config.target.tnl)
  576. inc(L.bufpos)
  577. of 'r', 'R', 'c', 'C':
  578. tok.literal.add(CR)
  579. inc(L.bufpos)
  580. of 'l', 'L':
  581. tok.literal.add(LF)
  582. inc(L.bufpos)
  583. of 'f', 'F':
  584. tok.literal.add(FF)
  585. inc(L.bufpos)
  586. of 'e', 'E':
  587. tok.literal.add(ESC)
  588. inc(L.bufpos)
  589. of 'a', 'A':
  590. tok.literal.add(BEL)
  591. inc(L.bufpos)
  592. of 'b', 'B':
  593. tok.literal.add(BACKSPACE)
  594. inc(L.bufpos)
  595. of 'v', 'V':
  596. tok.literal.add(VT)
  597. inc(L.bufpos)
  598. of 't', 'T':
  599. tok.literal.add('\t')
  600. inc(L.bufpos)
  601. of '\'', '\"':
  602. tok.literal.add(L.buf[L.bufpos])
  603. inc(L.bufpos)
  604. of '\\':
  605. tok.literal.add('\\')
  606. inc(L.bufpos)
  607. of 'x', 'X':
  608. inc(L.bufpos)
  609. var xi = 0
  610. handleHexChar(L, xi, 1)
  611. handleHexChar(L, xi, 2)
  612. tok.literal.add(chr(xi))
  613. of 'u', 'U':
  614. if tok.tokType == tkCharLit:
  615. lexMessage(L, errGenerated, "\\u not allowed in character literal")
  616. inc(L.bufpos)
  617. var xi = 0
  618. if L.buf[L.bufpos] == '{':
  619. inc(L.bufpos)
  620. var start = L.bufpos
  621. while L.buf[L.bufpos] != '}':
  622. handleHexChar(L, xi, 0)
  623. if start == L.bufpos:
  624. lexMessage(L, errGenerated,
  625. "Unicode codepoint cannot be empty")
  626. inc(L.bufpos)
  627. if xi > 0x10FFFF:
  628. let hex = ($L.buf)[start..L.bufpos-2]
  629. lexMessage(L, errGenerated,
  630. "Unicode codepoint must be lower than 0x10FFFF, but was: " & hex)
  631. else:
  632. handleHexChar(L, xi, 1)
  633. handleHexChar(L, xi, 2)
  634. handleHexChar(L, xi, 3)
  635. handleHexChar(L, xi, 4)
  636. addUnicodeCodePoint(tok.literal, xi)
  637. of '0'..'9':
  638. if matchTwoChars(L, '0', {'0'..'9'}):
  639. lexMessage(L, warnOctalEscape)
  640. var xi = 0
  641. handleDecChars(L, xi)
  642. if (xi <= 255): tok.literal.add(chr(xi))
  643. else: lexMessage(L, errGenerated, "invalid character constant")
  644. else: lexMessage(L, errGenerated, "invalid character constant")
  645. proc handleCRLF(L: var Lexer, pos: int): int =
  646. template registerLine =
  647. let col = L.getColNumber(pos)
  648. case L.buf[pos]
  649. of CR:
  650. registerLine()
  651. result = nimlexbase.handleCR(L, pos)
  652. of LF:
  653. registerLine()
  654. result = nimlexbase.handleLF(L, pos)
  655. else: result = pos
  656. type
  657. StringMode = enum
  658. normal,
  659. raw,
  660. generalized
  661. proc getString(L: var Lexer, tok: var Token, mode: StringMode) =
  662. var pos = L.bufpos
  663. var line = L.lineNumber # save linenumber for better error message
  664. tokenBegin(tok, pos - ord(mode == raw))
  665. inc pos # skip "
  666. if L.buf[pos] == '\"' and L.buf[pos+1] == '\"':
  667. tok.tokType = tkTripleStrLit # long string literal:
  668. inc(pos, 2) # skip ""
  669. # skip leading newline:
  670. if L.buf[pos] in {' ', '\t'}:
  671. var newpos = pos+1
  672. while L.buf[newpos] in {' ', '\t'}: inc newpos
  673. if L.buf[newpos] in {CR, LF}: pos = newpos
  674. pos = handleCRLF(L, pos)
  675. while true:
  676. case L.buf[pos]
  677. of '\"':
  678. if L.buf[pos+1] == '\"' and L.buf[pos+2] == '\"' and
  679. L.buf[pos+3] != '\"':
  680. tokenEndIgnore(tok, pos+2)
  681. L.bufpos = pos + 3 # skip the three """
  682. break
  683. tok.literal.add('\"')
  684. inc(pos)
  685. of CR, LF:
  686. tokenEndIgnore(tok, pos)
  687. pos = handleCRLF(L, pos)
  688. tok.literal.add("\n")
  689. of nimlexbase.EndOfFile:
  690. tokenEndIgnore(tok, pos)
  691. var line2 = L.lineNumber
  692. L.lineNumber = line
  693. lexMessagePos(L, errGenerated, L.lineStart, "closing \"\"\" expected, but end of file reached")
  694. L.lineNumber = line2
  695. L.bufpos = pos
  696. break
  697. else:
  698. tok.literal.add(L.buf[pos])
  699. inc(pos)
  700. else:
  701. # ordinary string literal
  702. if mode != normal: tok.tokType = tkRStrLit
  703. else: tok.tokType = tkStrLit
  704. while true:
  705. let c = L.buf[pos]
  706. if c == '\"':
  707. if mode != normal and L.buf[pos+1] == '\"':
  708. inc(pos, 2)
  709. tok.literal.add('"')
  710. else:
  711. tokenEndIgnore(tok, pos)
  712. inc(pos) # skip '"'
  713. break
  714. elif c in {CR, LF, nimlexbase.EndOfFile}:
  715. tokenEndIgnore(tok, pos)
  716. lexMessage(L, errGenerated, "closing \" expected")
  717. break
  718. elif (c == '\\') and mode == normal:
  719. L.bufpos = pos
  720. getEscapedChar(L, tok)
  721. pos = L.bufpos
  722. else:
  723. tok.literal.add(c)
  724. inc(pos)
  725. L.bufpos = pos
  726. proc getCharacter(L: var Lexer; tok: var Token) =
  727. tokenBegin(tok, L.bufpos)
  728. let startPos = L.bufpos
  729. inc(L.bufpos) # skip '
  730. let c = L.buf[L.bufpos]
  731. case c
  732. of '\0'..pred(' '), '\'':
  733. lexMessage(L, errGenerated, "invalid character literal")
  734. tok.literal = $c
  735. of '\\': getEscapedChar(L, tok)
  736. else:
  737. tok.literal = $c
  738. inc(L.bufpos)
  739. if L.buf[L.bufpos] == '\'':
  740. tokenEndIgnore(tok, L.bufpos)
  741. inc(L.bufpos) # skip '
  742. else:
  743. if startPos > 0 and L.buf[startPos-1] == '`':
  744. tok.literal = "'"
  745. L.bufpos = startPos+1
  746. else:
  747. lexMessage(L, errGenerated, "missing closing ' for character literal")
  748. tokenEndIgnore(tok, L.bufpos)
  749. const
  750. UnicodeOperatorStartChars = {'\226', '\194', '\195'}
  751. # the allowed unicode characters ("∙ ∘ × ★ ⊗ ⊘ ⊙ ⊛ ⊠ ⊡ ∩ ∧ ⊓ ± ⊕ ⊖ ⊞ ⊟ ∪ ∨ ⊔")
  752. # all start with one of these.
  753. type
  754. UnicodeOprPred = enum
  755. Mul, Add
  756. proc unicodeOprLen(buf: cstring; pos: int): (int8, UnicodeOprPred) =
  757. template m(len): untyped = (int8(len), Mul)
  758. template a(len): untyped = (int8(len), Add)
  759. result = 0.m
  760. case buf[pos]
  761. of '\226':
  762. if buf[pos+1] == '\136':
  763. if buf[pos+2] == '\152': result = 3.m # ∘
  764. elif buf[pos+2] == '\153': result = 3.m # ∙
  765. elif buf[pos+2] == '\167': result = 3.m # ∧
  766. elif buf[pos+2] == '\168': result = 3.a # ∨
  767. elif buf[pos+2] == '\169': result = 3.m # ∩
  768. elif buf[pos+2] == '\170': result = 3.a # ∪
  769. elif buf[pos+1] == '\138':
  770. if buf[pos+2] == '\147': result = 3.m # ⊓
  771. elif buf[pos+2] == '\148': result = 3.a # ⊔
  772. elif buf[pos+2] == '\149': result = 3.a # ⊕
  773. elif buf[pos+2] == '\150': result = 3.a # ⊖
  774. elif buf[pos+2] == '\151': result = 3.m # ⊗
  775. elif buf[pos+2] == '\152': result = 3.m # ⊘
  776. elif buf[pos+2] == '\153': result = 3.m # ⊙
  777. elif buf[pos+2] == '\155': result = 3.m # ⊛
  778. elif buf[pos+2] == '\158': result = 3.a # ⊞
  779. elif buf[pos+2] == '\159': result = 3.a # ⊟
  780. elif buf[pos+2] == '\160': result = 3.m # ⊠
  781. elif buf[pos+2] == '\161': result = 3.m # ⊡
  782. elif buf[pos+1] == '\152' and buf[pos+2] == '\133': result = 3.m # ★
  783. of '\194':
  784. if buf[pos+1] == '\177': result = 2.a # ±
  785. of '\195':
  786. if buf[pos+1] == '\151': result = 2.m # ×
  787. else:
  788. discard
  789. proc getSymbol(L: var Lexer, tok: var Token) =
  790. var h: Hash = 0
  791. var pos = L.bufpos
  792. tokenBegin(tok, pos)
  793. var suspicious = false
  794. while true:
  795. var c = L.buf[pos]
  796. case c
  797. of 'a'..'z', '0'..'9':
  798. h = h !& ord(c)
  799. inc(pos)
  800. of 'A'..'Z':
  801. c = chr(ord(c) + (ord('a') - ord('A'))) # toLower()
  802. h = h !& ord(c)
  803. inc(pos)
  804. suspicious = true
  805. of '_':
  806. if L.buf[pos+1] notin SymChars:
  807. lexMessage(L, errGenerated, "invalid token: trailing underscore")
  808. break
  809. inc(pos)
  810. suspicious = true
  811. of '\x80'..'\xFF':
  812. if c in UnicodeOperatorStartChars and unicodeOprLen(L.buf, pos)[0] != 0:
  813. break
  814. else:
  815. h = h !& ord(c)
  816. inc(pos)
  817. else: break
  818. tokenEnd(tok, pos-1)
  819. h = !$h
  820. tok.ident = L.cache.getIdent(cast[cstring](addr(L.buf[L.bufpos])), pos - L.bufpos, h)
  821. if (tok.ident.id < ord(tokKeywordLow) - ord(tkSymbol)) or
  822. (tok.ident.id > ord(tokKeywordHigh) - ord(tkSymbol)):
  823. tok.tokType = tkSymbol
  824. else:
  825. tok.tokType = TokType(tok.ident.id + ord(tkSymbol))
  826. if suspicious and {optStyleHint, optStyleError} * L.config.globalOptions != {}:
  827. lintReport(L.config, getLineInfo(L), tok.ident.s.normalize, tok.ident.s)
  828. L.bufpos = pos
  829. proc endOperator(L: var Lexer, tok: var Token, pos: int,
  830. hash: Hash) {.inline.} =
  831. var h = !$hash
  832. tok.ident = L.cache.getIdent(cast[cstring](addr(L.buf[L.bufpos])), pos - L.bufpos, h)
  833. if (tok.ident.id < oprLow) or (tok.ident.id > oprHigh): tok.tokType = tkOpr
  834. else: tok.tokType = TokType(tok.ident.id - oprLow + ord(tkColon))
  835. L.bufpos = pos
  836. proc getOperator(L: var Lexer, tok: var Token) =
  837. var pos = L.bufpos
  838. tokenBegin(tok, pos)
  839. var h: Hash = 0
  840. while true:
  841. let c = L.buf[pos]
  842. if c in OpChars:
  843. h = h !& ord(c)
  844. inc(pos)
  845. elif c in UnicodeOperatorStartChars:
  846. let oprLen = unicodeOprLen(L.buf, pos)[0]
  847. if oprLen == 0: break
  848. for i in 0..<oprLen:
  849. h = h !& ord(L.buf[pos])
  850. inc pos
  851. else:
  852. break
  853. endOperator(L, tok, pos, h)
  854. tokenEnd(tok, pos-1)
  855. # advance pos but don't store it in L.bufpos so the next token (which might
  856. # be an operator too) gets the preceding spaces:
  857. tok.spacing = tok.spacing - {tsTrailing, tsEof}
  858. var trailing = false
  859. while L.buf[pos] == ' ':
  860. inc pos
  861. trailing = true
  862. if L.buf[pos] in {CR, LF, nimlexbase.EndOfFile}:
  863. tok.spacing.incl(tsEof)
  864. elif trailing:
  865. tok.spacing.incl(tsTrailing)
  866. proc getPrecedence*(tok: Token): int =
  867. ## Calculates the precedence of the given token.
  868. const
  869. MulPred = 9
  870. PlusPred = 8
  871. case tok.tokType
  872. of tkOpr:
  873. let relevantChar = tok.ident.s[0]
  874. # arrow like?
  875. if tok.ident.s.len > 1 and tok.ident.s[^1] == '>' and
  876. tok.ident.s[^2] in {'-', '~', '='}: return 0
  877. template considerAsgn(value: untyped) =
  878. result = if tok.ident.s[^1] == '=': 1 else: value
  879. case relevantChar
  880. of '$', '^': considerAsgn(10)
  881. of '*', '%', '/', '\\': considerAsgn(MulPred)
  882. of '~': result = 8
  883. of '+', '-', '|': considerAsgn(PlusPred)
  884. of '&': considerAsgn(7)
  885. of '=', '<', '>', '!': result = 5
  886. of '.': considerAsgn(6)
  887. of '?': result = 2
  888. of UnicodeOperatorStartChars:
  889. if tok.ident.s[^1] == '=':
  890. result = 1
  891. else:
  892. let (len, pred) = unicodeOprLen(cstring(tok.ident.s), 0)
  893. if len != 0:
  894. result = if pred == Mul: MulPred else: PlusPred
  895. else:
  896. result = 2
  897. else: considerAsgn(2)
  898. of tkDiv, tkMod, tkShl, tkShr: result = 9
  899. of tkDotDot: result = 6
  900. of tkIn, tkNotin, tkIs, tkIsnot, tkOf, tkAs, tkFrom: result = 5
  901. of tkAnd: result = 4
  902. of tkOr, tkXor, tkPtr, tkRef: result = 3
  903. else: return -10
  904. proc skipMultiLineComment(L: var Lexer; tok: var Token; start: int;
  905. isDoc: bool) =
  906. var pos = start
  907. var toStrip = 0
  908. tokenBegin(tok, pos)
  909. # detect the amount of indentation:
  910. if isDoc:
  911. toStrip = getColNumber(L, pos)
  912. while L.buf[pos] == ' ':
  913. inc pos
  914. inc toStrip
  915. while L.buf[pos] in {CR, LF}: # skip blank lines
  916. pos = handleCRLF(L, pos)
  917. toStrip = 0
  918. while L.buf[pos] == ' ':
  919. inc pos
  920. inc toStrip
  921. var nesting = 0
  922. while true:
  923. case L.buf[pos]
  924. of '#':
  925. if isDoc:
  926. if L.buf[pos+1] == '#' and L.buf[pos+2] == '[':
  927. inc nesting
  928. tok.literal.add '#'
  929. elif L.buf[pos+1] == '[':
  930. inc nesting
  931. inc pos
  932. of ']':
  933. if isDoc:
  934. if L.buf[pos+1] == '#' and L.buf[pos+2] == '#':
  935. if nesting == 0:
  936. tokenEndIgnore(tok, pos+2)
  937. inc(pos, 3)
  938. break
  939. dec nesting
  940. tok.literal.add ']'
  941. elif L.buf[pos+1] == '#':
  942. if nesting == 0:
  943. tokenEndIgnore(tok, pos+1)
  944. inc(pos, 2)
  945. break
  946. dec nesting
  947. inc pos
  948. of CR, LF:
  949. tokenEndIgnore(tok, pos)
  950. pos = handleCRLF(L, pos)
  951. # strip leading whitespace:
  952. when defined(nimpretty): tok.literal.add "\L"
  953. if isDoc:
  954. when not defined(nimpretty): tok.literal.add "\n"
  955. var c = toStrip
  956. while L.buf[pos] == ' ' and c > 0:
  957. inc pos
  958. dec c
  959. of nimlexbase.EndOfFile:
  960. tokenEndIgnore(tok, pos)
  961. lexMessagePos(L, errGenerated, pos, "end of multiline comment expected")
  962. break
  963. else:
  964. if isDoc or defined(nimpretty): tok.literal.add L.buf[pos]
  965. inc(pos)
  966. L.bufpos = pos
  967. when defined(nimpretty):
  968. tok.commentOffsetB = L.offsetBase + pos - 1
  969. proc scanComment(L: var Lexer, tok: var Token) =
  970. var pos = L.bufpos
  971. tok.tokType = tkComment
  972. assert L.buf[pos+1] == '#'
  973. when defined(nimpretty):
  974. tok.commentOffsetA = L.offsetBase + pos
  975. if L.buf[pos+2] == '[':
  976. skipMultiLineComment(L, tok, pos+3, true)
  977. return
  978. tokenBegin(tok, pos)
  979. inc(pos, 2)
  980. var toStrip = 0
  981. var stripInit = false
  982. while true:
  983. if not stripInit: # find baseline indentation inside comment
  984. while L.buf[pos] == ' ':
  985. inc pos
  986. inc toStrip
  987. if L.buf[pos] in {CR, LF}: # don't set toStrip in blank comment lines
  988. toStrip = 0
  989. else: # found first non-whitespace character
  990. stripInit = true
  991. while L.buf[pos] notin {CR, LF, nimlexbase.EndOfFile}:
  992. tok.literal.add(L.buf[pos])
  993. inc(pos)
  994. tokenEndIgnore(tok, pos)
  995. pos = handleCRLF(L, pos)
  996. var indent = 0
  997. while L.buf[pos] == ' ':
  998. inc(pos)
  999. inc(indent)
  1000. if L.buf[pos] == '#' and L.buf[pos+1] == '#':
  1001. tok.literal.add "\n"
  1002. inc(pos, 2)
  1003. if stripInit:
  1004. var c = toStrip
  1005. while L.buf[pos] == ' ' and c > 0:
  1006. inc pos
  1007. dec c
  1008. else:
  1009. if L.buf[pos] > ' ':
  1010. L.indentAhead = indent
  1011. tokenEndIgnore(tok, pos)
  1012. break
  1013. L.bufpos = pos
  1014. when defined(nimpretty):
  1015. tok.commentOffsetB = L.offsetBase + pos - 1
  1016. proc skip(L: var Lexer, tok: var Token) =
  1017. var pos = L.bufpos
  1018. tokenBegin(tok, pos)
  1019. tok.spacing.excl(tsLeading)
  1020. when defined(nimpretty):
  1021. var hasComment = false
  1022. var commentIndent = L.currLineIndent
  1023. tok.commentOffsetA = L.offsetBase + pos
  1024. tok.commentOffsetB = tok.commentOffsetA
  1025. tok.line = -1
  1026. while true:
  1027. case L.buf[pos]
  1028. of ' ':
  1029. inc(pos)
  1030. tok.spacing.incl(tsLeading)
  1031. of '\t':
  1032. lexMessagePos(L, errGenerated, pos, "tabs are not allowed, use spaces instead")
  1033. inc(pos)
  1034. of CR, LF:
  1035. tokenEndPrevious(tok, pos)
  1036. pos = handleCRLF(L, pos)
  1037. var indent = 0
  1038. while true:
  1039. if L.buf[pos] == ' ':
  1040. inc(pos)
  1041. inc(indent)
  1042. elif L.buf[pos] == '#' and L.buf[pos+1] == '[':
  1043. when defined(nimpretty):
  1044. hasComment = true
  1045. if tok.line < 0:
  1046. tok.line = L.lineNumber
  1047. commentIndent = indent
  1048. skipMultiLineComment(L, tok, pos+2, false)
  1049. pos = L.bufpos
  1050. else:
  1051. break
  1052. tok.spacing.excl(tsLeading)
  1053. when defined(nimpretty):
  1054. if L.buf[pos] == '#' and tok.line < 0: commentIndent = indent
  1055. if L.buf[pos] > ' ' and (L.buf[pos] != '#' or L.buf[pos+1] == '#'):
  1056. tok.indent = indent
  1057. L.currLineIndent = indent
  1058. break
  1059. of '#':
  1060. # do not skip documentation comment:
  1061. if L.buf[pos+1] == '#': break
  1062. when defined(nimpretty):
  1063. hasComment = true
  1064. if tok.line < 0:
  1065. tok.line = L.lineNumber
  1066. if L.buf[pos+1] == '[':
  1067. skipMultiLineComment(L, tok, pos+2, false)
  1068. pos = L.bufpos
  1069. else:
  1070. tokenBegin(tok, pos)
  1071. while L.buf[pos] notin {CR, LF, nimlexbase.EndOfFile}:
  1072. when defined(nimpretty): tok.literal.add L.buf[pos]
  1073. inc(pos)
  1074. tokenEndIgnore(tok, pos+1)
  1075. when defined(nimpretty):
  1076. tok.commentOffsetB = L.offsetBase + pos + 1
  1077. else:
  1078. break # EndOfFile also leaves the loop
  1079. tokenEndPrevious(tok, pos-1)
  1080. L.bufpos = pos
  1081. when defined(nimpretty):
  1082. if hasComment:
  1083. tok.commentOffsetB = L.offsetBase + pos - 1
  1084. tok.tokType = tkComment
  1085. tok.indent = commentIndent
  1086. proc rawGetTok*(L: var Lexer, tok: var Token) =
  1087. template atTokenEnd() {.dirty.} =
  1088. when defined(nimsuggest):
  1089. L.previousTokenEnd.line = L.tokenEnd.line
  1090. L.previousTokenEnd.col = L.tokenEnd.col
  1091. L.tokenEnd.line = tok.line.uint16
  1092. L.tokenEnd.col = getColNumber(L, L.bufpos).int16
  1093. # we attach the cursor to the last *strong* token
  1094. if tok.tokType notin weakTokens:
  1095. L.previousToken.line = tok.line.uint16
  1096. L.previousToken.col = tok.col.int16
  1097. reset(tok)
  1098. if L.indentAhead >= 0:
  1099. tok.indent = L.indentAhead
  1100. L.currLineIndent = L.indentAhead
  1101. L.indentAhead = -1
  1102. else:
  1103. tok.indent = -1
  1104. skip(L, tok)
  1105. when defined(nimpretty):
  1106. if tok.tokType == tkComment:
  1107. L.indentAhead = L.currLineIndent
  1108. return
  1109. let c = L.buf[L.bufpos]
  1110. tok.line = L.lineNumber
  1111. tok.col = getColNumber(L, L.bufpos)
  1112. if c in SymStartChars - {'r', 'R'} - UnicodeOperatorStartChars:
  1113. getSymbol(L, tok)
  1114. else:
  1115. case c
  1116. of UnicodeOperatorStartChars:
  1117. if unicodeOprLen(L.buf, L.bufpos)[0] != 0:
  1118. getOperator(L, tok)
  1119. else:
  1120. getSymbol(L, tok)
  1121. of '#':
  1122. scanComment(L, tok)
  1123. of '*':
  1124. # '*:' is unfortunately a special case, because it is two tokens in
  1125. # 'var v*: int'.
  1126. if L.buf[L.bufpos+1] == ':' and L.buf[L.bufpos+2] notin OpChars:
  1127. var h = 0 !& ord('*')
  1128. endOperator(L, tok, L.bufpos+1, h)
  1129. else:
  1130. getOperator(L, tok)
  1131. of ',':
  1132. tok.tokType = tkComma
  1133. inc(L.bufpos)
  1134. of 'r', 'R':
  1135. if L.buf[L.bufpos + 1] == '\"':
  1136. inc(L.bufpos)
  1137. getString(L, tok, raw)
  1138. else:
  1139. getSymbol(L, tok)
  1140. of '(':
  1141. inc(L.bufpos)
  1142. if L.buf[L.bufpos] == '.' and L.buf[L.bufpos+1] != '.':
  1143. tok.tokType = tkParDotLe
  1144. inc(L.bufpos)
  1145. else:
  1146. tok.tokType = tkParLe
  1147. when defined(nimsuggest):
  1148. if L.fileIdx == L.config.m.trackPos.fileIndex and tok.col < L.config.m.trackPos.col and
  1149. tok.line == L.config.m.trackPos.line.int and L.config.ideCmd == ideCon:
  1150. L.config.m.trackPos.col = tok.col.int16
  1151. of ')':
  1152. tok.tokType = tkParRi
  1153. inc(L.bufpos)
  1154. of '[':
  1155. inc(L.bufpos)
  1156. if L.buf[L.bufpos] == '.' and L.buf[L.bufpos+1] != '.':
  1157. tok.tokType = tkBracketDotLe
  1158. inc(L.bufpos)
  1159. elif L.buf[L.bufpos] == ':':
  1160. tok.tokType = tkBracketLeColon
  1161. inc(L.bufpos)
  1162. else:
  1163. tok.tokType = tkBracketLe
  1164. of ']':
  1165. tok.tokType = tkBracketRi
  1166. inc(L.bufpos)
  1167. of '.':
  1168. when defined(nimsuggest):
  1169. if L.fileIdx == L.config.m.trackPos.fileIndex and tok.col+1 == L.config.m.trackPos.col and
  1170. tok.line == L.config.m.trackPos.line.int and L.config.ideCmd == ideSug:
  1171. tok.tokType = tkDot
  1172. L.config.m.trackPos.col = tok.col.int16
  1173. inc(L.bufpos)
  1174. atTokenEnd()
  1175. return
  1176. if L.buf[L.bufpos+1] == ']':
  1177. tok.tokType = tkBracketDotRi
  1178. inc(L.bufpos, 2)
  1179. elif L.buf[L.bufpos+1] == '}':
  1180. tok.tokType = tkCurlyDotRi
  1181. inc(L.bufpos, 2)
  1182. elif L.buf[L.bufpos+1] == ')':
  1183. tok.tokType = tkParDotRi
  1184. inc(L.bufpos, 2)
  1185. else:
  1186. getOperator(L, tok)
  1187. of '{':
  1188. inc(L.bufpos)
  1189. if L.buf[L.bufpos] == '.' and L.buf[L.bufpos+1] != '.':
  1190. tok.tokType = tkCurlyDotLe
  1191. inc(L.bufpos)
  1192. else:
  1193. tok.tokType = tkCurlyLe
  1194. of '}':
  1195. tok.tokType = tkCurlyRi
  1196. inc(L.bufpos)
  1197. of ';':
  1198. tok.tokType = tkSemiColon
  1199. inc(L.bufpos)
  1200. of '`':
  1201. tok.tokType = tkAccent
  1202. inc(L.bufpos)
  1203. of '_':
  1204. inc(L.bufpos)
  1205. if L.buf[L.bufpos] notin SymChars+{'_'}:
  1206. tok.tokType = tkSymbol
  1207. tok.ident = L.cache.getIdent("_")
  1208. else:
  1209. tok.literal = $c
  1210. tok.tokType = tkInvalid
  1211. lexMessage(L, errGenerated, "invalid token: " & c & " (\\" & $(ord(c)) & ')')
  1212. of '\"':
  1213. # check for generalized raw string literal:
  1214. let mode = if L.bufpos > 0 and L.buf[L.bufpos-1] in SymChars: generalized else: normal
  1215. getString(L, tok, mode)
  1216. if mode == generalized:
  1217. # tkRStrLit -> tkGStrLit
  1218. # tkTripleStrLit -> tkGTripleStrLit
  1219. inc(tok.tokType, 2)
  1220. of '\'':
  1221. tok.tokType = tkCharLit
  1222. getCharacter(L, tok)
  1223. tok.tokType = tkCharLit
  1224. of '0'..'9':
  1225. getNumber(L, tok)
  1226. let c = L.buf[L.bufpos]
  1227. if c in SymChars+{'_'}:
  1228. if c in UnicodeOperatorStartChars and
  1229. unicodeOprLen(L.buf, L.bufpos)[0] != 0:
  1230. discard
  1231. else:
  1232. lexMessage(L, errGenerated, "invalid token: no whitespace between number and identifier")
  1233. of '-':
  1234. if L.buf[L.bufpos+1] in {'0'..'9'} and
  1235. (L.bufpos-1 == 0 or L.buf[L.bufpos-1] in UnaryMinusWhitelist):
  1236. # x)-23 # binary minus
  1237. # ,-23 # unary minus
  1238. # \n-78 # unary minus? Yes.
  1239. # =-3 # parsed as `=-` anyway
  1240. getNumber(L, tok)
  1241. let c = L.buf[L.bufpos]
  1242. if c in SymChars+{'_'}:
  1243. if c in UnicodeOperatorStartChars and
  1244. unicodeOprLen(L.buf, L.bufpos)[0] != 0:
  1245. discard
  1246. else:
  1247. lexMessage(L, errGenerated, "invalid token: no whitespace between number and identifier")
  1248. else:
  1249. getOperator(L, tok)
  1250. else:
  1251. if c in OpChars:
  1252. getOperator(L, tok)
  1253. elif c == nimlexbase.EndOfFile:
  1254. tok.tokType = tkEof
  1255. tok.indent = 0
  1256. else:
  1257. tok.literal = $c
  1258. tok.tokType = tkInvalid
  1259. lexMessage(L, errGenerated, "invalid token: " & c & " (\\" & $(ord(c)) & ')')
  1260. inc(L.bufpos)
  1261. atTokenEnd()
  1262. proc getIndentWidth*(fileIdx: FileIndex, inputstream: PLLStream;
  1263. cache: IdentCache; config: ConfigRef): int =
  1264. result = 0
  1265. var lex: Lexer = default(Lexer)
  1266. var tok: Token = default(Token)
  1267. openLexer(lex, fileIdx, inputstream, cache, config)
  1268. var prevToken = tkEof
  1269. while tok.tokType != tkEof:
  1270. rawGetTok(lex, tok)
  1271. if tok.indent > 0 and prevToken in {tkColon, tkEquals, tkType, tkConst, tkLet, tkVar, tkUsing}:
  1272. result = tok.indent
  1273. if result > 0: break
  1274. prevToken = tok.tokType
  1275. closeLexer(lex)
  1276. proc getPrecedence*(ident: PIdent): int =
  1277. ## assumes ident is binary operator already
  1278. let
  1279. tokType =
  1280. if ident.id in ord(tokKeywordLow) - ord(tkSymbol)..ord(tokKeywordHigh) - ord(tkSymbol):
  1281. TokType(ident.id + ord(tkSymbol))
  1282. else: tkOpr
  1283. tok = Token(ident: ident, tokType: tokType)
  1284. getPrecedence(tok)