123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295 |
- # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
- # For details: https://bitbucket.org/ned/coveragepy/src/default/NOTICE.txt
- """Better tokenizing for coverage.py."""
- import codecs
- import keyword
- import re
- import sys
- import token
- import tokenize
- from coverage import env
- from coverage.backward import iternext
- from coverage.misc import contract
- def phys_tokens(toks):
- """Return all physical tokens, even line continuations.
- tokenize.generate_tokens() doesn't return a token for the backslash that
- continues lines. This wrapper provides those tokens so that we can
- re-create a faithful representation of the original source.
- Returns the same values as generate_tokens()
- """
- last_line = None
- last_lineno = -1
- last_ttype = None
- for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks:
- if last_lineno != elineno:
- if last_line and last_line.endswith("\\\n"):
- # We are at the beginning of a new line, and the last line
- # ended with a backslash. We probably have to inject a
- # backslash token into the stream. Unfortunately, there's more
- # to figure out. This code::
- #
- # usage = """\
- # HEY THERE
- # """
- #
- # triggers this condition, but the token text is::
- #
- # '"""\\\nHEY THERE\n"""'
- #
- # so we need to figure out if the backslash is already in the
- # string token or not.
- inject_backslash = True
- if last_ttype == tokenize.COMMENT:
- # Comments like this \
- # should never result in a new token.
- inject_backslash = False
- elif ttype == token.STRING:
- if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\':
- # It's a multi-line string and the first line ends with
- # a backslash, so we don't need to inject another.
- inject_backslash = False
- if inject_backslash:
- # Figure out what column the backslash is in.
- ccol = len(last_line.split("\n")[-2]) - 1
- # Yield the token, with a fake token type.
- yield (
- 99999, "\\\n",
- (slineno, ccol), (slineno, ccol+2),
- last_line
- )
- last_line = ltext
- last_ttype = ttype
- yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext
- last_lineno = elineno
- @contract(source='unicode')
- def source_token_lines(source):
- """Generate a series of lines, one for each line in `source`.
- Each line is a list of pairs, each pair is a token::
- [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ]
- Each pair has a token class, and the token text.
- If you concatenate all the token texts, and then join them with newlines,
- you should have your original `source` back, with two differences:
- trailing whitespace is not preserved, and a final line with no newline
- is indistinguishable from a final line with a newline.
- """
- ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL])
- line = []
- col = 0
- source = source.expandtabs(8).replace('\r\n', '\n')
- tokgen = generate_tokens(source)
- for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen):
- mark_start = True
- for part in re.split('(\n)', ttext):
- if part == '\n':
- yield line
- line = []
- col = 0
- mark_end = False
- elif part == '':
- mark_end = False
- elif ttype in ws_tokens:
- mark_end = False
- else:
- if mark_start and scol > col:
- line.append(("ws", u" " * (scol - col)))
- mark_start = False
- tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3]
- if ttype == token.NAME and keyword.iskeyword(ttext):
- tok_class = "key"
- line.append((tok_class, part))
- mark_end = True
- scol = 0
- if mark_end:
- col = ecol
- if line:
- yield line
- class CachedTokenizer(object):
- """A one-element cache around tokenize.generate_tokens.
- When reporting, coverage.py tokenizes files twice, once to find the
- structure of the file, and once to syntax-color it. Tokenizing is
- expensive, and easily cached.
- This is a one-element cache so that our twice-in-a-row tokenizing doesn't
- actually tokenize twice.
- """
- def __init__(self):
- self.last_text = None
- self.last_tokens = None
- @contract(text='unicode')
- def generate_tokens(self, text):
- """A stand-in for `tokenize.generate_tokens`."""
- if text != self.last_text:
- self.last_text = text
- readline = iternext(text.splitlines(True))
- self.last_tokens = list(tokenize.generate_tokens(readline))
- return self.last_tokens
- # Create our generate_tokens cache as a callable replacement function.
- generate_tokens = CachedTokenizer().generate_tokens
- COOKIE_RE = re.compile(r"^[ \t]*#.*coding[:=][ \t]*([-\w.]+)", flags=re.MULTILINE)
- @contract(source='bytes')
- def _source_encoding_py2(source):
- """Determine the encoding for `source`, according to PEP 263.
- `source` is a byte string, the text of the program.
- Returns a string, the name of the encoding.
- """
- assert isinstance(source, bytes)
- # Do this so the detect_encode code we copied will work.
- readline = iternext(source.splitlines(True))
- # This is mostly code adapted from Py3.2's tokenize module.
- def _get_normal_name(orig_enc):
- """Imitates get_normal_name in tokenizer.c."""
- # Only care about the first 12 characters.
- enc = orig_enc[:12].lower().replace("_", "-")
- if re.match(r"^utf-8($|-)", enc):
- return "utf-8"
- if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc):
- return "iso-8859-1"
- return orig_enc
- # From detect_encode():
- # It detects the encoding from the presence of a UTF-8 BOM or an encoding
- # cookie as specified in PEP-0263. If both a BOM and a cookie are present,
- # but disagree, a SyntaxError will be raised. If the encoding cookie is an
- # invalid charset, raise a SyntaxError. Note that if a UTF-8 BOM is found,
- # 'utf-8-sig' is returned.
- # If no encoding is specified, then the default will be returned.
- default = 'ascii'
- bom_found = False
- encoding = None
- def read_or_stop():
- """Get the next source line, or ''."""
- try:
- return readline()
- except StopIteration:
- return ''
- def find_cookie(line):
- """Find an encoding cookie in `line`."""
- try:
- line_string = line.decode('ascii')
- except UnicodeDecodeError:
- return None
- matches = COOKIE_RE.findall(line_string)
- if not matches:
- return None
- encoding = _get_normal_name(matches[0])
- try:
- codec = codecs.lookup(encoding)
- except LookupError:
- # This behavior mimics the Python interpreter
- raise SyntaxError("unknown encoding: " + encoding)
- if bom_found:
- # codecs in 2.3 were raw tuples of functions, assume the best.
- codec_name = getattr(codec, 'name', encoding)
- if codec_name != 'utf-8':
- # This behavior mimics the Python interpreter
- raise SyntaxError('encoding problem: utf-8')
- encoding += '-sig'
- return encoding
- first = read_or_stop()
- if first.startswith(codecs.BOM_UTF8):
- bom_found = True
- first = first[3:]
- default = 'utf-8-sig'
- if not first:
- return default
- encoding = find_cookie(first)
- if encoding:
- return encoding
- second = read_or_stop()
- if not second:
- return default
- encoding = find_cookie(second)
- if encoding:
- return encoding
- return default
- @contract(source='bytes')
- def _source_encoding_py3(source):
- """Determine the encoding for `source`, according to PEP 263.
- `source` is a byte string: the text of the program.
- Returns a string, the name of the encoding.
- """
- readline = iternext(source.splitlines(True))
- return tokenize.detect_encoding(readline)[0]
- if env.PY3:
- source_encoding = _source_encoding_py3
- else:
- source_encoding = _source_encoding_py2
- @contract(source='unicode')
- def compile_unicode(source, filename, mode):
- """Just like the `compile` builtin, but works on any Unicode string.
- Python 2's compile() builtin has a stupid restriction: if the source string
- is Unicode, then it may not have a encoding declaration in it. Why not?
- Who knows! It also decodes to utf8, and then tries to interpret those utf8
- bytes according to the encoding declaration. Why? Who knows!
- This function neuters the coding declaration, and compiles it.
- """
- source = neuter_encoding_declaration(source)
- if env.PY2 and isinstance(filename, unicode):
- filename = filename.encode(sys.getfilesystemencoding(), "replace")
- code = compile(source, filename, mode)
- return code
- @contract(source='unicode', returns='unicode')
- def neuter_encoding_declaration(source):
- """Return `source`, with any encoding declaration neutered."""
- source = COOKIE_RE.sub("# (deleted declaration)", source, count=2)
- return source
|