test_phystokens.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204
  1. # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
  2. # For details: https://bitbucket.org/ned/coveragepy/src/default/NOTICE.txt
  3. """Tests for coverage.py's improved tokenizer."""
  4. import os.path
  5. import re
  6. from coverage import env
  7. from coverage.phystokens import source_token_lines, source_encoding
  8. from coverage.phystokens import neuter_encoding_declaration, compile_unicode
  9. from coverage.python import get_python_source
  10. from tests.coveragetest import CoverageTest
  11. SIMPLE = u"""\
  12. # yay!
  13. def foo():
  14. say('two = %d' % 2)
  15. """
  16. MIXED_WS = u"""\
  17. def hello():
  18. a="Hello world!"
  19. \tb="indented"
  20. """
  21. HERE = os.path.dirname(__file__)
  22. class PhysTokensTest(CoverageTest):
  23. """Tests for coverage.py's improved tokenizer."""
  24. run_in_temp_dir = False
  25. def check_tokenization(self, source):
  26. """Tokenize `source`, then put it back together, should be the same."""
  27. tokenized = ""
  28. for line in source_token_lines(source):
  29. text = "".join(t for _, t in line)
  30. tokenized += text + "\n"
  31. # source_token_lines doesn't preserve trailing spaces, so trim all that
  32. # before comparing.
  33. source = source.replace('\r\n', '\n')
  34. source = re.sub(r"(?m)[ \t]+$", "", source)
  35. tokenized = re.sub(r"(?m)[ \t]+$", "", tokenized)
  36. self.assertMultiLineEqual(source, tokenized)
  37. def check_file_tokenization(self, fname):
  38. """Use the contents of `fname` for `check_tokenization`."""
  39. self.check_tokenization(get_python_source(fname))
  40. def test_simple(self):
  41. self.assertEqual(list(source_token_lines(SIMPLE)),
  42. [
  43. [('com', "# yay!")],
  44. [('key', 'def'), ('ws', ' '), ('nam', 'foo'), ('op', '('),
  45. ('op', ')'), ('op', ':')],
  46. [('ws', ' '), ('nam', 'say'), ('op', '('),
  47. ('str', "'two = %d'"), ('ws', ' '), ('op', '%'),
  48. ('ws', ' '), ('num', '2'), ('op', ')')]
  49. ])
  50. self.check_tokenization(SIMPLE)
  51. def test_tab_indentation(self):
  52. # Mixed tabs and spaces...
  53. self.assertEqual(list(source_token_lines(MIXED_WS)),
  54. [
  55. [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('),
  56. ('op', ')'), ('op', ':')],
  57. [('ws', ' '), ('nam', 'a'), ('op', '='),
  58. ('str', '"Hello world!"')],
  59. [('ws', ' '), ('nam', 'b'), ('op', '='),
  60. ('str', '"indented"')],
  61. ])
  62. def test_tokenize_real_file(self):
  63. # Check the tokenization of a real file (large, btw).
  64. real_file = os.path.join(HERE, "test_coverage.py")
  65. self.check_file_tokenization(real_file)
  66. def test_stress(self):
  67. # Check the tokenization of a stress-test file.
  68. stress = os.path.join(HERE, "stress_phystoken.tok")
  69. self.check_file_tokenization(stress)
  70. stress = os.path.join(HERE, "stress_phystoken_dos.tok")
  71. self.check_file_tokenization(stress)
  72. # The default encoding is different in Python 2 and Python 3.
  73. if env.PY3:
  74. DEF_ENCODING = "utf-8"
  75. else:
  76. DEF_ENCODING = "ascii"
  77. ENCODING_DECLARATION_SOURCES = [
  78. # Various forms from http://www.python.org/dev/peps/pep-0263/
  79. (1, b"# coding=cp850\n\n"),
  80. (1, b"#!/usr/bin/python\n# -*- coding: cp850 -*-\n"),
  81. (1, b"#!/usr/bin/python\n# vim: set fileencoding=cp850:\n"),
  82. (1, b"# This Python file uses this encoding: cp850\n"),
  83. (1, b"# This file uses a different encoding:\n# coding: cp850\n"),
  84. (1, b"\n# coding=cp850\n\n"),
  85. (2, b"# -*- coding:cp850 -*-\n# vim: fileencoding=cp850\n"),
  86. ]
  87. class SourceEncodingTest(CoverageTest):
  88. """Tests of source_encoding() for detecting encodings."""
  89. run_in_temp_dir = False
  90. def test_detect_source_encoding(self):
  91. for _, source in ENCODING_DECLARATION_SOURCES:
  92. self.assertEqual(
  93. source_encoding(source),
  94. 'cp850',
  95. "Wrong encoding in %r" % source
  96. )
  97. def test_detect_source_encoding_not_in_comment(self):
  98. if env.PYPY and env.PY3:
  99. # PyPy3 gets this case wrong. Not sure what I can do about it,
  100. # so skip the test.
  101. self.skipTest("PyPy3 is wrong about non-comment encoding. Skip it.")
  102. # Should not detect anything here
  103. source = b'def parse(src, encoding=None):\n pass'
  104. self.assertEqual(source_encoding(source), DEF_ENCODING)
  105. def test_dont_detect_source_encoding_on_third_line(self):
  106. # A coding declaration doesn't count on the third line.
  107. source = b"\n\n# coding=cp850\n\n"
  108. self.assertEqual(source_encoding(source), DEF_ENCODING)
  109. def test_detect_source_encoding_of_empty_file(self):
  110. # An important edge case.
  111. self.assertEqual(source_encoding(b""), DEF_ENCODING)
  112. def test_bom(self):
  113. # A BOM means utf-8.
  114. source = b"\xEF\xBB\xBFtext = 'hello'\n"
  115. self.assertEqual(source_encoding(source), 'utf-8-sig')
  116. # But it has to be the only authority.
  117. source = b"\xEF\xBB\xBF# coding: cp850\n"
  118. with self.assertRaises(SyntaxError):
  119. source_encoding(source)
  120. class NeuterEncodingDeclarationTest(CoverageTest):
  121. """Tests of phystokens.neuter_encoding_declaration()."""
  122. run_in_temp_dir = False
  123. def test_neuter_encoding_declaration(self):
  124. for lines_diff_expected, source in ENCODING_DECLARATION_SOURCES:
  125. neutered = neuter_encoding_declaration(source.decode("ascii"))
  126. neutered = neutered.encode("ascii")
  127. # The neutered source should have the same number of lines.
  128. source_lines = source.splitlines()
  129. neutered_lines = neutered.splitlines()
  130. self.assertEqual(len(source_lines), len(neutered_lines))
  131. # Only one of the lines should be different.
  132. lines_different = sum(
  133. int(nline != sline) for nline, sline in zip(neutered_lines, source_lines)
  134. )
  135. self.assertEqual(lines_diff_expected, lines_different)
  136. # The neutered source will be detected as having no encoding
  137. # declaration.
  138. self.assertEqual(
  139. source_encoding(neutered),
  140. DEF_ENCODING,
  141. "Wrong encoding in %r" % neutered
  142. )
  143. class CompileUnicodeTest(CoverageTest):
  144. """Tests of compiling Unicode strings."""
  145. run_in_temp_dir = False
  146. def assert_compile_unicode(self, source):
  147. """Assert that `source` will compile properly with `compile_unicode`."""
  148. source += u"a = 42\n"
  149. # This doesn't raise an exception:
  150. code = compile_unicode(source, "<string>", "exec")
  151. globs = {}
  152. exec(code, globs)
  153. self.assertEqual(globs['a'], 42)
  154. def test_cp1252(self):
  155. uni = u"""# coding: cp1252\n# \u201C curly \u201D\n"""
  156. self.assert_compile_unicode(uni)
  157. def test_double_coding_declaration(self):
  158. # Build this string in a weird way so that actual vim's won't try to
  159. # interpret it...
  160. uni = u"# -*- coding:utf-8 -*-\n# v" "im: fileencoding=utf-8\n"
  161. self.assert_compile_unicode(uni)