validator.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448
  1. #!/usr/bin/env python
  2. """Python script to validate that code does not contain an console platform specific
  3. references or code that should not be published. Can be used to scan any directory."""
  4. #
  5. # Copyright (c) Contributors to the Open 3D Engine Project.
  6. # For complete copyright and license terms please see the LICENSE at the root of this distribution.
  7. #
  8. # SPDX-License-Identifier: Apache-2.0 OR MIT
  9. #
  10. #
  11. # pylint: disable-msg=C0301
  12. from __future__ import absolute_import
  13. from __future__ import print_function
  14. import six
  15. import logging
  16. import os
  17. import re
  18. import sys
  19. import traceback
  20. import json
  21. from pathlib import Path
  22. from optparse import OptionParser
  23. if six.PY2:
  24. from cStringIO import StringIO
  25. else:
  26. from io import StringIO
  27. import validator_data_LEGAL_REVIEW_REQUIRED # pull in the data we need to configure this tool
  28. class Validator(object):
  29. """Class to contain the validator program"""
  30. # Set of all acceptable_use patterns actually used during the run
  31. pattern_used = set([])
  32. def validate_line(self, line, filepath, fileline, failed, errors, info):
  33. """Check that a line of text does not have any pattern that leaks IP."""
  34. # The acceptable_use_patterns must be kept up to date as the code base evolves, and it is desirable
  35. # that this list be short. The -u command line option will run the validator in a mode
  36. # where it will report acceptable use patterns that did not get used in the check.
  37. # Those patterns are candidates for elimination. Ultimately we want the acceptable use
  38. # pattern set to be empty, or as close to empty as possible.
  39. #
  40. # This is a 3 stage check. First we do a rapid check against fixed text
  41. # strings (self.prefilter). These strings must capture a superset of potential
  42. # problematic texts. The test is run by lowercasing all characters, so that
  43. # case specific strings are not required. For example "abc" tests for the
  44. # presence of any combination of the letters in any case.
  45. # This first stage is a speed optimization, because regular expression matching
  46. # in python is really horribly slow (orders of magnitude slower than unix
  47. # grep in many cases!).
  48. lower = line.lower()
  49. if any(ext in lower for ext in self.prefilter):
  50. line = line.rstrip(os.linesep)
  51. # Before we engage the accepted use machinery, make sure that the line contains a bad pattern that would
  52. # otherwise fail validation
  53. if self.compiled_bad_pattern.search(line) != None:
  54. # The second stage checks against a more specific regular expression. (self.compiled_acceptable_pattern)
  55. # if the regular expression is matched, then the line contains information
  56. # that may be acceptable use or a false positive. We check if the line is covered by an established
  57. # exception by the presence of matching pattern in the self.acceptable_use_patterns regular expression.
  58. # Make a copy of the line and replace any matching accepted uses with EXCEPTION. If something
  59. # changes, then we have a possible accepted use which we'll have to vet.
  60. origline = line
  61. line = self.compiled_acceptable_pattern.sub('EXCEPTION', line)
  62. accepting_patterns = []
  63. if line != origline:
  64. # Convert the path to forward slashes, then loop through all instances of an accepted use match
  65. # on this line
  66. line = origline
  67. canonical_filepath = os.path.abspath(filepath).replace('\\', '/')
  68. while True:
  69. m = self.compiled_acceptable_pattern.search(line)
  70. if not m:
  71. break
  72. # Find the specific matching pattern
  73. match = line[m.start():m.end()]
  74. found = False
  75. for pattern,compiledp,fileset in self.acceptable_use_patterns:
  76. if compiledp.search(match) != None:
  77. # If file testing isn't enabled, assume the match is good. Otherwise, search the file
  78. # patterns to see if this file is allowed
  79. if self.options.ignore_file_paths:
  80. found = True
  81. else:
  82. for fp in fileset:
  83. if fp.search(canonical_filepath) != None:
  84. found = True
  85. break
  86. if not found:
  87. errors.append("File rejected by pattern '{}': {}: line {}: {}".format(
  88. pattern, filepath, fileline, origline))
  89. # First pattern match is good enough
  90. if found:
  91. if self.options.check_unused_patterns:
  92. self.pattern_used.add(pattern)
  93. accepting_patterns.append(pattern)
  94. break
  95. # If no match was vetted by the filename, stop processing the line.
  96. if not found:
  97. break
  98. # We remove the accepted use from the line and replace it with "EXCEPTION" and retest the line.
  99. line = line[:m.start()] + 'EXCEPTION' + line[m.end():]
  100. # Once any possible accepted uses have been replaced, we check the resultant line to see if any bad
  101. # patterns remain in the line. If so, then we fail the line.
  102. if self.compiled_bad_pattern.search(line) != None:
  103. if self.options.list:
  104. self.output_unique_filepath(filepath)
  105. else:
  106. errors.append('validation failure in {}: line {}: {}'.format(filepath, fileline, origline))
  107. failed = 1
  108. # Otherwise, spit out the details of each match
  109. else:
  110. for a in accepting_patterns:
  111. info.append("Allowed by '{}' pattern: {}: line {}: {}".format(a, filepath, fileline, origline))
  112. if self.options.exception_file:
  113. self.exceptions_output.write("Allowed by '{}' pattern: {}: line {}: {}\n".format(a, filepath, fileline, origline))
  114. return failed
  115. def output_unique_filepath(self, filepath):
  116. """Output the name of a file exactly once in a run."""
  117. # Dictionary to use to ensure that we know which files we have already talked about
  118. global printed_filepath
  119. try:
  120. if not filepath in printed_filepath:
  121. print(filepath)
  122. printed_filepath[filepath] = 1
  123. except:
  124. # Global dict does not exist yet. Make it.
  125. printed_filepath = {}
  126. printed_filepath[filepath] = 1
  127. # Just print the filepath if we have not already printed it
  128. print(filepath)
  129. def validate_file(self, filepath):
  130. """Validate the content of a file 'filepath'.
  131. Return 0 if no issues are found, and 1 if an issue was noted."""
  132. failed = 0
  133. # Otherwise read the file off disk. Check the filename itself to make sure no naughty
  134. # bits are there.
  135. errors = []
  136. info = []
  137. failed = self.validate_line(filepath, 'filename', 0, failed, errors, info)
  138. for e in errors:
  139. logging.error(e)
  140. for i in info:
  141. logging.info(i)
  142. # Check if this file is a binary file, or an extension we always skip
  143. # These extensions are here because they sometimes look like text files,
  144. # but are not really text files in practice.
  145. if validator_data_LEGAL_REVIEW_REQUIRED.skip_file(filepath):
  146. logging.debug('Skipping %s', filepath)
  147. return
  148. # Python3 requires specific encoding but the repo is a mix of UTF-8, UTF-16, and latin-1
  149. # Just try except the possibilities until it works
  150. encodings = ["utf8", "utf-16-le", "utf-16-be", "latin-1"]
  151. for encoding_format in encodings:
  152. try:
  153. with open(filepath, encoding=encoding_format) as f:
  154. logging.debug('Validating %s', filepath)
  155. # Take care to deal with files that have unreasonably large lines.
  156. # if we don't do this, then the validator can segfault as it tries
  157. # to read a line that is insanely large. A try/except will not prevent this.
  158. # This can happen, for example, in XML file or obj files (the 3d format, not
  159. # the compiler output kind).
  160. # Quesion: if we encounter such a file, should we call it "binary" and quit?
  161. # TODO: There is a small "leak" here, in that we don't deal with crossing
  162. # a non line boundary clealy. In particular, a validation pattern
  163. # that occurs in the boundary crossing will not be properly searched.
  164. # The easiest thing to do is probably to retain the prior 128 bytes or
  165. # so from the prior portion of the line as a prefix to the tail of the rest of the line.
  166. # This will make the logic below much more complex.
  167. fileline = 0
  168. line = f.readline(10000)
  169. while line != '':
  170. fileline += 1
  171. errors = []
  172. info = []
  173. failed = self.validate_line(line, filepath, fileline, failed, errors, info)
  174. for e in errors:
  175. logging.error(e)
  176. for i in info:
  177. logging.info(i)
  178. line = f.readline(10000)
  179. return failed
  180. except UnicodeDecodeError:
  181. continue
  182. raise UnicodeError("Could not decode {0} due to an unexpected file encoding".format(filepath))
  183. # Walk directory tree and find all file paths, and run the search for bad code on each file.
  184. # We explicitly skip "SDKs" directories, "BinTemp" and "Python" directories and various others.
  185. # The first two are never part of the package, and Python is a special case SDK that we ship as is.\
  186. # TODO: Perhaps the directories to skip should become a parameter so we can use the validator
  187. # on non-Lumberyard trees.
  188. def validate_directory_tree(self, root, platform):
  189. """Walk from root to find all files to validate and call the validator on each file.
  190. Return 0 if no problems where found, and 1 if any validation failures occured."""
  191. counter = 0
  192. platform_failed = 0
  193. scanned = 0
  194. validations = 0
  195. bypassed_directories = validator_data_LEGAL_REVIEW_REQUIRED.get_bypassed_directories(self.options.all)
  196. for dirname, dirnames, filenames in os.walk(os.path.normpath(root)):
  197. # First deal with the files in the current directory
  198. for filename in filenames:
  199. filepath = os.path.join(dirname, filename)
  200. scanned += 1
  201. file_failed = self.validate_file(os.path.normpath(filepath))
  202. if file_failed:
  203. platform_failed = file_failed
  204. else:
  205. validations += 1
  206. # Trim out allowlisted subdirectories in the current directory if allowed
  207. for name in bypassed_directories:
  208. if name in dirnames:
  209. dirnames.remove(name)
  210. if scanned == 0:
  211. logging.error('No files scanned at target search directory: %s', root)
  212. platform_failed = 1
  213. else:
  214. print('validated {} of {} files'.format(validations, scanned))
  215. return platform_failed
  216. def compile_filter_patterns(self, platform):
  217. """Join together patterns listed in data file into single patterns and compile for speed."""
  218. if not platform in validator_data_LEGAL_REVIEW_REQUIRED.restricted_platforms:
  219. logging.error('platform data for platform %s not provided in validator_data_LEGAL_REVIEW_REQUIRED.py.', platform)
  220. sys.exit(1)
  221. restricted_platform = validator_data_LEGAL_REVIEW_REQUIRED.restricted_platforms[platform]
  222. if not 'prefilter' in restricted_platform:
  223. logging.error('prefilter list not found for platform %s in validator_data_LEGAL_REVIEW_REQUIRED.py', platform)
  224. sys.exit(1)
  225. self.prefilter = restricted_platform['prefilter']
  226. if not 'patterns' in restricted_platform:
  227. logging.error('patterns list not found for platform %s in validator_data_LEGAL_REVIEW_REQUIRED.py', platform)
  228. sys.exit(1)
  229. self.bad_patterns = restricted_platform['patterns']
  230. if not 'acceptable_use' in restricted_platform:
  231. logging.error('acceptable_use list not found for platform %s in validator_data_LEGAL_REVIEW_REQUIRED.py', platform)
  232. sys.exit(1)
  233. self.acceptable_use_patterns = []
  234. for p,fileset in restricted_platform['acceptable_use']:
  235. try:
  236. self.acceptable_use_patterns.append((p, re.compile(p), [re.compile(f) for f in fileset]))
  237. except:
  238. logging.error("Couldn't compile pattern {}...".format(p))
  239. traceback.print_exc()
  240. sys.exit(1)
  241. try:
  242. # Compile the search patterns for speed
  243. bad_pattern = '|'.join(self.bad_patterns)
  244. acceptable_pattern = '|'.join([p for p,compiledp,fileset in self.acceptable_use_patterns])
  245. self.compiled_bad_pattern = re.compile(bad_pattern)
  246. self.compiled_acceptable_pattern = re.compile(acceptable_pattern)
  247. except:
  248. logging.error('Could not compile patterns for validation. Check patterns in validator_data_LEGAL_REVIEW_REQUIRED.py for correctness.')
  249. traceback.print_exc()
  250. sys.exit(1)
  251. def test_prefilter_covers_bad_patterns(self):
  252. double_pattern = re.compile(r'\[(.)\1\]')
  253. for bad in self.bad_patterns:
  254. reduced = re.sub(double_pattern, r'\1', bad.lower())
  255. found = False
  256. for p in self.prefilter:
  257. if p in reduced:
  258. found = True
  259. break
  260. if not found:
  261. logging.error('Could not find a prefilter for {}.'.format(bad))
  262. return False
  263. return True
  264. def test_all_bad_patterns_active(self, platform):
  265. # Open the canary file relative to the validator script, that way we don't have to worry about temporary files and whatnot
  266. # Once we split the repos we will have to worry about multiple root points etc. but that is a problem for future us.
  267. # All the packaging safelist stuff goes away once repo is split for platforms
  268. this_path = Path(__file__).resolve()
  269. root_folder = this_path.parents[2]
  270. relative_folder = os.path.relpath(this_path.parent, root_folder)
  271. canary_file = os.path.join(root_folder, 'restricted', platform, relative_folder, platform.lower() + '_canary.txt')
  272. try:
  273. with open(canary_file) as canary:
  274. bad_patterns = self.bad_patterns
  275. canary.seek(0, 0)
  276. fileline = 0
  277. for line in canary:
  278. fileline += 1
  279. errors = []
  280. info = []
  281. # Each validation failure needs to be tracked back to the patterns that detect it. Once we find one, eliminate it
  282. # from the search list since we've found an instance where it would trigger
  283. if self.validate_line(line, canary_file, fileline, 0, errors, info) == 1:
  284. found = []
  285. for bad in bad_patterns:
  286. if re.search(bad, line):
  287. found.append(bad)
  288. for bad in found:
  289. bad_patterns.remove(bad)
  290. # If the search list isn't empty, then whatever remains needs a canary
  291. if len(bad_patterns) > 0:
  292. for bad in bad_patterns:
  293. logging.error("Could not find a canary for '{}'.".format(bad))
  294. return False
  295. except:
  296. logging.error('Could not open canary file {}.'.format(canary_file))
  297. return False
  298. return True
  299. def test(self, platform):
  300. if not self.test_prefilter_covers_bad_patterns():
  301. return False
  302. if not self.test_all_bad_patterns_active(platform):
  303. return False
  304. return True
  305. def validate(self, platform):
  306. self.compile_filter_patterns(platform)
  307. if not self.test(platform):
  308. logging.error('Validation could not pass {} self tests! Results cannot be trusted!'.format(platform))
  309. sys.exit(1)
  310. else:
  311. print('{} self tests SUCCEEDED'.format(platform))
  312. # Add the source code / SDK paths to check
  313. platform_failed = self.validate_directory_tree(os.path.abspath(self.args[0]), platform)
  314. # If the user asked, output any acceptable_use patterns that didn't get used
  315. if self.options.check_unused_patterns:
  316. for pattern,compiledp,fileset in self.acceptable_use_patterns:
  317. if not pattern in self.pattern_used:
  318. print("UNUSED ACCEPTABLE_USE PATTERN: '{}'".format(pattern))
  319. return platform_failed
  320. def __init__(self, options, args):
  321. self.options = options
  322. self.args = args
  323. self.prefilter = None
  324. self.compiled_bad_pattern = None
  325. self.compiled_acceptable_pattern = None
  326. self.bad_patterns = None
  327. self.acceptable_use_patterns = None
  328. def parse_options():
  329. """Set up the options parser, and parse the options the user gave to validator."""
  330. usage = 'usage: %prog [options] scandir'
  331. parser = OptionParser(usage)
  332. platform_choices = list(validator_data_LEGAL_REVIEW_REQUIRED.restricted_platforms_for_package.keys())
  333. platform_choices.sort()
  334. parser.add_option('--package_platform', action='store', type='choice',
  335. choices=platform_choices,
  336. dest='package_platform',
  337. help='Package platform to validate. Must be one of {}.'.format(platform_choices))
  338. parser.add_option('-s', '--store-exceptions', action='store', type='string', default='',
  339. dest='exception_file',
  340. help='Store list of lines that the validator gave exceptions to by matching accepted use patterns. These can be diffed with prior runs to see what is changing.')
  341. parser.add_option('-v', '--verbose', action='store', type='choice', choices=['0', '1', '2'], default='0',
  342. dest='verbose',
  343. help='Verbose output. Level 0 = only output lines that fail validation. '
  344. 'Level 1 = output of lines that would have failed without an accepted usage exception. '
  345. 'Level 2 also includes output of each filename being handled.')
  346. parser.add_option('-u', '--check-unused-patterns', action='store_true',
  347. dest='check_unused_patterns',
  348. help='Report on any acceptable_use patterns that are not matched.')
  349. parser.add_option('-l', '--list', action='store_true',
  350. dest='list',
  351. help='Only list filenames with validation errors. Useful as input to a set of files to edit or otherwise process.')
  352. parser.add_option('-a', '--all', action='store_true',
  353. dest='all',
  354. help='Do not skip any files or subdirectories when processing. Should be used on final clean code only. If you use this on your build tree in place lots of temp files will match.')
  355. parser.add_option('-i', '--ignore-file-paths', action='store_true',
  356. dest='ignore_file_paths',
  357. help='disable the filepath check for accepted_use patterns. Should only be when targeting a directory other than /dev/.')
  358. (options, args) = parser.parse_args()
  359. if options.verbose == '1':
  360. logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
  361. elif options.verbose == '2':
  362. logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.DEBUG)
  363. else:
  364. logging.basicConfig(format='%(levelname)s: %(message)s')
  365. if len(args) != 1:
  366. parser.error('no directory to scan specified and/or incorrect number of directories specified.')
  367. return options, args
  368. def main():
  369. """Main function for validator script"""
  370. options, args = parse_options()
  371. validator = Validator(options, args)
  372. package_failed = 0
  373. package_platform = validator.options.package_platform
  374. prohibited_platforms = validator_data_LEGAL_REVIEW_REQUIRED.get_prohibited_platforms_for_package(package_platform)
  375. if validator.options.exception_file != '':
  376. try:
  377. validator.exceptions_output = open(validator.options.exception_file, 'w', errors='ignore')
  378. except:
  379. logging.error("Cannot open exceptions output file '%s'", validator.options.exception_file)
  380. sys.exit(1)
  381. for platform in prohibited_platforms:
  382. print('validating {} against {} for package platform {}'.format(args[0], platform, package_platform))
  383. platform_failed = validator.validate(platform)
  384. if platform_failed:
  385. print('{} FAILED validation against {} for package platform {}'.format(args[0], platform, package_platform))
  386. package_failed = platform_failed
  387. else:
  388. print('{} is VALIDATED against {} for package platform {}'.format(args[0], platform, package_platform))
  389. if validator.options.exception_file != '':
  390. validator.exceptions_output.close()
  391. return package_failed
  392. if __name__ == '__main__':
  393. # pylint: disable-msg=C0103
  394. main_results = main()
  395. sys.exit(main_results)