Devyatyi9
/
o3de
mirror de https://github.com/o3de/o3de


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448
							#!/usr/bin/env python
"""Python script to validate that code does not contain an console platform specific
references or code that should not be published. Can be used to scan any directory."""

#
# Copyright (c) Contributors to the Open 3D Engine Project.
# For complete copyright and license terms please see the LICENSE at the root of this distribution.
#
# SPDX-License-Identifier: Apache-2.0 OR MIT
#
#

# pylint: disable-msg=C0301

from __future__ import absolute_import
from __future__ import print_function
import six
import logging
import os
import re
import sys
import traceback
import json
from pathlib import Path
from optparse import OptionParser
if six.PY2:
    from cStringIO import StringIO
else:
    from io import StringIO

import validator_data_LEGAL_REVIEW_REQUIRED # pull in the data we need to configure this tool

class Validator(object):
    """Class to contain the validator program"""
    # Set of all acceptable_use patterns actually used during the run
    pattern_used = set([])

    def validate_line(self, line, filepath, fileline, failed, errors, info):
        """Check that a line of text does not have any pattern that leaks IP."""
        # The acceptable_use_patterns must be kept up to date as the code base evolves, and it is desirable
        # that this list be short. The -u command line option will run the validator in a mode
        # where it will report acceptable use patterns that did not get used in the check.
        # Those patterns are candidates for elimination. Ultimately we want the acceptable use
        # pattern set to be empty, or as close to empty as possible.
        #
        # This is a 3 stage check. First we do a rapid check against fixed text
        # strings (self.prefilter). These strings must capture a superset of potential
        # problematic texts. The test is run by lowercasing all characters, so that
        # case specific strings are not required. For example "abc" tests for the
        # presence of any combination of the letters in any case.
        # This first stage is a speed optimization, because regular expression matching
        # in python is really horribly slow (orders of magnitude slower than unix
        # grep in many cases!).
        lower = line.lower()
        if any(ext in lower for ext in self.prefilter):
            line = line.rstrip(os.linesep)

            # Before we engage the accepted use machinery, make sure that the line contains a bad pattern that would
            # otherwise fail validation
            if self.compiled_bad_pattern.search(line) != None:

                # The second stage checks against a more specific regular expression. (self.compiled_acceptable_pattern)
                # if the regular expression is matched, then the line contains information
                # that may be acceptable use or a false positive. We check if the line is covered by an established
                # exception by the presence of matching pattern in the self.acceptable_use_patterns regular expression.
                # Make a copy of the line and replace any matching accepted uses with EXCEPTION. If something
                # changes, then we have a possible accepted use which we'll have to vet.
                origline = line
                line = self.compiled_acceptable_pattern.sub('EXCEPTION', line)
                accepting_patterns = []
                if line != origline:

                    # Convert the path to forward slashes, then loop through all instances of an accepted use match
                    # on this line
                    line = origline
                    canonical_filepath = os.path.abspath(filepath).replace('\\', '/')
                    while True:
                        m = self.compiled_acceptable_pattern.search(line)
                        if not m:
                            break

                        # Find the specific matching pattern
                        match = line[m.start():m.end()]
                        found = False
                        for pattern,compiledp,fileset in self.acceptable_use_patterns:
                            if compiledp.search(match) != None:

                                # If file testing isn't enabled, assume the match is good. Otherwise, search the file
                                # patterns to see if this file is allowed
                                if self.options.ignore_file_paths:
                                    found = True
                                else:
                                    for fp in fileset:
                                        if fp.search(canonical_filepath) != None:
                                            found = True
                                            break
                                    if not found:
                                        errors.append("File rejected by pattern '{}': {}: line {}: {}".format(
                                            pattern, filepath, fileline, origline))

                                # First pattern match is good enough
                                if found:
                                    if self.options.check_unused_patterns:
                                        self.pattern_used.add(pattern)
                                    accepting_patterns.append(pattern)
                                break

                        # If no match was vetted by the filename, stop processing the line.
                        if not found:
                            break

                        # We remove the accepted use from the line and replace it with "EXCEPTION" and retest the line.
                        line = line[:m.start()] + 'EXCEPTION' + line[m.end():]

                # Once any possible accepted uses have been replaced, we check the resultant line to see if any bad
                # patterns remain in the line. If so, then we fail the line.
                if self.compiled_bad_pattern.search(line) != None:
                    if self.options.list:
                        self.output_unique_filepath(filepath)
                    else:
                        errors.append('validation failure in {}: line {}:  {}'.format(filepath, fileline, origline))
                    failed = 1

                # Otherwise, spit out the details of each match
                else:
                    for a in accepting_patterns:
                        info.append("Allowed by '{}' pattern: {}: line {}: {}".format(a, filepath, fileline, origline))
                        if self.options.exception_file:
                            self.exceptions_output.write("Allowed by '{}' pattern: {}: line {}: {}\n".format(a, filepath, fileline, origline))
        return failed

    def output_unique_filepath(self, filepath):
        """Output the name of a file exactly once in a run."""
        # Dictionary to use to ensure that we know which files we have already talked about
        global printed_filepath
        try:
            if not filepath in printed_filepath:
                print(filepath)
                printed_filepath[filepath] = 1
        except:
            # Global dict does not exist yet. Make it.
            printed_filepath = {}
            printed_filepath[filepath] = 1
            # Just print the filepath if we have not already printed it
            print(filepath)

    def validate_file(self, filepath):
        """Validate the content of a file 'filepath'.
        Return 0 if no issues are found, and 1 if an issue was noted."""
        failed = 0

        # Otherwise read the file off disk.  Check the filename itself to make sure no naughty
        # bits are there.
        errors = []
        info = []
        failed = self.validate_line(filepath, 'filename', 0, failed, errors, info)
        for e in errors:
            logging.error(e)
        for i in info:
            logging.info(i)

        # Check if this file is a binary file, or an extension we always skip
        # These extensions are here because they sometimes look like text files,
        # but are not really text files in practice.
        if validator_data_LEGAL_REVIEW_REQUIRED.skip_file(filepath):
            logging.debug('Skipping %s', filepath)
            return

        # Python3 requires specific encoding but the repo is a mix of UTF-8, UTF-16, and latin-1
        # Just try except the possibilities until it works
        encodings = ["utf8", "utf-16-le", "utf-16-be", "latin-1"]
        for encoding_format in encodings:
            try:
                with open(filepath, encoding=encoding_format) as f:
                    logging.debug('Validating %s', filepath)

                    # Take care to deal with files that have unreasonably large lines.
                    # if we don't do this, then the validator can segfault as it tries
                    # to read a line that is insanely large. A try/except will not prevent this.
                    # This can happen, for example, in XML file or obj files (the 3d format, not
                    # the compiler output kind).
                    # Quesion: if we encounter such a file, should we call it "binary" and quit?
                    # TODO: There is a small "leak" here, in that we don't deal with crossing
                    # a non line boundary clealy. In particular, a validation pattern
                    # that occurs in the boundary crossing will not be properly searched.
                    # The easiest thing to do is probably to retain the prior 128 bytes or
                    # so from the prior portion of the line as a prefix to the tail of the rest of the line.
                    # This will make the logic below much more complex.
                    fileline = 0
                    line = f.readline(10000)
                    while line != '':
                        fileline += 1
                        errors = []
                        info = []
                        failed = self.validate_line(line, filepath, fileline, failed, errors, info)
                        for e in errors:
                            logging.error(e)
                        for i in info:
                            logging.info(i)
                        line = f.readline(10000)
                return failed
            except UnicodeDecodeError:
                continue

        raise UnicodeError("Could not decode {0} due to an unexpected file encoding".format(filepath))

    # Walk directory tree and find all file paths, and run the search for bad code on each file.
    # We explicitly skip "SDKs" directories, "BinTemp" and "Python" directories and various others.
    # The first two are never part of the package, and Python is a special case SDK that we ship as is.\
    # TODO: Perhaps the directories to skip should become a parameter so we can use the validator
    # on non-Lumberyard trees.
    def validate_directory_tree(self, root, platform):
        """Walk from root to find all files to validate and call the validator on each file.
        Return 0 if no problems where found, and 1 if any validation failures occured."""
        counter = 0
        platform_failed = 0
        scanned = 0
        validations = 0
        bypassed_directories = validator_data_LEGAL_REVIEW_REQUIRED.get_bypassed_directories(self.options.all)

        for dirname, dirnames, filenames in os.walk(os.path.normpath(root)):
            # First deal with the files in the current directory
            for filename in filenames:
                filepath = os.path.join(dirname, filename)
                scanned += 1
                file_failed = self.validate_file(os.path.normpath(filepath))
                if file_failed:
                    platform_failed = file_failed
                else:
                    validations += 1

            # Trim out allowlisted subdirectories in the current directory if allowed
            for name in bypassed_directories:
                if name in dirnames:
                    dirnames.remove(name)
        if scanned == 0:
            logging.error('No files scanned at target search directory: %s', root)
            platform_failed = 1
        else:
            print('validated {} of {} files'.format(validations, scanned))
        return platform_failed


    def compile_filter_patterns(self, platform):
        """Join together patterns listed in data file into single patterns and compile for speed."""
        if not platform in validator_data_LEGAL_REVIEW_REQUIRED.restricted_platforms:
            logging.error('platform data for platform %s not provided in validator_data_LEGAL_REVIEW_REQUIRED.py.', platform)
            sys.exit(1)
        restricted_platform = validator_data_LEGAL_REVIEW_REQUIRED.restricted_platforms[platform]
        if not 'prefilter' in restricted_platform:
            logging.error('prefilter list not found for platform %s in validator_data_LEGAL_REVIEW_REQUIRED.py', platform)
            sys.exit(1)
        self.prefilter = restricted_platform['prefilter']
        if not 'patterns' in restricted_platform:
            logging.error('patterns list not found for platform %s in validator_data_LEGAL_REVIEW_REQUIRED.py', platform)
            sys.exit(1)
        self.bad_patterns = restricted_platform['patterns']
        if not 'acceptable_use' in restricted_platform:
            logging.error('acceptable_use list not found for platform %s in validator_data_LEGAL_REVIEW_REQUIRED.py', platform)
            sys.exit(1)
        self.acceptable_use_patterns = []
        for p,fileset in restricted_platform['acceptable_use']:
            try:
                self.acceptable_use_patterns.append((p, re.compile(p), [re.compile(f) for f in fileset]))
            except:
                logging.error("Couldn't compile pattern {}...".format(p))
                traceback.print_exc()
                sys.exit(1)

        try:
            # Compile the search patterns for speed
            bad_pattern = '|'.join(self.bad_patterns)
            acceptable_pattern = '|'.join([p for p,compiledp,fileset in self.acceptable_use_patterns])
            self.compiled_bad_pattern = re.compile(bad_pattern)
            self.compiled_acceptable_pattern = re.compile(acceptable_pattern)
        except:
            logging.error('Could not compile patterns for validation. Check patterns in validator_data_LEGAL_REVIEW_REQUIRED.py for correctness.')
            traceback.print_exc()
            sys.exit(1)

    def test_prefilter_covers_bad_patterns(self):
        double_pattern = re.compile(r'\[(.)\1\]')
        for bad in self.bad_patterns:
            reduced = re.sub(double_pattern, r'\1', bad.lower())
            found = False
            for p in self.prefilter:
                if p in reduced:
                    found = True
                    break
            if not found:
                logging.error('Could not find a prefilter for {}.'.format(bad))
                return False
        return True

    def test_all_bad_patterns_active(self, platform):

        # Open the canary file relative to the validator script, that way we don't have to worry about temporary files and whatnot
        # Once we split the repos we will have to worry about multiple root points etc. but that is a problem for future us.
        # All the packaging safelist stuff goes away once repo is split for platforms
        this_path = Path(__file__).resolve()
        root_folder = this_path.parents[2]
        relative_folder = os.path.relpath(this_path.parent, root_folder)
        canary_file = os.path.join(root_folder, 'restricted', platform, relative_folder, platform.lower() + '_canary.txt')
        try:
            with open(canary_file) as canary:
                bad_patterns = self.bad_patterns
                canary.seek(0, 0)
                fileline = 0
                for line in canary:
                    fileline += 1
                    errors = []
                    info = []

                    # Each validation failure needs to be tracked back to the patterns that detect it. Once we find one, eliminate it
                    # from the search list since we've found an instance where it would trigger
                    if self.validate_line(line, canary_file, fileline, 0, errors, info) == 1:
                        found = []
                        for bad in bad_patterns:
                            if re.search(bad, line):
                                found.append(bad)
                        for bad in found:
                            bad_patterns.remove(bad)

                # If the search list isn't empty, then whatever remains needs a canary
                if len(bad_patterns) > 0:
                    for bad in bad_patterns:
                        logging.error("Could not find a canary for '{}'.".format(bad))
                    return False
        except:
            logging.error('Could not open canary file {}.'.format(canary_file))
            return False
        return True

    def test(self, platform):
        if not self.test_prefilter_covers_bad_patterns():
            return False
        if not self.test_all_bad_patterns_active(platform):
            return False
        return True

    def validate(self, platform):
        self.compile_filter_patterns(platform)
        if not self.test(platform):
            logging.error('Validation could not pass {} self tests! Results cannot be trusted!'.format(platform))
            sys.exit(1)
        else:
            print('{} self tests SUCCEEDED'.format(platform))

        # Add the source code / SDK paths to check
        platform_failed = self.validate_directory_tree(os.path.abspath(self.args[0]), platform)

        # If the user asked, output any acceptable_use patterns that didn't get used
        if self.options.check_unused_patterns:
            for pattern,compiledp,fileset in self.acceptable_use_patterns:
                if not pattern in self.pattern_used:
                    print("UNUSED ACCEPTABLE_USE PATTERN: '{}'".format(pattern))

        return platform_failed

    def __init__(self, options, args):
        self.options = options
        self.args = args
        self.prefilter = None
        self.compiled_bad_pattern = None
        self.compiled_acceptable_pattern = None
        self.bad_patterns = None
        self.acceptable_use_patterns = None

def parse_options():
    """Set up the options parser, and parse the options the user gave to validator."""
    usage = 'usage: %prog [options] scandir'
    parser = OptionParser(usage)
    platform_choices = list(validator_data_LEGAL_REVIEW_REQUIRED.restricted_platforms_for_package.keys())
    platform_choices.sort()
    parser.add_option('--package_platform', action='store', type='choice',
                      choices=platform_choices,
                      dest='package_platform',
                      help='Package platform to validate. Must be one of {}.'.format(platform_choices))
    parser.add_option('-s', '--store-exceptions', action='store', type='string', default='',
                      dest='exception_file',
                      help='Store list of lines that the validator gave exceptions to by matching accepted use patterns. These can be diffed with prior runs to see what is changing.')
    parser.add_option('-v', '--verbose', action='store', type='choice', choices=['0', '1', '2'], default='0',
                      dest='verbose',
                      help='Verbose output. Level 0 = only output lines that fail validation. '
                           'Level 1 = output of lines that would have failed without an accepted usage exception. '
                           'Level 2 also includes output of each filename being handled.')
    parser.add_option('-u', '--check-unused-patterns', action='store_true',
                      dest='check_unused_patterns',
                      help='Report on any acceptable_use patterns that are not matched.')
    parser.add_option('-l', '--list', action='store_true',
                      dest='list',
                      help='Only list filenames with validation errors. Useful as input to a set of files to edit or otherwise process.')
    parser.add_option('-a', '--all', action='store_true',
                      dest='all',
                      help='Do not skip any files or subdirectories when processing. Should be used on final clean code only. If you use this on your build tree in place lots of temp files will match.')
    parser.add_option('-i', '--ignore-file-paths', action='store_true',
                      dest='ignore_file_paths',
                      help='disable the filepath check for accepted_use patterns. Should only be when targeting a directory other than /dev/.')

    (options, args) = parser.parse_args()

    if options.verbose == '1':
        logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
    elif options.verbose == '2':
        logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.DEBUG)
    else:
        logging.basicConfig(format='%(levelname)s: %(message)s')

    if len(args) != 1:
        parser.error('no directory to scan specified and/or incorrect number of directories specified.')
    return options, args

def main():
    """Main function for validator script"""
    options, args = parse_options()
    validator = Validator(options, args)

    package_failed = 0
    package_platform = validator.options.package_platform
    prohibited_platforms = validator_data_LEGAL_REVIEW_REQUIRED.get_prohibited_platforms_for_package(package_platform)

    if validator.options.exception_file != '':
        try:
            validator.exceptions_output = open(validator.options.exception_file, 'w', errors='ignore')
        except:
            logging.error("Cannot open exceptions output file '%s'", validator.options.exception_file)
            sys.exit(1)

    for platform in prohibited_platforms:
        print('validating {} against {} for package platform {}'.format(args[0], platform, package_platform))
        platform_failed = validator.validate(platform)
        if platform_failed:
            print('{} FAILED validation against {} for package platform {}'.format(args[0], platform, package_platform))
            package_failed = platform_failed
        else:
            print('{} is VALIDATED against {} for package platform {}'.format(args[0], platform, package_platform))

    if validator.options.exception_file != '':
        validator.exceptions_output.close()

    return package_failed


if __name__ == '__main__':
    # pylint: disable-msg=C0103
    main_results = main()
    sys.exit(main_results)