transform_content.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. #!/usr/bin/env python
  2. # Copyright 2008 Brett Slatkin
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. __author__ = "Brett Slatkin (bslatkin@gmail.com)"
  16. import os
  17. import re
  18. import urlparse
  19. ################################################################################
  20. # URLs that have absolute addresses
  21. ABSOLUTE_URL_REGEX = r"(http(s?):)?//(?P<url>[^\"'> \t\)]+)"
  22. # URLs that are relative to the base of the current hostname.
  23. BASE_RELATIVE_URL_REGEX = r"/(?!(/)|(http(s?)://)|(url\())(?P<url>[^\"'> \t\)]*)"
  24. # URLs that have '../' or './' to start off their paths.
  25. TRAVERSAL_URL_REGEX = r"(?P<relative>\.(\.)?)/(?!(/)|(http(s?)://)|(url\())(?P<url>[^\"'> \t\)]*)"
  26. # URLs that are in the same directory as the requested URL.
  27. SAME_DIR_URL_REGEX = r"(?!(/)|(http(s?)://)|(url\())(?P<url>[^\"'> \t\)]+)"
  28. # URL matches the root directory.
  29. ROOT_DIR_URL_REGEX = r"(?!//(?!>))/(?P<url>)(?=[ \t\n]*[\"'\)>/])"
  30. # Start of a tag using 'src' or 'href'
  31. TAG_START = r"(?i)\b(?P<tag>src|href|action|url|background)(?P<equals>[\t ]*=[\t ]*)(?P<quote>[\"']?)"
  32. # Start of a CSS import
  33. CSS_IMPORT_START = r"(?i)@import(?P<spacing>[\t ]+)(?P<quote>[\"']?)"
  34. # CSS url() call
  35. CSS_URL_START = r"(?i)\burl\((?P<quote>[\"']?)"
  36. REPLACEMENT_REGEXES = [
  37. (TAG_START + SAME_DIR_URL_REGEX,
  38. "\g<tag>\g<equals>\g<quote>%(accessed_dir)s\g<url>"),
  39. (TAG_START + TRAVERSAL_URL_REGEX,
  40. "\g<tag>\g<equals>\g<quote>%(accessed_dir)s/\g<relative>/\g<url>"),
  41. (TAG_START + BASE_RELATIVE_URL_REGEX,
  42. "\g<tag>\g<equals>\g<quote>/%(base)s/\g<url>"),
  43. (TAG_START + ROOT_DIR_URL_REGEX,
  44. "\g<tag>\g<equals>\g<quote>/%(base)s/"),
  45. # Need this because HTML tags could end with '/>', which confuses the
  46. # tag-matching regex above, since that's the end-of-match signal.
  47. (TAG_START + ABSOLUTE_URL_REGEX,
  48. "\g<tag>\g<equals>\g<quote>/\g<url>"),
  49. (CSS_IMPORT_START + SAME_DIR_URL_REGEX,
  50. "@import\g<spacing>\g<quote>%(accessed_dir)s\g<url>"),
  51. (CSS_IMPORT_START + TRAVERSAL_URL_REGEX,
  52. "@import\g<spacing>\g<quote>%(accessed_dir)s/\g<relative>/\g<url>"),
  53. (CSS_IMPORT_START + BASE_RELATIVE_URL_REGEX,
  54. "@import\g<spacing>\g<quote>/%(base)s/\g<url>"),
  55. (CSS_IMPORT_START + ABSOLUTE_URL_REGEX,
  56. "@import\g<spacing>\g<quote>/\g<url>"),
  57. (CSS_URL_START + SAME_DIR_URL_REGEX,
  58. "url(\g<quote>%(accessed_dir)s\g<url>"),
  59. (CSS_URL_START + TRAVERSAL_URL_REGEX,
  60. "url(\g<quote>%(accessed_dir)s/\g<relative>/\g<url>"),
  61. (CSS_URL_START + BASE_RELATIVE_URL_REGEX,
  62. "url(\g<quote>/%(base)s/\g<url>"),
  63. (CSS_URL_START + ABSOLUTE_URL_REGEX,
  64. "url(\g<quote>/\g<url>"),
  65. ]
  66. ################################################################################
  67. def TransformContent(base_url, accessed_url, content):
  68. url_obj = urlparse.urlparse(accessed_url)
  69. accessed_dir = os.path.dirname(url_obj.path)
  70. if not accessed_dir.endswith("/"):
  71. accessed_dir += "/"
  72. for pattern, replacement in REPLACEMENT_REGEXES:
  73. fixed_replacement = replacement % {
  74. "base": base_url,
  75. "accessed_dir": accessed_dir,
  76. }
  77. content = re.sub(pattern, fixed_replacement, content)
  78. return content