text.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. # GNU MediaGoblin -- federated, autonomous media hosting
  2. # Copyright (C) 2011, 2012 MediaGoblin contributors. See AUTHORS.
  3. #
  4. # This program is free software: you can redistribute it and/or modify
  5. # it under the terms of the GNU Affero General Public License as published by
  6. # the Free Software Foundation, either version 3 of the License, or
  7. # (at your option) any later version.
  8. #
  9. # This program is distributed in the hope that it will be useful,
  10. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. # GNU Affero General Public License for more details.
  13. #
  14. # You should have received a copy of the GNU Affero General Public License
  15. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  16. import wtforms
  17. import markdown
  18. from lxml.html.clean import Cleaner
  19. from mediagoblin import mg_globals
  20. from mediagoblin.tools import url
  21. # A super strict version of the lxml.html cleaner class
  22. HTML_CLEANER = Cleaner(
  23. scripts=True,
  24. javascript=True,
  25. comments=True,
  26. style=True,
  27. links=True,
  28. page_structure=True,
  29. processing_instructions=True,
  30. embedded=True,
  31. frames=True,
  32. forms=True,
  33. annoying_tags=True,
  34. allow_tags=[
  35. 'div', 'b', 'i', 'em', 'strong', 'p', 'ul', 'ol', 'li', 'a', 'br',
  36. 'pre', 'code'],
  37. remove_unknown_tags=False, # can't be used with allow_tags
  38. safe_attrs_only=True,
  39. add_nofollow=True, # for now
  40. host_whitelist=(),
  41. whitelist_tags=set([]))
  42. def clean_html(html):
  43. # clean_html barfs on an empty string
  44. if not html:
  45. return u''
  46. return HTML_CLEANER.clean_html(html)
  47. def convert_to_tag_list_of_dicts(tag_string):
  48. """
  49. Filter input from incoming string containing user tags,
  50. Strips trailing, leading, and internal whitespace, and also converts
  51. the "tags" text into an array of tags
  52. """
  53. taglist = []
  54. if tag_string:
  55. # Strip out internal, trailing, and leading whitespace
  56. stripped_tag_string = u' '.join(tag_string.strip().split())
  57. # Split the tag string into a list of tags
  58. for tag in stripped_tag_string.split(','):
  59. tag = tag.strip()
  60. # Ignore empty or duplicate tags
  61. if tag and tag not in [t['name'] for t in taglist]:
  62. taglist.append({'name': tag,
  63. 'slug': url.slugify(tag)})
  64. return taglist
  65. def media_tags_as_string(media_entry_tags):
  66. """
  67. Generate a string from a media item's tags, stored as a list of dicts
  68. This is the opposite of convert_to_tag_list_of_dicts
  69. """
  70. tags_string = ''
  71. if media_entry_tags:
  72. tags_string = u', '.join([tag['name'] for tag in media_entry_tags])
  73. return tags_string
  74. TOO_LONG_TAG_WARNING = \
  75. u'Tags must be shorter than %s characters. Tags that are too long: %s'
  76. def tag_length_validator(form, field):
  77. """
  78. Make sure tags do not exceed the maximum tag length.
  79. """
  80. tags = convert_to_tag_list_of_dicts(field.data)
  81. too_long_tags = [
  82. tag['name'] for tag in tags
  83. if len(tag['name']) > mg_globals.app_config['tags_max_length']]
  84. if too_long_tags:
  85. raise wtforms.ValidationError(
  86. TOO_LONG_TAG_WARNING % (mg_globals.app_config['tags_max_length'],
  87. ', '.join(too_long_tags)))
  88. # Don't use the safe mode, because lxml.html.clean is better and we are using
  89. # it anyway
  90. UNSAFE_MARKDOWN_INSTANCE = markdown.Markdown()
  91. def cleaned_markdown_conversion(text):
  92. """
  93. Take a block of text, run it through MarkDown, and clean its HTML.
  94. """
  95. # Markdown will do nothing with and clean_html can do nothing with
  96. # an empty string :)
  97. if not text:
  98. return u''
  99. return clean_html(UNSAFE_MARKDOWN_INSTANCE.convert(text))