metadata.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224
  1. # GNU MediaGoblin -- federated, autonomous media hosting
  2. # Copyright (C) 2011, 2012 MediaGoblin contributors. See AUTHORS.
  3. #
  4. # This program is free software: you can redistribute it and/or modify
  5. # it under the terms of the GNU Affero General Public License as published by
  6. # the Free Software Foundation, either version 3 of the License, or
  7. # (at your option) any later version.
  8. #
  9. # This program is distributed in the hope that it will be useful,
  10. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. # GNU Affero General Public License for more details.
  13. #
  14. # You should have received a copy of the GNU Affero General Public License
  15. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  16. from io import open
  17. import os
  18. import copy
  19. import json
  20. import re
  21. from pkg_resources import resource_filename
  22. import dateutil.parser
  23. from pyld import jsonld
  24. from jsonschema import validate, FormatChecker, draft4_format_checker
  25. from jsonschema.compat import str_types
  26. from mediagoblin.tools.pluginapi import hook_handle
  27. ########################################################
  28. ## Set up the MediaGoblin format checker for json-schema
  29. ########################################################
  30. URL_REGEX = re.compile(
  31. r'^[a-z]+://([^/:]+|([0-9]{1,3}\.){3}[0-9]{1,3})(:[0-9]+)?(\/.*)?$',
  32. re.IGNORECASE)
  33. def is_uri(instance):
  34. """
  35. jsonschema uri validator
  36. """
  37. if not isinstance(instance, str_types):
  38. return True
  39. return URL_REGEX.match(instance)
  40. def is_datetime(instance):
  41. """
  42. Is a date or datetime readable string.
  43. """
  44. if not isinstance(instance, str_types):
  45. return True
  46. return dateutil.parser.parse(instance)
  47. class DefaultChecker(FormatChecker):
  48. """
  49. Default MediaGoblin format checker... extended to include a few extra things
  50. """
  51. checkers = copy.deepcopy(draft4_format_checker.checkers)
  52. DefaultChecker.checkers[u"uri"] = (is_uri, ())
  53. DefaultChecker.checkers[u"date-time"] = (is_datetime, (ValueError, TypeError))
  54. DEFAULT_CHECKER = DefaultChecker()
  55. # Crappy default schema, checks for things we deem important
  56. DEFAULT_SCHEMA = {
  57. "$schema": "http://json-schema.org/schema#",
  58. "type": "object",
  59. "properties": {
  60. "license": {
  61. "format": "uri",
  62. "type": "string",
  63. },
  64. "dcterms:created": {
  65. "format": "date-time",
  66. "type": "string",
  67. },
  68. "dc:created": {
  69. "format": "date-time",
  70. "type": "string",
  71. }
  72. },
  73. }
  74. def load_resource(package, resource_path):
  75. """
  76. Load a resource, return it as a string.
  77. Args:
  78. - package: package or module name. Eg "mediagoblin.media_types.audio"
  79. - resource_path: path to get to this resource, a list of
  80. directories and finally a filename. Will be joined with
  81. os.path.sep.
  82. """
  83. filename = resource_filename(package, os.path.sep.join(resource_path))
  84. return open(filename, encoding="utf-8").read()
  85. def load_resource_json(package, resource_path):
  86. """
  87. Load a resource json file, return a dictionary.
  88. Args:
  89. - package: package or module name. Eg "mediagoblin.media_types.audio"
  90. - resource_path: path to get to this resource, a list of
  91. directories and finally a filename. Will be joined with
  92. os.path.sep.
  93. """
  94. return json.loads(load_resource(package, resource_path))
  95. ##################################
  96. ## Load the MediaGoblin core files
  97. ##################################
  98. BUILTIN_CONTEXTS = {
  99. "http://www.w3.org/2013/json-ld-context/rdfa11": load_resource(
  100. "mediagoblin", ["static", "metadata", "rdfa11.jsonld"])}
  101. _CONTEXT_CACHE = {}
  102. def load_context(url):
  103. """
  104. A self-aware document loader. For those contexts MediaGoblin
  105. stores internally, load them from disk.
  106. """
  107. if url in _CONTEXT_CACHE:
  108. return _CONTEXT_CACHE[url]
  109. # See if it's one of our basic ones
  110. document = BUILTIN_CONTEXTS.get(url, None)
  111. # No? See if we have an internal schema for this
  112. if document is None:
  113. document = hook_handle(("context_url_data", url))
  114. # Okay, if we've gotten a document by now... let's package it up
  115. if document is not None:
  116. document = {'contextUrl': None,
  117. 'documentUrl': url,
  118. 'document': document}
  119. # Otherwise, use jsonld.load_document
  120. else:
  121. document = jsonld.load_document(url)
  122. # cache
  123. _CONTEXT_CACHE[url] = document
  124. return document
  125. DEFAULT_CONTEXT = "http://www.w3.org/2013/json-ld-context/rdfa11"
  126. def compact_json(metadata, context=DEFAULT_CONTEXT):
  127. """
  128. Compact json with supplied context.
  129. Note: Free floating" nodes are removed (eg a key just named
  130. "bazzzzzz" which isn't specified in the context... something like
  131. bazzzzzz:blerp will stay though. This is jsonld.compact behavior.
  132. """
  133. compacted = jsonld.compact(
  134. metadata, context,
  135. options={
  136. "documentLoader": load_context,
  137. # This allows for things like "license" and etc to be preserved
  138. "expandContext": context,
  139. "keepFreeFloatingNodes": False})
  140. return compacted
  141. def compact_and_validate(metadata, context=DEFAULT_CONTEXT,
  142. schema=DEFAULT_SCHEMA):
  143. """
  144. compact json with supplied context, check against schema for errors
  145. raises an exception (jsonschema.exceptions.ValidationError) if
  146. there's an error.
  147. Note: Free floating" nodes are removed (eg a key just named
  148. "bazzzzzz" which isn't specified in the context... something like
  149. bazzzzzz:blerp will stay though. This is jsonld.compact behavior.
  150. You may wish to do this validation yourself... this is just for convenience.
  151. """
  152. compacted = compact_json(metadata, context)
  153. validate(metadata, schema, format_checker=DEFAULT_CHECKER)
  154. return compacted
  155. def expand_json(metadata, context=DEFAULT_CONTEXT):
  156. """
  157. Expand json, but be sure to use our documentLoader.
  158. By default this expands with DEFAULT_CONTEXT, but if you do not need this,
  159. you can safely set this to None.
  160. # @@: Is the above a good idea? Maybe it should be set to None by
  161. # default.
  162. """
  163. options = {
  164. "documentLoader": load_context}
  165. if context is not None:
  166. options["expandContext"] = context
  167. return jsonld.expand(metadata, options=options)
  168. def rdfa_to_readable(rdfa_predicate):
  169. readable = rdfa_predicate.split(u":")[1].capitalize()
  170. return readable