process-json-files.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242
  1. #! /usr/bin/python3
  2. # Copyright (C) 2020, 2021 grizzlyuser <grizzlyuser@protonmail.com>
  3. # Based on: https://gitlab.trisquel.org/trisquel/wrapage-helpers/-/blob/81881d89b2bf7d502dd14fcccdb471fec6f6b206/helpers/DATA/firefox/reprocess-search-config.py
  4. # Below is the notice from the original author:
  5. #
  6. # Copyright (C) 2020 Ruben Rodriguez <ruben@trisquel.info>
  7. #
  8. # This program is free software; you can redistribute it and/or modify
  9. # it under the terms of the GNU General Public License as published by
  10. # the Free Software Foundation; either version 2 of the License, or
  11. # (at your option) any later version.
  12. #
  13. # This program is distributed in the hope that it will be useful,
  14. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. # GNU General Public License for more details.
  17. #
  18. # You should have received a copy of the GNU General Public License
  19. # along with this program; if not, write to the Free Software
  20. # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  21. import json
  22. import sys
  23. import time
  24. import copy
  25. import argparse
  26. import pathlib
  27. from collections import namedtuple
  28. from jsonschema import validate
  29. parser = argparse.ArgumentParser()
  30. parser.add_argument(
  31. 'MAIN_PATH',
  32. type=pathlib.Path,
  33. help='path to main application source code directory')
  34. parser.add_argument(
  35. 'BRANDING_PATH',
  36. type=pathlib.Path,
  37. help='path to branding source code directory')
  38. parser.add_argument(
  39. '-i',
  40. '--indent',
  41. type=int,
  42. default=2,
  43. help='indent for pretty printing of output files')
  44. arguments = parser.parse_args()
  45. File = namedtuple('File', ['path', 'content'])
  46. class RemoteSettings:
  47. DUMPS_PATH_RELATIVE = 'services/settings/dumps'
  48. DUMPS_PATH_ABSOLUTE = arguments.MAIN_PATH / DUMPS_PATH_RELATIVE
  49. _WRAPPER_NAME = 'data'
  50. @classmethod
  51. def wrap(cls, processed):
  52. return File(processed.path, {cls._WRAPPER_NAME: processed.content})
  53. @classmethod
  54. def unwrap(cls, parsed_jsons):
  55. return [File(json.path, json.content[cls._WRAPPER_NAME])
  56. for json in parsed_jsons]
  57. @classmethod
  58. def should_modify_collection(cls, collection):
  59. return True
  60. @classmethod
  61. def now(cls):
  62. return int(round(time.time_ns() / 10 ** 6))
  63. @classmethod
  64. def process_raw(cls, unwrapped_jsons, parsed_schema):
  65. timestamps, result = [], []
  66. for collection in unwrapped_jsons:
  67. should_modify_collection = cls.should_modify_collection(collection)
  68. for record in collection.content:
  69. if should_modify_collection:
  70. if cls.should_drop_record(record):
  71. continue
  72. clone = copy.deepcopy(record)
  73. record = cls.process_record(record)
  74. if clone != record:
  75. timestamp = cls.now()
  76. while timestamp in timestamps:
  77. timestamp += 1
  78. timestamps.append(timestamp)
  79. record['last_modified'] = timestamp
  80. if parsed_schema is not None:
  81. validate(record, schema=parsed_schema)
  82. result.append(record)
  83. cls.OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
  84. return File(cls.OUTPUT_PATH, result)
  85. @classmethod
  86. def process(cls, parsed_jsons, parsed_schema):
  87. return cls.wrap(
  88. cls.process_raw(
  89. cls.unwrap(parsed_jsons),
  90. parsed_schema))
  91. class Changes(RemoteSettings):
  92. JSON_PATHS = tuple(RemoteSettings.DUMPS_PATH_ABSOLUTE.glob('*/*.json'))
  93. OUTPUT_PATH = RemoteSettings.DUMPS_PATH_ABSOLUTE / 'monitor/changes.json'
  94. @classmethod
  95. def wrap(cls, processed):
  96. return File(
  97. processed.path, {
  98. 'changes': processed.content, 'timestamp': cls.now()})
  99. @classmethod
  100. def process_raw(cls, unwrapped_jsons, parsed_schema):
  101. changes = []
  102. for collection in unwrapped_jsons:
  103. if collection.path != RemoteSettings.DUMPS_PATH_ABSOLUTE / 'main/example.json':
  104. latest_change = {}
  105. latest_change['last_modified'] = max(
  106. (record['last_modified'] for record in collection.content), default=0)
  107. latest_change['bucket'] = collection.path.parent.name
  108. latest_change['collection'] = collection.path.stem
  109. changes.append(latest_change)
  110. cls.OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
  111. return File(cls.OUTPUT_PATH, changes)
  112. class SearchConfig(RemoteSettings):
  113. JSON_PATHS = (
  114. RemoteSettings.DUMPS_PATH_ABSOLUTE /
  115. 'main/search-config.json',
  116. )
  117. SCHEMA_PATH = arguments.MAIN_PATH / \
  118. 'toolkit/components/search/schema/search-engine-config-schema.json'
  119. OUTPUT_PATH = JSON_PATHS[0]
  120. _DUCKDUCKGO_SEARCH_ENGINE_ID = 'ddg@search.mozilla.org'
  121. @classmethod
  122. def should_drop_record(cls, search_engine):
  123. return search_engine['webExtension']['id'] not in (
  124. cls._DUCKDUCKGO_SEARCH_ENGINE_ID, 'wikipedia@search.mozilla.org')
  125. @classmethod
  126. def process_record(cls, search_engine):
  127. [search_engine.pop(key, None)
  128. for key in ['extraParams', 'telemetryId']]
  129. general_specifier = {}
  130. for specifier in search_engine['appliesTo'].copy():
  131. if 'application' in specifier:
  132. if 'distributions' in specifier['application']:
  133. search_engine['appliesTo'].remove(specifier)
  134. continue
  135. specifier['application'].pop('extraParams', None)
  136. if 'included' in specifier and 'everywhere' in specifier[
  137. 'included'] and specifier['included']['everywhere']:
  138. general_specifier = specifier
  139. if not general_specifier:
  140. general_specifier = {'included': {'everywhere': True}}
  141. search_engine['appliesTo'].insert(0, general_specifier)
  142. if search_engine['webExtension']['id'] == cls._DUCKDUCKGO_SEARCH_ENGINE_ID:
  143. general_specifier['default'] = 'yes'
  144. return search_engine
  145. class TippyTopSites:
  146. JSON_PATHS = (
  147. arguments.MAIN_PATH /
  148. 'browser/components/newtab/data/content/tippytop/top_sites.json',
  149. arguments.BRANDING_PATH /
  150. 'tippytop/top_sites.json')
  151. @classmethod
  152. def process(cls, parsed_jsons, parsed_schema):
  153. tippy_top_sites_main = parsed_jsons[0]
  154. tippy_top_sites_branding = parsed_jsons[1]
  155. result = tippy_top_sites_branding.content + \
  156. [site for site in tippy_top_sites_main.content if 'wikipedia.org' in site['domains']]
  157. return File(tippy_top_sites_main.path, result)
  158. class TopSites(RemoteSettings):
  159. _TOP_SITES_JSON_PATH = 'main/top-sites.json'
  160. _TOP_SITES_PATH_MAIN = RemoteSettings.DUMPS_PATH_ABSOLUTE / _TOP_SITES_JSON_PATH
  161. JSON_PATHS = (
  162. arguments.BRANDING_PATH /
  163. RemoteSettings.DUMPS_PATH_RELATIVE /
  164. _TOP_SITES_JSON_PATH,
  165. _TOP_SITES_PATH_MAIN)
  166. OUTPUT_PATH = _TOP_SITES_PATH_MAIN
  167. @classmethod
  168. def should_modify_collection(cls, collection):
  169. return cls._TOP_SITES_PATH_MAIN == collection.path
  170. @classmethod
  171. def should_drop_record(cls, site):
  172. return site['url'] != 'https://www.wikipedia.org/'
  173. @classmethod
  174. def process_record(cls, site):
  175. site.pop('exclude_regions', None)
  176. return site
  177. # To reflect the latest timestamps, Changes class should always come after
  178. # all other RemoteSettings subclasses
  179. processors = (TippyTopSites, SearchConfig, TopSites, Changes)
  180. for processor in processors:
  181. parsed_jsons = []
  182. for json_path in processor.JSON_PATHS:
  183. with json_path.open() as file:
  184. parsed_jsons.append(File(json_path, json.load(file)))
  185. parsed_schema = None
  186. if hasattr(processor, "SCHEMA_PATH"):
  187. with processor.SCHEMA_PATH.open() as file:
  188. parsed_schema = json.load(file)
  189. processed = processor.process(parsed_jsons, parsed_schema)
  190. with processed.path.open('w') as file:
  191. json.dump(processed.content, file, indent=arguments.indent)