process-json-files.py 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. #! /usr/bin/python3
  2. # Copyright (C) 2020, 2021, 2022, 2023 grizzlyuser <grizzlyuser@protonmail.com>
  3. # Based on: https://gitlab.trisquel.org/trisquel/wrapage-helpers/-/blob/81881d89b2bf7d502dd14fcccdb471fec6f6b206/helpers/DATA/firefox/reprocess-search-config.py
  4. # Below is the notice from the original author:
  5. #
  6. # Copyright (C) 2020 Ruben Rodriguez <ruben@trisquel.info>
  7. #
  8. # This program is free software; you can redistribute it and/or modify
  9. # it under the terms of the GNU General Public License as published by
  10. # the Free Software Foundation; either version 2 of the License, or
  11. # (at your option) any later version.
  12. #
  13. # This program is distributed in the hope that it will be useful,
  14. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. # GNU General Public License for more details.
  17. #
  18. # You should have received a copy of the GNU General Public License
  19. # along with this program; if not, write to the Free Software
  20. # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  21. import json
  22. import sys
  23. import time
  24. import copy
  25. import argparse
  26. import pathlib
  27. from collections import namedtuple
  28. from jsonschema import validate
  29. parser = argparse.ArgumentParser()
  30. parser.add_argument(
  31. 'MAIN_PATH',
  32. type=pathlib.Path,
  33. help='path to main application source code directory')
  34. parser.add_argument(
  35. 'BRANDING_PATH',
  36. type=pathlib.Path,
  37. help='path to branding source code directory')
  38. parser.add_argument(
  39. '-i',
  40. '--indent',
  41. type=int,
  42. default=2,
  43. help='indent for pretty printing of output files')
  44. arguments = parser.parse_args()
  45. File = namedtuple('File', ['path', 'content'])
  46. class RemoteSettings:
  47. DUMPS_PATH_RELATIVE = 'services/settings/dumps'
  48. DUMPS_PATH_ABSOLUTE = arguments.MAIN_PATH / DUMPS_PATH_RELATIVE
  49. _WRAPPER_NAME = 'data'
  50. _LAST_MODIFIED_KEY_NAME = 'last_modified'
  51. @classmethod
  52. def get_collection_timestamp(cls, collection):
  53. return max((record[cls._LAST_MODIFIED_KEY_NAME]
  54. for record in collection.content), default=0)
  55. @classmethod
  56. def wrap(cls, processed):
  57. return File(processed.path,
  58. {cls._WRAPPER_NAME: processed.content,
  59. 'timestamp': cls.get_collection_timestamp(processed)})
  60. @classmethod
  61. def unwrap(cls, parsed_jsons):
  62. return [File(json.path, json.content[cls._WRAPPER_NAME])
  63. for json in parsed_jsons]
  64. @classmethod
  65. def should_modify_collection(cls, collection):
  66. return True
  67. @classmethod
  68. def now(cls):
  69. return int(round(time.time_ns() / 10 ** 6))
  70. @classmethod
  71. def process_raw(cls, unwrapped_jsons, parsed_schema):
  72. timestamps, result = [], []
  73. for collection in unwrapped_jsons:
  74. should_modify_collection = cls.should_modify_collection(collection)
  75. for record in collection.content:
  76. if should_modify_collection:
  77. if cls.should_drop_record(record):
  78. continue
  79. clone = copy.deepcopy(record)
  80. record = cls.process_record(record)
  81. if clone != record:
  82. timestamp = cls.now()
  83. while timestamp in timestamps:
  84. timestamp += 1
  85. timestamps.append(timestamp)
  86. record[cls._LAST_MODIFIED_KEY_NAME] = timestamp
  87. if parsed_schema is not None:
  88. validate(record, schema=parsed_schema)
  89. result.append(record)
  90. result.sort(
  91. key=lambda record: record[cls._LAST_MODIFIED_KEY_NAME], reverse=True)
  92. cls.OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
  93. return File(cls.OUTPUT_PATH, result)
  94. @classmethod
  95. def process(cls, parsed_jsons, parsed_schema):
  96. return cls.wrap(
  97. cls.process_raw(
  98. cls.unwrap(parsed_jsons),
  99. parsed_schema))
  100. class Changes(RemoteSettings):
  101. JSON_PATHS = tuple(RemoteSettings.DUMPS_PATH_ABSOLUTE.glob('*/*.json'))
  102. OUTPUT_PATH = RemoteSettings.DUMPS_PATH_ABSOLUTE / 'monitor/changes'
  103. @classmethod
  104. def wrap(cls, processed):
  105. return File(
  106. processed.path, {
  107. 'changes': processed.content, 'timestamp': cls.now()})
  108. @classmethod
  109. def process_raw(cls, unwrapped_jsons, parsed_schema):
  110. changes = []
  111. for collection in unwrapped_jsons:
  112. if collection.path != RemoteSettings.DUMPS_PATH_ABSOLUTE / 'main/example.json':
  113. latest_change = {}
  114. latest_change[cls._LAST_MODIFIED_KEY_NAME] = cls.get_collection_timestamp(
  115. collection)
  116. latest_change['bucket'] = collection.path.parent.name
  117. latest_change['collection'] = collection.path.stem
  118. changes.append(latest_change)
  119. cls.OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
  120. return File(cls.OUTPUT_PATH, changes)
  121. class SearchConfig(RemoteSettings):
  122. JSON_PATHS = (
  123. RemoteSettings.DUMPS_PATH_ABSOLUTE /
  124. 'main/search-config.json',
  125. )
  126. SCHEMA_PATH = arguments.MAIN_PATH / \
  127. 'toolkit/components/search/schema/search-engine-config-schema.json'
  128. OUTPUT_PATH = JSON_PATHS[0]
  129. _DUCKDUCKGO_SEARCH_ENGINE_ID = 'ddg@search.mozilla.org'
  130. @classmethod
  131. def should_drop_record(cls, search_engine):
  132. return search_engine['webExtension']['id'] not in (
  133. cls._DUCKDUCKGO_SEARCH_ENGINE_ID, 'wikipedia@search.mozilla.org')
  134. @classmethod
  135. def process_record(cls, search_engine):
  136. [search_engine.pop(key, None)
  137. for key in ['extraParams', 'telemetryId']]
  138. general_specifier = {}
  139. for specifier in search_engine['appliesTo'].copy():
  140. if 'application' in specifier:
  141. if 'distributions' in specifier['application']:
  142. search_engine['appliesTo'].remove(specifier)
  143. continue
  144. specifier['application'].pop('extraParams', None)
  145. if 'included' in specifier and 'everywhere' in specifier[
  146. 'included'] and specifier['included']['everywhere']:
  147. general_specifier = specifier
  148. if not general_specifier:
  149. general_specifier = {'included': {'everywhere': True}}
  150. search_engine['appliesTo'].insert(0, general_specifier)
  151. if search_engine['webExtension']['id'] == cls._DUCKDUCKGO_SEARCH_ENGINE_ID:
  152. general_specifier['default'] = 'yes'
  153. return search_engine
  154. class TippyTopSites:
  155. JSON_PATHS = (
  156. arguments.MAIN_PATH /
  157. 'browser/components/newtab/data/content/tippytop/top_sites.json',
  158. arguments.BRANDING_PATH /
  159. 'tippytop/top_sites.json')
  160. @classmethod
  161. def process(cls, parsed_jsons, parsed_schema):
  162. tippy_top_sites_main = parsed_jsons[0]
  163. tippy_top_sites_branding = parsed_jsons[1]
  164. result = tippy_top_sites_branding.content + \
  165. [site for site in tippy_top_sites_main.content if 'wikipedia.org' in site['domains']]
  166. return File(tippy_top_sites_main.path, result)
  167. class TopSites(RemoteSettings):
  168. _TOP_SITES_JSON_PATH = 'main/top-sites.json'
  169. _TOP_SITES_PATH_MAIN = RemoteSettings.DUMPS_PATH_ABSOLUTE / _TOP_SITES_JSON_PATH
  170. JSON_PATHS = (
  171. arguments.BRANDING_PATH /
  172. RemoteSettings.DUMPS_PATH_RELATIVE /
  173. _TOP_SITES_JSON_PATH,
  174. _TOP_SITES_PATH_MAIN)
  175. OUTPUT_PATH = _TOP_SITES_PATH_MAIN
  176. @classmethod
  177. def should_modify_collection(cls, collection):
  178. return cls._TOP_SITES_PATH_MAIN == collection.path
  179. @classmethod
  180. def should_drop_record(cls, site):
  181. return site['url'] != 'https://www.wikipedia.org/'
  182. @classmethod
  183. def process_record(cls, site):
  184. site.pop('exclude_regions', None)
  185. return site
  186. # To reflect the latest timestamps, Changes class should always come after
  187. # all other RemoteSettings subclasses
  188. processors = (TippyTopSites, SearchConfig, TopSites, Changes)
  189. for processor in processors:
  190. parsed_jsons = []
  191. for json_path in processor.JSON_PATHS:
  192. with json_path.open() as file:
  193. parsed_jsons.append(File(json_path, json.load(file)))
  194. parsed_schema = None
  195. if hasattr(processor, "SCHEMA_PATH"):
  196. with processor.SCHEMA_PATH.open() as file:
  197. parsed_schema = json.load(file)
  198. processed = processor.process(parsed_jsons, parsed_schema)
  199. with processed.path.open('w') as file:
  200. json.dump(processed.content, file, indent=arguments.indent)