process-json-files.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360
  1. #! /usr/bin/python3
  2. # Copyright (C) 2020, 2021, 2022, 2023, 2024 grizzlyuser <grizzlyuser@protonmail.com>
  3. # Based on: https://gitlab.trisquel.org/trisquel/wrapage-helpers/-/blob/81881d89b2bf7d502dd14fcccdb471fec6f6b206/helpers/DATA/firefox/reprocess-search-config.py
  4. # Below is the notice from the original author:
  5. #
  6. # Copyright (C) 2020 Ruben Rodriguez <ruben@trisquel.info>
  7. #
  8. # This program is free software; you can redistribute it and/or modify
  9. # it under the terms of the GNU General Public License as published by
  10. # the Free Software Foundation; either version 2 of the License, or
  11. # (at your option) any later version.
  12. #
  13. # This program is distributed in the hope that it will be useful,
  14. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. # GNU General Public License for more details.
  17. #
  18. # You should have received a copy of the GNU General Public License
  19. # along with this program; if not, write to the Free Software
  20. # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  21. import json
  22. import sys
  23. import time
  24. import copy
  25. import argparse
  26. import pathlib
  27. import logging
  28. from collections import namedtuple
  29. from jsonschema import validate
  30. parser = argparse.ArgumentParser()
  31. parser.add_argument(
  32. 'MAIN_PATH',
  33. type=pathlib.Path,
  34. help='path to main application source code directory')
  35. parser.add_argument(
  36. 'BRANDING_PATH',
  37. type=pathlib.Path,
  38. help='path to branding source code directory')
  39. parser.add_argument(
  40. '-i',
  41. '--indent',
  42. type=int,
  43. default=2,
  44. help='indent for pretty printing of output files')
  45. parser.add_argument(
  46. '-l',
  47. '--loglevel',
  48. choices=logging._nameToLevel.keys(),
  49. default=logging.INFO,
  50. help='logging level')
  51. arguments = parser.parse_args()
  52. logging.basicConfig(level=arguments.loglevel)
  53. logger = logging.getLogger(str(pathlib.Path(__file__).name))
  54. File = namedtuple('File', ['path', 'content'])
  55. class JsonProcessor:
  56. @classmethod
  57. def process(cls):
  58. parsed_jsons = []
  59. for json_path in cls.JSON_PATHS:
  60. logger.info('Reading input: ' + str(json_path) + '...')
  61. with json_path.open() as file:
  62. parsed_jsons.append(File(json_path, json.load(file)))
  63. parsed_schema = None
  64. if hasattr(cls, "SCHEMA_PATH"):
  65. logger.info('Reading schema: ' + str(json_path) + '...')
  66. with cls.SCHEMA_PATH.open() as file:
  67. parsed_schema = json.load(file)
  68. processed = cls.process_parsed(parsed_jsons, parsed_schema)
  69. with processed.path.open('w') as file:
  70. json.dump(processed.content, file, indent=arguments.indent)
  71. logger.info('Wrote: ' + str(processed.path))
  72. class RemoteSettings(JsonProcessor):
  73. DUMPS_PATH_RELATIVE = 'services/settings/dumps'
  74. DUMPS_PATH_ABSOLUTE = arguments.MAIN_PATH / DUMPS_PATH_RELATIVE
  75. _WRAPPER_NAME = 'data'
  76. _LAST_MODIFIED_KEY_NAME = 'last_modified'
  77. @classmethod
  78. def get_collection_timestamp(cls, collection):
  79. return max((record[cls._LAST_MODIFIED_KEY_NAME]
  80. for record in collection.content), default=0)
  81. @classmethod
  82. def wrap(cls, processed):
  83. return File(processed.path,
  84. {cls._WRAPPER_NAME: processed.content,
  85. 'timestamp': cls.get_collection_timestamp(processed)})
  86. @classmethod
  87. def unwrap(cls, parsed_jsons):
  88. return [File(json.path, json.content[cls._WRAPPER_NAME])
  89. for json in parsed_jsons]
  90. @classmethod
  91. def should_modify_collection(cls, collection):
  92. return True
  93. @classmethod
  94. def now(cls):
  95. return int(round(time.time_ns() / 10 ** 6))
  96. @classmethod
  97. def process_raw(cls, unwrapped_jsons, parsed_schema):
  98. timestamps, result = [], []
  99. for collection in unwrapped_jsons:
  100. should_modify_collection = cls.should_modify_collection(collection)
  101. for record in collection.content:
  102. if should_modify_collection:
  103. if cls.should_drop_record(record):
  104. continue
  105. clone = copy.deepcopy(record)
  106. record = cls.process_record(record)
  107. if clone != record:
  108. timestamp = cls.now()
  109. while timestamp in timestamps:
  110. timestamp += 1
  111. timestamps.append(timestamp)
  112. record[cls._LAST_MODIFIED_KEY_NAME] = timestamp
  113. if parsed_schema is not None:
  114. validate(record, schema=parsed_schema)
  115. result.append(record)
  116. result.sort(
  117. key=lambda record: record[cls._LAST_MODIFIED_KEY_NAME], reverse=True)
  118. cls.OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
  119. return File(cls.OUTPUT_PATH, result)
  120. @classmethod
  121. def process_parsed(cls, parsed_jsons, parsed_schema):
  122. return cls.wrap(
  123. cls.process_raw(
  124. cls.unwrap(parsed_jsons),
  125. parsed_schema))
  126. class EmptyRemoteSettings(RemoteSettings):
  127. @classmethod
  128. def should_drop_record(cls, search_engine):
  129. return True
  130. @classmethod
  131. def process_record(cls, record):
  132. return record
  133. class Changes(RemoteSettings):
  134. JSON_PATHS = tuple(RemoteSettings.DUMPS_PATH_ABSOLUTE.glob('*/*.json'))
  135. OUTPUT_PATH = RemoteSettings.DUMPS_PATH_ABSOLUTE / 'monitor/changes'
  136. @classmethod
  137. def wrap(cls, processed):
  138. return File(
  139. processed.path, {
  140. 'changes': processed.content, 'timestamp': cls.now()})
  141. @classmethod
  142. def process_raw(cls, unwrapped_jsons, parsed_schema):
  143. changes = []
  144. for collection in unwrapped_jsons:
  145. if collection.path != RemoteSettings.DUMPS_PATH_ABSOLUTE / 'main/example.json':
  146. latest_change = {}
  147. latest_change[cls._LAST_MODIFIED_KEY_NAME] = cls.get_collection_timestamp(
  148. collection)
  149. latest_change['bucket'] = collection.path.parent.name
  150. latest_change['collection'] = collection.path.stem
  151. changes.append(latest_change)
  152. cls.OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
  153. return File(cls.OUTPUT_PATH, changes)
  154. class SearchConfigV2(RemoteSettings):
  155. JSON_PATHS = (
  156. RemoteSettings.DUMPS_PATH_ABSOLUTE /
  157. 'main/search-config-v2.json',
  158. )
  159. SCHEMA_PATH = arguments.MAIN_PATH / \
  160. 'toolkit/components/search/schema/search-config-v2-schema.json'
  161. OUTPUT_PATH = JSON_PATHS[0]
  162. _DUCKDUCKGO_SEARCH_ENGINE_IDENTIFIER = 'ddg'
  163. @classmethod
  164. def should_drop_record(cls, record):
  165. if record['recordType'] != 'engine':
  166. return False
  167. identifier = record['identifier']
  168. return identifier != cls._DUCKDUCKGO_SEARCH_ENGINE_IDENTIFIER and not identifier.startswith(
  169. 'wikipedia')
  170. @classmethod
  171. def process_record(cls, record):
  172. match record['recordType']:
  173. case 'defaultEngines':
  174. return cls.process_default_engines(record)
  175. case 'engine':
  176. return cls.process_engine(record)
  177. case 'engineOrders':
  178. return cls.process_engine_orders(record)
  179. case _:
  180. return record
  181. @classmethod
  182. def process_default_engines(cls, default_engines):
  183. default_engines['globalDefault'] = cls._DUCKDUCKGO_SEARCH_ENGINE_IDENTIFIER
  184. default_engines['specificDefaults'] = []
  185. return default_engines
  186. @classmethod
  187. def process_engine(cls, engine):
  188. engine['base'].pop('partnerCode', None)
  189. engine['base']['urls']['search'].pop('params', None)
  190. if engine['identifier'] == cls._DUCKDUCKGO_SEARCH_ENGINE_IDENTIFIER:
  191. engine['base']['name'] += ' HTML'
  192. engine['base']['urls']['search']['base'] = 'https://html.duckduckgo.com/html'
  193. engine['variants'] = [
  194. {'environment': {'allRegionsAndLocales': True}}]
  195. return engine
  196. @classmethod
  197. def process_engine_orders(cls, engine_orders):
  198. engine_orders['orders'] = []
  199. return engine_orders
  200. class SearchConfigOverridesV2(EmptyRemoteSettings):
  201. JSON_PATHS = (
  202. RemoteSettings.DUMPS_PATH_ABSOLUTE /
  203. 'main/search-config-overrides-v2.json',
  204. )
  205. SCHEMA_PATH = arguments.MAIN_PATH / \
  206. 'toolkit/components/search/schema/search-config-overrides-v2-schema.json'
  207. OUTPUT_PATH = JSON_PATHS[0]
  208. class SearchDefaultOverrideAllowlist(EmptyRemoteSettings):
  209. JSON_PATHS = (
  210. RemoteSettings.DUMPS_PATH_ABSOLUTE /
  211. 'main/search-default-override-allowlist.json',
  212. )
  213. SCHEMA_PATH = arguments.MAIN_PATH / \
  214. 'toolkit/components/search/schema/search-default-override-allowlist-schema.json'
  215. OUTPUT_PATH = JSON_PATHS[0]
  216. class SearchTelemetryV2(EmptyRemoteSettings):
  217. JSON_PATHS = (
  218. RemoteSettings.DUMPS_PATH_ABSOLUTE /
  219. 'main/search-telemetry-v2.json',
  220. )
  221. SCHEMA_PATH = arguments.MAIN_PATH / \
  222. 'browser/components/search/schema/search-telemetry-v2-schema.json'
  223. OUTPUT_PATH = JSON_PATHS[0]
  224. class TranslationsModels(EmptyRemoteSettings):
  225. JSON_PATHS = (
  226. RemoteSettings.DUMPS_PATH_ABSOLUTE /
  227. 'main/translations-models.json',
  228. )
  229. OUTPUT_PATH = JSON_PATHS[0]
  230. class TranslationsWasm(EmptyRemoteSettings):
  231. JSON_PATHS = (
  232. RemoteSettings.DUMPS_PATH_ABSOLUTE /
  233. 'main/translations-wasm.json',
  234. )
  235. OUTPUT_PATH = JSON_PATHS[0]
  236. class UrlClassifierSkipUrls(EmptyRemoteSettings):
  237. JSON_PATHS = (
  238. RemoteSettings.DUMPS_PATH_ABSOLUTE /
  239. 'main/url-classifier-skip-urls.json',
  240. )
  241. OUTPUT_PATH = JSON_PATHS[0]
  242. class TippyTopSites(JsonProcessor):
  243. JSON_PATHS = (
  244. arguments.MAIN_PATH /
  245. 'browser/components/newtab/data/content/tippytop/top_sites.json',
  246. arguments.BRANDING_PATH /
  247. 'tippytop/top_sites.json')
  248. @classmethod
  249. def process_parsed(cls, parsed_jsons, parsed_schema):
  250. tippy_top_sites_main = parsed_jsons[0]
  251. tippy_top_sites_branding = parsed_jsons[1]
  252. result = tippy_top_sites_branding.content + \
  253. [site for site in tippy_top_sites_main.content if 'wikipedia.org' in site['domains']]
  254. return File(tippy_top_sites_main.path, result)
  255. class TopSites(RemoteSettings):
  256. _TOP_SITES_JSON_PATH = 'main/top-sites.json'
  257. _TOP_SITES_PATH_MAIN = RemoteSettings.DUMPS_PATH_ABSOLUTE / _TOP_SITES_JSON_PATH
  258. JSON_PATHS = (
  259. arguments.BRANDING_PATH /
  260. RemoteSettings.DUMPS_PATH_RELATIVE /
  261. _TOP_SITES_JSON_PATH,
  262. _TOP_SITES_PATH_MAIN)
  263. OUTPUT_PATH = _TOP_SITES_PATH_MAIN
  264. @classmethod
  265. def should_modify_collection(cls, collection):
  266. return cls._TOP_SITES_PATH_MAIN == collection.path
  267. @classmethod
  268. def should_drop_record(cls, site):
  269. return site['url'] != 'https://www.wikipedia.org/'
  270. @classmethod
  271. def process_record(cls, site):
  272. site.pop('exclude_regions', None)
  273. return site
  274. # To reflect the latest timestamps, Changes class should always come after
  275. # all other RemoteSettings subclasses
  276. processors = (
  277. SearchConfigOverridesV2,
  278. SearchConfigV2,
  279. SearchDefaultOverrideAllowlist,
  280. SearchTelemetryV2,
  281. TippyTopSites,
  282. TopSites,
  283. TranslationsModels,
  284. TranslationsWasm,
  285. UrlClassifierSkipUrls,
  286. Changes)
  287. for processor in processors:
  288. processor.process()