uritools.js 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538
  1. /*******************************************************************************
  2. ηMatrix - a browser extension to black/white list requests.
  3. Copyright (C) 2014-2019 Raymond Hill
  4. Copyright (C) 2019-2022 Alessio Vanni
  5. This program is free software: you can redistribute it and/or modify
  6. it under the terms of the GNU General Public License as published by
  7. the Free Software Foundation, either version 3 of the License, or
  8. (at your option) any later version.
  9. This program is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. GNU General Public License for more details.
  13. You should have received a copy of the GNU General Public License
  14. along with this program. If not, see {http://www.gnu.org/licenses/}.
  15. Home: https://gitlab.com/vannilla/ematrix
  16. uMatrix Home: https://github.com/gorhill/uMatrix
  17. */
  18. /* global publicSuffixList, punycode */
  19. 'use strict';
  20. /*******************************************************************************
  21. RFC 3986 as reference: http://tools.ietf.org/html/rfc3986#appendix-A
  22. Naming convention from https://en.wikipedia.org/wiki/URI_scheme#Examples
  23. */
  24. /******************************************************************************/
  25. ηMatrix.URI = (function() {
  26. /******************************************************************************/
  27. // Favorite regex tool: http://regex101.com/
  28. // Ref: <http://tools.ietf.org/html/rfc3986#page-50>
  29. // I removed redundant capture groups: capture less = peform faster. See
  30. // <http://jsperf.com/old-uritools-vs-new-uritools>
  31. // Performance improvements welcomed.
  32. // jsperf: <http://jsperf.com/old-uritools-vs-new-uritools>
  33. var reRFC3986 = /^([^:\/?#]+:)?(\/\/[^\/?#]*)?([^?#]*)(\?[^#]*)?(#.*)?/;
  34. // Derived
  35. var reSchemeFromURI = /^[^:\/?#]+:/;
  36. var reAuthorityFromURI = /^(?:[^:\/?#]+:)?(\/\/[^\/?#]+)/;
  37. var reOriginFromURI = /^(?:[^:\/?#]+:)?(?:\/\/[^\/?#]+)/;
  38. var reCommonHostnameFromURL = /^https?:\/\/([0-9a-z_][0-9a-z._-]*[0-9a-z])\//;
  39. var rePathFromURI = /^(?:[^:\/?#]+:)?(?:\/\/[^\/?#]*)?([^?#]*)/;
  40. var reMustNormalizeHostname = /[^0-9a-z._-]/;
  41. // These are to parse authority field, not parsed by above official regex
  42. // IPv6 is seen as an exception: a non-compatible IPv6 is first tried, and
  43. // if it fails, the IPv6 compatible regex istr used. This helps
  44. // peformance by avoiding the use of a too complicated regex first.
  45. // https://github.com/gorhill/httpswitchboard/issues/211
  46. // "While a hostname may not contain other characters, such as the
  47. // "underscore character (_), other DNS names may contain the underscore"
  48. var reHostPortFromAuthority = /^(?:[^@]*@)?([^:]*)(:\d*)?$/;
  49. var reIPv6PortFromAuthority = /^(?:[^@]*@)?(\[[0-9a-f:]*\])(:\d*)?$/i;
  50. var reHostFromNakedAuthority = /^[0-9a-z._-]+[0-9a-z]$/i;
  51. var reHostFromAuthority = /^(?:[^@]*@)?([^:]+)(?::\d*)?$/;
  52. var reIPv6FromAuthority = /^(?:[^@]*@)?(\[[0-9a-f:]+\])(?::\d*)?$/i;
  53. // Coarse (but fast) tests
  54. var reValidHostname = /^([a-z\d]+(-*[a-z\d]+)*)(\.[a-z\d]+(-*[a-z\d])*)*$/;
  55. var reIPAddressNaive = /^\d+\.\d+\.\d+\.\d+$|^\[[\da-zA-Z:]+\]$/;
  56. // Accurate tests
  57. // Source.: http://stackoverflow.com/questions/5284147/validating-ipv4-addresses-with-regexp/5284410#5284410
  58. //var reIPv4 = /^((25[0-5]|2[0-4]\d|[01]?\d\d?)(\.|$)){4}/;
  59. // Source: http://forums.intermapper.com/viewtopic.php?p=1096#1096
  60. //var reIPv6 = /^\s*((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)?\s*$/;
  61. /******************************************************************************/
  62. var reset = function(o) {
  63. o.scheme = '';
  64. o.hostname = '';
  65. o._ipv4 = undefined;
  66. o._ipv6 = undefined;
  67. o.port = '';
  68. o.path = '';
  69. o.query = '';
  70. o.fragment = '';
  71. return o;
  72. };
  73. var resetAuthority = function(o) {
  74. o.hostname = '';
  75. o._ipv4 = undefined;
  76. o._ipv6 = undefined;
  77. o.port = '';
  78. return o;
  79. };
  80. /******************************************************************************/
  81. // This will be exported
  82. var URI = {
  83. scheme: '',
  84. authority: '',
  85. hostname: '',
  86. _ipv4: undefined,
  87. _ipv6: undefined,
  88. port: '',
  89. domain: undefined,
  90. path: '',
  91. query: '',
  92. fragment: '',
  93. schemeBit: (1 << 0),
  94. userBit: (1 << 1),
  95. passwordBit: (1 << 2),
  96. hostnameBit: (1 << 3),
  97. portBit: (1 << 4),
  98. pathBit: (1 << 5),
  99. queryBit: (1 << 6),
  100. fragmentBit: (1 << 7),
  101. allBits: (0xFFFF)
  102. };
  103. URI.authorityBit = (URI.userBit | URI.passwordBit | URI.hostnameBit | URI.portBit);
  104. URI.normalizeBits = (URI.schemeBit | URI.hostnameBit | URI.pathBit | URI.queryBit);
  105. /******************************************************************************/
  106. // See: https://en.wikipedia.org/wiki/URI_scheme#Examples
  107. // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
  108. //
  109. // foo://example.com:8042/over/there?name=ferret#nose
  110. // \_/ \______________/\_________/ \_________/ \__/
  111. // | | | | |
  112. // scheme authority path query fragment
  113. // | _____________________|__
  114. // / \ / \
  115. // urn:example:animal:ferret:nose
  116. URI.set = function(uri) {
  117. if ( uri === undefined ) {
  118. return reset(URI);
  119. }
  120. var matches = reRFC3986.exec(uri);
  121. if ( !matches ) {
  122. return reset(URI);
  123. }
  124. this.scheme = matches[1] !== undefined ? matches[1].slice(0, -1) : '';
  125. this.authority = matches[2] !== undefined ? matches[2].slice(2).toLowerCase() : '';
  126. this.path = matches[3] !== undefined ? matches[3] : '';
  127. // <http://tools.ietf.org/html/rfc3986#section-6.2.3>
  128. // "In general, a URI that uses the generic syntax for authority
  129. // "with an empty path should be normalized to a path of '/'."
  130. if ( this.authority !== '' && this.path === '' ) {
  131. this.path = '/';
  132. }
  133. this.query = matches[4] !== undefined ? matches[4].slice(1) : '';
  134. this.fragment = matches[5] !== undefined ? matches[5].slice(1) : '';
  135. // Assume very simple authority, i.e. just a hostname (highest likelihood
  136. // case for ηMatrix)
  137. if ( reHostFromNakedAuthority.test(this.authority) ) {
  138. this.hostname = this.authority;
  139. this.port = '';
  140. return this;
  141. }
  142. // Authority contains more than just a hostname
  143. matches = reHostPortFromAuthority.exec(this.authority);
  144. if ( !matches ) {
  145. matches = reIPv6PortFromAuthority.exec(this.authority);
  146. if ( !matches ) {
  147. return resetAuthority(URI);
  148. }
  149. }
  150. this.hostname = matches[1] !== undefined ? matches[1] : '';
  151. // http://en.wikipedia.org/wiki/FQDN
  152. if ( this.hostname.slice(-1) === '.' ) {
  153. this.hostname = this.hostname.slice(0, -1);
  154. }
  155. this.port = matches[2] !== undefined ? matches[2].slice(1) : '';
  156. return this;
  157. };
  158. /******************************************************************************/
  159. // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
  160. //
  161. // foo://example.com:8042/over/there?name=ferret#nose
  162. // \_/ \______________/\_________/ \_________/ \__/
  163. // | | | | |
  164. // scheme authority path query fragment
  165. // | _____________________|__
  166. // / \ / \
  167. // urn:example:animal:ferret:nose
  168. URI.assemble = function(bits) {
  169. if ( bits === undefined ) {
  170. bits = this.allBits;
  171. }
  172. var s = [];
  173. if ( this.scheme && (bits & this.schemeBit) ) {
  174. s.push(this.scheme, ':');
  175. }
  176. if ( this.hostname && (bits & this.hostnameBit) ) {
  177. s.push('//', this.hostname);
  178. }
  179. if ( this.port && (bits & this.portBit) ) {
  180. s.push(':', this.port);
  181. }
  182. if ( this.path && (bits & this.pathBit) ) {
  183. s.push(this.path);
  184. }
  185. if ( this.query && (bits & this.queryBit) ) {
  186. s.push('?', this.query);
  187. }
  188. if ( this.fragment && (bits & this.fragmentBit) ) {
  189. s.push('#', this.fragment);
  190. }
  191. return s.join('');
  192. };
  193. /******************************************************************************/
  194. URI.originFromURI = function(uri) {
  195. var matches = reOriginFromURI.exec(uri);
  196. return matches !== null ? matches[0].toLowerCase() : '';
  197. };
  198. /******************************************************************************/
  199. URI.schemeFromURI = function(uri) {
  200. var matches = reSchemeFromURI.exec(uri);
  201. if ( matches === null ) {
  202. return '';
  203. }
  204. return matches[0].slice(0, -1).toLowerCase();
  205. };
  206. /******************************************************************************/
  207. URI.isNetworkScheme = function(scheme) {
  208. return this.reNetworkScheme.test(scheme);
  209. };
  210. URI.reNetworkScheme = /^(?:https?|wss?|ftps?)\b/;
  211. /******************************************************************************/
  212. URI.isSecureScheme = function(scheme) {
  213. return this.reSecureScheme.test(scheme);
  214. };
  215. URI.reSecureScheme = /^(?:https|wss|ftps)\b/;
  216. /******************************************************************************/
  217. URI.authorityFromURI = function(uri) {
  218. var matches = reAuthorityFromURI.exec(uri);
  219. if ( !matches ) {
  220. return '';
  221. }
  222. return matches[1].slice(2).toLowerCase();
  223. };
  224. /******************************************************************************/
  225. // The most used function, so it better be fast.
  226. // https://github.com/gorhill/uBlock/issues/1559
  227. // See http://en.wikipedia.org/wiki/FQDN
  228. // https://bugzilla.mozilla.org/show_bug.cgi?id=1360285
  229. // Revisit punycode dependency when above issue is fixed in Firefox.
  230. URI.hostnameFromURI = function(uri) {
  231. var matches = reCommonHostnameFromURL.exec(uri);
  232. if ( matches !== null ) { return matches[1]; }
  233. matches = reAuthorityFromURI.exec(uri);
  234. if ( matches === null ) { return ''; }
  235. var authority = matches[1].slice(2);
  236. // Assume very simple authority (most common case for ηBlock)
  237. if ( reHostFromNakedAuthority.test(authority) ) {
  238. return authority.toLowerCase();
  239. }
  240. matches = reHostFromAuthority.exec(authority);
  241. if ( matches === null ) {
  242. matches = reIPv6FromAuthority.exec(authority);
  243. if ( matches === null ) { return ''; }
  244. }
  245. var hostname = matches[1];
  246. while ( hostname.endsWith('.') ) {
  247. hostname = hostname.slice(0, -1);
  248. }
  249. if ( reMustNormalizeHostname.test(hostname) ) {
  250. hostname = punycode.toASCII(hostname.toLowerCase());
  251. }
  252. return hostname;
  253. };
  254. /******************************************************************************/
  255. URI.domainFromHostname = function(hostname) {
  256. // Try to skip looking up the PSL database
  257. var entry = domainCache.get(hostname);
  258. if ( entry !== undefined ) {
  259. entry.tstamp = Date.now();
  260. return entry.domain;
  261. }
  262. // Meh.. will have to search it
  263. if ( reIPAddressNaive.test(hostname) === false ) {
  264. return domainCacheAdd(hostname, psl.getDomain(hostname));
  265. }
  266. return domainCacheAdd(hostname, hostname);
  267. };
  268. URI.domain = function() {
  269. return this.domainFromHostname(this.hostname);
  270. };
  271. // It is expected that there is higher-scoped `publicSuffixList` lingering
  272. // somewhere. Cache it. See <https://github.com/gorhill/publicsuffixlist.js>.
  273. var psl = publicSuffixList;
  274. /******************************************************************************/
  275. URI.pathFromURI = function(uri) {
  276. var matches = rePathFromURI.exec(uri);
  277. return matches !== null ? matches[1] : '';
  278. };
  279. /******************************************************************************/
  280. // Trying to alleviate the worries of looking up too often the domain name from
  281. // a hostname. With a cache, uBlock benefits given that it deals with a
  282. // specific set of hostnames within a narrow time span -- in other words, I
  283. // believe probability of cache hit are high in uBlock.
  284. var domainCache = new Map();
  285. var domainCacheCountLowWaterMark = 75;
  286. var domainCacheCountHighWaterMark = 100;
  287. var domainCacheEntryJunkyard = [];
  288. var domainCacheEntryJunkyardMax = domainCacheCountHighWaterMark - domainCacheCountLowWaterMark;
  289. var DomainCacheEntry = function(domain) {
  290. this.init(domain);
  291. };
  292. DomainCacheEntry.prototype.init = function(domain) {
  293. this.domain = domain;
  294. this.tstamp = Date.now();
  295. return this;
  296. };
  297. DomainCacheEntry.prototype.dispose = function() {
  298. this.domain = '';
  299. if ( domainCacheEntryJunkyard.length < domainCacheEntryJunkyardMax ) {
  300. domainCacheEntryJunkyard.push(this);
  301. }
  302. };
  303. var domainCacheEntryFactory = function(domain) {
  304. var entry = domainCacheEntryJunkyard.pop();
  305. if ( entry ) {
  306. return entry.init(domain);
  307. }
  308. return new DomainCacheEntry(domain);
  309. };
  310. var domainCacheAdd = function(hostname, domain) {
  311. var entry = domainCache.get(hostname);
  312. if ( entry !== undefined ) {
  313. entry.tstamp = Date.now();
  314. } else {
  315. domainCache.set(hostname, domainCacheEntryFactory(domain));
  316. if ( domainCache.size === domainCacheCountHighWaterMark ) {
  317. domainCachePrune();
  318. }
  319. }
  320. return domain;
  321. };
  322. var domainCacheEntrySort = function(a, b) {
  323. return domainCache.get(b).tstamp - domainCache.get(a).tstamp;
  324. };
  325. var domainCachePrune = function() {
  326. var hostnames = Array.from(domainCache.keys())
  327. .sort(domainCacheEntrySort)
  328. .slice(domainCacheCountLowWaterMark);
  329. var i = hostnames.length;
  330. var hostname;
  331. while ( i-- ) {
  332. hostname = hostnames[i];
  333. domainCache.get(hostname).dispose();
  334. domainCache.delete(hostname);
  335. }
  336. };
  337. var domainCacheReset = function() {
  338. domainCache.clear();
  339. };
  340. psl.onChanged.addListener(domainCacheReset);
  341. /******************************************************************************/
  342. URI.domainFromURI = function(uri) {
  343. if ( !uri ) {
  344. return '';
  345. }
  346. return this.domainFromHostname(this.hostnameFromURI(uri));
  347. };
  348. /******************************************************************************/
  349. // Normalize the way ηMatrix expects it
  350. URI.normalizedURI = function() {
  351. // Will be removed:
  352. // - port
  353. // - user id/password
  354. // - fragment
  355. return this.assemble(this.normalizeBits);
  356. };
  357. /******************************************************************************/
  358. URI.rootURL = function() {
  359. if ( !this.hostname ) {
  360. return '';
  361. }
  362. return this.assemble(this.schemeBit | this.hostnameBit);
  363. };
  364. /******************************************************************************/
  365. URI.isValidHostname = function(hostname) {
  366. var r;
  367. try {
  368. r = reValidHostname.test(hostname);
  369. }
  370. catch (e) {
  371. return false;
  372. }
  373. return r;
  374. };
  375. /******************************************************************************/
  376. // Return the parent domain. For IP address, there is no parent domain.
  377. URI.parentHostnameFromHostname = function(hostname) {
  378. // `locahost` => ``
  379. // `example.org` => `example.org`
  380. // `www.example.org` => `example.org`
  381. // `tomato.www.example.org` => `example.org`
  382. var domain = this.domainFromHostname(hostname);
  383. // `locahost` === `` => bye
  384. // `example.org` === `example.org` => bye
  385. // `www.example.org` !== `example.org` => stay
  386. // `tomato.www.example.org` !== `example.org` => stay
  387. if ( domain === '' || domain === hostname ) {
  388. return undefined;
  389. }
  390. // Parent is hostname minus first label
  391. return hostname.slice(hostname.indexOf('.') + 1);
  392. };
  393. /******************************************************************************/
  394. // Return all possible parent hostnames which can be derived from `hostname`,
  395. // ordered from direct parent up to domain inclusively.
  396. URI.parentHostnamesFromHostname = function(hostname) {
  397. // TODO: I should create an object which is optimized to receive
  398. // the list of hostnames by making it reusable (junkyard etc.) and which
  399. // has its own element counter property in order to avoid memory
  400. // alloc/dealloc.
  401. var domain = this.domainFromHostname(hostname);
  402. if ( domain === '' || domain === hostname ) {
  403. return [];
  404. }
  405. var nodes = [];
  406. var pos;
  407. for (;;) {
  408. pos = hostname.indexOf('.');
  409. if ( pos < 0 ) {
  410. break;
  411. }
  412. hostname = hostname.slice(pos + 1);
  413. nodes.push(hostname);
  414. if ( hostname === domain ) {
  415. break;
  416. }
  417. }
  418. return nodes;
  419. };
  420. /******************************************************************************/
  421. // Return all possible hostnames which can be derived from `hostname`,
  422. // ordered from self up to domain inclusively.
  423. URI.allHostnamesFromHostname = function(hostname) {
  424. var nodes = this.parentHostnamesFromHostname(hostname);
  425. nodes.unshift(hostname);
  426. return nodes;
  427. };
  428. /******************************************************************************/
  429. URI.toString = function() {
  430. return this.assemble();
  431. };
  432. /******************************************************************************/
  433. // Export
  434. return URI;
  435. /******************************************************************************/
  436. })();
  437. /******************************************************************************/