SiteClassifier.jsm 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. /* This Source Code Form is subject to the terms of the Mozilla Public
  2. * License, v. 2.0. If a copy of the MPL was not distributed with this
  3. * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
  4. "use strict";
  5. const {RemoteSettings} = ChromeUtils.import("resource://services-settings/remote-settings.js");
  6. // Returns whether the passed in params match the criteria.
  7. // To match, they must contain all the params specified in criteria and the values
  8. // must match if a value is provided in criteria.
  9. function _hasParams(criteria, params) {
  10. for (let param of criteria) {
  11. const val = params.get(param.key);
  12. if (val === null ||
  13. (param.value && param.value !== val) ||
  14. (param.prefix && !val.startsWith(param.prefix))) {
  15. return false;
  16. }
  17. }
  18. return true;
  19. }
  20. /**
  21. * classifySite
  22. * Classifies a given URL into a category based on classification data from RemoteSettings.
  23. * The data from remote settings can match a category by one of the following:
  24. * - match the exact URL
  25. * - match the hostname or second level domain (sld)
  26. * - match query parameter(s), and optionally their values or prefixes
  27. * - match both (hostname or sld) and query parameter(s)
  28. *
  29. * The data looks like:
  30. * [{
  31. * "type": "hostname-and-params-match",
  32. * "criteria": [
  33. * {
  34. * "url": "https://matchurl.com",
  35. * "hostname": "matchhostname.com",
  36. * "sld": "secondleveldomain",
  37. * "params": [
  38. * {
  39. * "key": "matchparam",
  40. * "value": "matchvalue",
  41. * "prefix": "matchpPrefix",
  42. * },
  43. * ],
  44. * },
  45. * ],
  46. * "weight": 300,
  47. * },...]
  48. */
  49. async function classifySite(url, RS = RemoteSettings) {
  50. let category = "other";
  51. let parsedURL;
  52. // Try to parse the url.
  53. for (let _url of [url, `https://${url}`]) {
  54. try {
  55. parsedURL = new URL(_url);
  56. break;
  57. } catch (e) {}
  58. }
  59. if (parsedURL) {
  60. // If we parsed successfully, find a match.
  61. const hostname = parsedURL.hostname.replace(/^www\./i, "");
  62. const params = parsedURL.searchParams;
  63. // NOTE: there will be an initial/default local copy of the data in m-c.
  64. // Therefore, this should never return an empty list [].
  65. const siteTypes = await RS("sites-classification").get();
  66. const sortedSiteTypes = siteTypes.sort((x, y) => (y.weight || 0) - (x.weight || 0));
  67. for (let type of sortedSiteTypes) {
  68. for (let criteria of type.criteria) {
  69. if (criteria.url && criteria.url !== url) {
  70. continue;
  71. }
  72. if (criteria.hostname && criteria.hostname !== hostname) {
  73. continue;
  74. }
  75. if (criteria.sld && criteria.sld !== hostname.split(".")[0]) {
  76. continue;
  77. }
  78. if (criteria.params && !_hasParams(criteria.params, params)) {
  79. continue;
  80. }
  81. return type.type;
  82. }
  83. }
  84. }
  85. return category;
  86. }
  87. const EXPORTED_SYMBOLS = ["classifySite"];