RecipeExecutor.jsm 32 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102
  1. /* This Source Code Form is subject to the terms of the Mozilla Public
  2. * License, v. 2.0. If a copy of the MPL was not distributed with this
  3. * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
  4. "use strict";
  5. const {tokenize} = ChromeUtils.import("resource://activity-stream/lib/Tokenize.jsm");
  6. /**
  7. * RecipeExecutor is the core feature engineering pipeline for the in-browser
  8. * personalization work. These pipelines are called "recipes". A recipe is an
  9. * array of objects that define a "step" in the recipe. A step is simply an
  10. * object with a field "function" that specifies what is being done in the step
  11. * along with other fields that are semantically defined for that step.
  12. *
  13. * There are two types of recipes "builder" recipes and "combiner" recipes. Builder
  14. * recipes mutate an object until it matches some set of critera. Combiner
  15. * recipes take two objects, (a "left" and a "right"), and specify the steps
  16. * to merge the right object into the left object.
  17. *
  18. * A short nonsense example recipe is:
  19. * [ {"function": "get_url_domain", "path_length": 1, "field": "url", "dest": "url_domain"},
  20. * {"function": "nb_tag", "fields": ["title", "description"]},
  21. * {"function": "conditionally_nmf_tag", "fields": ["title", "description"]} ]
  22. *
  23. * Recipes are sandboxed by the fact that the step functions must be explicitly
  24. * whitelisted. Functions whitelisted for builder recipes are specifed in the
  25. * RecipeExecutor.ITEM_BUILDER_REGISTRY, while combiner functions are whitelisted
  26. * in RecipeExecutor.ITEM_COMBINER_REGISTRY .
  27. */
  28. this.RecipeExecutor = class RecipeExecutor {
  29. constructor(nbTaggers, nmfTaggers) {
  30. this.ITEM_BUILDER_REGISTRY = {
  31. nb_tag: this.naiveBayesTag,
  32. conditionally_nmf_tag: this.conditionallyNmfTag,
  33. accept_item_by_field_value: this.acceptItemByFieldValue,
  34. tokenize_url: this.tokenizeUrl,
  35. get_url_domain: this.getUrlDomain,
  36. tokenize_field: this.tokenizeField,
  37. copy_value: this.copyValue,
  38. keep_top_k: this.keepTopK,
  39. scalar_multiply: this.scalarMultiply,
  40. elementwise_multiply: this.elementwiseMultiply,
  41. vector_multiply: this.vectorMultiply,
  42. scalar_add: this.scalarAdd,
  43. vector_add: this.vectorAdd,
  44. make_boolean: this.makeBoolean,
  45. whitelist_fields: this.whitelistFields,
  46. filter_by_value: this.filterByValue,
  47. l2_normalize: this.l2Normalize,
  48. prob_normalize: this.probNormalize,
  49. set_default: this.setDefault,
  50. lookup_value: this.lookupValue,
  51. copy_to_map: this.copyToMap,
  52. scalar_multiply_tag: this.scalarMultiplyTag,
  53. apply_softmax_tags: this.applySoftmaxTags,
  54. };
  55. this.ITEM_COMBINER_REGISTRY = {
  56. combiner_add: this.combinerAdd,
  57. combiner_max: this.combinerMax,
  58. combiner_collect_values: this.combinerCollectValues,
  59. };
  60. this.nbTaggers = nbTaggers;
  61. this.nmfTaggers = nmfTaggers;
  62. }
  63. /**
  64. * Determines the type of a field. Valid types are:
  65. * string
  66. * number
  67. * array
  68. * map (strings to anything)
  69. */
  70. _typeOf(data) {
  71. let t = typeof(data);
  72. if (t === "object") {
  73. if (data === null) {
  74. return "null";
  75. } if (Array.isArray(data)) {
  76. return "array";
  77. }
  78. return "map";
  79. }
  80. return t;
  81. }
  82. /**
  83. * Returns a scalar, either because it was a constant, or by
  84. * looking it up from the item. Allows for a default value if the lookup
  85. * fails.
  86. */
  87. _lookupScalar(item, k, dfault) {
  88. if (this._typeOf(k) === "number") {
  89. return k;
  90. } else if ((this._typeOf(k) === "string") && (k in item) && (this._typeOf(item[k]) === "number")) {
  91. return item[k];
  92. }
  93. return dfault;
  94. }
  95. /**
  96. * Simply appends all the strings from a set fields together. If the field
  97. * is a list, then the cells of the list are append.
  98. */
  99. _assembleText(item, fields) {
  100. let textArr = [];
  101. for (let field of fields) {
  102. if (field in item) {
  103. let type = this._typeOf(item[field]);
  104. if (type === "string") {
  105. textArr.push(item[field]);
  106. } else if (type === "array") {
  107. for (let ele of item[field]) {
  108. textArr.push(String(ele));
  109. }
  110. } else {
  111. textArr.push(String(item[field]));
  112. }
  113. }
  114. }
  115. return textArr.join(" ");
  116. }
  117. /**
  118. * Runs the naive bayes text taggers over a set of text fields. Stores the
  119. * results in new fields:
  120. * nb_tags: a map of text strings to probabilites
  121. * nb_tokens: the tokenized text that was tagged
  122. *
  123. * Config:
  124. * fields: an array containing a list of fields to concatenate and tag
  125. */
  126. naiveBayesTag(item, config) {
  127. let text = this._assembleText(item, config.fields);
  128. let tokens = tokenize(text);
  129. let tags = {};
  130. let extended_tags = {};
  131. for (let nbTagger of this.nbTaggers) {
  132. let result = nbTagger.tagTokens(tokens);
  133. if ((result.label !== null) && result.confident) {
  134. extended_tags[result.label] = result;
  135. tags[result.label] = Math.exp(result.logProb);
  136. }
  137. }
  138. item.nb_tags = tags;
  139. item.nb_tags_extended = extended_tags;
  140. item.nb_tokens = tokens;
  141. return item;
  142. }
  143. /**
  144. * Selectively runs NMF text taggers depending on which tags were found
  145. * by the naive bayes taggers. Writes the results in into new fields:
  146. * nmf_tags_parent_weights: map of pareent tags to probabilites of those parent tags
  147. * nmf_tags: map of strings to maps of strings to probabilities
  148. * nmf_tags_parent map of child tags to parent tags
  149. *
  150. * Config:
  151. * Not configurable
  152. */
  153. conditionallyNmfTag(item, config) {
  154. let nestedNmfTags = {};
  155. let parentTags = {};
  156. let parentWeights = {};
  157. if (!("nb_tags" in item) || !("nb_tokens" in item)) {
  158. return null;
  159. }
  160. Object.keys(item.nb_tags).forEach(parentTag => {
  161. let nmfTagger = this.nmfTaggers[parentTag];
  162. if (nmfTagger !== undefined) {
  163. nestedNmfTags[parentTag] = {};
  164. parentWeights[parentTag] = item.nb_tags[parentTag];
  165. let nmfTags = nmfTagger.tagTokens(item.nb_tokens);
  166. Object.keys(nmfTags).forEach(nmfTag => {
  167. nestedNmfTags[parentTag][nmfTag] = nmfTags[nmfTag];
  168. parentTags[nmfTag] = parentTag;
  169. });
  170. }
  171. });
  172. item.nmf_tags = nestedNmfTags;
  173. item.nmf_tags_parent = parentTags;
  174. item.nmf_tags_parent_weights = parentWeights;
  175. return item;
  176. }
  177. /**
  178. * Checks a field's value against another value (either from another field
  179. * or a constant). If the test passes, then the item is emitted, otherwise
  180. * the pipeline is aborted.
  181. *
  182. * Config:
  183. * field Field to read the value to test. Left side of operator.
  184. * op one of ==, !=, <, <=, >, >=
  185. * rhsValue Constant value to compare against. Right side of operator.
  186. * rhsField Field to read value to compare against. Right side of operator.
  187. *
  188. * NOTE: rhsValue takes precidence over rhsField.
  189. */
  190. acceptItemByFieldValue(item, config) {
  191. if (!(config.field in item)) {
  192. return null;
  193. }
  194. let rhs = null;
  195. if ("rhsValue" in config) {
  196. rhs = config.rhsValue;
  197. } else if (("rhsField" in config) && (config.rhsField in item)) {
  198. rhs = item[config.rhsField];
  199. }
  200. if (rhs === null) {
  201. return null;
  202. }
  203. // eslint-disable-next-line eqeqeq
  204. if (((config.op === "==") && (item[config.field] == rhs)) ||
  205. // eslint-disable-next-line eqeqeq
  206. ((config.op === "!=") && (item[config.field] != rhs)) ||
  207. ((config.op === "<") && (item[config.field] < rhs)) ||
  208. ((config.op === "<=") && (item[config.field] <= rhs)) ||
  209. ((config.op === ">") && (item[config.field] > rhs)) ||
  210. ((config.op === ">=") && (item[config.field] >= rhs))) {
  211. return item;
  212. }
  213. return null;
  214. }
  215. /**
  216. * Splits a URL into text-like tokens.
  217. *
  218. * Config:
  219. * field Field containing a URL
  220. * dest Field to write the tokens to as an array of strings
  221. *
  222. * NOTE: Any initial 'www' on the hostname is removed.
  223. */
  224. tokenizeUrl(item, config) {
  225. if (!(config.field in item)) {
  226. return null;
  227. }
  228. let url = new URL(item[config.field]);
  229. let domain = url.hostname;
  230. if (domain.startsWith("www.")) {
  231. domain = domain.substring(4);
  232. }
  233. let toks = tokenize(domain);
  234. let pathToks = tokenize(decodeURIComponent(url.pathname.replace(/\+/g, " ")));
  235. for (let tok of pathToks) {
  236. toks.push(tok);
  237. }
  238. for (let pair of url.searchParams.entries()) {
  239. let k = tokenize(decodeURIComponent(pair[0].replace(/\+/g, " ")));
  240. for (let tok of k) {
  241. toks.push(tok);
  242. }
  243. if ((pair[1] !== null) && (pair[1] !== "")) {
  244. let v = tokenize(decodeURIComponent(pair[1].replace(/\+/g, " ")));
  245. for (let tok of v) {
  246. toks.push(tok);
  247. }
  248. }
  249. }
  250. item[config.dest] = toks;
  251. return item;
  252. }
  253. /**
  254. * Gets the hostname (minus any initial "www." along with the left most
  255. * directories on the path.
  256. *
  257. * Config:
  258. * field Field containing the URL
  259. * dest Field to write the array of strings to
  260. * path_length OPTIONAL (DEFAULT: 0) Number of leftmost subdirectories to include
  261. */
  262. getUrlDomain(item, config) {
  263. if (!(config.field in item)) {
  264. return null;
  265. }
  266. let url = new URL(item[config.field]);
  267. let domain = url.hostname.toLocaleLowerCase();
  268. if (domain.startsWith("www.")) {
  269. domain = domain.substring(4);
  270. }
  271. item[config.dest] = domain;
  272. let pathLength = 0;
  273. if ("path_length" in config) {
  274. pathLength = config.path_length;
  275. }
  276. if (pathLength > 0) {
  277. item[config.dest] += url.pathname.toLocaleLowerCase().split("/")
  278. .slice(0, pathLength + 1)
  279. .join("/");
  280. }
  281. return item;
  282. }
  283. /**
  284. * Splits a field into tokens.
  285. * Config:
  286. * field Field containing a string to tokenize
  287. * dest Field to write the array of strings to
  288. */
  289. tokenizeField(item, config) {
  290. if (!(config.field in item)) {
  291. return null;
  292. }
  293. item[config.dest] = tokenize(item[config.field]);
  294. return item;
  295. }
  296. /**
  297. * Deep copy from one field to another.
  298. * Config:
  299. * src Field to read from
  300. * dest Field to write to
  301. */
  302. copyValue(item, config) {
  303. if (!(config.src in item)) {
  304. return null;
  305. }
  306. item[config.dest] = JSON.parse(JSON.stringify(item[config.src]));
  307. return item;
  308. }
  309. /**
  310. * Converts a field containing a map of strings to a map of strings
  311. * to numbers, to a map of strings to numbers containing at most k elements.
  312. * This operation is performed by first, promoting all the subkeys up one
  313. * level, and then taking the top (or bottom) k values.
  314. *
  315. * Config:
  316. * field Points to a map of strings to a map of strings to numbers
  317. * k Maximum number of items to keep
  318. * descending OPTIONAL (DEFAULT: True) Sorts score in descending order
  319. * (i.e. keeps maximum)
  320. */
  321. keepTopK(item, config) {
  322. if (!(config.field in item)) {
  323. return null;
  324. }
  325. let k = this._lookupScalar(item, config.k, 1048576);
  326. let descending = (!("descending" in config) || (config.descending !== false));
  327. // we can't sort by the values in the map, so we have to convert this
  328. // to an array, and then sort.
  329. let sortable = [];
  330. Object.keys(item[config.field]).forEach(outerKey => {
  331. let innerType = this._typeOf(item[config.field][outerKey]);
  332. if (innerType === "map") {
  333. Object.keys(item[config.field][outerKey]).forEach(innerKey => {
  334. sortable.push({key: innerKey, value: item[config.field][outerKey][innerKey]});
  335. });
  336. } else {
  337. sortable.push({key: outerKey, value: item[config.field][outerKey]});
  338. }
  339. });
  340. sortable.sort((a, b) => {
  341. if (descending) {
  342. return b.value - a.value;
  343. }
  344. return a.value - b.value;
  345. });
  346. // now take the top k
  347. let newMap = {};
  348. let i = 0;
  349. for (let pair of sortable) {
  350. if (i >= k) {
  351. break;
  352. }
  353. newMap[pair.key] = pair.value;
  354. i++;
  355. }
  356. item[config.field] = newMap;
  357. return item;
  358. }
  359. /**
  360. * Scalar multiplies a vector by some constant
  361. *
  362. * Config:
  363. * field Points to:
  364. * a map of strings to numbers
  365. * an array of numbers
  366. * a number
  367. * k Either a number, or a string. If it's a number then This
  368. * is the scalar value to multiply by. If it's a string,
  369. * the value in the pointed to field is used.
  370. * default OPTIONAL (DEFAULT: 0), If k is a string, and no numeric
  371. * value is found, then use this value.
  372. */
  373. scalarMultiply(item, config) {
  374. if (!(config.field in item)) {
  375. return null;
  376. }
  377. let k = this._lookupScalar(item, config.k, config.dfault);
  378. let fieldType = this._typeOf(item[config.field]);
  379. if (fieldType === "number") {
  380. item[config.field] *= k;
  381. } else if (fieldType === "array") {
  382. for (let i = 0; i < item[config.field].length; i++) {
  383. item[config.field][i] *= k;
  384. }
  385. } else if (fieldType === "map") {
  386. Object.keys(item[config.field]).forEach(key => {
  387. item[config.field][key] *= k;
  388. });
  389. } else {
  390. return null;
  391. }
  392. return item;
  393. }
  394. /**
  395. * Elementwise multiplies either two maps or two arrays together, storing
  396. * the result in left. If left and right are of the same type, results in an
  397. * error.
  398. *
  399. * Maps are special case. For maps the left must be a nested map such as:
  400. * { k1: { k11: 1, k12: 2}, k2: { k21: 3, k22: 4 } } and right needs to be
  401. * simple map such as: { k1: 5, k2: 6} . The operation is then to mulitply
  402. * every value of every right key, to every value every subkey where the
  403. * parent keys match. Using the previous examples, the result would be:
  404. * { k1: { k11: 5, k12: 10 }, k2: { k21: 18, k22: 24 } } .
  405. *
  406. * Config:
  407. * left
  408. * right
  409. */
  410. elementwiseMultiply(item, config) {
  411. if (!(config.left in item) || !(config.right in item)) {
  412. return null;
  413. }
  414. let leftType = this._typeOf(item[config.left]);
  415. if (leftType !== this._typeOf(item[config.right])) {
  416. return null;
  417. }
  418. if (leftType === "array") {
  419. if (item[config.left].length !== item[config.right].length) {
  420. return null;
  421. }
  422. for (let i = 0; i < item[config.left].length; i++) {
  423. item[config.left][i] *= item[config.right][i];
  424. }
  425. } else if (leftType === "map") {
  426. Object.keys(item[config.left]).forEach(outerKey => {
  427. let r = 0.0;
  428. if (outerKey in item[config.right]) {
  429. r = item[config.right][outerKey];
  430. }
  431. Object.keys(item[config.left][outerKey]).forEach(innerKey => {
  432. item[config.left][outerKey][innerKey] *= r;
  433. });
  434. });
  435. } else if (leftType === "number") {
  436. item[config.left] *= item[config.right];
  437. } else {
  438. return null;
  439. }
  440. return item;
  441. }
  442. /**
  443. * Vector multiplies (i.e. dot products) two vectors and stores the result in
  444. * third field. Both vectors must either by maps, or arrays of numbers with
  445. * the same length.
  446. *
  447. * Config:
  448. * left A field pointing to either a map of strings to numbers,
  449. * or an array of numbers
  450. * right A field pointing to either a map of strings to numbers,
  451. * or an array of numbers
  452. * dest The field to store the dot product.
  453. */
  454. vectorMultiply(item, config) {
  455. if (!(config.left in item) || !(config.right in item)) {
  456. return null;
  457. }
  458. let leftType = this._typeOf(item[config.left]);
  459. if (leftType !== this._typeOf(item[config.right])) {
  460. return null;
  461. }
  462. let destVal = 0.0;
  463. if (leftType === "array") {
  464. if (item[config.left].length !== item[config.right].length) {
  465. return null;
  466. }
  467. for (let i = 0; i < item[config.left].length; i++) {
  468. destVal += item[config.left][i] * item[config.right][i];
  469. }
  470. } else if (leftType === "map") {
  471. Object.keys(item[config.left]).forEach(key => {
  472. if (key in item[config.right]) {
  473. destVal += item[config.left][key] * item[config.right][key];
  474. }
  475. });
  476. } else {
  477. return null;
  478. }
  479. item[config.dest] = destVal;
  480. return item;
  481. }
  482. /**
  483. * Adds a constant value to all elements in the field. Mathematically,
  484. * this is the same as taking a 1-vector, scalar multiplying it by k,
  485. * and then vector adding it to a field.
  486. *
  487. * Config:
  488. * field A field pointing to either a map of strings to numbers,
  489. * or an array of numbers
  490. * k Either a number, or a string. If it's a number then This
  491. * is the scalar value to multiply by. If it's a string,
  492. * the value in the pointed to field is used.
  493. * default OPTIONAL (DEFAULT: 0), If k is a string, and no numeric
  494. * value is found, then use this value.
  495. */
  496. scalarAdd(item, config) {
  497. let k = this._lookupScalar(item, config.k, config.dfault);
  498. if (!(config.field in item)) {
  499. return null;
  500. }
  501. let fieldType = this._typeOf(item[config.field]);
  502. if (fieldType === "array") {
  503. for (let i = 0; i < item[config.field].length; i++) {
  504. item[config.field][i] += k;
  505. }
  506. } else if (fieldType === "map") {
  507. Object.keys(item[config.field]).forEach(key => {
  508. item[config.field][key] += k;
  509. });
  510. } else if (fieldType === "number") {
  511. item[config.field] += k;
  512. } else {
  513. return null;
  514. }
  515. return item;
  516. }
  517. /**
  518. * Adds two vectors together and stores the result in left.
  519. *
  520. * Config:
  521. * left A field pointing to either a map of strings to numbers,
  522. * or an array of numbers
  523. * right A field pointing to either a map of strings to numbers,
  524. * or an array of numbers
  525. */
  526. vectorAdd(item, config) {
  527. if (!(config.left in item)) {
  528. return this.copyValue(item, {src: config.right, dest: config.left});
  529. }
  530. if (!(config.right in item)) {
  531. return null;
  532. }
  533. let leftType = this._typeOf(item[config.left]);
  534. if (leftType !== this._typeOf(item[config.right])) {
  535. return null;
  536. }
  537. if (leftType === "array") {
  538. if (item[config.left].length !== item[config.right].length) {
  539. return null;
  540. }
  541. for (let i = 0; i < item[config.left].length; i++) {
  542. item[config.left][i] += item[config.right][i];
  543. }
  544. return item;
  545. } else if (leftType === "map") {
  546. Object.keys(item[config.right]).forEach(key => {
  547. let v = 0;
  548. if (key in item[config.left]) {
  549. v = item[config.left][key];
  550. }
  551. item[config.left][key] = v + item[config.right][key];
  552. });
  553. return item;
  554. }
  555. return null;
  556. }
  557. /**
  558. * Converts a vector from real values to boolean integers. (i.e. either 1/0
  559. * or 1/-1).
  560. *
  561. * Config:
  562. * field Field containing either a mpa of strings to numbers or
  563. * an array of numbers to convert.
  564. * threshold OPTIONAL (DEFAULT: 0) Values above this will be replaced
  565. * with 1.0. Those below will be converted to 0.
  566. * keep_negative OPTIONAL (DEFAULT: False) If true, values below the
  567. * threshold will be converted to -1 instead of 0.
  568. */
  569. makeBoolean(item, config) {
  570. if (!(config.field in item)) {
  571. return null;
  572. }
  573. let threshold = this._lookupScalar(item, config.threshold, 0.0);
  574. let type = this._typeOf(item[config.field]);
  575. if (type === "array") {
  576. for (let i = 0; i < item[config.field].length; i++) {
  577. if (item[config.field][i] > threshold) {
  578. item[config.field][i] = 1.0;
  579. } else if (config.keep_negative) {
  580. item[config.field][i] = -1.0;
  581. } else {
  582. item[config.field][i] = 0.0;
  583. }
  584. }
  585. } else if (type === "map") {
  586. Object.keys(item[config.field]).forEach(key => {
  587. let value = item[config.field][key];
  588. if (value > threshold) {
  589. item[config.field][key] = 1.0;
  590. } else if (config.keep_negative) {
  591. item[config.field][key] = -1.0;
  592. } else {
  593. item[config.field][key] = 0.0;
  594. }
  595. });
  596. } else if (type === "number") {
  597. let value = item[config.field];
  598. if (value > threshold) {
  599. item[config.field] = 1.0;
  600. } else if (config.keep_negative) {
  601. item[config.field] = -1.0;
  602. } else {
  603. item[config.field] = 0.0;
  604. }
  605. } else {
  606. return null;
  607. }
  608. return item;
  609. }
  610. /**
  611. * Removes all keys from the item except for the ones specified.
  612. *
  613. * fields An array of strings indicating the fields to keep
  614. */
  615. whitelistFields(item, config) {
  616. let newItem = {};
  617. for (let ele of config.fields) {
  618. if (ele in item) {
  619. newItem[ele] = item[ele];
  620. }
  621. }
  622. return newItem;
  623. }
  624. /**
  625. * Removes all keys whose value does not exceed some threshold.
  626. *
  627. * Config:
  628. * field Points to a map of strings to numbers
  629. * threshold Values must exceed this value, otherwise they are removed.
  630. */
  631. filterByValue(item, config) {
  632. if (!(config.field in item)) {
  633. return null;
  634. }
  635. let threshold = this._lookupScalar(item, config.threshold, 0.0);
  636. let filtered = {};
  637. Object.keys(item[config.field]).forEach(key => {
  638. let value = item[config.field][key];
  639. if (value > threshold) {
  640. filtered[key] = value;
  641. }
  642. });
  643. item[config.field] = filtered;
  644. return item;
  645. }
  646. /**
  647. * Rewrites a field so that its values are now L2 normed.
  648. *
  649. * Config:
  650. * field Points to a map of strings to numbers, or an array of numbers
  651. */
  652. l2Normalize(item, config) {
  653. if (!(config.field in item)) {
  654. return null;
  655. }
  656. let data = item[config.field];
  657. let type = this._typeOf(data);
  658. if (type === "array") {
  659. let norm = 0.0;
  660. for (let datum of data) {
  661. norm += datum * datum;
  662. }
  663. norm = Math.sqrt(norm);
  664. if (norm !== 0) {
  665. for (let i = 0; i < data.length; i++) {
  666. data[i] /= norm;
  667. }
  668. }
  669. } else if (type === "map") {
  670. let norm = 0.0;
  671. Object.keys(data).forEach(key => {
  672. norm += data[key] * data[key];
  673. });
  674. norm = Math.sqrt(norm);
  675. if (norm !== 0) {
  676. Object.keys(data).forEach(key => {
  677. data[key] /= norm;
  678. });
  679. }
  680. } else {
  681. return null;
  682. }
  683. item[config.field] = data;
  684. return item;
  685. }
  686. /**
  687. * Rewrites a field so that all of its values sum to 1.0
  688. *
  689. * Config:
  690. * field Points to a map of strings to numbers, or an array of numbers
  691. */
  692. probNormalize(item, config) {
  693. if (!(config.field in item)) {
  694. return null;
  695. }
  696. let data = item[config.field];
  697. let type = this._typeOf(data);
  698. if (type === "array") {
  699. let norm = 0.0;
  700. for (let datum of data) {
  701. norm += datum;
  702. }
  703. if (norm !== 0) {
  704. for (let i = 0; i < data.length; i++) {
  705. data[i] /= norm;
  706. }
  707. }
  708. } else if (type === "map") {
  709. let norm = 0.0;
  710. Object.keys(item[config.field]).forEach(key => {
  711. norm += item[config.field][key];
  712. });
  713. if (norm !== 0) {
  714. Object.keys(item[config.field]).forEach(key => {
  715. item[config.field][key] /= norm;
  716. });
  717. }
  718. } else {
  719. return null;
  720. }
  721. return item;
  722. }
  723. /**
  724. * Stores a value, if it is not already present
  725. *
  726. * Config:
  727. * field field to write to if it is missing
  728. * value value to store in that field
  729. */
  730. setDefault(item, config) {
  731. let val = this._lookupScalar(item, config.value, config.value);
  732. if (!(config.field in item)) {
  733. item[config.field] = val;
  734. }
  735. return item;
  736. }
  737. /**
  738. * Selctively promotes an value from an inner map up to the outer map
  739. *
  740. * Config:
  741. * haystack Points to a map of strings to values
  742. * needle Key inside the map we should promote up
  743. * dest Where we should write the value of haystack[needle]
  744. */
  745. lookupValue(item, config) {
  746. if ((config.haystack in item) && (config.needle in item[config.haystack])) {
  747. item[config.dest] = item[config.haystack][config.needle];
  748. }
  749. return item;
  750. }
  751. /**
  752. * Demotes a field into a map
  753. *
  754. * Config:
  755. * src Field to copy
  756. * dest_map Points to a map
  757. * dest_key Key inside dest_map to copy src to
  758. */
  759. copyToMap(item, config) {
  760. if (config.src in item) {
  761. if (!(config.dest_map in item)) {
  762. item[config.dest_map] = {};
  763. }
  764. item[config.dest_map][config.dest_key] = item[config.src];
  765. }
  766. return item;
  767. }
  768. /**
  769. * Config:
  770. * field Points to a string to number map
  771. * k Scalar to multiply the values by
  772. * log_scale Boolean, if true, then the values will be transformed
  773. * by a logrithm prior to multiplications
  774. */
  775. scalarMultiplyTag(item, config) {
  776. let EPSILON = 0.000001;
  777. if (!(config.field in item)) {
  778. return null;
  779. }
  780. let k = this._lookupScalar(item, config.k, 1);
  781. let type = this._typeOf(item[config.field]);
  782. if (type === "map") {
  783. Object.keys(item[config.field]).forEach(parentKey => {
  784. Object.keys(item[config.field][parentKey]).forEach(key => {
  785. let v = item[config.field][parentKey][key];
  786. if (config.log_scale) {
  787. v = Math.log(v + EPSILON);
  788. }
  789. item[config.field][parentKey][key] = v * k;
  790. });
  791. });
  792. } else {
  793. return null;
  794. }
  795. return item;
  796. }
  797. /**
  798. * Independently applies softmax across all subtags.
  799. *
  800. * Config:
  801. * field Points to a map of strings with values being another map of strings
  802. */
  803. applySoftmaxTags(item, config) {
  804. let type = this._typeOf(item[config.field]);
  805. if (type !== "map") {
  806. return null;
  807. }
  808. let abort = false;
  809. let softmaxSum = {};
  810. Object.keys(item[config.field]).forEach(tag => {
  811. if (this._typeOf(item[config.field][tag]) !== "map") {
  812. abort = true;
  813. return;
  814. }
  815. if (abort) {
  816. return;
  817. }
  818. softmaxSum[tag] = 0;
  819. Object.keys(item[config.field][tag]).forEach(subtag => {
  820. if (this._typeOf(item[config.field][tag][subtag]) !== "number") {
  821. abort = true;
  822. return;
  823. }
  824. let score = item[config.field][tag][subtag];
  825. softmaxSum[tag] += Math.exp(score);
  826. });
  827. });
  828. if (abort) {
  829. return null;
  830. }
  831. Object.keys(item[config.field]).forEach(tag => {
  832. Object.keys(item[config.field][tag]).forEach(subtag => {
  833. item[config.field][tag][subtag] = Math.exp(item[config.field][tag][subtag]) / softmaxSum[tag];
  834. });
  835. });
  836. return item;
  837. }
  838. /**
  839. * Vector adds a field and stores the result in left.
  840. *
  841. * Config:
  842. * field The field to vector add
  843. */
  844. combinerAdd(left, right, config) {
  845. if (!(config.field in right)) {
  846. return left;
  847. }
  848. let type = this._typeOf(right[config.field]);
  849. if (!(config.field in left)) {
  850. if (type === "map") {
  851. left[config.field] = {};
  852. } else if (type === "array") {
  853. left[config.field] = [];
  854. } else if (type === "number") {
  855. left[config.field] = 0;
  856. } else {
  857. return null;
  858. }
  859. }
  860. if (type !== this._typeOf(left[config.field])) {
  861. return null;
  862. }
  863. if (type === "map") {
  864. Object.keys(right[config.field]).forEach(key => {
  865. if (!(key in left[config.field])) {
  866. left[config.field][key] = 0;
  867. }
  868. left[config.field][key] += right[config.field][key];
  869. });
  870. } else if (type === "array") {
  871. for (let i = 0; i < right[config.field].length; i++) {
  872. if (i < left[config.field].length) {
  873. left[config.field][i] += right[config.field][i];
  874. } else {
  875. left[config.field].push(right[config.field][i]);
  876. }
  877. }
  878. } else if (type === "number") {
  879. left[config.field] += right[config.field];
  880. } else {
  881. return null;
  882. }
  883. return left;
  884. }
  885. /**
  886. * Stores the maximum value of the field in left.
  887. *
  888. * Config:
  889. * field The field to vector add
  890. */
  891. combinerMax(left, right, config) {
  892. if (!(config.field in right)) {
  893. return left;
  894. }
  895. let type = this._typeOf(right[config.field]);
  896. if (!(config.field in left)) {
  897. if (type === "map") {
  898. left[config.field] = {};
  899. } else if (type === "array") {
  900. left[config.field] = [];
  901. } else if (type === "number") {
  902. left[config.field] = 0;
  903. } else {
  904. return null;
  905. }
  906. }
  907. if (type !== this._typeOf(left[config.field])) {
  908. return null;
  909. }
  910. if (type === "map") {
  911. Object.keys(right[config.field]).forEach(key => {
  912. if (!(key in left[config.field]) ||
  913. (right[config.field][key] > left[config.field][key])) {
  914. left[config.field][key] = right[config.field][key];
  915. }
  916. });
  917. } else if (type === "array") {
  918. for (let i = 0; i < right[config.field].length; i++) {
  919. if (i < left[config.field].length) {
  920. if (left[config.field][i] < right[config.field][i]) {
  921. left[config.field][i] = right[config.field][i];
  922. }
  923. } else {
  924. left[config.field].push(right[config.field][i]);
  925. }
  926. }
  927. } else if (type === "number") {
  928. if (left[config.field] < right[config.field]) {
  929. left[config.field] = right[config.field];
  930. }
  931. } else {
  932. return null;
  933. }
  934. return left;
  935. }
  936. /**
  937. * Associates a value in right with another value in right. This association
  938. * is then stored in a map in left.
  939. *
  940. * For example: If a sequence of rights is:
  941. * { 'tags': {}, 'url_domain': 'maseratiusa.com/maserati', 'time': 41 }
  942. * { 'tags': {}, 'url_domain': 'mbusa.com/mercedes', 'time': 21 }
  943. * { 'tags': {}, 'url_domain': 'maseratiusa.com/maserati', 'time': 34 }
  944. *
  945. * Then assuming a 'sum' operation, left can build a map that would look like:
  946. * {
  947. * 'maseratiusa.com/maserati': 75,
  948. * 'mbusa.com/mercedes': 21,
  949. * }
  950. *
  951. * Fields:
  952. * left_field field in the left to store / update the map
  953. * right_key_field Field in the right to use as a key
  954. * right_value_field Field in the right to use as a value
  955. * operation One of "sum", "max", "overwrite", "count"
  956. */
  957. combinerCollectValues(left, right, config) {
  958. let op;
  959. if (config.operation === "sum") {
  960. op = (a, b) => a + b;
  961. } else if (config.operation === "max") {
  962. op = (a, b) => ((a > b) ? a : b);
  963. } else if (config.operation === "overwrite") {
  964. op = (a, b) => b;
  965. } else if (config.operation === "count") {
  966. op = (a, b) => a + 1;
  967. } else {
  968. return null;
  969. }
  970. if (!(config.left_field in left)) {
  971. left[config.left_field] = {};
  972. }
  973. if ((!(config.right_key_field in right)) || (!(config.right_value_field in right))) {
  974. return left;
  975. }
  976. let key = right[config.right_key_field];
  977. let rightValue = right[config.right_value_field];
  978. let leftValue = 0.0;
  979. if (key in left[config.left_field]) {
  980. leftValue = left[config.left_field][key];
  981. }
  982. left[config.left_field][key] = op(leftValue, rightValue);
  983. return left;
  984. }
  985. /**
  986. * Executes a recipe. Returns an object on success, or null on failure.
  987. */
  988. executeRecipe(item, recipe) {
  989. let newItem = item;
  990. for (let step of recipe) {
  991. let op = this.ITEM_BUILDER_REGISTRY[step.function];
  992. if (op === undefined) {
  993. return null;
  994. }
  995. newItem = op.call(this, newItem, step);
  996. if (newItem === null) {
  997. break;
  998. }
  999. }
  1000. return newItem;
  1001. }
  1002. /**
  1003. * Executes a recipe. Returns an object on success, or null on failure.
  1004. */
  1005. executeCombinerRecipe(item1, item2, recipe) {
  1006. let newItem1 = item1;
  1007. for (let step of recipe) {
  1008. let op = this.ITEM_COMBINER_REGISTRY[step.function];
  1009. if (op === undefined) {
  1010. return null;
  1011. }
  1012. newItem1 = op.call(this, newItem1, item2, step);
  1013. if (newItem1 === null) {
  1014. break;
  1015. }
  1016. }
  1017. return newItem1;
  1018. }
  1019. };
  1020. const EXPORTED_SYMBOLS = ["RecipeExecutor"];