mapper.c 26 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100
  1. /*
  2. * Ceph - scalable distributed file system
  3. *
  4. * Copyright (C) 2015 Intel Corporation All Rights Reserved
  5. *
  6. * This is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License version 2.1, as published by the Free Software
  9. * Foundation. See file COPYING.
  10. *
  11. */
  12. #ifdef __KERNEL__
  13. # include <linux/string.h>
  14. # include <linux/slab.h>
  15. # include <linux/bug.h>
  16. # include <linux/kernel.h>
  17. # include <linux/crush/crush.h>
  18. # include <linux/crush/hash.h>
  19. # include <linux/crush/mapper.h>
  20. #else
  21. # include "crush_compat.h"
  22. # include "crush.h"
  23. # include "hash.h"
  24. # include "mapper.h"
  25. #endif
  26. #include "crush_ln_table.h"
  27. #define dprintk(args...) /* printf(args) */
  28. /*
  29. * Implement the core CRUSH mapping algorithm.
  30. */
  31. /**
  32. * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
  33. * @map: the crush_map
  34. * @ruleset: the storage ruleset id (user defined)
  35. * @type: storage ruleset type (user defined)
  36. * @size: output set size
  37. */
  38. int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size)
  39. {
  40. __u32 i;
  41. for (i = 0; i < map->max_rules; i++) {
  42. if (map->rules[i] &&
  43. map->rules[i]->mask.ruleset == ruleset &&
  44. map->rules[i]->mask.type == type &&
  45. map->rules[i]->mask.min_size <= size &&
  46. map->rules[i]->mask.max_size >= size)
  47. return i;
  48. }
  49. return -1;
  50. }
  51. /*
  52. * bucket choose methods
  53. *
  54. * For each bucket algorithm, we have a "choose" method that, given a
  55. * crush input @x and replica position (usually, position in output set) @r,
  56. * will produce an item in the bucket.
  57. */
  58. /*
  59. * Choose based on a random permutation of the bucket.
  60. *
  61. * We used to use some prime number arithmetic to do this, but it
  62. * wasn't very random, and had some other bad behaviors. Instead, we
  63. * calculate an actual random permutation of the bucket members.
  64. * Since this is expensive, we optimize for the r=0 case, which
  65. * captures the vast majority of calls.
  66. */
  67. static int bucket_perm_choose(const struct crush_bucket *bucket,
  68. struct crush_work_bucket *work,
  69. int x, int r)
  70. {
  71. unsigned int pr = r % bucket->size;
  72. unsigned int i, s;
  73. /* start a new permutation if @x has changed */
  74. if (work->perm_x != (__u32)x || work->perm_n == 0) {
  75. dprintk("bucket %d new x=%d\n", bucket->id, x);
  76. work->perm_x = x;
  77. /* optimize common r=0 case */
  78. if (pr == 0) {
  79. s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
  80. bucket->size;
  81. work->perm[0] = s;
  82. work->perm_n = 0xffff; /* magic value, see below */
  83. goto out;
  84. }
  85. for (i = 0; i < bucket->size; i++)
  86. work->perm[i] = i;
  87. work->perm_n = 0;
  88. } else if (work->perm_n == 0xffff) {
  89. /* clean up after the r=0 case above */
  90. for (i = 1; i < bucket->size; i++)
  91. work->perm[i] = i;
  92. work->perm[work->perm[0]] = 0;
  93. work->perm_n = 1;
  94. }
  95. /* calculate permutation up to pr */
  96. for (i = 0; i < work->perm_n; i++)
  97. dprintk(" perm_choose have %d: %d\n", i, work->perm[i]);
  98. while (work->perm_n <= pr) {
  99. unsigned int p = work->perm_n;
  100. /* no point in swapping the final entry */
  101. if (p < bucket->size - 1) {
  102. i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
  103. (bucket->size - p);
  104. if (i) {
  105. unsigned int t = work->perm[p + i];
  106. work->perm[p + i] = work->perm[p];
  107. work->perm[p] = t;
  108. }
  109. dprintk(" perm_choose swap %d with %d\n", p, p+i);
  110. }
  111. work->perm_n++;
  112. }
  113. for (i = 0; i < bucket->size; i++)
  114. dprintk(" perm_choose %d: %d\n", i, work->perm[i]);
  115. s = work->perm[pr];
  116. out:
  117. dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
  118. bucket->size, x, r, pr, s);
  119. return bucket->items[s];
  120. }
  121. /* uniform */
  122. static int bucket_uniform_choose(const struct crush_bucket_uniform *bucket,
  123. struct crush_work_bucket *work, int x, int r)
  124. {
  125. return bucket_perm_choose(&bucket->h, work, x, r);
  126. }
  127. /* list */
  128. static int bucket_list_choose(const struct crush_bucket_list *bucket,
  129. int x, int r)
  130. {
  131. int i;
  132. for (i = bucket->h.size-1; i >= 0; i--) {
  133. __u64 w = crush_hash32_4(bucket->h.hash, x, bucket->h.items[i],
  134. r, bucket->h.id);
  135. w &= 0xffff;
  136. dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
  137. "sw %x rand %llx",
  138. i, x, r, bucket->h.items[i], bucket->item_weights[i],
  139. bucket->sum_weights[i], w);
  140. w *= bucket->sum_weights[i];
  141. w = w >> 16;
  142. /*dprintk(" scaled %llx\n", w);*/
  143. if (w < bucket->item_weights[i]) {
  144. return bucket->h.items[i];
  145. }
  146. }
  147. dprintk("bad list sums for bucket %d\n", bucket->h.id);
  148. return bucket->h.items[0];
  149. }
  150. /* (binary) tree */
  151. static int height(int n)
  152. {
  153. int h = 0;
  154. while ((n & 1) == 0) {
  155. h++;
  156. n = n >> 1;
  157. }
  158. return h;
  159. }
  160. static int left(int x)
  161. {
  162. int h = height(x);
  163. return x - (1 << (h-1));
  164. }
  165. static int right(int x)
  166. {
  167. int h = height(x);
  168. return x + (1 << (h-1));
  169. }
  170. static int terminal(int x)
  171. {
  172. return x & 1;
  173. }
  174. static int bucket_tree_choose(const struct crush_bucket_tree *bucket,
  175. int x, int r)
  176. {
  177. int n;
  178. __u32 w;
  179. __u64 t;
  180. /* start at root */
  181. n = bucket->num_nodes >> 1;
  182. while (!terminal(n)) {
  183. int l;
  184. /* pick point in [0, w) */
  185. w = bucket->node_weights[n];
  186. t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
  187. bucket->h.id) * (__u64)w;
  188. t = t >> 32;
  189. /* descend to the left or right? */
  190. l = left(n);
  191. if (t < bucket->node_weights[l])
  192. n = l;
  193. else
  194. n = right(n);
  195. }
  196. return bucket->h.items[n >> 1];
  197. }
  198. /* straw */
  199. static int bucket_straw_choose(const struct crush_bucket_straw *bucket,
  200. int x, int r)
  201. {
  202. __u32 i;
  203. int high = 0;
  204. __u64 high_draw = 0;
  205. __u64 draw;
  206. for (i = 0; i < bucket->h.size; i++) {
  207. draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
  208. draw &= 0xffff;
  209. draw *= bucket->straws[i];
  210. if (i == 0 || draw > high_draw) {
  211. high = i;
  212. high_draw = draw;
  213. }
  214. }
  215. return bucket->h.items[high];
  216. }
  217. /* compute 2^44*log2(input+1) */
  218. static __u64 crush_ln(unsigned int xin)
  219. {
  220. unsigned int x = xin;
  221. int iexpon, index1, index2;
  222. __u64 RH, LH, LL, xl64, result;
  223. x++;
  224. /* normalize input */
  225. iexpon = 15;
  226. /*
  227. * figure out number of bits we need to shift and
  228. * do it in one step instead of iteratively
  229. */
  230. if (!(x & 0x18000)) {
  231. int bits = __builtin_clz(x & 0x1FFFF) - 16;
  232. x <<= bits;
  233. iexpon = 15 - bits;
  234. }
  235. index1 = (x >> 8) << 1;
  236. /* RH ~ 2^56/index1 */
  237. RH = __RH_LH_tbl[index1 - 256];
  238. /* LH ~ 2^48 * log2(index1/256) */
  239. LH = __RH_LH_tbl[index1 + 1 - 256];
  240. /* RH*x ~ 2^48 * (2^15 + xf), xf<2^8 */
  241. xl64 = (__s64)x * RH;
  242. xl64 >>= 48;
  243. result = iexpon;
  244. result <<= (12 + 32);
  245. index2 = xl64 & 0xff;
  246. /* LL ~ 2^48*log2(1.0+index2/2^15) */
  247. LL = __LL_tbl[index2];
  248. LH = LH + LL;
  249. LH >>= (48 - 12 - 32);
  250. result += LH;
  251. return result;
  252. }
  253. /*
  254. * straw2
  255. *
  256. * for reference, see:
  257. *
  258. * http://en.wikipedia.org/wiki/Exponential_distribution#Distribution_of_the_minimum_of_exponential_random_variables
  259. *
  260. */
  261. static __u32 *get_choose_arg_weights(const struct crush_bucket_straw2 *bucket,
  262. const struct crush_choose_arg *arg,
  263. int position)
  264. {
  265. if (!arg || !arg->weight_set)
  266. return bucket->item_weights;
  267. if (position >= arg->weight_set_size)
  268. position = arg->weight_set_size - 1;
  269. return arg->weight_set[position].weights;
  270. }
  271. static __s32 *get_choose_arg_ids(const struct crush_bucket_straw2 *bucket,
  272. const struct crush_choose_arg *arg)
  273. {
  274. if (!arg || !arg->ids)
  275. return bucket->h.items;
  276. return arg->ids;
  277. }
  278. static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
  279. int x, int r,
  280. const struct crush_choose_arg *arg,
  281. int position)
  282. {
  283. unsigned int i, high = 0;
  284. unsigned int u;
  285. __s64 ln, draw, high_draw = 0;
  286. __u32 *weights = get_choose_arg_weights(bucket, arg, position);
  287. __s32 *ids = get_choose_arg_ids(bucket, arg);
  288. for (i = 0; i < bucket->h.size; i++) {
  289. dprintk("weight 0x%x item %d\n", weights[i], ids[i]);
  290. if (weights[i]) {
  291. u = crush_hash32_3(bucket->h.hash, x, ids[i], r);
  292. u &= 0xffff;
  293. /*
  294. * for some reason slightly less than 0x10000 produces
  295. * a slightly more accurate distribution... probably a
  296. * rounding effect.
  297. *
  298. * the natural log lookup table maps [0,0xffff]
  299. * (corresponding to real numbers [1/0x10000, 1] to
  300. * [0, 0xffffffffffff] (corresponding to real numbers
  301. * [-11.090355,0]).
  302. */
  303. ln = crush_ln(u) - 0x1000000000000ll;
  304. /*
  305. * divide by 16.16 fixed-point weight. note
  306. * that the ln value is negative, so a larger
  307. * weight means a larger (less negative) value
  308. * for draw.
  309. */
  310. draw = div64_s64(ln, weights[i]);
  311. } else {
  312. draw = S64_MIN;
  313. }
  314. if (i == 0 || draw > high_draw) {
  315. high = i;
  316. high_draw = draw;
  317. }
  318. }
  319. return bucket->h.items[high];
  320. }
  321. static int crush_bucket_choose(const struct crush_bucket *in,
  322. struct crush_work_bucket *work,
  323. int x, int r,
  324. const struct crush_choose_arg *arg,
  325. int position)
  326. {
  327. dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
  328. BUG_ON(in->size == 0);
  329. switch (in->alg) {
  330. case CRUSH_BUCKET_UNIFORM:
  331. return bucket_uniform_choose(
  332. (const struct crush_bucket_uniform *)in,
  333. work, x, r);
  334. case CRUSH_BUCKET_LIST:
  335. return bucket_list_choose((const struct crush_bucket_list *)in,
  336. x, r);
  337. case CRUSH_BUCKET_TREE:
  338. return bucket_tree_choose((const struct crush_bucket_tree *)in,
  339. x, r);
  340. case CRUSH_BUCKET_STRAW:
  341. return bucket_straw_choose(
  342. (const struct crush_bucket_straw *)in,
  343. x, r);
  344. case CRUSH_BUCKET_STRAW2:
  345. return bucket_straw2_choose(
  346. (const struct crush_bucket_straw2 *)in,
  347. x, r, arg, position);
  348. default:
  349. dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
  350. return in->items[0];
  351. }
  352. }
  353. /*
  354. * true if device is marked "out" (failed, fully offloaded)
  355. * of the cluster
  356. */
  357. static int is_out(const struct crush_map *map,
  358. const __u32 *weight, int weight_max,
  359. int item, int x)
  360. {
  361. if (item >= weight_max)
  362. return 1;
  363. if (weight[item] >= 0x10000)
  364. return 0;
  365. if (weight[item] == 0)
  366. return 1;
  367. if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
  368. < weight[item])
  369. return 0;
  370. return 1;
  371. }
  372. /**
  373. * crush_choose_firstn - choose numrep distinct items of given type
  374. * @map: the crush_map
  375. * @bucket: the bucket we are choose an item from
  376. * @x: crush input value
  377. * @numrep: the number of items to choose
  378. * @type: the type of item to choose
  379. * @out: pointer to output vector
  380. * @outpos: our position in that vector
  381. * @out_size: size of the out vector
  382. * @tries: number of attempts to make
  383. * @recurse_tries: number of attempts to have recursive chooseleaf make
  384. * @local_retries: localized retries
  385. * @local_fallback_retries: localized fallback retries
  386. * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
  387. * @stable: stable mode starts rep=0 in the recursive call for all replicas
  388. * @vary_r: pass r to recursive calls
  389. * @out2: second output vector for leaf items (if @recurse_to_leaf)
  390. * @parent_r: r value passed from the parent
  391. */
  392. static int crush_choose_firstn(const struct crush_map *map,
  393. struct crush_work *work,
  394. const struct crush_bucket *bucket,
  395. const __u32 *weight, int weight_max,
  396. int x, int numrep, int type,
  397. int *out, int outpos,
  398. int out_size,
  399. unsigned int tries,
  400. unsigned int recurse_tries,
  401. unsigned int local_retries,
  402. unsigned int local_fallback_retries,
  403. int recurse_to_leaf,
  404. unsigned int vary_r,
  405. unsigned int stable,
  406. int *out2,
  407. int parent_r,
  408. const struct crush_choose_arg *choose_args)
  409. {
  410. int rep;
  411. unsigned int ftotal, flocal;
  412. int retry_descent, retry_bucket, skip_rep;
  413. const struct crush_bucket *in = bucket;
  414. int r;
  415. int i;
  416. int item = 0;
  417. int itemtype;
  418. int collide, reject;
  419. int count = out_size;
  420. dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d stable %d\n",
  421. recurse_to_leaf ? "_LEAF" : "",
  422. bucket->id, x, outpos, numrep,
  423. tries, recurse_tries, local_retries, local_fallback_retries,
  424. parent_r, stable);
  425. for (rep = stable ? 0 : outpos; rep < numrep && count > 0 ; rep++) {
  426. /* keep trying until we get a non-out, non-colliding item */
  427. ftotal = 0;
  428. skip_rep = 0;
  429. do {
  430. retry_descent = 0;
  431. in = bucket; /* initial bucket */
  432. /* choose through intervening buckets */
  433. flocal = 0;
  434. do {
  435. collide = 0;
  436. retry_bucket = 0;
  437. r = rep + parent_r;
  438. /* r' = r + f_total */
  439. r += ftotal;
  440. /* bucket choose */
  441. if (in->size == 0) {
  442. reject = 1;
  443. goto reject;
  444. }
  445. if (local_fallback_retries > 0 &&
  446. flocal >= (in->size>>1) &&
  447. flocal > local_fallback_retries)
  448. item = bucket_perm_choose(
  449. in, work->work[-1-in->id],
  450. x, r);
  451. else
  452. item = crush_bucket_choose(
  453. in, work->work[-1-in->id],
  454. x, r,
  455. (choose_args ?
  456. &choose_args[-1-in->id] : NULL),
  457. outpos);
  458. if (item >= map->max_devices) {
  459. dprintk(" bad item %d\n", item);
  460. skip_rep = 1;
  461. break;
  462. }
  463. /* desired type? */
  464. if (item < 0)
  465. itemtype = map->buckets[-1-item]->type;
  466. else
  467. itemtype = 0;
  468. dprintk(" item %d type %d\n", item, itemtype);
  469. /* keep going? */
  470. if (itemtype != type) {
  471. if (item >= 0 ||
  472. (-1-item) >= map->max_buckets) {
  473. dprintk(" bad item type %d\n", type);
  474. skip_rep = 1;
  475. break;
  476. }
  477. in = map->buckets[-1-item];
  478. retry_bucket = 1;
  479. continue;
  480. }
  481. /* collision? */
  482. for (i = 0; i < outpos; i++) {
  483. if (out[i] == item) {
  484. collide = 1;
  485. break;
  486. }
  487. }
  488. reject = 0;
  489. if (!collide && recurse_to_leaf) {
  490. if (item < 0) {
  491. int sub_r;
  492. if (vary_r)
  493. sub_r = r >> (vary_r-1);
  494. else
  495. sub_r = 0;
  496. if (crush_choose_firstn(
  497. map,
  498. work,
  499. map->buckets[-1-item],
  500. weight, weight_max,
  501. x, stable ? 1 : outpos+1, 0,
  502. out2, outpos, count,
  503. recurse_tries, 0,
  504. local_retries,
  505. local_fallback_retries,
  506. 0,
  507. vary_r,
  508. stable,
  509. NULL,
  510. sub_r,
  511. choose_args) <= outpos)
  512. /* didn't get leaf */
  513. reject = 1;
  514. } else {
  515. /* we already have a leaf! */
  516. out2[outpos] = item;
  517. }
  518. }
  519. if (!reject && !collide) {
  520. /* out? */
  521. if (itemtype == 0)
  522. reject = is_out(map, weight,
  523. weight_max,
  524. item, x);
  525. }
  526. reject:
  527. if (reject || collide) {
  528. ftotal++;
  529. flocal++;
  530. if (collide && flocal <= local_retries)
  531. /* retry locally a few times */
  532. retry_bucket = 1;
  533. else if (local_fallback_retries > 0 &&
  534. flocal <= in->size + local_fallback_retries)
  535. /* exhaustive bucket search */
  536. retry_bucket = 1;
  537. else if (ftotal < tries)
  538. /* then retry descent */
  539. retry_descent = 1;
  540. else
  541. /* else give up */
  542. skip_rep = 1;
  543. dprintk(" reject %d collide %d "
  544. "ftotal %u flocal %u\n",
  545. reject, collide, ftotal,
  546. flocal);
  547. }
  548. } while (retry_bucket);
  549. } while (retry_descent);
  550. if (skip_rep) {
  551. dprintk("skip rep\n");
  552. continue;
  553. }
  554. dprintk("CHOOSE got %d\n", item);
  555. out[outpos] = item;
  556. outpos++;
  557. count--;
  558. #ifndef __KERNEL__
  559. if (map->choose_tries && ftotal <= map->choose_total_tries)
  560. map->choose_tries[ftotal]++;
  561. #endif
  562. }
  563. dprintk("CHOOSE returns %d\n", outpos);
  564. return outpos;
  565. }
  566. /**
  567. * crush_choose_indep: alternative breadth-first positionally stable mapping
  568. *
  569. */
  570. static void crush_choose_indep(const struct crush_map *map,
  571. struct crush_work *work,
  572. const struct crush_bucket *bucket,
  573. const __u32 *weight, int weight_max,
  574. int x, int left, int numrep, int type,
  575. int *out, int outpos,
  576. unsigned int tries,
  577. unsigned int recurse_tries,
  578. int recurse_to_leaf,
  579. int *out2,
  580. int parent_r,
  581. const struct crush_choose_arg *choose_args)
  582. {
  583. const struct crush_bucket *in = bucket;
  584. int endpos = outpos + left;
  585. int rep;
  586. unsigned int ftotal;
  587. int r;
  588. int i;
  589. int item = 0;
  590. int itemtype;
  591. int collide;
  592. dprintk("CHOOSE%s INDEP bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
  593. bucket->id, x, outpos, numrep);
  594. /* initially my result is undefined */
  595. for (rep = outpos; rep < endpos; rep++) {
  596. out[rep] = CRUSH_ITEM_UNDEF;
  597. if (out2)
  598. out2[rep] = CRUSH_ITEM_UNDEF;
  599. }
  600. for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) {
  601. #ifdef DEBUG_INDEP
  602. if (out2 && ftotal) {
  603. dprintk("%u %d a: ", ftotal, left);
  604. for (rep = outpos; rep < endpos; rep++) {
  605. dprintk(" %d", out[rep]);
  606. }
  607. dprintk("\n");
  608. dprintk("%u %d b: ", ftotal, left);
  609. for (rep = outpos; rep < endpos; rep++) {
  610. dprintk(" %d", out2[rep]);
  611. }
  612. dprintk("\n");
  613. }
  614. #endif
  615. for (rep = outpos; rep < endpos; rep++) {
  616. if (out[rep] != CRUSH_ITEM_UNDEF)
  617. continue;
  618. in = bucket; /* initial bucket */
  619. /* choose through intervening buckets */
  620. for (;;) {
  621. /* note: we base the choice on the position
  622. * even in the nested call. that means that
  623. * if the first layer chooses the same bucket
  624. * in a different position, we will tend to
  625. * choose a different item in that bucket.
  626. * this will involve more devices in data
  627. * movement and tend to distribute the load.
  628. */
  629. r = rep + parent_r;
  630. /* be careful */
  631. if (in->alg == CRUSH_BUCKET_UNIFORM &&
  632. in->size % numrep == 0)
  633. /* r'=r+(n+1)*f_total */
  634. r += (numrep+1) * ftotal;
  635. else
  636. /* r' = r + n*f_total */
  637. r += numrep * ftotal;
  638. /* bucket choose */
  639. if (in->size == 0) {
  640. dprintk(" empty bucket\n");
  641. break;
  642. }
  643. item = crush_bucket_choose(
  644. in, work->work[-1-in->id],
  645. x, r,
  646. (choose_args ?
  647. &choose_args[-1-in->id] : NULL),
  648. outpos);
  649. if (item >= map->max_devices) {
  650. dprintk(" bad item %d\n", item);
  651. out[rep] = CRUSH_ITEM_NONE;
  652. if (out2)
  653. out2[rep] = CRUSH_ITEM_NONE;
  654. left--;
  655. break;
  656. }
  657. /* desired type? */
  658. if (item < 0)
  659. itemtype = map->buckets[-1-item]->type;
  660. else
  661. itemtype = 0;
  662. dprintk(" item %d type %d\n", item, itemtype);
  663. /* keep going? */
  664. if (itemtype != type) {
  665. if (item >= 0 ||
  666. (-1-item) >= map->max_buckets) {
  667. dprintk(" bad item type %d\n", type);
  668. out[rep] = CRUSH_ITEM_NONE;
  669. if (out2)
  670. out2[rep] =
  671. CRUSH_ITEM_NONE;
  672. left--;
  673. break;
  674. }
  675. in = map->buckets[-1-item];
  676. continue;
  677. }
  678. /* collision? */
  679. collide = 0;
  680. for (i = outpos; i < endpos; i++) {
  681. if (out[i] == item) {
  682. collide = 1;
  683. break;
  684. }
  685. }
  686. if (collide)
  687. break;
  688. if (recurse_to_leaf) {
  689. if (item < 0) {
  690. crush_choose_indep(
  691. map,
  692. work,
  693. map->buckets[-1-item],
  694. weight, weight_max,
  695. x, 1, numrep, 0,
  696. out2, rep,
  697. recurse_tries, 0,
  698. 0, NULL, r,
  699. choose_args);
  700. if (out2[rep] == CRUSH_ITEM_NONE) {
  701. /* placed nothing; no leaf */
  702. break;
  703. }
  704. } else {
  705. /* we already have a leaf! */
  706. out2[rep] = item;
  707. }
  708. }
  709. /* out? */
  710. if (itemtype == 0 &&
  711. is_out(map, weight, weight_max, item, x))
  712. break;
  713. /* yay! */
  714. out[rep] = item;
  715. left--;
  716. break;
  717. }
  718. }
  719. }
  720. for (rep = outpos; rep < endpos; rep++) {
  721. if (out[rep] == CRUSH_ITEM_UNDEF) {
  722. out[rep] = CRUSH_ITEM_NONE;
  723. }
  724. if (out2 && out2[rep] == CRUSH_ITEM_UNDEF) {
  725. out2[rep] = CRUSH_ITEM_NONE;
  726. }
  727. }
  728. #ifndef __KERNEL__
  729. if (map->choose_tries && ftotal <= map->choose_total_tries)
  730. map->choose_tries[ftotal]++;
  731. #endif
  732. #ifdef DEBUG_INDEP
  733. if (out2) {
  734. dprintk("%u %d a: ", ftotal, left);
  735. for (rep = outpos; rep < endpos; rep++) {
  736. dprintk(" %d", out[rep]);
  737. }
  738. dprintk("\n");
  739. dprintk("%u %d b: ", ftotal, left);
  740. for (rep = outpos; rep < endpos; rep++) {
  741. dprintk(" %d", out2[rep]);
  742. }
  743. dprintk("\n");
  744. }
  745. #endif
  746. }
  747. /*
  748. * This takes a chunk of memory and sets it up to be a shiny new
  749. * working area for a CRUSH placement computation. It must be called
  750. * on any newly allocated memory before passing it in to
  751. * crush_do_rule. It may be used repeatedly after that, so long as the
  752. * map has not changed. If the map /has/ changed, you must make sure
  753. * the working size is no smaller than what was allocated and re-run
  754. * crush_init_workspace.
  755. *
  756. * If you do retain the working space between calls to crush, make it
  757. * thread-local.
  758. */
  759. void crush_init_workspace(const struct crush_map *map, void *v)
  760. {
  761. struct crush_work *w = v;
  762. __s32 b;
  763. /*
  764. * We work by moving through the available space and setting
  765. * values and pointers as we go.
  766. *
  767. * It's a bit like Forth's use of the 'allot' word since we
  768. * set the pointer first and then reserve the space for it to
  769. * point to by incrementing the point.
  770. */
  771. v += sizeof(struct crush_work);
  772. w->work = v;
  773. v += map->max_buckets * sizeof(struct crush_work_bucket *);
  774. for (b = 0; b < map->max_buckets; ++b) {
  775. if (!map->buckets[b])
  776. continue;
  777. w->work[b] = v;
  778. switch (map->buckets[b]->alg) {
  779. default:
  780. v += sizeof(struct crush_work_bucket);
  781. break;
  782. }
  783. w->work[b]->perm_x = 0;
  784. w->work[b]->perm_n = 0;
  785. w->work[b]->perm = v;
  786. v += map->buckets[b]->size * sizeof(__u32);
  787. }
  788. BUG_ON(v - (void *)w != map->working_size);
  789. }
  790. /**
  791. * crush_do_rule - calculate a mapping with the given input and rule
  792. * @map: the crush_map
  793. * @ruleno: the rule id
  794. * @x: hash input
  795. * @result: pointer to result vector
  796. * @result_max: maximum result size
  797. * @weight: weight vector (for map leaves)
  798. * @weight_max: size of weight vector
  799. * @cwin: pointer to at least crush_work_size() bytes of memory
  800. * @choose_args: weights and ids for each known bucket
  801. */
  802. int crush_do_rule(const struct crush_map *map,
  803. int ruleno, int x, int *result, int result_max,
  804. const __u32 *weight, int weight_max,
  805. void *cwin, const struct crush_choose_arg *choose_args)
  806. {
  807. int result_len;
  808. struct crush_work *cw = cwin;
  809. int *a = cwin + map->working_size;
  810. int *b = a + result_max;
  811. int *c = b + result_max;
  812. int *w = a;
  813. int *o = b;
  814. int recurse_to_leaf;
  815. int wsize = 0;
  816. int osize;
  817. int *tmp;
  818. const struct crush_rule *rule;
  819. __u32 step;
  820. int i, j;
  821. int numrep;
  822. int out_size;
  823. /*
  824. * the original choose_total_tries value was off by one (it
  825. * counted "retries" and not "tries"). add one.
  826. */
  827. int choose_tries = map->choose_total_tries + 1;
  828. int choose_leaf_tries = 0;
  829. /*
  830. * the local tries values were counted as "retries", though,
  831. * and need no adjustment
  832. */
  833. int choose_local_retries = map->choose_local_tries;
  834. int choose_local_fallback_retries = map->choose_local_fallback_tries;
  835. int vary_r = map->chooseleaf_vary_r;
  836. int stable = map->chooseleaf_stable;
  837. if ((__u32)ruleno >= map->max_rules) {
  838. dprintk(" bad ruleno %d\n", ruleno);
  839. return 0;
  840. }
  841. rule = map->rules[ruleno];
  842. result_len = 0;
  843. for (step = 0; step < rule->len; step++) {
  844. int firstn = 0;
  845. const struct crush_rule_step *curstep = &rule->steps[step];
  846. switch (curstep->op) {
  847. case CRUSH_RULE_TAKE:
  848. if ((curstep->arg1 >= 0 &&
  849. curstep->arg1 < map->max_devices) ||
  850. (-1-curstep->arg1 >= 0 &&
  851. -1-curstep->arg1 < map->max_buckets &&
  852. map->buckets[-1-curstep->arg1])) {
  853. w[0] = curstep->arg1;
  854. wsize = 1;
  855. } else {
  856. dprintk(" bad take value %d\n", curstep->arg1);
  857. }
  858. break;
  859. case CRUSH_RULE_SET_CHOOSE_TRIES:
  860. if (curstep->arg1 > 0)
  861. choose_tries = curstep->arg1;
  862. break;
  863. case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
  864. if (curstep->arg1 > 0)
  865. choose_leaf_tries = curstep->arg1;
  866. break;
  867. case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES:
  868. if (curstep->arg1 >= 0)
  869. choose_local_retries = curstep->arg1;
  870. break;
  871. case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES:
  872. if (curstep->arg1 >= 0)
  873. choose_local_fallback_retries = curstep->arg1;
  874. break;
  875. case CRUSH_RULE_SET_CHOOSELEAF_VARY_R:
  876. if (curstep->arg1 >= 0)
  877. vary_r = curstep->arg1;
  878. break;
  879. case CRUSH_RULE_SET_CHOOSELEAF_STABLE:
  880. if (curstep->arg1 >= 0)
  881. stable = curstep->arg1;
  882. break;
  883. case CRUSH_RULE_CHOOSELEAF_FIRSTN:
  884. case CRUSH_RULE_CHOOSE_FIRSTN:
  885. firstn = 1;
  886. /* fall through */
  887. case CRUSH_RULE_CHOOSELEAF_INDEP:
  888. case CRUSH_RULE_CHOOSE_INDEP:
  889. if (wsize == 0)
  890. break;
  891. recurse_to_leaf =
  892. curstep->op ==
  893. CRUSH_RULE_CHOOSELEAF_FIRSTN ||
  894. curstep->op ==
  895. CRUSH_RULE_CHOOSELEAF_INDEP;
  896. /* reset output */
  897. osize = 0;
  898. for (i = 0; i < wsize; i++) {
  899. int bno;
  900. numrep = curstep->arg1;
  901. if (numrep <= 0) {
  902. numrep += result_max;
  903. if (numrep <= 0)
  904. continue;
  905. }
  906. j = 0;
  907. /* make sure bucket id is valid */
  908. bno = -1 - w[i];
  909. if (bno < 0 || bno >= map->max_buckets) {
  910. /* w[i] is probably CRUSH_ITEM_NONE */
  911. dprintk(" bad w[i] %d\n", w[i]);
  912. continue;
  913. }
  914. if (firstn) {
  915. int recurse_tries;
  916. if (choose_leaf_tries)
  917. recurse_tries =
  918. choose_leaf_tries;
  919. else if (map->chooseleaf_descend_once)
  920. recurse_tries = 1;
  921. else
  922. recurse_tries = choose_tries;
  923. osize += crush_choose_firstn(
  924. map,
  925. cw,
  926. map->buckets[bno],
  927. weight, weight_max,
  928. x, numrep,
  929. curstep->arg2,
  930. o+osize, j,
  931. result_max-osize,
  932. choose_tries,
  933. recurse_tries,
  934. choose_local_retries,
  935. choose_local_fallback_retries,
  936. recurse_to_leaf,
  937. vary_r,
  938. stable,
  939. c+osize,
  940. 0,
  941. choose_args);
  942. } else {
  943. out_size = ((numrep < (result_max-osize)) ?
  944. numrep : (result_max-osize));
  945. crush_choose_indep(
  946. map,
  947. cw,
  948. map->buckets[bno],
  949. weight, weight_max,
  950. x, out_size, numrep,
  951. curstep->arg2,
  952. o+osize, j,
  953. choose_tries,
  954. choose_leaf_tries ?
  955. choose_leaf_tries : 1,
  956. recurse_to_leaf,
  957. c+osize,
  958. 0,
  959. choose_args);
  960. osize += out_size;
  961. }
  962. }
  963. if (recurse_to_leaf)
  964. /* copy final _leaf_ values to output set */
  965. memcpy(o, c, osize*sizeof(*o));
  966. /* swap o and w arrays */
  967. tmp = o;
  968. o = w;
  969. w = tmp;
  970. wsize = osize;
  971. break;
  972. case CRUSH_RULE_EMIT:
  973. for (i = 0; i < wsize && result_len < result_max; i++) {
  974. result[result_len] = w[i];
  975. result_len++;
  976. }
  977. wsize = 0;
  978. break;
  979. default:
  980. dprintk(" unknown op %d at step %d\n",
  981. curstep->op, step);
  982. break;
  983. }
  984. }
  985. return result_len;
  986. }