dm-switch.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589
  1. /*
  2. * Copyright (C) 2010-2012 by Dell Inc. All rights reserved.
  3. * Copyright (C) 2011-2013 Red Hat, Inc.
  4. *
  5. * This file is released under the GPL.
  6. *
  7. * dm-switch is a device-mapper target that maps IO to underlying block
  8. * devices efficiently when there are a large number of fixed-sized
  9. * address regions but there is no simple pattern to allow for a compact
  10. * mapping representation such as dm-stripe.
  11. */
  12. #include <linux/device-mapper.h>
  13. #include <linux/module.h>
  14. #include <linux/init.h>
  15. #include <linux/vmalloc.h>
  16. #define DM_MSG_PREFIX "switch"
  17. /*
  18. * One region_table_slot_t holds <region_entries_per_slot> region table
  19. * entries each of which is <region_table_entry_bits> in size.
  20. */
  21. typedef unsigned long region_table_slot_t;
  22. /*
  23. * A device with the offset to its start sector.
  24. */
  25. struct switch_path {
  26. struct dm_dev *dmdev;
  27. sector_t start;
  28. };
  29. /*
  30. * Context block for a dm switch device.
  31. */
  32. struct switch_ctx {
  33. struct dm_target *ti;
  34. unsigned nr_paths; /* Number of paths in path_list. */
  35. unsigned region_size; /* Region size in 512-byte sectors */
  36. unsigned long nr_regions; /* Number of regions making up the device */
  37. signed char region_size_bits; /* log2 of region_size or -1 */
  38. unsigned char region_table_entry_bits; /* Number of bits in one region table entry */
  39. unsigned char region_entries_per_slot; /* Number of entries in one region table slot */
  40. signed char region_entries_per_slot_bits; /* log2 of region_entries_per_slot or -1 */
  41. region_table_slot_t *region_table; /* Region table */
  42. /*
  43. * Array of dm devices to switch between.
  44. */
  45. struct switch_path path_list[0];
  46. };
  47. static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths,
  48. unsigned region_size)
  49. {
  50. struct switch_ctx *sctx;
  51. sctx = kzalloc(sizeof(struct switch_ctx) + nr_paths * sizeof(struct switch_path),
  52. GFP_KERNEL);
  53. if (!sctx)
  54. return NULL;
  55. sctx->ti = ti;
  56. sctx->region_size = region_size;
  57. ti->private = sctx;
  58. return sctx;
  59. }
  60. static int alloc_region_table(struct dm_target *ti, unsigned nr_paths)
  61. {
  62. struct switch_ctx *sctx = ti->private;
  63. sector_t nr_regions = ti->len;
  64. sector_t nr_slots;
  65. if (!(sctx->region_size & (sctx->region_size - 1)))
  66. sctx->region_size_bits = __ffs(sctx->region_size);
  67. else
  68. sctx->region_size_bits = -1;
  69. sctx->region_table_entry_bits = 1;
  70. while (sctx->region_table_entry_bits < sizeof(region_table_slot_t) * 8 &&
  71. (region_table_slot_t)1 << sctx->region_table_entry_bits < nr_paths)
  72. sctx->region_table_entry_bits++;
  73. sctx->region_entries_per_slot = (sizeof(region_table_slot_t) * 8) / sctx->region_table_entry_bits;
  74. if (!(sctx->region_entries_per_slot & (sctx->region_entries_per_slot - 1)))
  75. sctx->region_entries_per_slot_bits = __ffs(sctx->region_entries_per_slot);
  76. else
  77. sctx->region_entries_per_slot_bits = -1;
  78. if (sector_div(nr_regions, sctx->region_size))
  79. nr_regions++;
  80. if (nr_regions >= ULONG_MAX) {
  81. ti->error = "Region table too large";
  82. return -EINVAL;
  83. }
  84. sctx->nr_regions = nr_regions;
  85. nr_slots = nr_regions;
  86. if (sector_div(nr_slots, sctx->region_entries_per_slot))
  87. nr_slots++;
  88. if (nr_slots > ULONG_MAX / sizeof(region_table_slot_t)) {
  89. ti->error = "Region table too large";
  90. return -EINVAL;
  91. }
  92. sctx->region_table = vmalloc(array_size(nr_slots,
  93. sizeof(region_table_slot_t)));
  94. if (!sctx->region_table) {
  95. ti->error = "Cannot allocate region table";
  96. return -ENOMEM;
  97. }
  98. return 0;
  99. }
  100. static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr,
  101. unsigned long *region_index, unsigned *bit)
  102. {
  103. if (sctx->region_entries_per_slot_bits >= 0) {
  104. *region_index = region_nr >> sctx->region_entries_per_slot_bits;
  105. *bit = region_nr & (sctx->region_entries_per_slot - 1);
  106. } else {
  107. *region_index = region_nr / sctx->region_entries_per_slot;
  108. *bit = region_nr % sctx->region_entries_per_slot;
  109. }
  110. *bit *= sctx->region_table_entry_bits;
  111. }
  112. static unsigned switch_region_table_read(struct switch_ctx *sctx, unsigned long region_nr)
  113. {
  114. unsigned long region_index;
  115. unsigned bit;
  116. switch_get_position(sctx, region_nr, &region_index, &bit);
  117. return (READ_ONCE(sctx->region_table[region_index]) >> bit) &
  118. ((1 << sctx->region_table_entry_bits) - 1);
  119. }
  120. /*
  121. * Find which path to use at given offset.
  122. */
  123. static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
  124. {
  125. unsigned path_nr;
  126. sector_t p;
  127. p = offset;
  128. if (sctx->region_size_bits >= 0)
  129. p >>= sctx->region_size_bits;
  130. else
  131. sector_div(p, sctx->region_size);
  132. path_nr = switch_region_table_read(sctx, p);
  133. /* This can only happen if the processor uses non-atomic stores. */
  134. if (unlikely(path_nr >= sctx->nr_paths))
  135. path_nr = 0;
  136. return path_nr;
  137. }
  138. static void switch_region_table_write(struct switch_ctx *sctx, unsigned long region_nr,
  139. unsigned value)
  140. {
  141. unsigned long region_index;
  142. unsigned bit;
  143. region_table_slot_t pte;
  144. switch_get_position(sctx, region_nr, &region_index, &bit);
  145. pte = sctx->region_table[region_index];
  146. pte &= ~((((region_table_slot_t)1 << sctx->region_table_entry_bits) - 1) << bit);
  147. pte |= (region_table_slot_t)value << bit;
  148. sctx->region_table[region_index] = pte;
  149. }
  150. /*
  151. * Fill the region table with an initial round robin pattern.
  152. */
  153. static void initialise_region_table(struct switch_ctx *sctx)
  154. {
  155. unsigned path_nr = 0;
  156. unsigned long region_nr;
  157. for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) {
  158. switch_region_table_write(sctx, region_nr, path_nr);
  159. if (++path_nr >= sctx->nr_paths)
  160. path_nr = 0;
  161. }
  162. }
  163. static int parse_path(struct dm_arg_set *as, struct dm_target *ti)
  164. {
  165. struct switch_ctx *sctx = ti->private;
  166. unsigned long long start;
  167. int r;
  168. r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
  169. &sctx->path_list[sctx->nr_paths].dmdev);
  170. if (r) {
  171. ti->error = "Device lookup failed";
  172. return r;
  173. }
  174. if (kstrtoull(dm_shift_arg(as), 10, &start) || start != (sector_t)start) {
  175. ti->error = "Invalid device starting offset";
  176. dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
  177. return -EINVAL;
  178. }
  179. sctx->path_list[sctx->nr_paths].start = start;
  180. sctx->nr_paths++;
  181. return 0;
  182. }
  183. /*
  184. * Destructor: Don't free the dm_target, just the ti->private data (if any).
  185. */
  186. static void switch_dtr(struct dm_target *ti)
  187. {
  188. struct switch_ctx *sctx = ti->private;
  189. while (sctx->nr_paths--)
  190. dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
  191. vfree(sctx->region_table);
  192. kfree(sctx);
  193. }
  194. /*
  195. * Constructor arguments:
  196. * <num_paths> <region_size> <num_optional_args> [<optional_args>...]
  197. * [<dev_path> <offset>]+
  198. *
  199. * Optional args are to allow for future extension: currently this
  200. * parameter must be 0.
  201. */
  202. static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv)
  203. {
  204. static const struct dm_arg _args[] = {
  205. {1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"},
  206. {1, UINT_MAX, "Invalid region size"},
  207. {0, 0, "Invalid number of optional args"},
  208. };
  209. struct switch_ctx *sctx;
  210. struct dm_arg_set as;
  211. unsigned nr_paths, region_size, nr_optional_args;
  212. int r;
  213. as.argc = argc;
  214. as.argv = argv;
  215. r = dm_read_arg(_args, &as, &nr_paths, &ti->error);
  216. if (r)
  217. return -EINVAL;
  218. r = dm_read_arg(_args + 1, &as, &region_size, &ti->error);
  219. if (r)
  220. return r;
  221. r = dm_read_arg_group(_args + 2, &as, &nr_optional_args, &ti->error);
  222. if (r)
  223. return r;
  224. /* parse optional arguments here, if we add any */
  225. if (as.argc != nr_paths * 2) {
  226. ti->error = "Incorrect number of path arguments";
  227. return -EINVAL;
  228. }
  229. sctx = alloc_switch_ctx(ti, nr_paths, region_size);
  230. if (!sctx) {
  231. ti->error = "Cannot allocate redirection context";
  232. return -ENOMEM;
  233. }
  234. r = dm_set_target_max_io_len(ti, region_size);
  235. if (r)
  236. goto error;
  237. while (as.argc) {
  238. r = parse_path(&as, ti);
  239. if (r)
  240. goto error;
  241. }
  242. r = alloc_region_table(ti, nr_paths);
  243. if (r)
  244. goto error;
  245. initialise_region_table(sctx);
  246. /* For UNMAP, sending the request down any path is sufficient */
  247. ti->num_discard_bios = 1;
  248. return 0;
  249. error:
  250. switch_dtr(ti);
  251. return r;
  252. }
  253. static int switch_map(struct dm_target *ti, struct bio *bio)
  254. {
  255. struct switch_ctx *sctx = ti->private;
  256. sector_t offset = dm_target_offset(ti, bio->bi_iter.bi_sector);
  257. unsigned path_nr = switch_get_path_nr(sctx, offset);
  258. bio_set_dev(bio, sctx->path_list[path_nr].dmdev->bdev);
  259. bio->bi_iter.bi_sector = sctx->path_list[path_nr].start + offset;
  260. return DM_MAPIO_REMAPPED;
  261. }
  262. /*
  263. * We need to parse hex numbers in the message as quickly as possible.
  264. *
  265. * This table-based hex parser improves performance.
  266. * It improves a time to load 1000000 entries compared to the condition-based
  267. * parser.
  268. * table-based parser condition-based parser
  269. * PA-RISC 0.29s 0.31s
  270. * Opteron 0.0495s 0.0498s
  271. */
  272. static const unsigned char hex_table[256] = {
  273. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  274. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  275. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  276. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255,
  277. 255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  278. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  279. 255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  280. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  281. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  282. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  283. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  284. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  285. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  286. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  287. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  288. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
  289. };
  290. static __always_inline unsigned long parse_hex(const char **string)
  291. {
  292. unsigned char d;
  293. unsigned long r = 0;
  294. while ((d = hex_table[(unsigned char)**string]) < 16) {
  295. r = (r << 4) | d;
  296. (*string)++;
  297. }
  298. return r;
  299. }
  300. static int process_set_region_mappings(struct switch_ctx *sctx,
  301. unsigned argc, char **argv)
  302. {
  303. unsigned i;
  304. unsigned long region_index = 0;
  305. for (i = 1; i < argc; i++) {
  306. unsigned long path_nr;
  307. const char *string = argv[i];
  308. if ((*string & 0xdf) == 'R') {
  309. unsigned long cycle_length, num_write;
  310. string++;
  311. if (unlikely(*string == ',')) {
  312. DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
  313. return -EINVAL;
  314. }
  315. cycle_length = parse_hex(&string);
  316. if (unlikely(*string != ',')) {
  317. DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
  318. return -EINVAL;
  319. }
  320. string++;
  321. if (unlikely(!*string)) {
  322. DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
  323. return -EINVAL;
  324. }
  325. num_write = parse_hex(&string);
  326. if (unlikely(*string)) {
  327. DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
  328. return -EINVAL;
  329. }
  330. if (unlikely(!cycle_length) || unlikely(cycle_length - 1 > region_index)) {
  331. DMWARN("invalid set_region_mappings cycle length: %lu > %lu",
  332. cycle_length - 1, region_index);
  333. return -EINVAL;
  334. }
  335. if (unlikely(region_index + num_write < region_index) ||
  336. unlikely(region_index + num_write >= sctx->nr_regions)) {
  337. DMWARN("invalid set_region_mappings region number: %lu + %lu >= %lu",
  338. region_index, num_write, sctx->nr_regions);
  339. return -EINVAL;
  340. }
  341. while (num_write--) {
  342. region_index++;
  343. path_nr = switch_region_table_read(sctx, region_index - cycle_length);
  344. switch_region_table_write(sctx, region_index, path_nr);
  345. }
  346. continue;
  347. }
  348. if (*string == ':')
  349. region_index++;
  350. else {
  351. region_index = parse_hex(&string);
  352. if (unlikely(*string != ':')) {
  353. DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
  354. return -EINVAL;
  355. }
  356. }
  357. string++;
  358. if (unlikely(!*string)) {
  359. DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
  360. return -EINVAL;
  361. }
  362. path_nr = parse_hex(&string);
  363. if (unlikely(*string)) {
  364. DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
  365. return -EINVAL;
  366. }
  367. if (unlikely(region_index >= sctx->nr_regions)) {
  368. DMWARN("invalid set_region_mappings region number: %lu >= %lu", region_index, sctx->nr_regions);
  369. return -EINVAL;
  370. }
  371. if (unlikely(path_nr >= sctx->nr_paths)) {
  372. DMWARN("invalid set_region_mappings device: %lu >= %u", path_nr, sctx->nr_paths);
  373. return -EINVAL;
  374. }
  375. switch_region_table_write(sctx, region_index, path_nr);
  376. }
  377. return 0;
  378. }
  379. /*
  380. * Messages are processed one-at-a-time.
  381. *
  382. * Only set_region_mappings is supported.
  383. */
  384. static int switch_message(struct dm_target *ti, unsigned argc, char **argv,
  385. char *result, unsigned maxlen)
  386. {
  387. static DEFINE_MUTEX(message_mutex);
  388. struct switch_ctx *sctx = ti->private;
  389. int r = -EINVAL;
  390. mutex_lock(&message_mutex);
  391. if (!strcasecmp(argv[0], "set_region_mappings"))
  392. r = process_set_region_mappings(sctx, argc, argv);
  393. else
  394. DMWARN("Unrecognised message received.");
  395. mutex_unlock(&message_mutex);
  396. return r;
  397. }
  398. static void switch_status(struct dm_target *ti, status_type_t type,
  399. unsigned status_flags, char *result, unsigned maxlen)
  400. {
  401. struct switch_ctx *sctx = ti->private;
  402. unsigned sz = 0;
  403. int path_nr;
  404. switch (type) {
  405. case STATUSTYPE_INFO:
  406. result[0] = '\0';
  407. break;
  408. case STATUSTYPE_TABLE:
  409. DMEMIT("%u %u 0", sctx->nr_paths, sctx->region_size);
  410. for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++)
  411. DMEMIT(" %s %llu", sctx->path_list[path_nr].dmdev->name,
  412. (unsigned long long)sctx->path_list[path_nr].start);
  413. break;
  414. }
  415. }
  416. /*
  417. * Switch ioctl:
  418. *
  419. * Passthrough all ioctls to the path for sector 0
  420. */
  421. static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
  422. {
  423. struct switch_ctx *sctx = ti->private;
  424. unsigned path_nr;
  425. path_nr = switch_get_path_nr(sctx, 0);
  426. *bdev = sctx->path_list[path_nr].dmdev->bdev;
  427. /*
  428. * Only pass ioctls through if the device sizes match exactly.
  429. */
  430. if (ti->len + sctx->path_list[path_nr].start !=
  431. i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT)
  432. return 1;
  433. return 0;
  434. }
  435. static int switch_iterate_devices(struct dm_target *ti,
  436. iterate_devices_callout_fn fn, void *data)
  437. {
  438. struct switch_ctx *sctx = ti->private;
  439. int path_nr;
  440. int r;
  441. for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) {
  442. r = fn(ti, sctx->path_list[path_nr].dmdev,
  443. sctx->path_list[path_nr].start, ti->len, data);
  444. if (r)
  445. return r;
  446. }
  447. return 0;
  448. }
  449. static struct target_type switch_target = {
  450. .name = "switch",
  451. .version = {1, 1, 0},
  452. .module = THIS_MODULE,
  453. .ctr = switch_ctr,
  454. .dtr = switch_dtr,
  455. .map = switch_map,
  456. .message = switch_message,
  457. .status = switch_status,
  458. .prepare_ioctl = switch_prepare_ioctl,
  459. .iterate_devices = switch_iterate_devices,
  460. };
  461. static int __init dm_switch_init(void)
  462. {
  463. int r;
  464. r = dm_register_target(&switch_target);
  465. if (r < 0)
  466. DMERR("dm_register_target() failed %d", r);
  467. return r;
  468. }
  469. static void __exit dm_switch_exit(void)
  470. {
  471. dm_unregister_target(&switch_target);
  472. }
  473. module_init(dm_switch_init);
  474. module_exit(dm_switch_exit);
  475. MODULE_DESCRIPTION(DM_NAME " dynamic path switching target");
  476. MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>");
  477. MODULE_AUTHOR("Narendran Ganapathy <Narendran_Ganapathy@dell.com>");
  478. MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay@dell.com>");
  479. MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
  480. MODULE_LICENSE("GPL");