dm-switch.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588
  1. /*
  2. * Copyright (C) 2010-2012 by Dell Inc. All rights reserved.
  3. * Copyright (C) 2011-2013 Red Hat, Inc.
  4. *
  5. * This file is released under the GPL.
  6. *
  7. * dm-switch is a device-mapper target that maps IO to underlying block
  8. * devices efficiently when there are a large number of fixed-sized
  9. * address regions but there is no simple pattern to allow for a compact
  10. * mapping representation such as dm-stripe.
  11. */
  12. #include <linux/device-mapper.h>
  13. #include <linux/module.h>
  14. #include <linux/init.h>
  15. #include <linux/vmalloc.h>
  16. #define DM_MSG_PREFIX "switch"
  17. /*
  18. * One region_table_slot_t holds <region_entries_per_slot> region table
  19. * entries each of which is <region_table_entry_bits> in size.
  20. */
  21. typedef unsigned long region_table_slot_t;
  22. /*
  23. * A device with the offset to its start sector.
  24. */
  25. struct switch_path {
  26. struct dm_dev *dmdev;
  27. sector_t start;
  28. };
  29. /*
  30. * Context block for a dm switch device.
  31. */
  32. struct switch_ctx {
  33. struct dm_target *ti;
  34. unsigned nr_paths; /* Number of paths in path_list. */
  35. unsigned region_size; /* Region size in 512-byte sectors */
  36. unsigned long nr_regions; /* Number of regions making up the device */
  37. signed char region_size_bits; /* log2 of region_size or -1 */
  38. unsigned char region_table_entry_bits; /* Number of bits in one region table entry */
  39. unsigned char region_entries_per_slot; /* Number of entries in one region table slot */
  40. signed char region_entries_per_slot_bits; /* log2 of region_entries_per_slot or -1 */
  41. region_table_slot_t *region_table; /* Region table */
  42. /*
  43. * Array of dm devices to switch between.
  44. */
  45. struct switch_path path_list[0];
  46. };
  47. static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths,
  48. unsigned region_size)
  49. {
  50. struct switch_ctx *sctx;
  51. sctx = kzalloc(struct_size(sctx, path_list, nr_paths), GFP_KERNEL);
  52. if (!sctx)
  53. return NULL;
  54. sctx->ti = ti;
  55. sctx->region_size = region_size;
  56. ti->private = sctx;
  57. return sctx;
  58. }
  59. static int alloc_region_table(struct dm_target *ti, unsigned nr_paths)
  60. {
  61. struct switch_ctx *sctx = ti->private;
  62. sector_t nr_regions = ti->len;
  63. sector_t nr_slots;
  64. if (!(sctx->region_size & (sctx->region_size - 1)))
  65. sctx->region_size_bits = __ffs(sctx->region_size);
  66. else
  67. sctx->region_size_bits = -1;
  68. sctx->region_table_entry_bits = 1;
  69. while (sctx->region_table_entry_bits < sizeof(region_table_slot_t) * 8 &&
  70. (region_table_slot_t)1 << sctx->region_table_entry_bits < nr_paths)
  71. sctx->region_table_entry_bits++;
  72. sctx->region_entries_per_slot = (sizeof(region_table_slot_t) * 8) / sctx->region_table_entry_bits;
  73. if (!(sctx->region_entries_per_slot & (sctx->region_entries_per_slot - 1)))
  74. sctx->region_entries_per_slot_bits = __ffs(sctx->region_entries_per_slot);
  75. else
  76. sctx->region_entries_per_slot_bits = -1;
  77. if (sector_div(nr_regions, sctx->region_size))
  78. nr_regions++;
  79. if (nr_regions >= ULONG_MAX) {
  80. ti->error = "Region table too large";
  81. return -EINVAL;
  82. }
  83. sctx->nr_regions = nr_regions;
  84. nr_slots = nr_regions;
  85. if (sector_div(nr_slots, sctx->region_entries_per_slot))
  86. nr_slots++;
  87. if (nr_slots > ULONG_MAX / sizeof(region_table_slot_t)) {
  88. ti->error = "Region table too large";
  89. return -EINVAL;
  90. }
  91. sctx->region_table = vmalloc(array_size(nr_slots,
  92. sizeof(region_table_slot_t)));
  93. if (!sctx->region_table) {
  94. ti->error = "Cannot allocate region table";
  95. return -ENOMEM;
  96. }
  97. return 0;
  98. }
  99. static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr,
  100. unsigned long *region_index, unsigned *bit)
  101. {
  102. if (sctx->region_entries_per_slot_bits >= 0) {
  103. *region_index = region_nr >> sctx->region_entries_per_slot_bits;
  104. *bit = region_nr & (sctx->region_entries_per_slot - 1);
  105. } else {
  106. *region_index = region_nr / sctx->region_entries_per_slot;
  107. *bit = region_nr % sctx->region_entries_per_slot;
  108. }
  109. *bit *= sctx->region_table_entry_bits;
  110. }
  111. static unsigned switch_region_table_read(struct switch_ctx *sctx, unsigned long region_nr)
  112. {
  113. unsigned long region_index;
  114. unsigned bit;
  115. switch_get_position(sctx, region_nr, &region_index, &bit);
  116. return (READ_ONCE(sctx->region_table[region_index]) >> bit) &
  117. ((1 << sctx->region_table_entry_bits) - 1);
  118. }
  119. /*
  120. * Find which path to use at given offset.
  121. */
  122. static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
  123. {
  124. unsigned path_nr;
  125. sector_t p;
  126. p = offset;
  127. if (sctx->region_size_bits >= 0)
  128. p >>= sctx->region_size_bits;
  129. else
  130. sector_div(p, sctx->region_size);
  131. path_nr = switch_region_table_read(sctx, p);
  132. /* This can only happen if the processor uses non-atomic stores. */
  133. if (unlikely(path_nr >= sctx->nr_paths))
  134. path_nr = 0;
  135. return path_nr;
  136. }
  137. static void switch_region_table_write(struct switch_ctx *sctx, unsigned long region_nr,
  138. unsigned value)
  139. {
  140. unsigned long region_index;
  141. unsigned bit;
  142. region_table_slot_t pte;
  143. switch_get_position(sctx, region_nr, &region_index, &bit);
  144. pte = sctx->region_table[region_index];
  145. pte &= ~((((region_table_slot_t)1 << sctx->region_table_entry_bits) - 1) << bit);
  146. pte |= (region_table_slot_t)value << bit;
  147. sctx->region_table[region_index] = pte;
  148. }
  149. /*
  150. * Fill the region table with an initial round robin pattern.
  151. */
  152. static void initialise_region_table(struct switch_ctx *sctx)
  153. {
  154. unsigned path_nr = 0;
  155. unsigned long region_nr;
  156. for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) {
  157. switch_region_table_write(sctx, region_nr, path_nr);
  158. if (++path_nr >= sctx->nr_paths)
  159. path_nr = 0;
  160. }
  161. }
  162. static int parse_path(struct dm_arg_set *as, struct dm_target *ti)
  163. {
  164. struct switch_ctx *sctx = ti->private;
  165. unsigned long long start;
  166. int r;
  167. r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
  168. &sctx->path_list[sctx->nr_paths].dmdev);
  169. if (r) {
  170. ti->error = "Device lookup failed";
  171. return r;
  172. }
  173. if (kstrtoull(dm_shift_arg(as), 10, &start) || start != (sector_t)start) {
  174. ti->error = "Invalid device starting offset";
  175. dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
  176. return -EINVAL;
  177. }
  178. sctx->path_list[sctx->nr_paths].start = start;
  179. sctx->nr_paths++;
  180. return 0;
  181. }
  182. /*
  183. * Destructor: Don't free the dm_target, just the ti->private data (if any).
  184. */
  185. static void switch_dtr(struct dm_target *ti)
  186. {
  187. struct switch_ctx *sctx = ti->private;
  188. while (sctx->nr_paths--)
  189. dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
  190. vfree(sctx->region_table);
  191. kfree(sctx);
  192. }
  193. /*
  194. * Constructor arguments:
  195. * <num_paths> <region_size> <num_optional_args> [<optional_args>...]
  196. * [<dev_path> <offset>]+
  197. *
  198. * Optional args are to allow for future extension: currently this
  199. * parameter must be 0.
  200. */
  201. static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv)
  202. {
  203. static const struct dm_arg _args[] = {
  204. {1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"},
  205. {1, UINT_MAX, "Invalid region size"},
  206. {0, 0, "Invalid number of optional args"},
  207. };
  208. struct switch_ctx *sctx;
  209. struct dm_arg_set as;
  210. unsigned nr_paths, region_size, nr_optional_args;
  211. int r;
  212. as.argc = argc;
  213. as.argv = argv;
  214. r = dm_read_arg(_args, &as, &nr_paths, &ti->error);
  215. if (r)
  216. return -EINVAL;
  217. r = dm_read_arg(_args + 1, &as, &region_size, &ti->error);
  218. if (r)
  219. return r;
  220. r = dm_read_arg_group(_args + 2, &as, &nr_optional_args, &ti->error);
  221. if (r)
  222. return r;
  223. /* parse optional arguments here, if we add any */
  224. if (as.argc != nr_paths * 2) {
  225. ti->error = "Incorrect number of path arguments";
  226. return -EINVAL;
  227. }
  228. sctx = alloc_switch_ctx(ti, nr_paths, region_size);
  229. if (!sctx) {
  230. ti->error = "Cannot allocate redirection context";
  231. return -ENOMEM;
  232. }
  233. r = dm_set_target_max_io_len(ti, region_size);
  234. if (r)
  235. goto error;
  236. while (as.argc) {
  237. r = parse_path(&as, ti);
  238. if (r)
  239. goto error;
  240. }
  241. r = alloc_region_table(ti, nr_paths);
  242. if (r)
  243. goto error;
  244. initialise_region_table(sctx);
  245. /* For UNMAP, sending the request down any path is sufficient */
  246. ti->num_discard_bios = 1;
  247. return 0;
  248. error:
  249. switch_dtr(ti);
  250. return r;
  251. }
  252. static int switch_map(struct dm_target *ti, struct bio *bio)
  253. {
  254. struct switch_ctx *sctx = ti->private;
  255. sector_t offset = dm_target_offset(ti, bio->bi_iter.bi_sector);
  256. unsigned path_nr = switch_get_path_nr(sctx, offset);
  257. bio_set_dev(bio, sctx->path_list[path_nr].dmdev->bdev);
  258. bio->bi_iter.bi_sector = sctx->path_list[path_nr].start + offset;
  259. return DM_MAPIO_REMAPPED;
  260. }
  261. /*
  262. * We need to parse hex numbers in the message as quickly as possible.
  263. *
  264. * This table-based hex parser improves performance.
  265. * It improves a time to load 1000000 entries compared to the condition-based
  266. * parser.
  267. * table-based parser condition-based parser
  268. * PA-RISC 0.29s 0.31s
  269. * Opteron 0.0495s 0.0498s
  270. */
  271. static const unsigned char hex_table[256] = {
  272. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  273. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  274. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  275. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255,
  276. 255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  277. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  278. 255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  279. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  280. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  281. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  282. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  283. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  284. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  285. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  286. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  287. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
  288. };
  289. static __always_inline unsigned long parse_hex(const char **string)
  290. {
  291. unsigned char d;
  292. unsigned long r = 0;
  293. while ((d = hex_table[(unsigned char)**string]) < 16) {
  294. r = (r << 4) | d;
  295. (*string)++;
  296. }
  297. return r;
  298. }
  299. static int process_set_region_mappings(struct switch_ctx *sctx,
  300. unsigned argc, char **argv)
  301. {
  302. unsigned i;
  303. unsigned long region_index = 0;
  304. for (i = 1; i < argc; i++) {
  305. unsigned long path_nr;
  306. const char *string = argv[i];
  307. if ((*string & 0xdf) == 'R') {
  308. unsigned long cycle_length, num_write;
  309. string++;
  310. if (unlikely(*string == ',')) {
  311. DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
  312. return -EINVAL;
  313. }
  314. cycle_length = parse_hex(&string);
  315. if (unlikely(*string != ',')) {
  316. DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
  317. return -EINVAL;
  318. }
  319. string++;
  320. if (unlikely(!*string)) {
  321. DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
  322. return -EINVAL;
  323. }
  324. num_write = parse_hex(&string);
  325. if (unlikely(*string)) {
  326. DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
  327. return -EINVAL;
  328. }
  329. if (unlikely(!cycle_length) || unlikely(cycle_length - 1 > region_index)) {
  330. DMWARN("invalid set_region_mappings cycle length: %lu > %lu",
  331. cycle_length - 1, region_index);
  332. return -EINVAL;
  333. }
  334. if (unlikely(region_index + num_write < region_index) ||
  335. unlikely(region_index + num_write >= sctx->nr_regions)) {
  336. DMWARN("invalid set_region_mappings region number: %lu + %lu >= %lu",
  337. region_index, num_write, sctx->nr_regions);
  338. return -EINVAL;
  339. }
  340. while (num_write--) {
  341. region_index++;
  342. path_nr = switch_region_table_read(sctx, region_index - cycle_length);
  343. switch_region_table_write(sctx, region_index, path_nr);
  344. }
  345. continue;
  346. }
  347. if (*string == ':')
  348. region_index++;
  349. else {
  350. region_index = parse_hex(&string);
  351. if (unlikely(*string != ':')) {
  352. DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
  353. return -EINVAL;
  354. }
  355. }
  356. string++;
  357. if (unlikely(!*string)) {
  358. DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
  359. return -EINVAL;
  360. }
  361. path_nr = parse_hex(&string);
  362. if (unlikely(*string)) {
  363. DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
  364. return -EINVAL;
  365. }
  366. if (unlikely(region_index >= sctx->nr_regions)) {
  367. DMWARN("invalid set_region_mappings region number: %lu >= %lu", region_index, sctx->nr_regions);
  368. return -EINVAL;
  369. }
  370. if (unlikely(path_nr >= sctx->nr_paths)) {
  371. DMWARN("invalid set_region_mappings device: %lu >= %u", path_nr, sctx->nr_paths);
  372. return -EINVAL;
  373. }
  374. switch_region_table_write(sctx, region_index, path_nr);
  375. }
  376. return 0;
  377. }
  378. /*
  379. * Messages are processed one-at-a-time.
  380. *
  381. * Only set_region_mappings is supported.
  382. */
  383. static int switch_message(struct dm_target *ti, unsigned argc, char **argv,
  384. char *result, unsigned maxlen)
  385. {
  386. static DEFINE_MUTEX(message_mutex);
  387. struct switch_ctx *sctx = ti->private;
  388. int r = -EINVAL;
  389. mutex_lock(&message_mutex);
  390. if (!strcasecmp(argv[0], "set_region_mappings"))
  391. r = process_set_region_mappings(sctx, argc, argv);
  392. else
  393. DMWARN("Unrecognised message received.");
  394. mutex_unlock(&message_mutex);
  395. return r;
  396. }
  397. static void switch_status(struct dm_target *ti, status_type_t type,
  398. unsigned status_flags, char *result, unsigned maxlen)
  399. {
  400. struct switch_ctx *sctx = ti->private;
  401. unsigned sz = 0;
  402. int path_nr;
  403. switch (type) {
  404. case STATUSTYPE_INFO:
  405. result[0] = '\0';
  406. break;
  407. case STATUSTYPE_TABLE:
  408. DMEMIT("%u %u 0", sctx->nr_paths, sctx->region_size);
  409. for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++)
  410. DMEMIT(" %s %llu", sctx->path_list[path_nr].dmdev->name,
  411. (unsigned long long)sctx->path_list[path_nr].start);
  412. break;
  413. }
  414. }
  415. /*
  416. * Switch ioctl:
  417. *
  418. * Passthrough all ioctls to the path for sector 0
  419. */
  420. static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
  421. {
  422. struct switch_ctx *sctx = ti->private;
  423. unsigned path_nr;
  424. path_nr = switch_get_path_nr(sctx, 0);
  425. *bdev = sctx->path_list[path_nr].dmdev->bdev;
  426. /*
  427. * Only pass ioctls through if the device sizes match exactly.
  428. */
  429. if (ti->len + sctx->path_list[path_nr].start !=
  430. i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT)
  431. return 1;
  432. return 0;
  433. }
  434. static int switch_iterate_devices(struct dm_target *ti,
  435. iterate_devices_callout_fn fn, void *data)
  436. {
  437. struct switch_ctx *sctx = ti->private;
  438. int path_nr;
  439. int r;
  440. for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) {
  441. r = fn(ti, sctx->path_list[path_nr].dmdev,
  442. sctx->path_list[path_nr].start, ti->len, data);
  443. if (r)
  444. return r;
  445. }
  446. return 0;
  447. }
  448. static struct target_type switch_target = {
  449. .name = "switch",
  450. .version = {1, 1, 0},
  451. .module = THIS_MODULE,
  452. .ctr = switch_ctr,
  453. .dtr = switch_dtr,
  454. .map = switch_map,
  455. .message = switch_message,
  456. .status = switch_status,
  457. .prepare_ioctl = switch_prepare_ioctl,
  458. .iterate_devices = switch_iterate_devices,
  459. };
  460. static int __init dm_switch_init(void)
  461. {
  462. int r;
  463. r = dm_register_target(&switch_target);
  464. if (r < 0)
  465. DMERR("dm_register_target() failed %d", r);
  466. return r;
  467. }
  468. static void __exit dm_switch_exit(void)
  469. {
  470. dm_unregister_target(&switch_target);
  471. }
  472. module_init(dm_switch_init);
  473. module_exit(dm_switch_exit);
  474. MODULE_DESCRIPTION(DM_NAME " dynamic path switching target");
  475. MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>");
  476. MODULE_AUTHOR("Narendran Ganapathy <Narendran_Ganapathy@dell.com>");
  477. MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay@dell.com>");
  478. MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
  479. MODULE_LICENSE("GPL");