123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790 |
- /*
- * Copyright (C) 2006-2009 Red Hat, Inc.
- *
- * This file is released under the LGPL.
- */
- #include <linux/bio.h>
- #include <linux/slab.h>
- #include <linux/dm-dirty-log.h>
- #include <linux/device-mapper.h>
- #include <linux/dm-log-userspace.h>
- #include "dm-log-userspace-transfer.h"
- #define DM_LOG_USERSPACE_VSN "1.1.0"
- struct flush_entry {
- int type;
- region_t region;
- struct list_head list;
- };
- /*
- * This limit on the number of mark and clear request is, to a degree,
- * arbitrary. However, there is some basis for the choice in the limits
- * imposed on the size of data payload by dm-log-userspace-transfer.c:
- * dm_consult_userspace().
- */
- #define MAX_FLUSH_GROUP_COUNT 32
- struct log_c {
- struct dm_target *ti;
- uint32_t region_size;
- region_t region_count;
- uint64_t luid;
- char uuid[DM_UUID_LEN];
- char *usr_argv_str;
- uint32_t usr_argc;
- /*
- * in_sync_hint gets set when doing is_remote_recovering. It
- * represents the first region that needs recovery. IOW, the
- * first zero bit of sync_bits. This can be useful for to limit
- * traffic for calls like is_remote_recovering and get_resync_work,
- * but be take care in its use for anything else.
- */
- uint64_t in_sync_hint;
- /*
- * Mark and clear requests are held until a flush is issued
- * so that we can group, and thereby limit, the amount of
- * network traffic between kernel and userspace. The 'flush_lock'
- * is used to protect these lists.
- */
- spinlock_t flush_lock;
- struct list_head mark_list;
- struct list_head clear_list;
- };
- static mempool_t *flush_entry_pool;
- static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data)
- {
- return kmalloc(sizeof(struct flush_entry), gfp_mask);
- }
- static void flush_entry_free(void *element, void *pool_data)
- {
- kfree(element);
- }
- static int userspace_do_request(struct log_c *lc, const char *uuid,
- int request_type, char *data, size_t data_size,
- char *rdata, size_t *rdata_size)
- {
- int r;
- /*
- * If the server isn't there, -ESRCH is returned,
- * and we must keep trying until the server is
- * restored.
- */
- retry:
- r = dm_consult_userspace(uuid, lc->luid, request_type, data,
- data_size, rdata, rdata_size);
- if (r != -ESRCH)
- return r;
- DMERR(" Userspace log server not found.");
- while (1) {
- set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(2*HZ);
- DMWARN("Attempting to contact userspace log server...");
- r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_CTR,
- lc->usr_argv_str,
- strlen(lc->usr_argv_str) + 1,
- NULL, NULL);
- if (!r)
- break;
- }
- DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete");
- r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_RESUME, NULL,
- 0, NULL, NULL);
- if (!r)
- goto retry;
- DMERR("Error trying to resume userspace log: %d", r);
- return -ESRCH;
- }
- static int build_constructor_string(struct dm_target *ti,
- unsigned argc, char **argv,
- char **ctr_str)
- {
- int i, str_size;
- char *str = NULL;
- *ctr_str = NULL;
- for (i = 0, str_size = 0; i < argc; i++)
- str_size += strlen(argv[i]) + 1; /* +1 for space between args */
- str_size += 20; /* Max number of chars in a printed u64 number */
- str = kzalloc(str_size, GFP_KERNEL);
- if (!str) {
- DMWARN("Unable to allocate memory for constructor string");
- return -ENOMEM;
- }
- str_size = sprintf(str, "%llu", (unsigned long long)ti->len);
- for (i = 0; i < argc; i++)
- str_size += sprintf(str + str_size, " %s", argv[i]);
- *ctr_str = str;
- return str_size;
- }
- /*
- * userspace_ctr
- *
- * argv contains:
- * <UUID> <other args>
- * Where 'other args' is the userspace implementation specific log
- * arguments. An example might be:
- * <UUID> clustered_disk <arg count> <log dev> <region_size> [[no]sync]
- *
- * So, this module will strip off the <UUID> for identification purposes
- * when communicating with userspace about a log; but will pass on everything
- * else.
- */
- static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
- unsigned argc, char **argv)
- {
- int r = 0;
- int str_size;
- char *ctr_str = NULL;
- struct log_c *lc = NULL;
- uint64_t rdata;
- size_t rdata_size = sizeof(rdata);
- if (argc < 3) {
- DMWARN("Too few arguments to userspace dirty log");
- return -EINVAL;
- }
- lc = kmalloc(sizeof(*lc), GFP_KERNEL);
- if (!lc) {
- DMWARN("Unable to allocate userspace log context.");
- return -ENOMEM;
- }
- /* The ptr value is sufficient for local unique id */
- lc->luid = (unsigned long)lc;
- lc->ti = ti;
- if (strlen(argv[0]) > (DM_UUID_LEN - 1)) {
- DMWARN("UUID argument too long.");
- kfree(lc);
- return -EINVAL;
- }
- strncpy(lc->uuid, argv[0], DM_UUID_LEN);
- spin_lock_init(&lc->flush_lock);
- INIT_LIST_HEAD(&lc->mark_list);
- INIT_LIST_HEAD(&lc->clear_list);
- str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str);
- if (str_size < 0) {
- kfree(lc);
- return str_size;
- }
- /* Send table string */
- r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR,
- ctr_str, str_size, NULL, NULL);
- if (r < 0) {
- if (r == -ESRCH)
- DMERR("Userspace log server not found");
- else
- DMERR("Userspace log server failed to create log");
- goto out;
- }
- /* Since the region size does not change, get it now */
- rdata_size = sizeof(rdata);
- r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_GET_REGION_SIZE,
- NULL, 0, (char *)&rdata, &rdata_size);
- if (r) {
- DMERR("Failed to get region size of dirty log");
- goto out;
- }
- lc->region_size = (uint32_t)rdata;
- lc->region_count = dm_sector_div_up(ti->len, lc->region_size);
- out:
- if (r) {
- kfree(lc);
- kfree(ctr_str);
- } else {
- lc->usr_argv_str = ctr_str;
- lc->usr_argc = argc;
- log->context = lc;
- }
- return r;
- }
- static void userspace_dtr(struct dm_dirty_log *log)
- {
- struct log_c *lc = log->context;
- (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,
- NULL, 0,
- NULL, NULL);
- kfree(lc->usr_argv_str);
- kfree(lc);
- return;
- }
- static int userspace_presuspend(struct dm_dirty_log *log)
- {
- int r;
- struct log_c *lc = log->context;
- r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_PRESUSPEND,
- NULL, 0,
- NULL, NULL);
- return r;
- }
- static int userspace_postsuspend(struct dm_dirty_log *log)
- {
- int r;
- struct log_c *lc = log->context;
- r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_POSTSUSPEND,
- NULL, 0,
- NULL, NULL);
- return r;
- }
- static int userspace_resume(struct dm_dirty_log *log)
- {
- int r;
- struct log_c *lc = log->context;
- lc->in_sync_hint = 0;
- r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_RESUME,
- NULL, 0,
- NULL, NULL);
- return r;
- }
- static uint32_t userspace_get_region_size(struct dm_dirty_log *log)
- {
- struct log_c *lc = log->context;
- return lc->region_size;
- }
- /*
- * userspace_is_clean
- *
- * Check whether a region is clean. If there is any sort of
- * failure when consulting the server, we return not clean.
- *
- * Returns: 1 if clean, 0 otherwise
- */
- static int userspace_is_clean(struct dm_dirty_log *log, region_t region)
- {
- int r;
- uint64_t region64 = (uint64_t)region;
- int64_t is_clean;
- size_t rdata_size;
- struct log_c *lc = log->context;
- rdata_size = sizeof(is_clean);
- r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_CLEAN,
- (char *)®ion64, sizeof(region64),
- (char *)&is_clean, &rdata_size);
- return (r) ? 0 : (int)is_clean;
- }
- /*
- * userspace_in_sync
- *
- * Check if the region is in-sync. If there is any sort
- * of failure when consulting the server, we assume that
- * the region is not in sync.
- *
- * If 'can_block' is set, return immediately
- *
- * Returns: 1 if in-sync, 0 if not-in-sync, -EWOULDBLOCK
- */
- static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
- int can_block)
- {
- int r;
- uint64_t region64 = region;
- int64_t in_sync;
- size_t rdata_size;
- struct log_c *lc = log->context;
- /*
- * We can never respond directly - even if in_sync_hint is
- * set. This is because another machine could see a device
- * failure and mark the region out-of-sync. If we don't go
- * to userspace to ask, we might think the region is in-sync
- * and allow a read to pick up data that is stale. (This is
- * very unlikely if a device actually fails; but it is very
- * likely if a connection to one device from one machine fails.)
- *
- * There still might be a problem if the mirror caches the region
- * state as in-sync... but then this call would not be made. So,
- * that is a mirror problem.
- */
- if (!can_block)
- return -EWOULDBLOCK;
- rdata_size = sizeof(in_sync);
- r = userspace_do_request(lc, lc->uuid, DM_ULOG_IN_SYNC,
- (char *)®ion64, sizeof(region64),
- (char *)&in_sync, &rdata_size);
- return (r) ? 0 : (int)in_sync;
- }
- static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list)
- {
- int r = 0;
- struct flush_entry *fe;
- list_for_each_entry(fe, flush_list, list) {
- r = userspace_do_request(lc, lc->uuid, fe->type,
- (char *)&fe->region,
- sizeof(fe->region),
- NULL, NULL);
- if (r)
- break;
- }
- return r;
- }
- static int flush_by_group(struct log_c *lc, struct list_head *flush_list)
- {
- int r = 0;
- int count;
- uint32_t type = 0;
- struct flush_entry *fe, *tmp_fe;
- LIST_HEAD(tmp_list);
- uint64_t group[MAX_FLUSH_GROUP_COUNT];
- /*
- * Group process the requests
- */
- while (!list_empty(flush_list)) {
- count = 0;
- list_for_each_entry_safe(fe, tmp_fe, flush_list, list) {
- group[count] = fe->region;
- count++;
- list_del(&fe->list);
- list_add(&fe->list, &tmp_list);
- type = fe->type;
- if (count >= MAX_FLUSH_GROUP_COUNT)
- break;
- }
- r = userspace_do_request(lc, lc->uuid, type,
- (char *)(group),
- count * sizeof(uint64_t),
- NULL, NULL);
- if (r) {
- /* Group send failed. Attempt one-by-one. */
- list_splice_init(&tmp_list, flush_list);
- r = flush_one_by_one(lc, flush_list);
- break;
- }
- }
- /*
- * Must collect flush_entrys that were successfully processed
- * as a group so that they will be free'd by the caller.
- */
- list_splice_init(&tmp_list, flush_list);
- return r;
- }
- /*
- * userspace_flush
- *
- * This function is ok to block.
- * The flush happens in two stages. First, it sends all
- * clear/mark requests that are on the list. Then it
- * tells the server to commit them. This gives the
- * server a chance to optimise the commit, instead of
- * doing it for every request.
- *
- * Additionally, we could implement another thread that
- * sends the requests up to the server - reducing the
- * load on flush. Then the flush would have less in
- * the list and be responsible for the finishing commit.
- *
- * Returns: 0 on success, < 0 on failure
- */
- static int userspace_flush(struct dm_dirty_log *log)
- {
- int r = 0;
- unsigned long flags;
- struct log_c *lc = log->context;
- LIST_HEAD(mark_list);
- LIST_HEAD(clear_list);
- struct flush_entry *fe, *tmp_fe;
- spin_lock_irqsave(&lc->flush_lock, flags);
- list_splice_init(&lc->mark_list, &mark_list);
- list_splice_init(&lc->clear_list, &clear_list);
- spin_unlock_irqrestore(&lc->flush_lock, flags);
- if (list_empty(&mark_list) && list_empty(&clear_list))
- return 0;
- r = flush_by_group(lc, &mark_list);
- if (r)
- goto fail;
- r = flush_by_group(lc, &clear_list);
- if (r)
- goto fail;
- r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
- NULL, 0, NULL, NULL);
- fail:
- /*
- * We can safely remove these entries, even if failure.
- * Calling code will receive an error and will know that
- * the log facility has failed.
- */
- list_for_each_entry_safe(fe, tmp_fe, &mark_list, list) {
- list_del(&fe->list);
- mempool_free(fe, flush_entry_pool);
- }
- list_for_each_entry_safe(fe, tmp_fe, &clear_list, list) {
- list_del(&fe->list);
- mempool_free(fe, flush_entry_pool);
- }
- if (r)
- dm_table_event(lc->ti->table);
- return r;
- }
- /*
- * userspace_mark_region
- *
- * This function should avoid blocking unless absolutely required.
- * (Memory allocation is valid for blocking.)
- */
- static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
- {
- unsigned long flags;
- struct log_c *lc = log->context;
- struct flush_entry *fe;
- /* Wait for an allocation, but _never_ fail */
- fe = mempool_alloc(flush_entry_pool, GFP_NOIO);
- BUG_ON(!fe);
- spin_lock_irqsave(&lc->flush_lock, flags);
- fe->type = DM_ULOG_MARK_REGION;
- fe->region = region;
- list_add(&fe->list, &lc->mark_list);
- spin_unlock_irqrestore(&lc->flush_lock, flags);
- return;
- }
- /*
- * userspace_clear_region
- *
- * This function must not block.
- * So, the alloc can't block. In the worst case, it is ok to
- * fail. It would simply mean we can't clear the region.
- * Does nothing to current sync context, but does mean
- * the region will be re-sync'ed on a reload of the mirror
- * even though it is in-sync.
- */
- static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
- {
- unsigned long flags;
- struct log_c *lc = log->context;
- struct flush_entry *fe;
- /*
- * If we fail to allocate, we skip the clearing of
- * the region. This doesn't hurt us in any way, except
- * to cause the region to be resync'ed when the
- * device is activated next time.
- */
- fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC);
- if (!fe) {
- DMERR("Failed to allocate memory to clear region.");
- return;
- }
- spin_lock_irqsave(&lc->flush_lock, flags);
- fe->type = DM_ULOG_CLEAR_REGION;
- fe->region = region;
- list_add(&fe->list, &lc->clear_list);
- spin_unlock_irqrestore(&lc->flush_lock, flags);
- return;
- }
- /*
- * userspace_get_resync_work
- *
- * Get a region that needs recovery. It is valid to return
- * an error for this function.
- *
- * Returns: 1 if region filled, 0 if no work, <0 on error
- */
- static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region)
- {
- int r;
- size_t rdata_size;
- struct log_c *lc = log->context;
- struct {
- int64_t i; /* 64-bit for mix arch compatibility */
- region_t r;
- } pkg;
- if (lc->in_sync_hint >= lc->region_count)
- return 0;
- rdata_size = sizeof(pkg);
- r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK,
- NULL, 0,
- (char *)&pkg, &rdata_size);
- *region = pkg.r;
- return (r) ? r : (int)pkg.i;
- }
- /*
- * userspace_set_region_sync
- *
- * Set the sync status of a given region. This function
- * must not fail.
- */
- static void userspace_set_region_sync(struct dm_dirty_log *log,
- region_t region, int in_sync)
- {
- int r;
- struct log_c *lc = log->context;
- struct {
- region_t r;
- int64_t i;
- } pkg;
- pkg.r = region;
- pkg.i = (int64_t)in_sync;
- r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
- (char *)&pkg, sizeof(pkg),
- NULL, NULL);
- /*
- * It would be nice to be able to report failures.
- * However, it is easy emough to detect and resolve.
- */
- return;
- }
- /*
- * userspace_get_sync_count
- *
- * If there is any sort of failure when consulting the server,
- * we assume that the sync count is zero.
- *
- * Returns: sync count on success, 0 on failure
- */
- static region_t userspace_get_sync_count(struct dm_dirty_log *log)
- {
- int r;
- size_t rdata_size;
- uint64_t sync_count;
- struct log_c *lc = log->context;
- rdata_size = sizeof(sync_count);
- r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT,
- NULL, 0,
- (char *)&sync_count, &rdata_size);
- if (r)
- return 0;
- if (sync_count >= lc->region_count)
- lc->in_sync_hint = lc->region_count;
- return (region_t)sync_count;
- }
- /*
- * userspace_status
- *
- * Returns: amount of space consumed
- */
- static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
- char *result, unsigned maxlen)
- {
- int r = 0;
- char *table_args;
- size_t sz = (size_t)maxlen;
- struct log_c *lc = log->context;
- switch (status_type) {
- case STATUSTYPE_INFO:
- r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO,
- NULL, 0,
- result, &sz);
- if (r) {
- sz = 0;
- DMEMIT("%s 1 COM_FAILURE", log->type->name);
- }
- break;
- case STATUSTYPE_TABLE:
- sz = 0;
- table_args = strchr(lc->usr_argv_str, ' ');
- BUG_ON(!table_args); /* There will always be a ' ' */
- table_args++;
- DMEMIT("%s %u %s %s ", log->type->name, lc->usr_argc,
- lc->uuid, table_args);
- break;
- }
- return (r) ? 0 : (int)sz;
- }
- /*
- * userspace_is_remote_recovering
- *
- * Returns: 1 if region recovering, 0 otherwise
- */
- static int userspace_is_remote_recovering(struct dm_dirty_log *log,
- region_t region)
- {
- int r;
- uint64_t region64 = region;
- struct log_c *lc = log->context;
- static unsigned long long limit;
- struct {
- int64_t is_recovering;
- uint64_t in_sync_hint;
- } pkg;
- size_t rdata_size = sizeof(pkg);
- /*
- * Once the mirror has been reported to be in-sync,
- * it will never again ask for recovery work. So,
- * we can safely say there is not a remote machine
- * recovering if the device is in-sync. (in_sync_hint
- * must be reset at resume time.)
- */
- if (region < lc->in_sync_hint)
- return 0;
- else if (jiffies < limit)
- return 1;
- limit = jiffies + (HZ / 4);
- r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_REMOTE_RECOVERING,
- (char *)®ion64, sizeof(region64),
- (char *)&pkg, &rdata_size);
- if (r)
- return 1;
- lc->in_sync_hint = pkg.in_sync_hint;
- return (int)pkg.is_recovering;
- }
- static struct dm_dirty_log_type _userspace_type = {
- .name = "userspace",
- .module = THIS_MODULE,
- .ctr = userspace_ctr,
- .dtr = userspace_dtr,
- .presuspend = userspace_presuspend,
- .postsuspend = userspace_postsuspend,
- .resume = userspace_resume,
- .get_region_size = userspace_get_region_size,
- .is_clean = userspace_is_clean,
- .in_sync = userspace_in_sync,
- .flush = userspace_flush,
- .mark_region = userspace_mark_region,
- .clear_region = userspace_clear_region,
- .get_resync_work = userspace_get_resync_work,
- .set_region_sync = userspace_set_region_sync,
- .get_sync_count = userspace_get_sync_count,
- .status = userspace_status,
- .is_remote_recovering = userspace_is_remote_recovering,
- };
- static int __init userspace_dirty_log_init(void)
- {
- int r = 0;
- flush_entry_pool = mempool_create(100, flush_entry_alloc,
- flush_entry_free, NULL);
- if (!flush_entry_pool) {
- DMWARN("Unable to create flush_entry_pool: No memory.");
- return -ENOMEM;
- }
- r = dm_ulog_tfr_init();
- if (r) {
- DMWARN("Unable to initialize userspace log communications");
- mempool_destroy(flush_entry_pool);
- return r;
- }
- r = dm_dirty_log_type_register(&_userspace_type);
- if (r) {
- DMWARN("Couldn't register userspace dirty log type");
- dm_ulog_tfr_exit();
- mempool_destroy(flush_entry_pool);
- return r;
- }
- DMINFO("version " DM_LOG_USERSPACE_VSN " loaded");
- return 0;
- }
- static void __exit userspace_dirty_log_exit(void)
- {
- dm_dirty_log_type_unregister(&_userspace_type);
- dm_ulog_tfr_exit();
- mempool_destroy(flush_entry_pool);
- DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded");
- return;
- }
- module_init(userspace_dirty_log_init);
- module_exit(userspace_dirty_log_exit);
- MODULE_DESCRIPTION(DM_NAME " userspace dirty log link");
- MODULE_AUTHOR("Jonathan Brassow <dm-devel@redhat.com>");
- MODULE_LICENSE("GPL");
|