summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/ec/src
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/cluster/ec/src')
-rw-r--r--xlators/cluster/ec/src/ec-common.c253
-rw-r--r--xlators/cluster/ec/src/ec-common.h1
-rw-r--r--xlators/cluster/ec/src/ec-types.h2
-rw-r--r--xlators/cluster/ec/src/ec.c77
4 files changed, 239 insertions, 94 deletions
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index cb627a92c9c..fbc7ac97aa0 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -1847,6 +1847,67 @@ gf_boolean_t ec_lock_acquire(ec_lock_link_t *link)
return _gf_true;
}
+static ec_lock_link_t *
+ec_lock_timer_cancel(xlator_t *xl, ec_lock_t *lock)
+{
+ ec_lock_link_t *timer_link;
+
+ /* If we don't have any timer, there's nothing to cancel. */
+ if (lock->timer == NULL) {
+ return NULL;
+ }
+
+ /* We are trying to access a lock that has an unlock timer active.
+ * This means that the lock must be idle, i.e. no fop can be in the
+ * owner, waiting or frozen lists. It also means that the lock cannot
+ * have been marked as being released (this is done without timers).
+ * There should only be one owner reference, but it's possible that
+ * some fops are being prepared to use this lock. */
+ GF_ASSERT ((lock->refs_owners == 1) &&
+ list_empty(&lock->owners) && list_empty(&lock->waiting));
+
+ /* We take the timer_link before cancelling the timer, since a
+ * successful cancellation will destroy it. It must not be NULL
+ * because it references the fop responsible for the delayed unlock
+ * that we are currently trying to cancel. */
+ timer_link = lock->timer->data;
+ GF_ASSERT(timer_link != NULL);
+
+ if (gf_timer_call_cancel(xl->ctx, lock->timer) < 0) {
+ /* It's too late to avoid the execution of the timer callback.
+ * Since we need to be sure that the callback has access to all
+ * needed resources, we cannot resume the execution of the
+ * timer fop now. This will be done in the callback. */
+ timer_link = NULL;
+ } else {
+ /* The timer has been cancelled. The fop referenced by
+ * timer_link holds the last reference. The caller is
+ * responsible to release it when not needed anymore. */
+ ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock);
+ }
+
+ /* We have two options here:
+ *
+ * 1. The timer has been successfully cancelled.
+ *
+ * This is the easiest case and we can continue with the currently
+ * acquired lock.
+ *
+ * 2. The timer callback has already been fired.
+ *
+ * In this case we have not been able to cancel the timer before
+ * the timer callback has been fired, but we also know that
+ * lock->timer != NULL. This means that the timer callback is still
+ * trying to acquire the inode mutex that we currently own. We are
+ * safe until we release it. In this case we can safely clear
+ * lock->timer. This will cause that the timer callback does nothing
+ * once it acquires the mutex.
+ */
+ lock->timer = NULL;
+
+ return timer_link;
+}
+
static gf_boolean_t
ec_lock_assign_owner(ec_lock_link_t *link)
{
@@ -1891,61 +1952,7 @@ ec_lock_assign_owner(ec_lock_link_t *link)
* empty. */
GF_ASSERT(list_empty(&lock->frozen));
- if (lock->timer != NULL) {
- /* We are trying to acquire a lock that has an unlock timer active.
- * This means that the lock must be idle, i.e. no fop can be in the
- * owner, waiting or frozen lists. It also means that the lock cannot
- * have been marked as being released (this is done without timers).
- * There should only be one owner reference, but it's possible that
- * some fops are being prepared to use this lock.
- */
- GF_ASSERT ((lock->refs_owners == 1) &&
- list_empty(&lock->owners) && list_empty(&lock->waiting));
-
- /* We take the timer_link before cancelling the timer, since a
- * successful cancellation will destroy it. It must not be NULL
- * because it references the fop responsible for the delayed unlock
- * that we are currently trying to cancel. */
- timer_link = lock->timer->data;
- GF_ASSERT(timer_link != NULL);
-
- if (gf_timer_call_cancel(fop->xl->ctx, lock->timer) < 0) {
- /* It's too late to avoid the execution of the timer callback.
- * Since we need to be sure that the callback has access to all
- * needed resources, we cannot resume the execution of the timer
- * fop now. This will be done in the callback.
- */
- timer_link = NULL;
- } else {
- /* The timer has been cancelled, so we need to release the owner
- * reference that was held by the fop waiting for the timer. This
- * can be the last reference, but we'll immediately increment it
- * for the current fop, so no need to check it.
- */
- lock->refs_owners--;
-
- ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock);
- }
-
- /* We have two options here:
- *
- * 1. The timer has been successfully cancelled.
- *
- * This is the easiest case and we can continue with the currently
- * acquired lock.
- *
- * 2. The timer callback has already been fired.
- *
- * In this case we have not been able to cancel the timer before
- * the timer callback has been fired, but we also know that
- * lock->timer != NULL. This means that the timer callback is still
- * trying to acquire the inode mutex that we currently own. We are
- * safe until we release it. In this case we can safely clear
- * lock->timer. This will cause that the timer callback does nothing
- * once it acquires the mutex.
- */
- lock->timer = NULL;
- }
+ timer_link = ec_lock_timer_cancel(fop->xl, lock);
if (!list_empty(&lock->owners)) {
/* There are other owners of this lock. We can only take ownership if
@@ -1965,7 +1972,13 @@ ec_lock_assign_owner(ec_lock_link_t *link)
}
list_add_tail(&link->owner_list, &lock->owners);
- lock->refs_owners++;
+
+ /* If timer_link is not NULL, it means that we have inherited the owner
+ * reference assigned to the timer fop. In this case we simply reuse it.
+ * Otherwise we need to increase the number of owners. */
+ if (timer_link == NULL) {
+ lock->refs_owners++;
+ }
assigned = _gf_true;
@@ -2383,6 +2396,48 @@ ec_unlock_now(ec_lock_link_t *link)
ec_resume(link->fop, 0);
}
+void
+ec_lock_release(ec_t *ec, inode_t *inode)
+{
+ ec_lock_t *lock;
+ ec_inode_t *ctx;
+ ec_lock_link_t *timer_link = NULL;
+
+ LOCK(&inode->lock);
+
+ ctx = __ec_inode_get(inode, ec->xl);
+ if (ctx == NULL) {
+ goto done;
+ }
+ lock = ctx->inode_lock;
+ if ((lock == NULL) || !lock->acquired || lock->release) {
+ goto done;
+ }
+
+ gf_msg_debug(ec->xl->name, 0,
+ "Releasing inode %p due to lock contention", inode);
+
+ /* The lock is not marked to be released, so the frozen list should be
+ * empty. */
+ GF_ASSERT(list_empty(&lock->frozen));
+
+ timer_link = ec_lock_timer_cancel(ec->xl, lock);
+
+ /* We mark the lock to be released as soon as possible. */
+ lock->release = _gf_true;
+
+done:
+ UNLOCK(&inode->lock);
+
+ /* If we have cancelled the timer, we need to start the unlock of the
+ * inode. If there was a timer but we have been unable to cancel it
+ * because it was just triggered, the timer callback will take care
+ * of releasing the inode. */
+ if (timer_link != NULL) {
+ ec_unlock_now(timer_link);
+ }
+}
+
void ec_unlock_timer_add(ec_lock_link_t *link);
void
@@ -2470,9 +2525,60 @@ void ec_unlock_timer_cbk(void *data)
ec_unlock_timer_del(data);
}
+static gf_boolean_t
+ec_eager_lock_used(ec_t *ec, ec_fop_data_t *fop)
+{
+ /* Fops with no locks at this point mean that they are sent as sub-fops
+ * of other higher level fops. In this case we simply assume that the
+ * parent fop will take correct care of the eager lock. */
+ if (fop->lock_count == 0) {
+ return _gf_true;
+ }
+
+ /* We may have more than one lock, but this only happens in the rename
+ * fop, and both locks will reference an inode of the same type (a
+ * directory in this case), so we only need to check the first lock. */
+ if (fop->locks[0].lock->loc.inode->ia_type == IA_IFREG) {
+ return ec->eager_lock;
+ }
+
+ return ec->other_eager_lock;
+}
+
+static uint32_t
+ec_eager_lock_timeout(ec_t *ec, ec_lock_t *lock)
+{
+ if (lock->loc.inode->ia_type == IA_IFREG) {
+ return ec->eager_lock_timeout;
+ }
+
+ return ec->other_eager_lock_timeout;
+}
+
+static gf_boolean_t
+ec_lock_delay_create(ec_lock_link_t *link)
+{
+ struct timespec delay;
+ ec_fop_data_t *fop = link->fop;
+ ec_lock_t *lock = link->lock;
+
+ delay.tv_sec = ec_eager_lock_timeout(fop->xl->private, lock);
+ delay.tv_nsec = 0;
+ lock->timer = gf_timer_call_after(fop->xl->ctx, delay,
+ ec_unlock_timer_cbk, link);
+ if (lock->timer == NULL) {
+ gf_msg(fop->xl->name, GF_LOG_WARNING, ENOMEM,
+ EC_MSG_UNLOCK_DELAY_FAILED,
+ "Unable to delay an unlock");
+
+ return _gf_false;
+ }
+
+ return _gf_true;
+}
+
void ec_unlock_timer_add(ec_lock_link_t *link)
{
- struct timespec delay;
ec_fop_data_t *fop = link->fop;
ec_lock_t *lock = link->lock;
gf_boolean_t now = _gf_false;
@@ -2526,19 +2632,12 @@ void ec_unlock_timer_add(ec_lock_link_t *link)
ec_trace("UNLOCK_DELAY", fop, "lock=%p, release=%d", lock,
lock->release);
- delay.tv_sec = 1;
- delay.tv_nsec = 0;
- lock->timer = gf_timer_call_after(fop->xl->ctx, delay,
- ec_unlock_timer_cbk, link);
- if (lock->timer == NULL) {
- gf_msg(fop->xl->name, GF_LOG_WARNING, ENOMEM,
- EC_MSG_UNLOCK_DELAY_FAILED,
- "Unable to delay an unlock");
-
+ if (!ec_lock_delay_create(link)) {
/* We are unable to create a new timer. We immediately release
* the lock. */
lock->release = now = _gf_true;
}
+
} else {
ec_trace("UNLOCK_FORCE", fop, "lock=%p, release=%d", lock,
lock->release);
@@ -2583,26 +2682,6 @@ void ec_flush_size_version(ec_fop_data_t * fop)
ec_update_info(&fop->locks[0]);
}
-static gf_boolean_t
-ec_use_eager_lock(ec_t *ec, ec_fop_data_t *fop)
-{
- /* Fops with no locks at this point mean that they are sent as sub-fops
- * of other higher level fops. In this case we simply assume that the
- * parent fop will take correct care of the eager lock. */
- if (fop->lock_count == 0) {
- return _gf_true;
- }
-
- /* We may have more than one lock, but this only happens in the rename
- * fop, and both locks will reference an inode of the same type (a
- * directory in this case), so we only need to check the first lock. */
- if (fop->locks[0].lock->loc.inode->ia_type == IA_IFREG) {
- return ec->eager_lock;
- }
-
- return ec->other_eager_lock;
-}
-
static void
ec_update_stripe(ec_t *ec, ec_stripe_list_t *stripe_cache, ec_stripe_t *stripe,
ec_fop_data_t *fop)
@@ -2708,7 +2787,7 @@ void ec_lock_reuse(ec_fop_data_t *fop)
ec = fop->xl->private;
cbk = fop->answer;
- if (ec_use_eager_lock(ec, fop) && cbk != NULL) {
+ if (ec_eager_lock_used(ec, fop) && cbk != NULL) {
if (cbk->xdata != NULL) {
if ((dict_get_int32(cbk->xdata, GLUSTERFS_INODELK_COUNT,
&count) == 0) && (count > 1)) {
diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h
index c3e291585ef..99e2f0653be 100644
--- a/xlators/cluster/ec/src/ec-common.h
+++ b/xlators/cluster/ec/src/ec-common.h
@@ -102,6 +102,7 @@ void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags,
void ec_lock(ec_fop_data_t * fop);
void ec_lock_reuse(ec_fop_data_t *fop);
void ec_unlock(ec_fop_data_t * fop);
+void ec_lock_release(ec_t *ec, inode_t *inode);
gf_boolean_t ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode,
uint64_t *size);
diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
index 23dc434bc42..15b4c77abfe 100644
--- a/xlators/cluster/ec/src/ec-types.h
+++ b/xlators/cluster/ec/src/ec-types.h
@@ -669,6 +669,8 @@ struct _ec {
uint32_t background_heals;
uint32_t heal_wait_qlen;
uint32_t self_heal_window_size; /* max size of read/writes */
+ uint32_t eager_lock_timeout;
+ uint32_t other_eager_lock_timeout;
struct list_head pending_fops;
struct list_head heal_waiting;
struct list_head healing;
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
index 4c80f1283f1..30b0bdcb29c 100644
--- a/xlators/cluster/ec/src/ec.c
+++ b/xlators/cluster/ec/src/ec.c
@@ -271,6 +271,11 @@ reconfigure (xlator_t *this, dict_t *options)
bool, failed);
GF_OPTION_RECONF ("other-eager-lock", ec->other_eager_lock, options,
bool, failed);
+ GF_OPTION_RECONF ("eager-lock-timeout", ec->eager_lock_timeout,
+ options, uint32, failed);
+ GF_OPTION_RECONF ("other-eager-lock-timeout",
+ ec->other_eager_lock_timeout, options, uint32,
+ failed);
GF_OPTION_RECONF ("background-heals", background_heals, options,
uint32, failed);
GF_OPTION_RECONF ("heal-wait-qlength", heal_wait_qlen, options,
@@ -453,6 +458,43 @@ ec_set_up_state(ec_t *ec, uintptr_t index_mask, uintptr_t new_state)
}
}
+static gf_boolean_t
+ec_upcall(ec_t *ec, struct gf_upcall *upcall)
+{
+ struct gf_upcall_cache_invalidation *ci = NULL;
+ struct gf_upcall_inodelk_contention *lc = NULL;
+ inode_t *inode;
+
+ switch (upcall->event_type) {
+ case GF_UPCALL_CACHE_INVALIDATION:
+ ci = upcall->data;
+ ci->flags |= UP_INVAL_ATTR;
+ return _gf_true;
+
+ case GF_UPCALL_INODELK_CONTENTION:
+ lc = upcall->data;
+ if (strcmp(lc->domain, ec->xl->name) != 0) {
+ /* The lock is not owned by EC, ignore it. */
+ return _gf_true;
+ }
+ inode = inode_find(((xlator_t *)ec->xl->graph->top)->itable,
+ upcall->gfid);
+ /* If inode is not found, it means that it's already released,
+ * so we can ignore it. Probably it has been released and
+ * destroyed while the contention notification was being sent.
+ */
+ if (inode != NULL) {
+ ec_lock_release(ec, inode);
+ inode_unref(inode);
+ }
+
+ return _gf_false;
+
+ default:
+ return _gf_true;
+ }
+}
+
int32_t
ec_notify (xlator_t *this, int32_t event, void *data, void *data2)
{
@@ -464,19 +506,13 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2)
dict_t *output = NULL;
gf_boolean_t propagate = _gf_true;
int32_t orig_event = event;
- struct gf_upcall *up_data = NULL;
- struct gf_upcall_cache_invalidation *up_ci = NULL;
uintptr_t mask = 0;
gf_msg_trace (this->name, 0, "NOTIFY(%d): %p, %p",
event, data, data2);
if (event == GF_EVENT_UPCALL) {
- up_data = (struct gf_upcall *)data;
- if (up_data->event_type == GF_UPCALL_CACHE_INVALIDATION) {
- up_ci = (struct gf_upcall_cache_invalidation *)up_data->data;
- up_ci->flags |= UP_INVAL_ATTR;
- }
+ propagate = ec_upcall(ec, data);
goto done;
}
@@ -664,6 +700,10 @@ init (xlator_t *this)
GF_OPTION_INIT ("iam-self-heal-daemon", ec->shd.iamshd, bool, failed);
GF_OPTION_INIT ("eager-lock", ec->eager_lock, bool, failed);
GF_OPTION_INIT ("other-eager-lock", ec->other_eager_lock, bool, failed);
+ GF_OPTION_INIT ("eager-lock-timeout", ec->eager_lock_timeout, uint32,
+ failed);
+ GF_OPTION_INIT ("other-eager-lock-timeout", ec->other_eager_lock_timeout,
+ uint32, failed);
GF_OPTION_INIT ("background-heals", ec->background_heals, uint32, failed);
GF_OPTION_INIT ("heal-wait-qlength", ec->heal_wait_qlen, uint32, failed);
GF_OPTION_INIT ("self-heal-window-size", ec->self_heal_window_size, uint32,
@@ -1456,6 +1496,29 @@ struct volume_options options[] =
.description = "It's equivalent to the eager-lock option but for non "
"regular files."
},
+ { .key = {"eager-lock-timeout"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 60,
+ .default_value = "1",
+ .op_version = { GD_OP_VERSION_4_0_0 },
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+ .tags = { "disperse", "locks", "timeout" },
+ .description = "Maximum time (in seconds) that a lock on an inode is "
+ "kept held if no new operations on the inode are "
+ "received."
+ },
+ { .key = {"other-eager-lock-timeout"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 60,
+ .default_value = "1",
+ .op_version = { GD_OP_VERSION_4_0_0 },
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+ .tags = { "disperse", "locks", "timeout" },
+ .description = "It's equivalent ot eager-lock-timeout option but for "
+ "non regular files."
+ },
{ .key = {"background-heals"},
.type = GF_OPTION_TYPE_INT,
.min = 0,/*Disabling background heals*/