diff options
author | Pranith Kumar K <pkarampu@redhat.com> | 2015-05-13 16:57:49 +0530 |
---|---|---|
committer | Pranith Kumar Karampuri <pkarampu@redhat.com> | 2015-05-28 04:12:06 -0700 |
commit | 3a57ca8ee29ea8e3d3c5bbf28a56a821bfa99d99 (patch) | |
tree | 7e919238192422d4bb1f8a86a950013b286b41b3 | |
parent | 2b8fdde926532014f19d850b1321a4c7046dc001 (diff) |
cluster/ec: Fix all EIO errors in EC
Backport of http://review.gluster.org/10770
Backport of http://review.gluster.org/10806
Backport of http://review.gluster.org/10787
Backport of http://review.gluster.org/10868
Backport of http://review.gluster.com/10852
- When a blocking lock is requested, lock request is succeeded even when
ec->fragment number of locks are acquired successfully in non-blocking locking
phase. This will lead to fop succeeding only on the bricks where the locks are
acquired, leading to the necessity of self-heals. To prevent these un-necessary
self-heals, if the remaining locks fail with EAGAIN in non-blocking lock phase
try blocking locking phase instead.
- Handle lookup failures while op in progress
- cluster/ec: Correctly cleanup delayed locks
When a delayed lock is pending, a graph switch doesn't correctly
terminate it. This means that the update of version and size xattrs
is lost, causing EIO errors. This patch handles GF_EVENT_PARENT_DOWN
event to correctly finish pending udpdates before completing the
graph switch.
- Fix use after free crash
ec_heal creates ec_fop_data but doesn't run ec_manager. ec_fop_data_allocate
adds this fop to ec->pending_fops, because ec_manager is not run on this heal
fop it is never removed from ec->pending_fops. When it is accessed after free
it leads to crash. It is better to not to add HEAL fops to ec->pending_fops
because we don't want graph switch to hang the mount because of a BIG
file/directory heal.
- Forced unlock when lock contention is detected
EC uses an eager lock mechanism to optimize multiple read/write
requests on the same entry or inode. This increases performance
but can have adverse results when other clients try to access the
same entry/inode. To solve this, this patch adds a functionality
to detect when this happens and force an earlier release to not
block other clients.
The method consists on requesting GF_GLUSTERFS_INODELK_COUNT and
GF_GLUSTERFS_ENTRYLK_COUNT for all fops that take a lock. When this
count is greater than one, the lock is marked to be released. All
fops already waiting for this lock will be executed normally before
releasing the lock, but new requests that also require it will be
blocked and restarted after the lock has been released and reacquired
again.
Another problem was that some operations did correctly lock the
parent of an entry when needed, but got the size and version xattrs
from the entry instead of the parent.
This patch solves this problem by binding all queries of size and
version to each lock and replacing all entrylk calls by inodelk ones
to remove concurrent updates on directory metadata. This also allows
rename to correctly update source and destination directories.
BUG: 1225279
Change-Id: I02a6084b138dd38e018a462347cd9ce38610c7ef
Reviewed-on: http://review.gluster.org/10926
Tested-by: NetBSD Build System
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
-rwxr-xr-x | run-tests.sh | 6 | ||||
-rw-r--r-- | tests/bugs/disperse/bug-1188145.t | 50 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-combine.c | 35 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-common.c | 1509 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-common.h | 36 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-data.c | 82 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-data.h | 84 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-dir-read.c | 8 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-dir-write.c | 96 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-fops.h | 3 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-generic.c | 86 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-heal.c | 103 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-helpers.c | 38 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-helpers.h | 7 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-inode-read.c | 51 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-inode-write.c | 159 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-locks.c | 13 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec.c | 97 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec.h | 5 |
19 files changed, 1397 insertions, 1071 deletions
diff --git a/run-tests.sh b/run-tests.sh index 0bfa5c6230c..ef6568c30b0 100755 --- a/run-tests.sh +++ b/run-tests.sh @@ -201,13 +201,7 @@ function is_bad_test () ./tests/basic/ec/quota.t \ ./tests/bugs/distribute/bug-1161156.t \ ./tests/basic/tier/tier.t \ - ./tests/basic/ec/ec-4-1.t \ - ./tests/basic/ec/ec.t \ ./tests/basic/quota-nfs.t \ - ./tests/basic/ec/ec-6-2.t \ - ./tests/basic/ec/ec-3-1.t \ - ./tests/basic/ec/ec-5-1.t \ - ./tests/basic/ec/ec-12-4.t \ ./tests/bugs/quota/bug-1035576.t \ ./tests/bugs/glusterfs/bug-867253.t \ ./tests/bugs/glusterd/bug-974007.t \ diff --git a/tests/bugs/disperse/bug-1188145.t b/tests/bugs/disperse/bug-1188145.t new file mode 100644 index 00000000000..aa3a59bc62f --- /dev/null +++ b/tests/bugs/disperse/bug-1188145.t @@ -0,0 +1,50 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +function create_dirs() +{ + local stop=$1 + local idx + local res + + res=0 + idx=1 + while [[ -f ${stop} ]]; do + mkdir $M0/${idx} + if [[ $? -ne 0 ]]; then + res=1 + break; + fi + idx=$(( idx + 1 )) + done + + return ${res} +} + +cleanup + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 redundancy 2 $H0:$B0/${V0}{0..5} +EXPECT "Created" volinfo_field $V0 'Status' +TEST $CLI volume start $V0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Started" volinfo_field $V0 'Status' +TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0 + +name=`mktemp -t ${0##*/}.XXXXXX` +create_dirs ${name} & +pid=$! + +sleep 2 +TEST $CLI volume set $V0 uss on +sleep 5 +TEST $CLI volume set $V0 uss off +sleep 5 + +TEST rm -f ${name} +TEST wait ${pid} + +cleanup diff --git a/xlators/cluster/ec/src/ec-combine.c b/xlators/cluster/ec/src/ec-combine.c index 9d4a18999f1..4617a0430f1 100644 --- a/xlators/cluster/ec/src/ec-combine.c +++ b/xlators/cluster/ec/src/ec-combine.c @@ -171,8 +171,10 @@ void ec_iatt_rebuild(ec_t * ec, struct iatt * iatt, int32_t count, gf_boolean_t ec_xattr_match (dict_t *dict, char *key, data_t *value, void *arg) { - if (fnmatch(GF_XATTR_STIME_PATTERN, key, 0) == 0) + if ((fnmatch(GF_XATTR_STIME_PATTERN, key, 0) == 0) || + (strcmp(key, GLUSTERFS_OPEN_FD_COUNT) == 0)) { return _gf_false; + } return _gf_true; } @@ -185,6 +187,8 @@ ec_value_ignore (char *key) (strcmp(key, GF_XATTR_USER_PATHINFO_KEY) == 0) || (strcmp(key, GF_XATTR_LOCKINFO_KEY) == 0) || (strcmp(key, GLUSTERFS_OPEN_FD_COUNT) == 0) || + (strcmp(key, GLUSTERFS_INODELK_COUNT) == 0) || + (strcmp(key, GLUSTERFS_ENTRYLK_COUNT) == 0) || (strncmp(key, GF_XATTR_CLRLK_CMD, strlen (GF_XATTR_CLRLK_CMD)) == 0) || (strncmp(key, EC_QUOTA_PREFIX, strlen(EC_QUOTA_PREFIX)) == 0) || @@ -225,15 +229,9 @@ int32_t ec_dict_list(data_t ** list, int32_t * count, ec_cbk_data_t * cbk, dict = (which == EC_COMBINE_XDATA) ? ans->xdata : ans->dict; list[i] = dict_get(dict, key); - if (list[i] == NULL) - { - gf_log(cbk->fop->xl->name, GF_LOG_ERROR, "Unexpected missing " - "dictionary entry"); - - return 0; + if (list[i] != NULL) { + i++; } - - i++; } *count = i; @@ -471,11 +469,6 @@ int32_t ec_dict_data_max32(ec_cbk_data_t *cbk, int32_t which, char *key) return -1; } - if (num <= 1) - { - return 0; - } - max = data_to_uint32(data[0]); for (i = 1; i < num; i++) { @@ -507,10 +500,6 @@ int32_t ec_dict_data_max64(ec_cbk_data_t *cbk, int32_t which, char *key) return -1; } - if (num <= 1) { - return 0; - } - max = data_to_uint64(data[0]); for (i = 1; i < num; i++) { tmp = data_to_uint64(data[i]); @@ -630,6 +619,10 @@ int32_t ec_dict_data_combine(dict_t * dict, char * key, data_t * value, { return ec_dict_data_max32(data->cbk, data->which, key); } + if ((strcmp(key, GLUSTERFS_INODELK_COUNT) == 0) || + (strcmp(key, GLUSTERFS_ENTRYLK_COUNT) == 0)) { + return ec_dict_data_max32(data->cbk, data->which, key); + } if (strcmp(key, QUOTA_SIZE_KEY) == 0) { return ec_dict_data_quota(data->cbk, data->which, key); @@ -831,6 +824,8 @@ void ec_combine (ec_cbk_data_t *newcbk, ec_combine_f combine) LOCK(&fop->lock); + fop->received |= newcbk->mask; + item = fop->cbk_list.prev; list_for_each_entry(cbk, &fop->cbk_list, list) { @@ -868,7 +863,9 @@ void ec_combine (ec_cbk_data_t *newcbk, ec_combine_f combine) } cbk = list_entry(fop->cbk_list.next, ec_cbk_data_t, list); - needed = fop->minimum - cbk->count - fop->winds + 1; + if ((fop->mask ^ fop->remaining) == fop->received) { + needed = fop->minimum - cbk->count; + } UNLOCK(&fop->lock); diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index 393d9142797..ba81fc7313f 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -368,24 +368,34 @@ int32_t ec_child_select(ec_fop_data_t * fop) uintptr_t mask = 0; int32_t first = 0, num = 0; + ec_fop_cleanup(fop); + fop->mask &= ec->node_mask; mask = ec->xl_up; if (fop->parent == NULL) { - if (fop->loc[0].inode != NULL) { + if ((fop->flags & EC_FLAG_UPDATE_LOC_PARENT) && fop->loc[0].parent) + mask &= ec_inode_good(fop->loc[0].parent, fop->xl); + + if ((fop->flags & EC_FLAG_UPDATE_LOC_INODE) && fop->loc[0].inode) { mask &= ec_inode_good(fop->loc[0].inode, fop->xl); } - if (fop->loc[1].inode != NULL) { + + if ((fop->flags & EC_FLAG_UPDATE_LOC_INODE) && fop->loc[1].inode) { mask &= ec_inode_good(fop->loc[1].inode, fop->xl); } - if (fop->fd != NULL) { - if (fop->fd->inode != NULL) { + + if (fop->fd) { + if ((fop->flags & EC_FLAG_UPDATE_FD_INODE) && fop->fd->inode) { mask &= ec_inode_good(fop->fd->inode, fop->xl); } - mask &= ec_fd_good(fop->fd, fop->xl); + if (fop->flags & fop->flags & EC_FLAG_UPDATE_FD) { + mask &= ec_fd_good(fop->fd, fop->xl); + } } } + if ((fop->mask & ~mask) != 0) { gf_log(fop->xl->name, GF_LOG_WARNING, "Executing operation with " @@ -420,6 +430,7 @@ int32_t ec_child_select(ec_fop_data_t * fop) /*Unconditionally wind on healing subvolumes*/ fop->mask |= fop->healing; fop->remaining = fop->mask; + fop->received = 0; ec_trace("SELECT", fop, ""); @@ -585,7 +596,7 @@ void ec_dispatch_min(ec_fop_data_t * fop) } } -ec_lock_t * ec_lock_allocate(xlator_t * xl, int32_t kind, loc_t * loc) +ec_lock_t *ec_lock_allocate(xlator_t *xl, loc_t *loc) { ec_t * ec = xl->private; ec_lock_t * lock; @@ -602,9 +613,9 @@ ec_lock_t * ec_lock_allocate(xlator_t * xl, int32_t kind, loc_t * loc) lock = mem_get0(ec->lock_pool); if (lock != NULL) { - lock->kind = kind; lock->good_mask = -1ULL; INIT_LIST_HEAD(&lock->waiting); + INIT_LIST_HEAD(&lock->frozen); if (ec_loc_from_loc(xl, &lock->loc, loc) != 0) { mem_put(lock); @@ -618,6 +629,9 @@ ec_lock_t * ec_lock_allocate(xlator_t * xl, int32_t kind, loc_t * loc) void ec_lock_destroy(ec_lock_t * lock) { loc_wipe(&lock->loc); + if (lock->fd != NULL) { + fd_unref(lock->fd); + } mem_put(lock); } @@ -627,166 +641,96 @@ int32_t ec_lock_compare(ec_lock_t * lock1, ec_lock_t * lock2) return gf_uuid_compare(lock1->loc.gfid, lock2->loc.gfid); } -ec_lock_link_t *ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, - int32_t update) +void ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, uint32_t flags, + loc_t *base) { - ec_lock_t *new_lock, *tmp; - ec_lock_link_t *link = NULL; - int32_t tmp_update; + ec_lock_link_t *link; - new_lock = lock; + /* This check is only prepared for up to 2 locks per fop. If more locks + * are needed this must be changed. */ if ((fop->lock_count > 0) && - (ec_lock_compare(fop->locks[0].lock, new_lock) > 0)) - { - tmp = fop->locks[0].lock; - fop->locks[0].lock = new_lock; - new_lock = tmp; - - tmp_update = fop->locks_update; - fop->locks_update = update; - update = tmp_update; - } - fop->locks[fop->lock_count].lock = new_lock; - fop->locks[fop->lock_count].fop = fop; - - fop->locks_update |= update << fop->lock_count; - - fop->lock_count++; - - if (lock->timer != NULL) { - link = lock->timer->data; - ec_trace("UNLOCK_CANCELLED", link->fop, "lock=%p", lock); - gf_timer_call_cancel(fop->xl->ctx, lock->timer); - lock->timer = NULL; + (ec_lock_compare(fop->locks[0].lock, lock) < 0)) { + fop->first_lock = fop->lock_count; } else { - lock->refs++; - } - - return link; -} - -void ec_lock_prepare_entry(ec_fop_data_t *fop, loc_t *loc, int32_t update) -{ - ec_lock_t * lock = NULL; - ec_inode_t * ctx = NULL; - ec_lock_link_t *link = NULL; - loc_t tmp; - - if ((fop->parent != NULL) || (fop->error != 0)) - { - return; - } - - /* update is only 0 for 'opendir', which needs to lock the entry pointed - * by loc instead of its parent. - */ - if (update) - { - if (ec_loc_parent(fop->xl, loc, &tmp) != 0) { - ec_fop_set_error(fop, EIO); - - return; - } - - /* If there's another lock, make sure that it's not the same. Otherwise - * do not insert it. - * - * This can only happen on renames where source and target names are - * in the same directory. */ - if ((fop->lock_count > 0) && - (fop->locks[0].lock->loc.inode == tmp.inode)) { - goto wipe; + /* When the first lock is added to the current fop, request lock + * counts from locks xlator to be able to determine if there is + * contention and release the lock sooner. */ + if (fop->xdata == NULL) { + fop->xdata = dict_new(); + if (fop->xdata == NULL) { + ec_fop_set_error(fop, ENOMEM); + return; + } } - } else { - if (ec_loc_from_loc(fop->xl, &tmp, loc) != 0) { - ec_fop_set_error(fop, EIO); - + if (dict_set_str(fop->xdata, GLUSTERFS_INODELK_DOM_COUNT, + fop->xl->name) != 0) { + ec_fop_set_error(fop, ENOMEM); return; } } - LOCK(&tmp.inode->lock); - - ctx = __ec_inode_get(tmp.inode, fop->xl); - if (ctx == NULL) - { - __ec_fop_set_error(fop, EIO); - - goto unlock; - } - - if (ctx->entry_lock != NULL) - { - lock = ctx->entry_lock; - ec_trace("LOCK_ENTRYLK", fop, "lock=%p, inode=%p, path=%s" - "Lock already acquired", - lock, tmp.inode, tmp.path); + link = &fop->locks[fop->lock_count++]; - goto insert; - } + link->lock = lock; + link->fop = fop; + link->update[EC_DATA_TXN] = (flags & EC_UPDATE_DATA) != 0; + link->update[EC_METADATA_TXN] = (flags & EC_UPDATE_META) != 0; + link->base = base; - lock = ec_lock_allocate(fop->xl, EC_LOCK_ENTRY, &tmp); - if (lock == NULL) - { - __ec_fop_set_error(fop, EIO); - - goto unlock; - } - - ec_trace("LOCK_CREATE", fop, "lock=%p", lock); - - lock->type = ENTRYLK_WRLCK; - - lock->plock = &ctx->entry_lock; - ctx->entry_lock = lock; - -insert: - link = ec_lock_insert(fop, lock, update); - -unlock: - UNLOCK(&tmp.inode->lock); - -wipe: - loc_wipe(&tmp); - - if (link != NULL) { - ec_resume(link->fop, 0); - } + lock->refs++; + lock->inserted++; } -void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, int32_t update) +void ec_lock_prepare_inode_internal(ec_fop_data_t *fop, loc_t *loc, + uint32_t flags, loc_t *base) { - ec_lock_link_t *link = NULL; - ec_lock_t * lock; - ec_inode_t * ctx; + ec_lock_t *lock = NULL; + ec_inode_t *ctx; - if ((fop->parent != NULL) || (fop->error != 0) || (loc->inode == NULL)) - { + if ((fop->parent != NULL) || (fop->error != 0) || (loc->inode == NULL)) { return; } LOCK(&loc->inode->lock); ctx = __ec_inode_get(loc->inode, fop->xl); - if (ctx == NULL) - { + if (ctx == NULL) { __ec_fop_set_error(fop, EIO); goto unlock; } - if (ctx->inode_lock != NULL) - { + if (ctx->inode_lock != NULL) { lock = ctx->inode_lock; + + /* If there's another lock, make sure that it's not the same. Otherwise + * do not insert it. + * + * This can only happen on renames where source and target names are + * in the same directory. */ + if ((fop->lock_count > 0) && (fop->locks[0].lock == lock)) { + /* Combine data/meta updates */ + fop->locks[0].update[EC_DATA_TXN] |= (flags & EC_UPDATE_DATA) != 0; + fop->locks[0].update[EC_METADATA_TXN] |= + (flags & EC_UPDATE_META) != 0; + + /* Only one base inode is allowed per fop, so there shouldn't be + * overwrites here. */ + if (base != NULL) { + fop->locks[0].base = base; + } + + goto update_query; + } + ec_trace("LOCK_INODELK", fop, "lock=%p, inode=%p. Lock already " "acquired", lock, loc->inode); goto insert; } - lock = ec_lock_allocate(fop->xl, EC_LOCK_INODE, loc); - if (lock == NULL) - { + lock = ec_lock_allocate(fop->xl, loc); + if (lock == NULL) { __ec_fop_set_error(fop, EIO); goto unlock; @@ -797,154 +741,78 @@ void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, int32_t update) lock->flock.l_type = F_WRLCK; lock->flock.l_whence = SEEK_SET; - lock->plock = &ctx->inode_lock; + lock->ctx = ctx; ctx->inode_lock = lock; insert: - link = ec_lock_insert(fop, lock, update); - + ec_lock_insert(fop, lock, flags, base); +update_query: + lock->query |= (flags & EC_QUERY_INFO) != 0; unlock: UNLOCK(&loc->inode->lock); +} - if (link != NULL) { - ec_resume(link->fop, 0); - } +void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, uint32_t flags) +{ + ec_lock_prepare_inode_internal(fop, loc, flags, NULL); } -void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, int32_t update) +void ec_lock_prepare_parent_inode(ec_fop_data_t *fop, loc_t *loc, + uint32_t flags) { - loc_t loc; + loc_t tmp, *base = NULL; - if ((fop->parent != NULL) || (fop->error != 0)) - { + if (fop->error != 0) { return; } - if (ec_loc_from_fd(fop->xl, &loc, fd) == 0) - { - ec_lock_prepare_inode(fop, &loc, update); - - loc_wipe(&loc); - } - else - { + if (ec_loc_parent(fop->xl, loc, &tmp) != 0) { ec_fop_set_error(fop, EIO); - } -} - -int32_t ec_locked(call_frame_t * frame, void * cookie, xlator_t * this, - int32_t op_ret, int32_t op_errno, dict_t * xdata) -{ - ec_fop_data_t * fop = cookie; - ec_lock_t * lock = NULL; - - if (op_ret >= 0) - { - lock = fop->data; - lock->mask = fop->good; - lock->acquired = 1; - fop->parent->mask &= fop->good; - fop->parent->locked++; - - ec_trace("LOCKED", fop->parent, "lock=%p", lock); - - ec_lock(fop->parent); + return; } - else - { - gf_log(this->name, GF_LOG_WARNING, "Failed to complete preop lock"); + + if ((flags & EC_INODE_SIZE) != 0) { + base = loc; + flags ^= EC_INODE_SIZE; } - return 0; + ec_lock_prepare_inode_internal(fop, &tmp, flags, base); + + loc_wipe(&tmp); } -void ec_lock(ec_fop_data_t * fop) +void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags) { - ec_lock_t * lock; - - while (fop->locked < fop->lock_count) - { - lock = fop->locks[fop->locked].lock; - - LOCK(&lock->loc.inode->lock); - - if (lock->owner != NULL) - { - ec_trace("LOCK_WAIT", fop, "lock=%p", lock); - - list_add_tail(&fop->locks[fop->locked].wait_list, &lock->waiting); - - ec_sleep(fop); - - UNLOCK(&lock->loc.inode->lock); - - break; - } - lock->owner = fop; - - UNLOCK(&lock->loc.inode->lock); - - if (!lock->acquired) - { - ec_owner_set(fop->frame, lock); - - if (lock->kind == EC_LOCK_ENTRY) - { - ec_trace("LOCK_ACQUIRE", fop, "lock=%p, inode=%p, path=%s", - lock, lock->loc.inode, lock->loc.path); - - ec_entrylk(fop->frame, fop->xl, -1, EC_MINIMUM_ALL, ec_locked, - lock, fop->xl->name, &lock->loc, NULL, - ENTRYLK_LOCK, lock->type, NULL); - } - else - { - ec_trace("LOCK_ACQUIRE", fop, "lock=%p, inode=%p", lock, - lock->loc.inode); + loc_t loc; - ec_inodelk(fop->frame, fop->xl, -1, EC_MINIMUM_ALL, ec_locked, - lock, fop->xl->name, &lock->loc, F_SETLKW, - &lock->flock, NULL); - } + if (fop->error != 0) { + return; + } - break; - } + if (ec_loc_from_fd(fop->xl, &loc, fd) != 0) { + ec_fop_set_error(fop, EIO); - ec_trace("LOCK_REUSE", fop, "lock=%p", lock); + return; + } - if (lock->have_size) - { - fop->pre_size = fop->post_size = lock->size; - fop->have_size = 1; - } - fop->mask &= lock->good_mask; + ec_lock_prepare_inode_internal(fop, &loc, flags, NULL); - fop->locked++; - } + loc_wipe(&loc); } gf_boolean_t -ec_config_check (ec_fop_data_t *fop, dict_t *xdata) +ec_config_check (ec_fop_data_t *fop, ec_config_t *config) { ec_t *ec; - if (ec_dict_del_config(xdata, EC_XATTR_CONFIG, &fop->config) < 0) { - gf_log(fop->xl->name, GF_LOG_ERROR, "Failed to get a valid " - "config"); - - ec_fop_set_error(fop, EIO); - - return _gf_false; - } - ec = fop->xl->private; - if ((fop->config.version != EC_CONFIG_VERSION) || - (fop->config.algorithm != EC_CONFIG_ALGORITHM) || - (fop->config.gf_word_size != EC_GF_BITS) || - (fop->config.bricks != ec->nodes) || - (fop->config.redundancy != ec->redundancy) || - (fop->config.chunk_size != EC_METHOD_CHUNK_SIZE)) { + if ((config->version != EC_CONFIG_VERSION) || + (config->algorithm != EC_CONFIG_ALGORITHM) || + (config->gf_word_size != EC_GF_BITS) || + (config->bricks != ec->nodes) || + (config->redundancy != ec->redundancy) || + (config->chunk_size != EC_METHOD_CHUNK_SIZE)) { uint32_t data_bricks; /* This combination of version/algorithm requires the following @@ -957,271 +825,201 @@ ec_config_check (ec_fop_data_t *fop, dict_t *xdata) chunk_size (in bits) must be a multiple of gf_word_size * (bricks - redundancy) */ - data_bricks = fop->config.bricks - fop->config.redundancy; - if ((fop->config.redundancy < 1) || - (fop->config.redundancy * 2 >= fop->config.bricks) || - !ec_is_power_of_2(fop->config.gf_word_size) || - ((fop->config.chunk_size * 8) % (fop->config.gf_word_size * - data_bricks) != 0)) { + data_bricks = config->bricks - config->redundancy; + if ((config->redundancy < 1) || + (config->redundancy * 2 >= config->bricks) || + !ec_is_power_of_2(config->gf_word_size) || + ((config->chunk_size * 8) % (config->gf_word_size * data_bricks) + != 0)) { gf_log(fop->xl->name, GF_LOG_ERROR, "Invalid or corrupted config"); } else { gf_log(fop->xl->name, GF_LOG_ERROR, "Unsupported config " "(V=%u, A=%u, W=%u, " "N=%u, R=%u, S=%u)", - fop->config.version, fop->config.algorithm, - fop->config.gf_word_size, fop->config.bricks, - fop->config.redundancy, fop->config.chunk_size); + config->version, config->algorithm, + config->gf_word_size, config->bricks, + config->redundancy, config->chunk_size); } - ec_fop_set_error(fop, EIO); - return _gf_false; } return _gf_true; } -int32_t ec_get_size_version_set(call_frame_t * frame, void * cookie, - xlator_t * this, int32_t op_ret, - int32_t op_errno, inode_t * inode, - struct iatt * buf, dict_t * xdata, - struct iatt * postparent) +int32_t +ec_prepare_update_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) { - ec_fop_data_t * fop = cookie; - ec_inode_t * ctx; + ec_fop_data_t *fop = cookie, *parent; + ec_lock_link_t *link = fop->data; ec_lock_t *lock = NULL; + ec_inode_t *ctx; - if (op_ret >= 0) - { - if ((buf->ia_type == IA_IFREG) && !ec_config_check(fop, xdata)) { - return 0; - } + lock = link->lock; + parent = link->fop; + ctx = lock->ctx; - LOCK(&inode->lock); + if (op_ret < 0) { + gf_log(this->name, GF_LOG_WARNING, + "Failed to get size and version (error %d: %s)", op_errno, + strerror (op_errno)); - ctx = __ec_inode_get(inode, this); - if (ctx != NULL) { - if (ctx->inode_lock != NULL) { - lock = ctx->inode_lock; - lock->version[0] = fop->answer->version[0]; - lock->version[1] = fop->answer->version[1]; + goto out; + } - if (buf->ia_type == IA_IFREG) { - lock->have_size = 1; - lock->size = buf->ia_size; - } - } - if (ctx->entry_lock != NULL) { - lock = ctx->entry_lock; - lock->version[0] = fop->answer->version[0]; - lock->version[1] = fop->answer->version[1]; - } - } + op_errno = EIO; - UNLOCK(&inode->lock); + LOCK(&lock->loc.inode->lock); - if (lock != NULL) - { - // Only update parent mask if the lookup has been made with - // inode locked. - fop->parent->mask &= fop->good; - } + if (ec_dict_del_array(dict, EC_XATTR_VERSION, ctx->pre_version, + EC_VERSION_SIZE) != 0) { + gf_log(this->name, GF_LOG_ERROR, "Unable to get version xattr"); - if (buf->ia_type == IA_IFREG) { - fop->parent->pre_size = fop->parent->post_size = buf->ia_size; - fop->parent->have_size = 1; - } - } - else - { - gf_log(this->name, GF_LOG_WARNING, "Failed to get size and version " - "(error %d)", op_errno); - ec_fop_set_error(fop, op_errno); + goto unlock; } + ctx->post_version[0] += ctx->pre_version[0]; + ctx->post_version[1] += ctx->pre_version[1]; - return 0; -} + ctx->have_version = _gf_true; -gf_boolean_t -ec_is_data_fop (glusterfs_fop_t fop) -{ - switch (fop) { - case GF_FOP_WRITE: - case GF_FOP_TRUNCATE: - case GF_FOP_FTRUNCATE: - case GF_FOP_FALLOCATE: - case GF_FOP_DISCARD: - case GF_FOP_ZEROFILL: - return _gf_true; - default: - return _gf_false; - } - return _gf_false; -} + if (ec_dict_del_array(dict, EC_XATTR_DIRTY, ctx->pre_dirty, + EC_VERSION_SIZE) == 0) { + ctx->post_dirty[0] += ctx->pre_dirty[0]; + ctx->post_dirty[1] += ctx->pre_dirty[1]; -gf_boolean_t -ec_is_metadata_fop (glusterfs_fop_t fop) -{ - switch (fop) { - case GF_FOP_SETATTR: - case GF_FOP_FSETATTR: - case GF_FOP_SETXATTR: - case GF_FOP_FSETXATTR: - case GF_FOP_REMOVEXATTR: - case GF_FOP_FREMOVEXATTR: - return _gf_true; - default: - return _gf_false; - } - return _gf_false; -} + ctx->have_dirty = _gf_true; + } -int32_t -ec_prepare_update_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict, dict_t *xdata) -{ - ec_fop_data_t *fop = cookie, *parent; - ec_lock_t *lock = NULL; - uint64_t size = 0; - uint64_t version[EC_VERSION_SIZE] = {0, 0}; + if (lock->loc.inode->ia_type == IA_IFREG) { + if (ec_dict_del_number(dict, EC_XATTR_SIZE, &ctx->pre_size) != 0) { + gf_log(this->name, GF_LOG_ERROR, "Unable to get size xattr"); - if (op_ret >= 0) { - parent = fop->parent; - while ((parent != NULL) && (parent->locks[0].lock == NULL)) { - parent = parent->parent; - } - if (parent == NULL) { - return 0; + goto unlock; } + ctx->post_size = ctx->pre_size; - lock = parent->locks[0].lock; - if (ec_is_metadata_fop (fop->parent->id)) - lock->is_dirty[EC_METADATA_TXN] = _gf_true; - else - lock->is_dirty[EC_DATA_TXN] = _gf_true; - - if (lock->loc.inode->ia_type == IA_IFREG) { - if (!ec_config_check(fop, dict) || - (ec_dict_del_number(dict, EC_XATTR_SIZE, &size) != 0)) { - ec_fop_set_error(fop, EIO); - return 0; - } - } + ctx->have_size = _gf_true; - if (ec_dict_del_array(dict, EC_XATTR_VERSION, version, - EC_VERSION_SIZE) != 0) { - ec_fop_set_error(fop, EIO); - return 0; + if ((ec_dict_del_config(dict, EC_XATTR_CONFIG, &ctx->config) != 0) || + !ec_config_check(parent, &ctx->config)) { + gf_log(this->name, GF_LOG_ERROR, "Unable to get config xattr"); + + goto unlock; } - LOCK(&lock->loc.inode->lock); + ctx->have_config = _gf_true; + } - if (lock->loc.inode->ia_type == IA_IFREG) { - lock->size = size; - fop->parent->pre_size = fop->parent->post_size = size; - fop->parent->have_size = lock->have_size = 1; - } - lock->version[0] = version[0]; - lock->version[1] = version[1]; + ctx->have_info = _gf_true; - UNLOCK(&lock->loc.inode->lock); + op_errno = 0; + +unlock: + UNLOCK(&lock->loc.inode->lock); +out: + if (op_errno == 0) { + parent->mask &= fop->good; - fop->parent->mask &= fop->good; /*As of now only data healing marks bricks as healing*/ - if (ec_is_data_fop (fop->parent->id)) - fop->parent->healing |= fop->healing; + if (ec_is_data_fop (parent->id)) { + parent->healing |= fop->healing; + } } else { - gf_log(this->name, GF_LOG_WARNING, - "Failed to get size and version (error %d: %s)", op_errno, - strerror (op_errno)); - ec_fop_set_error(fop, op_errno); + ec_fop_set_error(parent, op_errno); } return 0; } -void ec_get_size_version(ec_fop_data_t * fop) +void ec_get_size_version(ec_lock_link_t *link) { loc_t loc; - dict_t * xdata; + ec_lock_t *lock; + ec_inode_t *ctx; + ec_fop_data_t *fop; + dict_t *dict = NULL; uid_t uid; gid_t gid; int32_t error = ENOMEM; uint64_t allzero[EC_VERSION_SIZE] = {0, 0}; - if (fop->have_size) - { + lock = link->lock; + ctx = lock->ctx; + + /* If ec metadata has already been retrieved, do not try again. */ + if (ctx->have_info) { return; } - if ((fop->parent != NULL) && fop->parent->have_size) - { - fop->pre_size = fop->parent->pre_size; - fop->post_size = fop->parent->post_size; - - fop->have_size = 1; - + /* Determine if there's something we need to retrieve for the current + * operation. */ + if (!lock->query && (lock->loc.inode->ia_type != IA_IFREG)) { return; } + + fop = link->fop; + uid = fop->frame->root->uid; gid = fop->frame->root->gid; - fop->frame->root->uid = 0; - fop->frame->root->gid = 0; - memset(&loc, 0, sizeof(loc)); - xdata = dict_new(); - if (xdata == NULL) - { + dict = dict_new(); + if (dict == NULL) { goto out; } - if ((ec_dict_set_array(xdata, EC_XATTR_VERSION, - allzero, EC_VERSION_SIZE) != 0) || - (ec_dict_set_number(xdata, EC_XATTR_SIZE, 0) != 0) || - (ec_dict_set_number(xdata, EC_XATTR_CONFIG, 0) != 0) || - (ec_dict_set_array(xdata, EC_XATTR_DIRTY, allzero, - EC_VERSION_SIZE) != 0)) - { + + /* Once we know that an xattrop will be needed, we try to get all available + * information in a single call. */ + if ((ec_dict_set_array(dict, EC_XATTR_VERSION, allzero, + EC_VERSION_SIZE) != 0) || + (ec_dict_set_array(dict, EC_XATTR_DIRTY, allzero, + EC_VERSION_SIZE) != 0)) { goto out; } - error = EIO; + if (lock->loc.inode->ia_type == IA_IFREG) { + if ((ec_dict_set_number(dict, EC_XATTR_SIZE, 0) != 0) || + (ec_dict_set_number(dict, EC_XATTR_CONFIG, 0) != 0)) { + goto out; + } + } - if (!fop->use_fd) - { - if (ec_loc_from_loc(fop->xl, &loc, &fop->loc[0]) != 0) - { + fop->frame->root->uid = 0; + fop->frame->root->gid = 0; + + /* For normal fops, ec_[f]xattrop() must succeed on at least + * EC_MINIMUM_MIN bricks, however when this is called as part of a + * self-heal operation the mask of target bricks (fop->mask) could + * contain less than EC_MINIMUM_MIN bricks, causing the lookup to + * always fail. Thus we always use the same minimum used for the main + * fop. + */ + if (lock->fd == NULL) { + if (ec_loc_from_loc(fop->xl, &loc, &lock->loc) != 0) { goto out; } - if (gf_uuid_is_null(loc.pargfid)) - { - if (loc.parent != NULL) - { + if (gf_uuid_is_null(loc.pargfid)) { + if (loc.parent != NULL) { inode_unref(loc.parent); loc.parent = NULL; } GF_FREE((char *)loc.path); - loc.path = NULL; - loc.name = NULL; + loc.path = NULL; + loc.name = NULL; } - /* For normal fops, ec_lookup() must succeed on at least EC_MINIMUM_MIN - * bricks, however when this is called as part of a self-heal operation - * the mask of target bricks (fop->mask) could contain less than - * EC_MINIMUM_MIN bricks, causing the lookup to always fail. Thus we - * always use the same minimum used for the main fop. - */ - ec_lookup(fop->frame, fop->xl, fop->mask, fop->minimum, - ec_get_size_version_set, NULL, &loc, xdata); + + ec_xattrop (fop->frame, fop->xl, fop->mask, fop->minimum, + ec_prepare_update_cbk, link, &loc, + GF_XATTROP_ADD_ARRAY64, dict, NULL); } else { - if (ec_loc_from_fd(fop->xl, &loc, fop->fd) != 0) { - goto out; - } ec_fxattrop(fop->frame, fop->xl, fop->mask, fop->minimum, - ec_prepare_update_cbk, NULL, fop->fd, - GF_XATTROP_ADD_ARRAY64, xdata, NULL); + ec_prepare_update_cbk, link, lock->fd, + GF_XATTROP_ADD_ARRAY64, dict, NULL); } + error = 0; out: @@ -1230,9 +1028,8 @@ out: loc_wipe(&loc); - if (xdata != NULL) - { - dict_unref(xdata); + if (dict != NULL) { + dict_unref(dict); } if (error != 0) { @@ -1240,147 +1037,408 @@ out: } } -void ec_prepare_update(ec_fop_data_t *fop) +gf_boolean_t ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode, + uint64_t *size) { - loc_t loc; - dict_t *xdata; - ec_fop_data_t *tmp; - ec_lock_t *lock; - ec_t *ec; - uid_t uid; - gid_t gid; - uint64_t version[2] = {0, 0}; - uint64_t dirty[2] = {0, 0}; - int32_t error = ENOMEM; + ec_inode_t *ctx; + gf_boolean_t found = _gf_false; - tmp = fop; - while ((tmp != NULL) && (tmp->locks[0].lock == NULL)) { - tmp = tmp->parent; - } - if ((tmp != NULL) && - (tmp->locks[0].lock->is_dirty[0] || tmp->locks[0].lock->is_dirty[1])) { - lock = tmp->locks[0].lock; + LOCK(&inode->lock); - fop->pre_size = fop->post_size = lock->size; - fop->have_size = 1; + ctx = __ec_inode_get(inode, fop->xl); + if (ctx == NULL) { + goto unlock; + } - return; + if (ctx->have_size) { + *size = ctx->post_size; + found = _gf_true; } - uid = fop->frame->root->uid; - gid = fop->frame->root->gid; - fop->frame->root->uid = 0; - fop->frame->root->gid = 0; +unlock: + UNLOCK(&inode->lock); - memset(&loc, 0, sizeof(loc)); + return found; +} - ec = fop->xl->private; - if (ec_bits_count (fop->mask) >= ec->fragments) { - /* It is changing data only if the update happens on at least - * fragment number of bricks. Otherwise it probably is healing*/ - if (ec_is_metadata_fop (fop->id)) - dirty[EC_METADATA_TXN] = 1; - else - dirty[EC_DATA_TXN] = 1; +gf_boolean_t ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode, + uint64_t size) +{ + ec_inode_t *ctx; + gf_boolean_t found = _gf_false; + + LOCK(&inode->lock); + + ctx = __ec_inode_get(inode, fop->xl); + if (ctx == NULL) { + goto unlock; } - xdata = dict_new(); - if (xdata == NULL) { - goto out; + /* Normal fops always have ctx->have_size set. However self-heal calls this + * to prepare the inode, so ctx->have_size will be false. In this case we + * prepare both pre_size and post_size, and set have_size and have_info to + * true. */ + if (!ctx->have_size) { + ctx->pre_size = size; + ctx->have_size = ctx->have_info = _gf_true; } - if ((ec_dict_set_array(xdata, EC_XATTR_VERSION, - version, EC_VERSION_SIZE) != 0) || - (ec_dict_set_number(xdata, EC_XATTR_SIZE, 0) != 0) || - (ec_dict_set_number(xdata, EC_XATTR_CONFIG, 0) != 0) || - (ec_dict_set_array(xdata, EC_XATTR_DIRTY, dirty, - EC_VERSION_SIZE) != 0)) { - goto out; + ctx->post_size = size; + + found = _gf_true; + +unlock: + UNLOCK(&inode->lock); + + return found; +} + +void ec_clear_inode_info(ec_fop_data_t *fop, inode_t *inode) +{ + ec_inode_t *ctx; + + LOCK(&inode->lock); + + ctx = __ec_inode_get(inode, fop->xl); + if (ctx == NULL) { + goto unlock; } - error = EIO; + ctx->have_info = _gf_false; + ctx->have_config = _gf_false; + ctx->have_version = _gf_false; + ctx->have_size = _gf_false; + ctx->have_dirty = _gf_false; - if (!fop->use_fd) { - if (ec_loc_from_loc(fop->xl, &loc, &fop->loc[0]) != 0) { - goto out; - } + memset(&ctx->config, 0, sizeof(ctx->config)); + memset(ctx->pre_version, 0, sizeof(ctx->pre_version)); + memset(ctx->post_version, 0, sizeof(ctx->post_version)); + ctx->pre_size = ctx->post_size = 0; + memset(ctx->pre_dirty, 0, sizeof(ctx->pre_dirty)); + memset(ctx->post_dirty, 0, sizeof(ctx->post_dirty)); + +unlock: + UNLOCK(&inode->lock); +} - ec_xattrop(fop->frame, fop->xl, fop->mask, fop->minimum, - ec_prepare_update_cbk, NULL, &loc, GF_XATTROP_ADD_ARRAY64, - xdata, NULL); +int32_t ec_get_real_size_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, + struct iatt *postparent) +{ + ec_fop_data_t *fop = cookie; + ec_lock_link_t *link; + + if (op_ret >= 0) { + link = fop->data; + if (ec_dict_del_number(xdata, EC_XATTR_SIZE, &link->size) != 0) { + gf_log(this->name, GF_LOG_WARNING, + "Unable to determine real file size"); + } } else { - ec_fxattrop(fop->frame, fop->xl, fop->mask, fop->minimum, - ec_prepare_update_cbk, NULL, fop->fd, - GF_XATTROP_ADD_ARRAY64, xdata, NULL); + /* Prevent failure of parent fop. */ + fop->error = 0; } - error = 0; + return 0; +} -out: +/* This function is used to get the trusted.ec.size xattr from a file when + * no lock is needed on the inode. This is only required to maintan iatt + * structs on fops that manipulate directory entries but do not operate + * directly on the inode, like link, rename, ... + * + * Any error processing this request is ignored. In the worst case, an invalid + * or not up to date value in the iatt could cause some cache invalidation. + */ +void ec_get_real_size(ec_lock_link_t *link) +{ + ec_fop_data_t *fop; + dict_t *xdata; - fop->frame->root->uid = uid; - fop->frame->root->gid = gid; + if (link->base == NULL) { + return; + } - loc_wipe(&loc); + if (link->base->inode->ia_type != IA_IFREG) { + return; + } + fop = link->fop; + + if (ec_get_inode_size(fop, link->base->inode, &link->size)) { + return; + } + + xdata = dict_new(); + if (xdata == NULL) { + return; + } + if (ec_dict_set_number(xdata, EC_XATTR_SIZE, 0) != 0) { + goto out; + } + + /* Send a simple lookup. A single answer is considered ok since this value + * is only used to return an iatt struct related to an inode that is not + * locked and have not suffered any operation. */ + ec_lookup(fop->frame, fop->xl, fop->mask, 1, ec_get_real_size_cbk, link, + link->base, xdata); + +out: if (xdata != NULL) { dict_unref(xdata); } +} - if (error != 0) { - ec_fop_set_error(fop, error); +void ec_lock_acquired(ec_lock_link_t *link) +{ + ec_lock_t *lock; + ec_fop_data_t *fop; + + lock = link->lock; + fop = link->fop; + + ec_trace("LOCKED", link->fop, "lock=%p", lock); + + /* If the fop has an fd available, attach it to the lock structure to be + * able to do fxattrop calls instead of xattrop. It's safe to change this + * here because no xattrop using the fd can start concurrently at this + * point. */ + if (fop->use_fd) { + if (lock->fd != NULL) { + fd_unref(lock->fd); + } + lock->fd = fd_ref(fop->fd); } + lock->acquired = _gf_true; + + fop->mask &= lock->good_mask; + + fop->locked++; + + ec_get_size_version(link); + ec_get_real_size(link); } -int32_t ec_unlocked(call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +int32_t ec_locked(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { ec_fop_data_t *fop = cookie; + ec_lock_link_t *link = NULL; + ec_lock_t *lock = NULL; - if (op_ret < 0) { - gf_log(this->name, GF_LOG_WARNING, "entry/inode unlocking failed (%s)", - ec_fop_name(fop->parent->id)); + if (op_ret >= 0) { + link = fop->data; + lock = link->lock; + lock->mask = lock->good_mask = fop->good; + + ec_lock_acquired(link); + ec_lock(fop->parent); } else { - ec_trace("UNLOCKED", fop->parent, "lock=%p", fop->data); + gf_log(this->name, GF_LOG_WARNING, "Failed to complete preop lock"); } return 0; } -void ec_unlock_lock(ec_fop_data_t *fop, ec_lock_t *lock) +gf_boolean_t ec_lock_acquire(ec_lock_link_t *link) { - if ((lock->mask != 0) && lock->acquired) { + ec_lock_t *lock; + ec_fop_data_t *fop; + + lock = link->lock; + fop = link->fop; + if (!lock->acquired) { ec_owner_set(fop->frame, lock); - switch (lock->kind) { - case EC_LOCK_ENTRY: - ec_trace("UNLOCK_ENTRYLK", fop, "lock=%p, inode=%p, path=%s", lock, - lock->loc.inode, lock->loc.path); + ec_trace("LOCK_ACQUIRE", fop, "lock=%p, inode=%p", lock, + lock->loc.inode); + + lock->flock.l_type = F_WRLCK; + ec_inodelk(fop->frame, fop->xl, -1, EC_MINIMUM_ALL, ec_locked, + link, fop->xl->name, &lock->loc, F_SETLKW, &lock->flock, + NULL); + + return _gf_false; + } + + ec_trace("LOCK_REUSE", fop, "lock=%p", lock); + + ec_lock_acquired(link); + + return _gf_true; +} + +void ec_lock(ec_fop_data_t *fop) +{ + ec_lock_link_t *link; + ec_lock_link_t *timer_link = NULL; + ec_lock_t *lock; + + /* There is a chance that ec_resume is called on fop even before ec_sleep. + * Which can result in refs == 0 for fop leading to use after free in this + * function when it calls ec_sleep so do ec_sleep at start and end of this + * function.*/ + ec_sleep (fop); + while (fop->locked < fop->lock_count) { + /* Since there are only up to 2 locks per fop, this xor will change + * the order of the locks if fop->first_lock is 1. */ + link = &fop->locks[fop->locked ^ fop->first_lock]; + lock = link->lock; + + timer_link = NULL; + + LOCK(&lock->loc.inode->lock); + GF_ASSERT (lock->inserted > 0); + lock->inserted--; + + if (lock->timer != NULL) { + GF_ASSERT (lock->release == _gf_false); + timer_link = lock->timer->data; + ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock); + gf_timer_call_cancel(fop->xl->ctx, lock->timer); + lock->timer = NULL; + + lock->refs--; + /* There should remain at least 1 ref, the current one. */ + GF_ASSERT(lock->refs > 0); + } + + GF_ASSERT(list_empty(&link->wait_list)); + + if ((lock->owner != NULL) || lock->release) { + if (lock->release) { + ec_trace("LOCK_QUEUE_FREEZE", fop, "lock=%p", lock); - ec_entrylk(fop->frame, fop->xl, lock->mask, EC_MINIMUM_ALL, - ec_unlocked, lock, fop->xl->name, &lock->loc, NULL, - ENTRYLK_UNLOCK, lock->type, NULL); + list_add_tail(&link->wait_list, &lock->frozen); + + /* The lock is frozen, so we move the current reference to + * refs_frozen. After that, there should remain at least one + * ref belonging to the lock that is processing the release. */ + lock->refs--; + GF_ASSERT(lock->refs > 0); + lock->refs_frozen++; + } else { + ec_trace("LOCK_QUEUE_WAIT", fop, "lock=%p", lock); + + list_add_tail(&link->wait_list, &lock->waiting); + } + + UNLOCK(&lock->loc.inode->lock); + + ec_sleep(fop); break; + } - case EC_LOCK_INODE: - lock->flock.l_type = F_UNLCK; - ec_trace("UNLOCK_INODELK", fop, "lock=%p, inode=%p", lock, - lock->loc.inode); + lock->owner = fop; - ec_inodelk(fop->frame, fop->xl, lock->mask, EC_MINIMUM_ALL, - ec_unlocked, lock, fop->xl->name, &lock->loc, F_SETLK, - &lock->flock, NULL); + UNLOCK(&lock->loc.inode->lock); + if (!ec_lock_acquire(link)) { break; + } - default: - gf_log(fop->xl->name, GF_LOG_ERROR, "Invalid lock type"); + if (timer_link != NULL) { + ec_resume(timer_link->fop, 0); + timer_link = NULL; } } + ec_resume (fop, 0); + + if (timer_link != NULL) { + ec_resume(timer_link->fop, 0); + } +} + +void +ec_lock_unfreeze(ec_lock_link_t *link) +{ + ec_lock_t *lock; - ec_trace("LOCK_DESTROY", fop, "lock=%p", lock); + lock = link->lock; + + LOCK(&lock->loc.inode->lock); + + lock->acquired = _gf_false; + lock->release = _gf_false; + + lock->refs--; + GF_ASSERT (lock->refs == lock->inserted); + + GF_ASSERT(list_empty(&lock->waiting) && (lock->owner == NULL)); + + list_splice_init(&lock->frozen, &lock->waiting); + lock->refs += lock->refs_frozen; + lock->refs_frozen = 0; + + if (!list_empty(&lock->waiting)) { + link = list_entry(lock->waiting.next, ec_lock_link_t, wait_list); + list_del_init(&link->wait_list); + + lock->owner = link->fop; + + UNLOCK(&lock->loc.inode->lock); + + ec_trace("LOCK_UNFREEZE", link->fop, "lock=%p", lock); + + if (ec_lock_acquire(link)) { + ec_lock(link->fop); + } + ec_resume(link->fop, 0); + } else if (lock->refs == 0) { + ec_trace("LOCK_DESTROY", link->fop, "lock=%p", lock); + + lock->ctx->inode_lock = NULL; + + UNLOCK(&lock->loc.inode->lock); - ec_lock_destroy(lock); + ec_lock_destroy(lock); + } else { + UNLOCK(&lock->loc.inode->lock); + } +} + +int32_t ec_unlocked(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + ec_fop_data_t *fop = cookie; + ec_lock_link_t *link = fop->data; + + if (op_ret < 0) { + gf_log(this->name, GF_LOG_WARNING, "entry/inode unlocking failed (%s)", + ec_fop_name(link->fop->id)); + } else { + ec_trace("UNLOCKED", link->fop, "lock=%p", link->lock); + } + + ec_lock_unfreeze(link); + + return 0; +} + +void ec_unlock_lock(ec_lock_link_t *link) +{ + ec_lock_t *lock; + ec_fop_data_t *fop; + + lock = link->lock; + fop = link->fop; + + ec_clear_inode_info(fop, lock->loc.inode); + + if ((lock->mask != 0) && lock->acquired) { + ec_owner_set(fop->frame, lock); + + lock->flock.l_type = F_UNLCK; + ec_trace("UNLOCK_INODELK", fop, "lock=%p, inode=%p", lock, + lock->loc.inode); + + ec_inodelk(fop->frame, fop->xl, lock->mask, EC_MINIMUM_ALL, + ec_unlocked, link, fop->xl->name, &lock->loc, F_SETLK, + &lock->flock, NULL); + } else { + ec_lock_unfreeze(link); + } } int32_t ec_update_size_version_done(call_frame_t * frame, void * cookie, @@ -1388,111 +1446,128 @@ int32_t ec_update_size_version_done(call_frame_t * frame, void * cookie, int32_t op_errno, dict_t * xattr, dict_t * xdata) { - ec_fop_data_t * fop = cookie; + ec_fop_data_t *fop = cookie; + ec_lock_link_t *link; + ec_lock_t *lock; + ec_inode_t *ctx; - if (op_ret < 0) - { + if (op_ret < 0) { gf_log(fop->xl->name, GF_LOG_ERROR, "Failed to update version and " "size (error %d)", op_errno); - } - else - { + } else { fop->parent->mask &= fop->good; + link = fop->data; + lock = link->lock; + ctx = lock->ctx; + + if (ec_dict_del_array(xattr, EC_XATTR_VERSION, ctx->post_version, + EC_VERSION_SIZE) == 0) { + ctx->pre_version[0] = ctx->post_version[0]; + ctx->pre_version[1] = ctx->post_version[1]; + + ctx->have_version = _gf_true; + } + if (ec_dict_del_number(xattr, EC_XATTR_SIZE, &ctx->post_size) == 0) { + ctx->pre_size = ctx->post_size; + + ctx->have_size = _gf_true; + } + if (ec_dict_del_array(xattr, EC_XATTR_DIRTY, ctx->post_dirty, + EC_VERSION_SIZE) == 0) { + ctx->pre_dirty[0] = ctx->post_dirty[0]; + ctx->pre_dirty[1] = ctx->post_dirty[1]; + + ctx->have_dirty = _gf_true; + } + if ((ec_dict_del_config(xdata, EC_XATTR_CONFIG, &ctx->config) == 0) && + ec_config_check(fop->parent, &ctx->config)) { + ctx->have_config = _gf_true; + } + + ctx->have_info = _gf_true; } - if (fop->data != NULL) { - ec_unlock_lock(fop->parent, fop->data); + if ((fop->parent->id != GF_FOP_FLUSH) && + (fop->parent->id != GF_FOP_FSYNC) && + (fop->parent->id != GF_FOP_FSYNCDIR)) { + ec_unlock_lock(fop->data); } return 0; } -uint64_t -ec_get_dirty_value (ec_t *ec, uintptr_t fop_mask, uint64_t version_delta, - gf_boolean_t dirty) -{ - uint64_t dirty_val = 0; - - if (version_delta) { - if (~fop_mask & ec->node_mask) { - /* fop didn't succeed on all subvols so 'dirty' xattr - * shouldn't be cleared */ - if (!dirty) - dirty_val = 1; - } else { - /* fop succeed on all subvols so 'dirty' xattr - * should be cleared */ - if (dirty) - dirty_val = -1; - } - } - return dirty_val; -} - void -ec_update_size_version(ec_fop_data_t *fop, loc_t *loc, uint64_t version[2], - uint64_t size, gf_boolean_t dirty[2], ec_lock_t *lock) +ec_update_size_version(ec_lock_link_t *link, uint64_t *version, + uint64_t size, uint64_t *dirty) { - ec_t *ec = fop->xl->private; + ec_fop_data_t *fop; + ec_lock_t *lock; + ec_inode_t *ctx; dict_t * dict; uid_t uid; gid_t gid; - uint64_t dirty_values[2] = {0}; - int i = 0; - if (fop->parent != NULL) - { - fop->parent->post_size = fop->post_size; + fop = link->fop; - return; - } - - ec_trace("UPDATE", fop, "version=%ld, size=%ld, dirty=%u", version, size, - dirty); + ec_trace("UPDATE", fop, "version=%ld/%ld, size=%ld, dirty=%ld/%ld", + version[0], version[1], size, dirty[0], dirty[1]); dict = dict_new(); - if (dict == NULL) - { + if (dict == NULL) { goto out; } - if (version[0] != 0 || version[1] != 0) { + lock = link->lock; + ctx = lock->ctx; + + /* If we don't have version information or it has been modified, we + * update it. */ + if (!ctx->have_version || (version[0] != 0) || (version[1] != 0)) { if (ec_dict_set_array(dict, EC_XATTR_VERSION, version, EC_VERSION_SIZE) != 0) { goto out; } } + if (size != 0) { + /* If size has been changed, we should already know the previous size + * of the file. */ + GF_ASSERT(ctx->have_size); + if (ec_dict_set_number(dict, EC_XATTR_SIZE, size) != 0) { goto out; } } - for (i = 0; i < sizeof (dirty_values)/sizeof (dirty_values[0]); i++) { - dirty_values[i] = ec_get_dirty_value (ec, fop->mask, version[i], - dirty[i]); - } - - if (dirty_values[0] || dirty_values[1]) { - if (ec_dict_set_array(dict, EC_XATTR_DIRTY, dirty_values, + /* If we don't have dirty information or it has been modified, we update + * it. */ + if (!ctx->have_dirty || (dirty[0] != 0) || (dirty[1] != 0)) { + if (ec_dict_set_array(dict, EC_XATTR_DIRTY, dirty, EC_VERSION_SIZE) != 0) { goto out; } } + /* If config information is not know, we request it now. */ + if ((lock->loc.inode->ia_type == IA_IFREG) && !ctx->have_config) { + /* A failure requesting this xattr is ignored because it's not + * absolutely required right now. */ + ec_dict_set_number(dict, EC_XATTR_CONFIG, 0); + } + uid = fop->frame->root->uid; gid = fop->frame->root->gid; fop->frame->root->uid = 0; fop->frame->root->gid = 0; - if (fop->use_fd) { - ec_fxattrop(fop->frame, fop->xl, fop->mask, EC_MINIMUM_MIN, - ec_update_size_version_done, lock, fop->fd, + if (link->lock->fd == NULL) { + ec_xattrop(fop->frame, fop->xl, fop->mask, EC_MINIMUM_MIN, + ec_update_size_version_done, link, &link->lock->loc, GF_XATTROP_ADD_ARRAY64, dict, NULL); } else { - ec_xattrop(fop->frame, fop->xl, fop->mask, EC_MINIMUM_MIN, - ec_update_size_version_done, lock, loc, + ec_fxattrop(fop->frame, fop->xl, fop->mask, EC_MINIMUM_MIN, + ec_update_size_version_done, link, link->lock->fd, GF_XATTROP_ADD_ARRAY64, dict, NULL); } @@ -1504,8 +1579,7 @@ ec_update_size_version(ec_fop_data_t *fop, loc_t *loc, uint64_t version[2], return; out: - if (dict != NULL) - { + if (dict != NULL) { dict_unref(dict); } @@ -1514,46 +1588,99 @@ out: gf_log(fop->xl->name, GF_LOG_ERROR, "Unable to update version and size"); } -void ec_unlock_now(ec_fop_data_t *fop, ec_lock_t *lock) +gf_boolean_t +ec_update_info(ec_lock_link_t *link) { - ec_trace("UNLOCK_NOW", fop, "lock=%p", lock); + ec_lock_t *lock; + ec_inode_t *ctx; + uint64_t version[2]; + uint64_t dirty[2]; + uint64_t size; - if ((lock->version_delta[0] != 0) || (lock->version_delta[1] != 0) || - lock->is_dirty[0] || lock->is_dirty[1]) { - ec_update_size_version(fop, &lock->loc, lock->version_delta, - lock->size_delta, lock->is_dirty, lock); - } else { - ec_unlock_lock(fop, lock); + lock = link->lock; + ctx = lock->ctx; + + /* pre_version[*] will be 0 if have_version is false */ + version[0] = ctx->post_version[0] - ctx->pre_version[0]; + version[1] = ctx->post_version[1] - ctx->pre_version[1]; + + size = ctx->post_size - ctx->pre_size; + + /* pre_dirty[*] will be 0 if have_dirty is false */ + dirty[0] = ctx->post_dirty[0] - ctx->pre_dirty[0]; + dirty[1] = ctx->post_dirty[1] - ctx->pre_dirty[1]; + + if ((version[0] != 0) || (version[1] != 0) || + (dirty[0] != 0) || (dirty[1] != 0)) { + ec_update_size_version(link, version, size, dirty); + + return _gf_true; } - ec_resume(fop, 0); + return _gf_false; } -void ec_unlock_timer_cbk(void *data) +void +ec_unlock_now(ec_lock_link_t *link) { - ec_lock_link_t *link = data; - ec_lock_t *lock = link->lock; - ec_fop_data_t *fop = NULL; + ec_trace("UNLOCK_NOW", link->fop, "lock=%p", link->lock); - LOCK(&lock->loc.inode->lock); + if (!ec_update_info(link)) { + ec_unlock_lock(link); + } - if (lock->timer != NULL) { - fop = link->fop; + ec_resume(link->fop, 0); +} - ec_trace("UNLOCK_DELAYED", fop, "lock=%p", lock); +void +ec_unlock_timer_del(ec_lock_link_t *link) +{ + int32_t before = 0; + ec_lock_t *lock; + inode_t *inode; + gf_boolean_t now = _gf_false; + + lock = link->lock; + + /* A race condition can happen if timer expires, calls this function + * and the lock is released (lock->loc is wiped) but the fop is not + * fully completed yet (it's still on the list of pending fops). In + * this case, this function can also be called if ec_unlock_force() is + * called. */ + inode = lock->loc.inode; + if (inode == NULL) { + return; + } - GF_ASSERT(lock->refs == 1); + LOCK(&inode->lock); - gf_timer_call_cancel(fop->xl->ctx, lock->timer); - lock->timer = NULL; - *lock->plock = NULL; - } + if (lock->timer != NULL) { + ec_trace("UNLOCK_DELAYED", link->fop, "lock=%p", lock); - UNLOCK(&lock->loc.inode->lock); + gf_timer_call_cancel(link->fop->xl->ctx, lock->timer); + lock->timer = NULL; - if (fop != NULL) { - ec_unlock_now(fop, lock); - } + lock->release = now = _gf_true; + + before = lock->refs + lock->refs_frozen; + list_splice_init(&lock->waiting, &lock->frozen); + lock->refs_frozen += lock->refs - lock->inserted - 1; + lock->refs = 1 + lock->inserted; + /* We moved around the locks, so total number of locks shouldn't + * change by this operation*/ + GF_ASSERT (before == (lock->refs + lock->refs_frozen)); + } + + UNLOCK(&inode->lock); + + if (now) { + ec_unlock_now(link); + } +} + +void ec_unlock_timer_cbk(void *data) +{ + ec_unlock_timer_del(data); } void ec_unlock_timer_add(ec_lock_link_t *link) @@ -1561,28 +1688,28 @@ void ec_unlock_timer_add(ec_lock_link_t *link) struct timespec delay; ec_fop_data_t *fop = link->fop; ec_lock_t *lock = link->lock; - int32_t refs = 1; + gf_boolean_t now = _gf_false; LOCK(&lock->loc.inode->lock); GF_ASSERT(lock->timer == NULL); - if (lock->refs != 1) { + if ((lock->refs - lock->inserted) > 1) { ec_trace("UNLOCK_SKIP", fop, "lock=%p", lock); lock->refs--; UNLOCK(&lock->loc.inode->lock); } else if (lock->acquired) { - delay.tv_sec = 1; - delay.tv_nsec = 0; + ec_t *ec = fop->xl->private; ec_sleep(fop); - /* If healing is needed, do not delay lock release to let self-heal - * start working as soon as possible. */ - if (!ec_fop_needs_heal(fop)) { - ec_trace("UNLOCK_DELAY", fop, "lock=%p", lock); + /* If healing is needed, the lock needs to be released due to + * contention, or ec is shutting down, do not delay lock release. */ + if (!lock->release && !ec_fop_needs_heal(fop) && !ec->shutdown) { + ec_trace("UNLOCK_DELAY", fop, "lock=%p, release=%d", lock, + lock->release); delay.tv_sec = 1; delay.tv_nsec = 0; @@ -1592,26 +1719,25 @@ void ec_unlock_timer_add(ec_lock_link_t *link) gf_log(fop->xl->name, GF_LOG_WARNING, "Unable to delay an " "unlock"); - *lock->plock = NULL; - refs = 0; + lock->release = now = _gf_true; } } else { - ec_trace("UNLOCK_FORCE", fop, "lock=%p", lock); - *lock->plock = NULL; - refs = 0; + ec_trace("UNLOCK_FORCE", fop, "lock=%p, release=%d", lock, + lock->release); + lock->release = now = _gf_true; } UNLOCK(&lock->loc.inode->lock); - if (refs == 0) { - ec_unlock_now(fop, lock); + if (now) { + ec_unlock_now(link); } } else { - *lock->plock = NULL; + lock->release = _gf_true; UNLOCK(&lock->loc.inode->lock); - ec_lock_destroy(lock); + ec_lock_unfreeze(link); } } @@ -1624,52 +1750,60 @@ void ec_unlock(ec_fop_data_t *fop) } } -void ec_flush_size_version(ec_fop_data_t * fop) +void +ec_unlock_force(ec_fop_data_t *fop) { - ec_lock_t * lock; - uint64_t version[2], delta; - gf_boolean_t dirty[2] = {_gf_false, _gf_false}; + int32_t i; - GF_ASSERT(fop->lock_count == 1); + for (i = 0; i < fop->lock_count; i++) { + ec_trace("UNLOCK_FORCED", fop, "lock=%p", &fop->locks[i]); - lock = fop->locks[0].lock; - - LOCK(&lock->loc.inode->lock); - - GF_ASSERT(lock->owner == fop); - - version[0] = lock->version_delta[0]; - version[1] = lock->version_delta[1]; - dirty[0] = lock->is_dirty[0]; - dirty[1] = lock->is_dirty[1]; - delta = lock->size_delta; - lock->version_delta[0] = 0; - lock->version_delta[1] = 0; - lock->size_delta = 0; - lock->is_dirty[0] = _gf_false; - lock->is_dirty[1] = _gf_false; + ec_unlock_timer_del(&fop->locks[i]); + } +} - UNLOCK(&lock->loc.inode->lock); +void ec_flush_size_version(ec_fop_data_t *fop) +{ + GF_ASSERT(fop->lock_count == 1); - if (version[0] > 0 || version[1] > 0 || dirty[0] || dirty[1]) - { - ec_update_size_version(fop, &lock->loc, version, delta, dirty, - NULL); - } + ec_update_info(&fop->locks[0]); } void ec_lock_reuse(ec_fop_data_t *fop) { - ec_fop_data_t * wait_fop; - ec_lock_t * lock; - ec_lock_link_t * link; - int32_t i; + ec_t *ec; + ec_cbk_data_t *cbk; + ec_lock_t *lock; + ec_lock_link_t *link; + ec_inode_t *ctx; + int32_t i, count; + gf_boolean_t release = _gf_false; + + cbk = fop->answer; + if (cbk != NULL) { + if (cbk->xdata != NULL) { + if ((dict_get_int32(cbk->xdata, GLUSTERFS_INODELK_COUNT, + &count) == 0) && (count > 1)) { + release = _gf_true; + } + if (release) { + gf_log(fop->xl->name, GF_LOG_DEBUG, + "Lock contention detected"); + } + } + } else { + /* If we haven't get an answer with enough quorum, we always release + * the lock. */ + release = _gf_true; + } + + ec = fop->xl->private; for (i = 0; i < fop->lock_count; i++) { - wait_fop = NULL; - - lock = fop->locks[i].lock; + link = &fop->locks[i]; + lock = link->lock; + ctx = lock->ctx; LOCK(&lock->loc.inode->lock); @@ -1677,47 +1811,42 @@ void ec_lock_reuse(ec_fop_data_t *fop) GF_ASSERT(lock->owner == fop); lock->owner = NULL; + lock->release |= release; - if (((fop->locks_update >> i) & 1) != 0) { - if (fop->error == 0) - { - if (ec_is_metadata_fop (fop->id)) { - lock->version_delta[1]++; - } else { - lock->version_delta[0]++; - } - lock->size_delta += fop->post_size - fop->pre_size; - if (fop->have_size) { - lock->size = fop->post_size; - lock->have_size = 1; + if ((fop->error == 0) && (cbk != NULL) && (cbk->op_ret >= 0)) { + if (link->update[0]) { + ctx->post_version[0]++; + if (ec->node_mask & ~fop->mask) { + ctx->post_dirty[0]++; + } + } + if (link->update[1]) { + ctx->post_version[1]++; + if (ec->node_mask & ~fop->mask) { + ctx->post_dirty[1]++; } } } lock->good_mask &= fop->mask; + link = NULL; if (!list_empty(&lock->waiting)) { link = list_entry(lock->waiting.next, ec_lock_link_t, wait_list); list_del_init(&link->wait_list); - wait_fop = link->fop; - - if (lock->kind == EC_LOCK_INODE) - { - wait_fop->pre_size = wait_fop->post_size = fop->post_size; - wait_fop->have_size = fop->have_size; - } - wait_fop->mask &= fop->mask; + lock->owner = link->fop; } UNLOCK(&lock->loc.inode->lock); - if (wait_fop != NULL) + if (link != NULL) { - ec_lock(wait_fop); - - ec_resume(wait_fop, 0); + if (ec_lock_acquire(link)) { + ec_lock(link->fop); + } + ec_resume(link->fop, 0); } } } diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h index 9e0aaa0f079..c0db0218699 100644 --- a/xlators/cluster/ec/src/ec-common.h +++ b/xlators/cluster/ec/src/ec-common.h @@ -20,6 +20,9 @@ typedef enum { EC_METADATA_TXN } ec_txn_t; +#define EC_FOP_HEAL -1 +#define EC_FOP_FHEAL -2 + #define EC_CONFIG_VERSION 0 #define EC_CONFIG_ALGORITHM 0 @@ -35,19 +38,20 @@ typedef enum { #define EC_MINIMUM_MIN -2 #define EC_MINIMUM_ALL -3 -#define EC_LOCK_ENTRY 0 -#define EC_LOCK_INODE 1 +#define EC_UPDATE_DATA 1 +#define EC_UPDATE_META 2 +#define EC_QUERY_INFO 4 +#define EC_INODE_SIZE 8 #define EC_STATE_START 0 #define EC_STATE_END 0 #define EC_STATE_INIT 1 #define EC_STATE_LOCK 2 -#define EC_STATE_GET_SIZE_AND_VERSION 3 -#define EC_STATE_DISPATCH 4 -#define EC_STATE_PREPARE_ANSWER 5 -#define EC_STATE_REPORT 6 -#define EC_STATE_LOCK_REUSE 7 -#define EC_STATE_UNLOCK 8 +#define EC_STATE_DISPATCH 3 +#define EC_STATE_PREPARE_ANSWER 4 +#define EC_STATE_REPORT 5 +#define EC_STATE_LOCK_REUSE 6 +#define EC_STATE_UNLOCK 7 #define EC_STATE_DELAYED_START 100 @@ -81,15 +85,21 @@ void ec_update_bad(ec_fop_data_t * fop, uintptr_t good); void ec_fop_set_error(ec_fop_data_t * fop, int32_t error); -void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, int32_t update); -void ec_lock_prepare_entry(ec_fop_data_t *fop, loc_t *loc, int32_t update); -void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, int32_t update); +void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, uint32_t flags); +void ec_lock_prepare_parent_inode(ec_fop_data_t *fop, loc_t *loc, + uint32_t flags); +void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags); void ec_lock(ec_fop_data_t * fop); void ec_lock_reuse(ec_fop_data_t *fop); void ec_unlock(ec_fop_data_t * fop); +void ec_unlock_force(ec_fop_data_t *fop); + +gf_boolean_t ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode, + uint64_t *size); +gf_boolean_t ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode, + uint64_t size); +void ec_clear_inode_info(ec_fop_data_t *fop, inode_t *inode); -void ec_get_size_version(ec_fop_data_t * fop); -void ec_prepare_update(ec_fop_data_t *fop); void ec_flush_size_version(ec_fop_data_t * fop); void ec_dispatch_all(ec_fop_data_t * fop); diff --git a/xlators/cluster/ec/src/ec-data.c b/xlators/cluster/ec/src/ec-data.c index fb47aea90a8..047ccd5ff31 100644 --- a/xlators/cluster/ec/src/ec-data.c +++ b/xlators/cluster/ec/src/ec-data.c @@ -96,6 +96,19 @@ void ec_cbk_data_destroy(ec_cbk_data_t * cbk) mem_put(cbk); } +/* PARENT_DOWN will be notified to children only after these fops are complete + * when graph switch happens. We do not want graph switch to be waiting on + * heal to complete as healing big file/directory could take a while. Which + * will lead to hang on the mount. + */ +static inline gf_boolean_t +ec_needs_graceful_completion (ec_fop_data_t *fop) +{ + if ((fop->id != EC_FOP_HEAL) && (fop->id != EC_FOP_FHEAL)) + return _gf_true; + return _gf_false; +} + ec_fop_data_t * ec_fop_data_allocate(call_frame_t * frame, xlator_t * this, int32_t id, uint32_t flags, uintptr_t target, int32_t minimum, @@ -114,6 +127,12 @@ ec_fop_data_t * ec_fop_data_allocate(call_frame_t * frame, xlator_t * this, return NULL; } + INIT_LIST_HEAD(&fop->cbk_list); + INIT_LIST_HEAD(&fop->answer_list); + INIT_LIST_HEAD(&fop->pending_list); + INIT_LIST_HEAD(&fop->locks[0].wait_list); + INIT_LIST_HEAD(&fop->locks[1].wait_list); + fop->xl = this; fop->req_frame = frame; @@ -148,9 +167,6 @@ ec_fop_data_t * ec_fop_data_allocate(call_frame_t * frame, xlator_t * this, fop->minimum = minimum; fop->mask = target; - INIT_LIST_HEAD(&fop->cbk_list); - INIT_LIST_HEAD(&fop->answer_list); - fop->wind = wind; fop->handler = handler; fop->cbks = cbks; @@ -165,17 +181,20 @@ ec_fop_data_t * ec_fop_data_allocate(call_frame_t * frame, xlator_t * this, parent = frame->local; if (parent != NULL) { - LOCK(&parent->lock); - - parent->jobs++; - parent->refs++; - - UNLOCK(&parent->lock); + ec_sleep(parent); } fop->parent = parent; } + if (ec_needs_graceful_completion (fop)) { + LOCK(&ec->lock); + + list_add_tail(&fop->pending_list, &ec->pending_fops); + + UNLOCK(&ec->lock); + } + return fop; } @@ -190,10 +209,41 @@ void ec_fop_data_acquire(ec_fop_data_t * fop) UNLOCK(&fop->lock); } +static void +ec_handle_last_pending_fop_completion (ec_fop_data_t *fop, gf_boolean_t *notify) +{ + ec_t *ec = fop->xl->private; + + if (!list_empty (&fop->pending_list)) { + LOCK(&ec->lock); + { + list_del_init (&fop->pending_list); + *notify = list_empty (&ec->pending_fops); + } + UNLOCK(&ec->lock); + } +} + +void +ec_fop_cleanup(ec_fop_data_t *fop) +{ + ec_cbk_data_t *cbk, *tmp; + + list_for_each_entry_safe(cbk, tmp, &fop->answer_list, answer_list) { + list_del_init(&cbk->answer_list); + + ec_cbk_data_destroy(cbk); + } + INIT_LIST_HEAD(&fop->cbk_list); + + fop->answer = NULL; +} + void ec_fop_data_release(ec_fop_data_t * fop) { - ec_cbk_data_t * cbk, * tmp; + ec_t *ec = NULL; int32_t refs; + gf_boolean_t notify = _gf_false; LOCK(&fop->lock); @@ -238,13 +288,13 @@ void ec_fop_data_release(ec_fop_data_t * fop) ec_resume_parent(fop, fop->error); - list_for_each_entry_safe(cbk, tmp, &fop->answer_list, answer_list) - { - list_del_init(&cbk->answer_list); - - ec_cbk_data_destroy(cbk); - } + ec_fop_cleanup(fop); + ec = fop->xl->private; + ec_handle_last_pending_fop_completion (fop, ¬ify); mem_put(fop); + if (notify) { + ec_pending_fops_completed(ec); + } } } diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h index 9e5c92dd5b8..8204cf087de 100644 --- a/xlators/cluster/ec/src/ec-data.h +++ b/xlators/cluster/ec/src/ec-data.h @@ -67,10 +67,20 @@ struct _ec_fd struct _ec_inode { uintptr_t bad; - ec_lock_t *entry_lock; ec_lock_t *inode_lock; + gf_boolean_t have_info; + gf_boolean_t have_config; + gf_boolean_t have_version; + gf_boolean_t have_size; + gf_boolean_t have_dirty; + ec_config_t config; + uint64_t pre_version[2]; + uint64_t post_version[2]; + uint64_t pre_size; + uint64_t post_size; + uint64_t pre_dirty[2]; + uint64_t post_dirty[2]; struct list_head heal; - }; typedef int32_t (* fop_heal_cbk_t)(call_frame_t *, void * cookie, xlator_t *, @@ -80,7 +90,6 @@ typedef int32_t (* fop_fheal_cbk_t)(call_frame_t *, void * cookie, xlator_t *, int32_t, int32_t, uintptr_t, uintptr_t, uintptr_t, dict_t *); - union _ec_cbk { fop_access_cbk_t access; @@ -132,21 +141,21 @@ union _ec_cbk struct _ec_lock { - ec_lock_t **plock; + ec_inode_t *ctx; gf_timer_t *timer; - struct list_head waiting; + struct list_head waiting; /* Queue of requests being serviced. */ + struct list_head frozen; /* Queue of requests that will be serviced in + the next unlock/lock cycle. */ uintptr_t mask; uintptr_t good_mask; - int32_t kind; int32_t refs; - int32_t acquired; - int32_t have_size; - uint64_t size; - uint64_t size_delta; - uint64_t version[2]; - uint64_t version_delta[2]; - gf_boolean_t is_dirty[2]; + int32_t refs_frozen; + int32_t inserted; + gf_boolean_t acquired; + gf_boolean_t release; + gf_boolean_t query; ec_fop_data_t *owner; + fd_t *fd; loc_t loc; union { @@ -157,9 +166,12 @@ struct _ec_lock struct _ec_lock_link { - ec_lock_t * lock; - ec_fop_data_t * fop; - struct list_head wait_list; + ec_lock_t *lock; + ec_fop_data_t *fop; + struct list_head wait_list; + gf_boolean_t update[2]; + loc_t *base; + uint64_t size; }; struct _ec_fop_data @@ -172,22 +184,19 @@ struct _ec_fop_data int32_t winds; int32_t jobs; int32_t error; - ec_fop_data_t * parent; - xlator_t * xl; - call_frame_t * req_frame; // frame of the calling xlator - call_frame_t * frame; // frame used by this fop - struct list_head cbk_list; // sorted list of groups of answers - struct list_head answer_list; // list of answers - ec_cbk_data_t * answer; // accepted answer + ec_fop_data_t *parent; + xlator_t *xl; + call_frame_t *req_frame; /* frame of the calling xlator */ + call_frame_t *frame; /* frame used by this fop */ + struct list_head cbk_list; /* sorted list of groups of answers */ + struct list_head answer_list; /* list of answers */ + struct list_head pending_list; /* member of ec_t.pending_fops */ + ec_cbk_data_t *answer; /* accepted answer */ int32_t lock_count; int32_t locked; ec_lock_link_t locks[2]; - int32_t locks_update; - int32_t have_size; - uint64_t pre_size; - uint64_t post_size; + int32_t first_lock; gf_lock_t lock; - ec_config_t config; uint32_t flags; uint32_t first; @@ -196,6 +205,7 @@ struct _ec_fop_data if fop->minimum number of subvolumes succeed which are not healing*/ uintptr_t remaining; + uintptr_t received; /* Mask of responses */ uintptr_t good; uintptr_t bad; @@ -203,7 +213,7 @@ struct _ec_fop_data ec_handler_f handler; ec_resume_f resume; ec_cbk_t cbks; - void * data; + void *data; ec_heal_t *heal; uint64_t user_size; @@ -211,8 +221,8 @@ struct _ec_fop_data int32_t use_fd; - dict_t * xdata; - dict_t * dict; + dict_t *xdata; + dict_t *dict; int32_t int32; uint32_t uint32; uint64_t size; @@ -222,14 +232,14 @@ struct _ec_fop_data entrylk_type entrylk_type; gf_xattrop_flags_t xattrop_flags; dev_t dev; - inode_t * inode; - fd_t * fd; + inode_t *inode; + fd_t *fd; struct iatt iatt; - char * str[2]; + char *str[2]; loc_t loc[2]; struct gf_flock flock; - struct iovec * vector; - struct iobref * buffers; + struct iovec *vector; + struct iobref *buffers; }; struct _ec_cbk_data @@ -299,4 +309,6 @@ ec_fop_data_t * ec_fop_data_allocate(call_frame_t * frame, xlator_t * this, void ec_fop_data_acquire(ec_fop_data_t * fop); void ec_fop_data_release(ec_fop_data_t * fop); +void ec_fop_cleanup(ec_fop_data_t *fop); + #endif /* __EC_DATA_H__ */ diff --git a/xlators/cluster/ec/src/ec-dir-read.c b/xlators/cluster/ec/src/ec-dir-read.c index ffc3ed5a7cd..354c63d3683 100644 --- a/xlators/cluster/ec/src/ec-dir-read.c +++ b/xlators/cluster/ec/src/ec-dir-read.c @@ -128,14 +128,9 @@ int32_t ec_manager_opendir(ec_fop_data_t * fop, int32_t state) /* Fall through */ case EC_STATE_LOCK: - ec_lock_prepare_entry(fop, &fop->loc[0], 0); + ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO); ec_lock(fop); - return EC_STATE_GET_SIZE_AND_VERSION; - - case EC_STATE_GET_SIZE_AND_VERSION: - ec_get_size_version(fop); - return EC_STATE_DISPATCH; case EC_STATE_DISPATCH: @@ -195,7 +190,6 @@ int32_t ec_manager_opendir(ec_fop_data_t * fop, int32_t state) case -EC_STATE_INIT: case -EC_STATE_LOCK: - case -EC_STATE_GET_SIZE_AND_VERSION: case -EC_STATE_DISPATCH: case -EC_STATE_PREPARE_ANSWER: case -EC_STATE_REPORT: diff --git a/xlators/cluster/ec/src/ec-dir-write.c b/xlators/cluster/ec/src/ec-dir-write.c index ffc96bf4351..ce09138fb7a 100644 --- a/xlators/cluster/ec/src/ec-dir-write.c +++ b/xlators/cluster/ec/src/ec-dir-write.c @@ -98,11 +98,10 @@ void ec_wind_create(ec_t * ec, ec_fop_data_t * fop, int32_t idx) int32_t ec_manager_create(ec_fop_data_t * fop, int32_t state) { - - - ec_t * ec; - ec_cbk_data_t * cbk; - ec_fd_t * ctx; + ec_config_t config; + ec_t *ec; + ec_cbk_data_t *cbk; + ec_fd_t *ctx; uint64_t version[2] = {0, 0}; switch (state) @@ -137,16 +136,15 @@ int32_t ec_manager_create(ec_fop_data_t * fop, int32_t state) ec = fop->xl->private; - fop->config.version = EC_CONFIG_VERSION; - fop->config.algorithm = EC_CONFIG_ALGORITHM; - fop->config.gf_word_size = EC_GF_BITS; - fop->config.bricks = ec->nodes; - fop->config.redundancy = ec->redundancy; - fop->config.chunk_size = EC_METHOD_CHUNK_SIZE; + config.version = EC_CONFIG_VERSION; + config.algorithm = EC_CONFIG_ALGORITHM; + config.gf_word_size = EC_GF_BITS; + config.bricks = ec->nodes; + config.redundancy = ec->redundancy; + config.chunk_size = EC_METHOD_CHUNK_SIZE; if (ec_dict_set_config(fop->xdata, EC_XATTR_CONFIG, - &fop->config) < 0) - { + &config) < 0) { fop->error = EIO; return EC_STATE_REPORT; @@ -172,7 +170,8 @@ int32_t ec_manager_create(ec_fop_data_t * fop, int32_t state) /* Fall through */ case EC_STATE_LOCK: - ec_lock_prepare_entry(fop, &fop->loc[0], 1); + ec_lock_prepare_parent_inode(fop, &fop->loc[0], + EC_UPDATE_DATA | EC_UPDATE_META); ec_lock(fop); return EC_STATE_DISPATCH; @@ -376,17 +375,11 @@ int32_t ec_manager_link(ec_fop_data_t * fop, int32_t state) { case EC_STATE_INIT: case EC_STATE_LOCK: - // Parent entry of fop->loc[0] should be locked, but I don't - // receive enough information to do it (fop->loc[0].parent is - // NULL). - ec_lock_prepare_entry(fop, &fop->loc[1], 1); + ec_lock_prepare_parent_inode(fop, &fop->loc[1], EC_UPDATE_DATA | + EC_UPDATE_META | + EC_INODE_SIZE); ec_lock(fop); - return EC_STATE_GET_SIZE_AND_VERSION; - - case EC_STATE_GET_SIZE_AND_VERSION: - ec_get_size_version(fop); - return EC_STATE_DISPATCH; case EC_STATE_DISPATCH: @@ -410,7 +403,7 @@ int32_t ec_manager_link(ec_fop_data_t * fop, int32_t state) ec_iatt_rebuild(fop->xl->private, cbk->iatt, 3, cbk->count); if (cbk->iatt[0].ia_type == IA_IFREG) { - cbk->iatt[0].ia_size = fop->pre_size; + cbk->iatt[0].ia_size = fop->locks[0].size; } if (ec_loc_update(fop->xl, &fop->loc[0], cbk->inode, @@ -446,7 +439,6 @@ int32_t ec_manager_link(ec_fop_data_t * fop, int32_t state) case -EC_STATE_INIT: case -EC_STATE_LOCK: - case -EC_STATE_GET_SIZE_AND_VERSION: case -EC_STATE_DISPATCH: case -EC_STATE_PREPARE_ANSWER: case -EC_STATE_REPORT: @@ -589,7 +581,8 @@ int32_t ec_manager_mkdir(ec_fop_data_t * fop, int32_t state) /* Fall through */ case EC_STATE_LOCK: - ec_lock_prepare_entry(fop, &fop->loc[0], 1); + ec_lock_prepare_parent_inode(fop, &fop->loc[0], + EC_UPDATE_DATA | EC_UPDATE_META); ec_lock(fop); return EC_STATE_DISPATCH; @@ -764,6 +757,7 @@ void ec_wind_mknod(ec_t * ec, ec_fop_data_t * fop, int32_t idx) int32_t ec_manager_mknod(ec_fop_data_t * fop, int32_t state) { + ec_config_t config; ec_t *ec; ec_cbk_data_t * cbk; uint64_t version[2] = {0, 0}; @@ -783,15 +777,15 @@ int32_t ec_manager_mknod(ec_fop_data_t * fop, int32_t state) ec = fop->xl->private; - fop->config.version = EC_CONFIG_VERSION; - fop->config.algorithm = EC_CONFIG_ALGORITHM; - fop->config.gf_word_size = EC_GF_BITS; - fop->config.bricks = ec->nodes; - fop->config.redundancy = ec->redundancy; - fop->config.chunk_size = EC_METHOD_CHUNK_SIZE; + config.version = EC_CONFIG_VERSION; + config.algorithm = EC_CONFIG_ALGORITHM; + config.gf_word_size = EC_GF_BITS; + config.bricks = ec->nodes; + config.redundancy = ec->redundancy; + config.chunk_size = EC_METHOD_CHUNK_SIZE; if (ec_dict_set_config(fop->xdata, EC_XATTR_CONFIG, - &fop->config) < 0) { + &config) < 0) { fop->error = EIO; return EC_STATE_REPORT; @@ -814,7 +808,8 @@ int32_t ec_manager_mknod(ec_fop_data_t * fop, int32_t state) /* Fall through */ case EC_STATE_LOCK: - ec_lock_prepare_entry(fop, &fop->loc[0], 1); + ec_lock_prepare_parent_inode(fop, &fop->loc[0], + EC_UPDATE_DATA | EC_UPDATE_META); ec_lock(fop); return EC_STATE_DISPATCH; @@ -997,15 +992,13 @@ int32_t ec_manager_rename(ec_fop_data_t * fop, int32_t state) { case EC_STATE_INIT: case EC_STATE_LOCK: - ec_lock_prepare_entry(fop, &fop->loc[0], 1); - ec_lock_prepare_entry(fop, &fop->loc[1], 1); + ec_lock_prepare_parent_inode(fop, &fop->loc[0], EC_UPDATE_DATA | + EC_UPDATE_META | + EC_INODE_SIZE); + ec_lock_prepare_parent_inode(fop, &fop->loc[1], + EC_UPDATE_DATA | EC_UPDATE_META); ec_lock(fop); - return EC_STATE_GET_SIZE_AND_VERSION; - - case EC_STATE_GET_SIZE_AND_VERSION: - ec_get_size_version(fop); - return EC_STATE_DISPATCH; case EC_STATE_DISPATCH: @@ -1034,9 +1027,8 @@ int32_t ec_manager_rename(ec_fop_data_t * fop, int32_t state) ec_iatt_rebuild(fop->xl->private, cbk->iatt, 5, cbk->count); - if (cbk->iatt[0].ia_type == IA_IFREG) - { - cbk->iatt[0].ia_size = fop->pre_size; + if (cbk->iatt[0].ia_type == IA_IFREG) { + cbk->iatt[0].ia_size = fop->locks[0].size; } } } @@ -1064,7 +1056,6 @@ int32_t ec_manager_rename(ec_fop_data_t * fop, int32_t state) case -EC_STATE_INIT: case -EC_STATE_LOCK: - case -EC_STATE_GET_SIZE_AND_VERSION: case -EC_STATE_DISPATCH: case -EC_STATE_PREPARE_ANSWER: case -EC_STATE_REPORT: @@ -1191,7 +1182,8 @@ int32_t ec_manager_rmdir(ec_fop_data_t * fop, int32_t state) { case EC_STATE_INIT: case EC_STATE_LOCK: - ec_lock_prepare_entry(fop, &fop->loc[0], 1); + ec_lock_prepare_parent_inode(fop, &fop->loc[0], + EC_UPDATE_DATA | EC_UPDATE_META); ec_lock(fop); return EC_STATE_DISPATCH; @@ -1361,7 +1353,8 @@ int32_t ec_manager_symlink(ec_fop_data_t * fop, int32_t state) { case EC_STATE_INIT: case EC_STATE_LOCK: - ec_lock_prepare_entry(fop, &fop->loc[0], 1); + ec_lock_prepare_parent_inode(fop, &fop->loc[0], + EC_UPDATE_DATA | EC_UPDATE_META); ec_lock(fop); return EC_STATE_DISPATCH; @@ -1552,14 +1545,10 @@ int32_t ec_manager_unlink(ec_fop_data_t * fop, int32_t state) { case EC_STATE_INIT: case EC_STATE_LOCK: - ec_lock_prepare_entry(fop, &fop->loc[0], 1); + ec_lock_prepare_parent_inode(fop, &fop->loc[0], + EC_UPDATE_DATA | EC_UPDATE_META); ec_lock(fop); - return EC_STATE_GET_SIZE_AND_VERSION; - - case EC_STATE_GET_SIZE_AND_VERSION: - ec_get_size_version(fop); - return EC_STATE_DISPATCH; case EC_STATE_DISPATCH: @@ -1607,7 +1596,6 @@ int32_t ec_manager_unlink(ec_fop_data_t * fop, int32_t state) case -EC_STATE_INIT: case -EC_STATE_LOCK: - case -EC_STATE_GET_SIZE_AND_VERSION: case -EC_STATE_DISPATCH: case -EC_STATE_PREPARE_ANSWER: case -EC_STATE_REPORT: diff --git a/xlators/cluster/ec/src/ec-fops.h b/xlators/cluster/ec/src/ec-fops.h index d6b9770f720..7661077cca3 100644 --- a/xlators/cluster/ec/src/ec-fops.h +++ b/xlators/cluster/ec/src/ec-fops.h @@ -16,9 +16,6 @@ #include "ec-data.h" #include "ec-common.h" -#define EC_FOP_HEAL -1 -#define EC_FOP_FHEAL -2 - void ec_access(call_frame_t * frame, xlator_t * this, uintptr_t target, int32_t minimum, fop_access_cbk_t func, void *data, loc_t * loc, int32_t mask, dict_t * xdata); diff --git a/xlators/cluster/ec/src/ec-generic.c b/xlators/cluster/ec/src/ec-generic.c index d957bf6533d..f50c7a70560 100644 --- a/xlators/cluster/ec/src/ec-generic.c +++ b/xlators/cluster/ec/src/ec-generic.c @@ -320,15 +320,10 @@ int32_t ec_manager_fsync(ec_fop_data_t * fop, int32_t state) { case EC_STATE_INIT: case EC_STATE_LOCK: - ec_lock_prepare_fd(fop, fop->fd, 0); + ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO); ec_lock(fop); - return EC_STATE_GET_SIZE_AND_VERSION; - - case EC_STATE_GET_SIZE_AND_VERSION: - ec_get_size_version(fop); - - return EC_STATE_DISPATCH; + return EC_STATE_DISPATCH; case EC_STATE_DISPATCH: ec_flush_size_version(fop); @@ -361,8 +356,10 @@ int32_t ec_manager_fsync(ec_fop_data_t * fop, int32_t state) ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count); - cbk->iatt[0].ia_size = fop->pre_size; - cbk->iatt[1].ia_size = fop->post_size; + /* This shouldn't fail because we have the inode locked. */ + GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode, + &cbk->iatt[0].ia_size)); + cbk->iatt[1].ia_size = cbk->iatt[0].ia_size; } } else @@ -388,7 +385,6 @@ int32_t ec_manager_fsync(ec_fop_data_t * fop, int32_t state) case -EC_STATE_INIT: case -EC_STATE_LOCK: - case -EC_STATE_GET_SIZE_AND_VERSION: case -EC_STATE_DISPATCH: case -EC_STATE_PREPARE_ANSWER: case -EC_STATE_REPORT: @@ -705,7 +701,6 @@ void ec_lookup_rebuild(ec_t * ec, ec_fop_data_t * fop, ec_cbk_data_t * cbk) { ec_cbk_data_t * ans = NULL; ec_inode_t * ctx = NULL; - ec_lock_t * lock = NULL; data_t * data = NULL; uint8_t * buff = NULL; uint64_t size = 0; @@ -729,14 +724,14 @@ void ec_lookup_rebuild(ec_t * ec, ec_fop_data_t * fop, ec_cbk_data_t * cbk) LOCK(&cbk->inode->lock); ctx = __ec_inode_get(cbk->inode, fop->xl); - if ((ctx != NULL) && (ctx->inode_lock != NULL)) + if (ctx != NULL) { - lock = ctx->inode_lock; - cbk->version[0] = lock->version[0]; - cbk->version[1] = lock->version[1]; - if (lock->have_size) - { - size = lock->size; + if (ctx->have_version) { + cbk->version[0] = ctx->post_version[0]; + cbk->version[1] = ctx->post_version[1]; + } + if (ctx->have_size) { + size = ctx->post_size; have_size = 1; } } @@ -964,9 +959,20 @@ int32_t ec_manager_lookup(ec_fop_data_t * fop, int32_t state) return EC_STATE_PREPARE_ANSWER; case EC_STATE_PREPARE_ANSWER: + /* + * Lookup happens without any lock, so there is a chance that it + * will have answers before modification happened and after + * modification happened in the same response. So choose the next + * best answer when the answers don't match for EC_MINIMUM_MIN + */ + + if (!fop->answer && !list_empty(&fop->cbk_list)) { + fop->answer = list_entry (fop->cbk_list.next, ec_cbk_data_t, + list); + } + cbk = fop->answer; - if (cbk != NULL) - { + if (cbk != NULL) { if (!ec_dict_combine(cbk, EC_COMBINE_XDATA)) { if (cbk->op_ret >= 0) @@ -986,9 +992,7 @@ int32_t ec_manager_lookup(ec_fop_data_t * fop, int32_t state) ec_lookup_rebuild(fop->xl->private, fop, cbk); } - } - else - { + } else { ec_fop_set_error(fop, EIO); } @@ -1295,8 +1299,8 @@ out: /* FOP: xattrop */ -int32_t ec_combine_xattrop(ec_fop_data_t * fop, ec_cbk_data_t * dst, - ec_cbk_data_t * src) +int32_t ec_combine_xattrop(ec_fop_data_t *fop, ec_cbk_data_t *dst, + ec_cbk_data_t *src) { if (!ec_dict_compare(dst->dict, src->dict)) { @@ -1316,9 +1320,9 @@ ec_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, { ec_fop_data_t *fop = NULL; ec_cbk_data_t *cbk = NULL; + data_t *data; + uint64_t *version; int32_t idx = (int32_t)(uintptr_t)cookie; - uint64_t version = 0; - uint64_t *version_xattr = 0; VALIDATE_OR_GOTO (this, out); GF_VALIDATE_OR_GOTO (this->name, frame, out); @@ -1338,12 +1342,19 @@ ec_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret >= 0) { cbk->dict = dict_ref (xattr); - if (dict_get_bin (xattr, EC_XATTR_VERSION, - (void **)&version_xattr) == 0) { - version = ntoh64(version_xattr[0]); - if ((version >> EC_SELFHEAL_BIT) & 1) - fop->healing |= (1ULL<<idx); + data = dict_get(cbk->dict, EC_XATTR_VERSION); + if ((data != NULL) && (data->len >= sizeof(uint64_t))) { + version = (uint64_t *)data->data; + + if (((ntoh64(version[0]) >> EC_SELFHEAL_BIT) & 1) != 0) { + LOCK(&fop->lock); + + fop->healing |= 1ULL << idx; + + UNLOCK(&fop->lock); + } } + ec_dict_del_array (xattr, EC_XATTR_DIRTY, cbk->dirty, EC_VERSION_SIZE); } @@ -1377,13 +1388,10 @@ int32_t ec_manager_xattrop(ec_fop_data_t * fop, int32_t state) { case EC_STATE_INIT: case EC_STATE_LOCK: - if (fop->fd == NULL) - { - ec_lock_prepare_inode(fop, &fop->loc[0], 1); - } - else - { - ec_lock_prepare_fd(fop, fop->fd, 1); + if (fop->fd == NULL) { + ec_lock_prepare_inode(fop, &fop->loc[0], EC_UPDATE_META); + } else { + ec_lock_prepare_fd(fop, fop->fd, EC_UPDATE_META); } ec_lock(fop); diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c index ceddfeb6ac7..80725e5a9fa 100644 --- a/xlators/cluster/ec/src/ec-heal.c +++ b/xlators/cluster/ec/src/ec-heal.c @@ -119,9 +119,8 @@ void ec_heal_lookup_resume(ec_fop_data_t * fop) heal->version[0] = cbk->version[0]; heal->version[1] = cbk->version[1]; heal->raw_size = cbk->size; - heal->fop->pre_size = cbk->iatt[0].ia_size; - heal->fop->post_size = cbk->iatt[0].ia_size; - heal->fop->have_size = 1; + + GF_ASSERT(ec_set_inode_size(fop, cbk->inode, cbk->size)); if (ec_loc_update(heal->xl, &heal->loc, cbk->inode, &cbk->iatt[0]) != 0) @@ -532,12 +531,7 @@ ec_heal_init (ec_fop_data_t * fop) gf_log("ec", GF_LOG_INFO, "Healing '%s', gfid %s", heal->loc.path, uuid_utoa(heal->loc.gfid)); } else { - LOCK(&fop->lock); - - fop->jobs++; - fop->refs++; - - UNLOCK(&fop->lock); + ec_sleep(fop); } list_add_tail(&heal->list, &ctx->heal); @@ -552,25 +546,8 @@ out: return error; } -void ec_heal_entrylk(ec_heal_t * heal, entrylk_cmd cmd) -{ - loc_t loc; - - if (ec_loc_parent(heal->xl, &heal->loc, &loc) != 0) { - gf_log("ec", GF_LOG_NOTICE, "ec_loc_parent() failed"); - ec_fop_set_error(heal->fop, EIO); - - return; - } - - ec_entrylk(heal->fop->frame, heal->xl, -1, EC_MINIMUM_ALL, NULL, NULL, - heal->xl->name, &loc, NULL, cmd, ENTRYLK_WRLCK, NULL); - - loc_wipe(&loc); -} - -void ec_heal_inodelk(ec_heal_t * heal, int32_t type, int32_t use_fd, - off_t offset, size_t size) +void ec_heal_lock(ec_heal_t *heal, int32_t type, fd_t *fd, loc_t *loc, + off_t offset, size_t size) { struct gf_flock flock; @@ -581,20 +558,47 @@ void ec_heal_inodelk(ec_heal_t * heal, int32_t type, int32_t use_fd, flock.l_pid = 0; flock.l_owner.len = 0; - if (use_fd) + /* Remove inode size information before unlocking it. */ + if ((type == F_UNLCK) && (heal->loc.inode != NULL)) { + ec_clear_inode_info(heal->fop, heal->loc.inode); + } + + if (fd != NULL) { ec_finodelk(heal->fop->frame, heal->xl, heal->fop->mask, - EC_MINIMUM_ALL, NULL, NULL, heal->xl->name, heal->fd, + EC_MINIMUM_ALL, NULL, NULL, heal->xl->name, fd, F_SETLKW, &flock, NULL); } else { ec_inodelk(heal->fop->frame, heal->xl, heal->fop->mask, EC_MINIMUM_ALL, - NULL, NULL, heal->xl->name, &heal->loc, F_SETLKW, &flock, + NULL, NULL, heal->xl->name, loc, F_SETLKW, &flock, NULL); } } +void ec_heal_entrylk(ec_heal_t *heal, int32_t type) +{ + loc_t loc; + + if (ec_loc_parent(heal->xl, &heal->loc, &loc) != 0) { + ec_fop_set_error(heal->fop, EIO); + + return; + } + + ec_heal_lock(heal, type, NULL, &loc, 0, 0); + + loc_wipe(&loc); +} + +void ec_heal_inodelk(ec_heal_t *heal, int32_t type, int32_t use_fd, + off_t offset, size_t size) +{ + ec_heal_lock(heal, type, use_fd ? heal->fd : NULL, &heal->loc, offset, + size); +} + void ec_heal_lookup(ec_heal_t *heal, uintptr_t mask) { dict_t * xdata; @@ -1302,13 +1306,10 @@ ec_manager_heal (ec_fop_data_t * fop, int32_t state) case EC_STATE_DISPATCH: if (heal->done != 0) { - gf_log("ec", GF_LOG_NOTICE, "heal already done"); return EC_STATE_HEAL_DISPATCH; } - gf_log("ec", GF_LOG_NOTICE, "heal before entrylk"); - ec_heal_entrylk(heal, ENTRYLK_LOCK); - gf_log("ec", GF_LOG_NOTICE, "heal after entrylk"); + ec_heal_entrylk(heal, F_WRLCK); return EC_STATE_HEAL_ENTRY_LOOKUP; @@ -1336,7 +1337,7 @@ ec_manager_heal (ec_fop_data_t * fop, int32_t state) /* Only heal data/metadata if enough information is supplied. */ if (gf_uuid_is_null(heal->loc.gfid)) { - ec_heal_entrylk(heal, ENTRYLK_UNLOCK); + ec_heal_entrylk(heal, F_UNLCK); return EC_STATE_HEAL_DISPATCH; } @@ -1392,7 +1393,7 @@ ec_manager_heal (ec_fop_data_t * fop, int32_t state) case -EC_STATE_HEAL_UNLOCK_ENTRY: case EC_STATE_HEAL_UNLOCK_ENTRY: if (heal->nameheal) - ec_heal_entrylk(heal, ENTRYLK_UNLOCK); + ec_heal_entrylk(heal, F_UNLCK); heal->bad = ec_heal_needs_data_rebuild(heal); if (heal->bad != 0) @@ -2562,9 +2563,9 @@ ec_heal_name (call_frame_t *frame, ec_t *ec, inode_t *parent, char *name, EC_REPLIES_ALLOC (replies, ec->nodes); output = alloca0 (ec->nodes); locked_on = alloca0 (ec->nodes); - ret = cluster_entrylk (ec->xl_list, participants, ec->nodes, replies, + ret = cluster_inodelk (ec->xl_list, participants, ec->nodes, replies, locked_on, frame, ec->xl, ec->xl->name, parent, - NULL); + 0, 0); { if (ret <= ec->fragments) { gf_log (ec->xl->name, GF_LOG_DEBUG, "%s/%s: Skipping " @@ -2578,8 +2579,8 @@ ec_heal_name (call_frame_t *frame, ec_t *ec, inode_t *parent, char *name, ret = __ec_heal_name (frame, ec, parent, name, participants); } unlock: - cluster_unentrylk (ec->xl_list, locked_on, ec->nodes, replies, output, - frame, ec->xl, ec->xl->name, parent, NULL); + cluster_uninodelk (ec->xl_list, locked_on, ec->nodes, replies, output, + frame, ec->xl, ec->xl->name, parent, 0, 0); out: cluster_replies_wipe (replies, ec->nodes); loc_wipe (&loc); @@ -2663,9 +2664,9 @@ __ec_heal_entry (call_frame_t *frame, ec_t *ec, inode_t *inode, dirty = alloca0 (ec->nodes * sizeof (*dirty)); EC_REPLIES_ALLOC (replies, ec->nodes); - ret = cluster_entrylk (ec->xl_list, heal_on, ec->nodes, replies, + ret = cluster_inodelk (ec->xl_list, heal_on, ec->nodes, replies, locked_on, frame, ec->xl, ec->xl->name, inode, - NULL); + 0, 0); { if (ret <= ec->fragments) { gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: Skipping heal " @@ -2680,8 +2681,8 @@ __ec_heal_entry (call_frame_t *frame, ec_t *ec, inode_t *inode, source = ret; } unlock: - cluster_unentrylk (ec->xl_list, locked_on, ec->nodes, replies, output, - frame, ec->xl, ec->xl->name, inode, NULL); + cluster_uninodelk (ec->xl_list, locked_on, ec->nodes, replies, output, + frame, ec->xl, ec->xl->name, inode, 0, 0); if (ret < 0) goto out; @@ -2728,9 +2729,9 @@ ec_heal_entry (call_frame_t *frame, ec_t *ec, inode_t *inode, sprintf (selfheal_domain, "%s:self-heal", ec->xl->name); ec_mask_to_char_array (ec->xl_up, up_subvols, ec->nodes); /*If other processes are already doing the heal, don't block*/ - ret = cluster_entrylk (ec->xl_list, up_subvols, ec->nodes, replies, + ret = cluster_inodelk (ec->xl_list, up_subvols, ec->nodes, replies, locked_on, frame, ec->xl, selfheal_domain, inode, - NULL); + 0, 0); { if (ret <= ec->fragments) { gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: Skipping heal " @@ -2743,8 +2744,8 @@ ec_heal_entry (call_frame_t *frame, ec_t *ec, inode_t *inode, sources, healed_sinks); } unlock: - cluster_unentrylk (ec->xl_list, locked_on, ec->nodes, replies, output, - frame, ec->xl, selfheal_domain, inode, NULL); + cluster_uninodelk (ec->xl_list, locked_on, ec->nodes, replies, output, + frame, ec->xl, selfheal_domain, inode, 0, 0); cluster_replies_wipe (replies, ec->nodes); return ret; } @@ -3086,8 +3087,8 @@ ec_heal_block (call_frame_t *frame, xlator_t *this, uintptr_t target, if (fop == NULL) goto out; - fop->pre_size = fop->post_size = heal->total_size; - fop->have_size = 1; + GF_ASSERT(ec_set_inode_size(fop, heal->fd->inode, heal->total_size)); + error = 0; out: diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c index 8ce3087d5a6..48251c84bac 100644 --- a/xlators/cluster/ec/src/ec-helpers.c +++ b/xlators/cluster/ec/src/ec-helpers.c @@ -738,3 +738,41 @@ ec_filter_internal_xattrs (dict_t *xattr) dict_foreach_match (xattr, ec_is_internal_xattr, NULL, dict_remove_foreach_fn, NULL); } + +gf_boolean_t +ec_is_data_fop (glusterfs_fop_t fop) +{ + switch (fop) { + case GF_FOP_WRITE: + case GF_FOP_TRUNCATE: + case GF_FOP_FTRUNCATE: + case GF_FOP_FALLOCATE: + case GF_FOP_DISCARD: + case GF_FOP_ZEROFILL: + return _gf_true; + default: + return _gf_false; + } + return _gf_false; +} +/* +gf_boolean_t +ec_is_metadata_fop (int32_t lock_kind, glusterfs_fop_t fop) +{ + if (lock_kind == EC_LOCK_ENTRY) { + return _gf_false; + } + + switch (fop) { + case GF_FOP_SETATTR: + case GF_FOP_FSETATTR: + case GF_FOP_SETXATTR: + case GF_FOP_FSETXATTR: + case GF_FOP_REMOVEXATTR: + case GF_FOP_FREMOVEXATTR: + return _gf_true; + default: + return _gf_false; + } + return _gf_false; +}*/ diff --git a/xlators/cluster/ec/src/ec-helpers.h b/xlators/cluster/ec/src/ec-helpers.h index df4495138fe..14243df54f3 100644 --- a/xlators/cluster/ec/src/ec-helpers.h +++ b/xlators/cluster/ec/src/ec-helpers.h @@ -59,4 +59,11 @@ ec_is_internal_xattr (dict_t *dict, char *key, data_t *value, void *data); void ec_filter_internal_xattrs (dict_t *xattr); + +gf_boolean_t +ec_is_data_fop (glusterfs_fop_t fop); +/* +gf_boolean_t +ec_is_metadata_fop (glusterfs_fop_t fop); +*/ #endif /* __EC_HELPERS_H__ */ diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c index 7372c0a0599..853d914148b 100644 --- a/xlators/cluster/ec/src/ec-inode-read.c +++ b/xlators/cluster/ec/src/ec-inode-read.c @@ -267,9 +267,9 @@ int32_t ec_manager_getxattr(ec_fop_data_t * fop, int32_t state) (strncmp(fop->str[0], GF_XATTR_CLRLK_CMD, strlen(GF_XATTR_CLRLK_CMD)) != 0)) { if (fop->fd == NULL) { - ec_lock_prepare_inode(fop, &fop->loc[0], 0); + ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO); } else { - ec_lock_prepare_fd(fop, fop->fd, 0); + ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO); } ec_lock(fop); } @@ -1094,12 +1094,12 @@ int32_t ec_readv_rebuild(ec_t * ec, ec_fop_data_t * fop, ec_cbk_data_t * cbk) size_t fsize = 0, size = 0, max = 0; int32_t i = 0; - if (cbk->op_ret < 0) - { + if (cbk->op_ret < 0) { goto out; } - cbk->iatt[0].ia_size = fop->pre_size; + /* This shouldn't fail because we have the inode locked. */ + GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode, &cbk->iatt[0].ia_size)); if (cbk->op_ret > 0) { @@ -1331,15 +1331,10 @@ int32_t ec_manager_readv(ec_fop_data_t * fop, int32_t state) /* Fall through */ case EC_STATE_LOCK: - ec_lock_prepare_fd(fop, fop->fd, 0); + ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO); ec_lock(fop); - return EC_STATE_GET_SIZE_AND_VERSION; - - case EC_STATE_GET_SIZE_AND_VERSION: - ec_get_size_version(fop); - - return EC_STATE_DISPATCH; + return EC_STATE_DISPATCH; case EC_STATE_DISPATCH: ec_dispatch_min(fop); @@ -1396,7 +1391,6 @@ int32_t ec_manager_readv(ec_fop_data_t * fop, int32_t state) case -EC_STATE_INIT: case -EC_STATE_LOCK: - case -EC_STATE_GET_SIZE_AND_VERSION: case -EC_STATE_DISPATCH: case -EC_STATE_PREPARE_ANSWER: case -EC_STATE_REPORT: @@ -1580,22 +1574,14 @@ int32_t ec_manager_stat(ec_fop_data_t * fop, int32_t state) { case EC_STATE_INIT: case EC_STATE_LOCK: - if (fop->fd == NULL) - { - ec_lock_prepare_inode(fop, &fop->loc[0], 0); - } - else - { - ec_lock_prepare_fd(fop, fop->fd, 0); + if (fop->fd == NULL) { + ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO); + } else { + ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO); } ec_lock(fop); - return EC_STATE_GET_SIZE_AND_VERSION; - - case EC_STATE_GET_SIZE_AND_VERSION: - ec_get_size_version(fop); - - return EC_STATE_DISPATCH; + return EC_STATE_DISPATCH; case EC_STATE_DISPATCH: ec_dispatch_all(fop); @@ -1614,16 +1600,16 @@ int32_t ec_manager_stat(ec_fop_data_t * fop, int32_t state) cbk->op_errno = EIO; } } - if (cbk->op_ret < 0) - { + if (cbk->op_ret < 0) { ec_fop_set_error(fop, cbk->op_errno); - } - else - { + } else if (cbk->iatt[0].ia_type == IA_IFREG) { ec_iatt_rebuild(fop->xl->private, cbk->iatt, 1, cbk->count); - cbk->iatt[0].ia_size = fop->pre_size; + /* This shouldn't fail because we have the inode locked. */ + GF_ASSERT(ec_get_inode_size(fop, + fop->locks[0].lock->loc.inode, + &cbk->iatt[0].ia_size)); } } else @@ -1659,7 +1645,6 @@ int32_t ec_manager_stat(ec_fop_data_t * fop, int32_t state) case -EC_STATE_INIT: case -EC_STATE_LOCK: - case -EC_STATE_GET_SIZE_AND_VERSION: case -EC_STATE_DISPATCH: case -EC_STATE_PREPARE_ANSWER: case -EC_STATE_REPORT: diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c index 6b485a26fbc..368b3ae5edf 100644 --- a/xlators/cluster/ec/src/ec-inode-write.c +++ b/xlators/cluster/ec/src/ec-inode-write.c @@ -123,11 +123,13 @@ ec_manager_xattr (ec_fop_data_t *fop, int32_t state) switch (state) { case EC_STATE_INIT: case EC_STATE_LOCK: - if (fop->fd == NULL) - ec_lock_prepare_inode(fop, &fop->loc[0], 1); - else - ec_lock_prepare_fd(fop, fop->fd, 1); - + if (fop->fd == NULL) { + ec_lock_prepare_inode(fop, &fop->loc[0], + EC_UPDATE_META | EC_QUERY_INFO); + } else { + ec_lock_prepare_fd(fop, fop->fd, + EC_UPDATE_META | EC_QUERY_INFO); + } ec_lock(fop); return EC_STATE_DISPATCH; @@ -378,21 +380,15 @@ int32_t ec_manager_setattr(ec_fop_data_t * fop, int32_t state) { case EC_STATE_INIT: case EC_STATE_LOCK: - if (fop->fd == NULL) - { - ec_lock_prepare_inode(fop, &fop->loc[0], 1); - } - else - { - ec_lock_prepare_fd(fop, fop->fd, 1); + if (fop->fd == NULL) { + ec_lock_prepare_inode(fop, &fop->loc[0], + EC_UPDATE_META | EC_QUERY_INFO); + } else { + ec_lock_prepare_fd(fop, fop->fd, + EC_UPDATE_META | EC_QUERY_INFO); } ec_lock(fop); - return EC_STATE_GET_SIZE_AND_VERSION; - - case EC_STATE_GET_SIZE_AND_VERSION: - ec_get_size_version(fop); - return EC_STATE_DISPATCH; case EC_STATE_DISPATCH: @@ -412,17 +408,17 @@ int32_t ec_manager_setattr(ec_fop_data_t * fop, int32_t state) cbk->op_errno = EIO; } } - if (cbk->op_ret < 0) - { + if (cbk->op_ret < 0) { ec_fop_set_error(fop, cbk->op_errno); - } - else - { + } else if (cbk->iatt[0].ia_type == IA_IFREG) { ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count); - cbk->iatt[0].ia_size = fop->pre_size; - cbk->iatt[1].ia_size = fop->pre_size; + /* This shouldn't fail because we have the inode locked. */ + GF_ASSERT(ec_get_inode_size(fop, + fop->locks[0].lock->loc.inode, + &cbk->iatt[0].ia_size)); + cbk->iatt[1].ia_size = cbk->iatt[0].ia_size; } } else @@ -462,7 +458,6 @@ int32_t ec_manager_setattr(ec_fop_data_t * fop, int32_t state) case -EC_STATE_INIT: case -EC_STATE_LOCK: - case -EC_STATE_GET_SIZE_AND_VERSION: case -EC_STATE_DISPATCH: case -EC_STATE_PREPARE_ANSWER: case -EC_STATE_REPORT: @@ -992,21 +987,17 @@ int32_t ec_manager_truncate(ec_fop_data_t * fop, int32_t state) /* Fall through */ case EC_STATE_LOCK: - if (fop->id == GF_FOP_TRUNCATE) - { - ec_lock_prepare_inode(fop, &fop->loc[0], 1); - } - else - { - ec_lock_prepare_fd(fop, fop->fd, 1); + if (fop->id == GF_FOP_TRUNCATE) { + ec_lock_prepare_inode(fop, &fop->loc[0], + EC_UPDATE_DATA | EC_UPDATE_META | + EC_QUERY_INFO); + } else { + ec_lock_prepare_fd(fop, fop->fd, + EC_UPDATE_DATA | EC_UPDATE_META | + EC_QUERY_INFO); } ec_lock(fop); - return EC_STATE_GET_SIZE_AND_VERSION; - - case EC_STATE_GET_SIZE_AND_VERSION: - ec_prepare_update(fop); - return EC_STATE_DISPATCH; case EC_STATE_DISPATCH: @@ -1035,14 +1026,18 @@ int32_t ec_manager_truncate(ec_fop_data_t * fop, int32_t state) ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count); - cbk->iatt[0].ia_size = fop->pre_size; + /* This shouldn't fail because we have the inode locked. */ + GF_ASSERT(ec_get_inode_size(fop, + fop->locks[0].lock->loc.inode, + &cbk->iatt[0].ia_size)); cbk->iatt[1].ia_size = fop->user_size; - fop->post_size = fop->user_size; - if ((fop->pre_size > fop->post_size) && - (fop->user_size != fop->offset)) - { - if (!ec_truncate_clean(fop)) - { + /* This shouldn't fail because we have the inode locked. */ + GF_ASSERT(ec_set_inode_size(fop, + fop->locks[0].lock->loc.inode, + fop->user_size)); + if ((cbk->iatt[0].ia_size > cbk->iatt[1].ia_size) && + (fop->user_size != fop->offset)) { + if (!ec_truncate_clean(fop)) { ec_fop_set_error(fop, EIO); } } @@ -1085,7 +1080,6 @@ int32_t ec_manager_truncate(ec_fop_data_t * fop, int32_t state) case -EC_STATE_INIT: case -EC_STATE_LOCK: - case -EC_STATE_GET_SIZE_AND_VERSION: case -EC_STATE_DISPATCH: case -EC_STATE_PREPARE_ANSWER: case -EC_STATE_REPORT: @@ -1355,9 +1349,13 @@ void ec_writev_start(ec_fop_data_t *fop) ec_fd_t *ctx; fd_t *fd; size_t tail; + uint64_t current; uid_t uid; gid_t gid; + /* This shouldn't fail because we have the inode locked. */ + GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode, ¤t)); + fd = fd_anonymous(fop->fd->inode); if (fd == NULL) { ec_fop_set_error(fop, EIO); @@ -1373,7 +1371,7 @@ void ec_writev_start(ec_fop_data_t *fop) ctx = ec_fd_get(fop->fd, fop->xl); if (ctx != NULL) { if ((ctx->flags & O_APPEND) != 0) { - fop->offset = fop->pre_size; + fop->offset = current; } } @@ -1404,22 +1402,17 @@ void ec_writev_start(ec_fop_data_t *fop) iobref_unref(fop->buffers); fop->buffers = iobref; - if (fop->head > 0) - { + if (fop->head > 0) { ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN, ec_writev_merge_head, NULL, fd, ec->stripe_size, fop->offset, 0, NULL); } tail = fop->size - fop->user_size - fop->head; - if ((tail > 0) && ((fop->head == 0) || (fop->size > ec->stripe_size))) - { - if (fop->pre_size > fop->offset + fop->head + fop->user_size) - { + if ((tail > 0) && ((fop->head == 0) || (fop->size > ec->stripe_size))) { + if (current > fop->offset + fop->head + fop->user_size) { ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN, ec_writev_merge_tail, NULL, fd, ec->stripe_size, fop->offset + fop->size - ec->stripe_size, 0, NULL); - } - else - { + } else { memset(fop->vector[0].iov_base + fop->size - tail, 0, tail); } } @@ -1530,14 +1523,11 @@ int32_t ec_manager_writev(ec_fop_data_t * fop, int32_t state) { case EC_STATE_INIT: case EC_STATE_LOCK: - ec_lock_prepare_fd(fop, fop->fd, 1); + ec_lock_prepare_fd(fop, fop->fd, + EC_UPDATE_DATA | EC_UPDATE_META | + EC_QUERY_INFO); ec_lock(fop); - return EC_STATE_GET_SIZE_AND_VERSION; - - case EC_STATE_GET_SIZE_AND_VERSION: - ec_prepare_update(fop); - return EC_STATE_DISPATCH; case EC_STATE_DISPATCH: @@ -1574,27 +1564,34 @@ int32_t ec_manager_writev(ec_fop_data_t * fop, int32_t state) ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count); + /* This shouldn't fail because we have the inode locked. */ + GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode, + &cbk->iatt[0].ia_size)); + cbk->iatt[1].ia_size = cbk->iatt[0].ia_size; size = fop->offset + fop->head + fop->user_size; - if (size > fop->pre_size) - { - fop->post_size = size; - } - - cbk->iatt[0].ia_size = fop->pre_size; - cbk->iatt[1].ia_size = fop->post_size; - - cbk->op_ret *= ec->fragments; - if (cbk->op_ret < fop->head) - { - cbk->op_ret = 0; - } - else - { - cbk->op_ret -= fop->head; + if (size > cbk->iatt[0].ia_size) { + /* Only update inode size if this is a top level fop. + * Otherwise this is an internal write and the top + * level fop should take care of the real inode size. + */ + if (fop->parent == NULL) { + /* This shouldn't fail because we have the inode + * locked. */ + GF_ASSERT(ec_set_inode_size(fop, fop->fd->inode, + size)); + } + cbk->iatt[1].ia_size = size; } - if (cbk->op_ret > fop->user_size) - { - cbk->op_ret = fop->user_size; + if (fop->error == 0) { + cbk->op_ret *= ec->fragments; + if (cbk->op_ret < fop->head) { + cbk->op_ret = 0; + } else { + cbk->op_ret -= fop->head; + } + if (cbk->op_ret > fop->user_size) { + cbk->op_ret = fop->user_size; + } } } } @@ -1621,8 +1618,8 @@ int32_t ec_manager_writev(ec_fop_data_t * fop, int32_t state) case -EC_STATE_INIT: case -EC_STATE_LOCK: - case -EC_STATE_GET_SIZE_AND_VERSION: case -EC_STATE_DISPATCH: + case -EC_STATE_DELAYED_START: case -EC_STATE_PREPARE_ANSWER: case -EC_STATE_REPORT: GF_ASSERT(fop->error != 0); diff --git a/xlators/cluster/ec/src/ec-locks.c b/xlators/cluster/ec/src/ec-locks.c index 10572037932..22b6fa4d6e5 100644 --- a/xlators/cluster/ec/src/ec-locks.c +++ b/xlators/cluster/ec/src/ec-locks.c @@ -37,13 +37,22 @@ int32_t ec_lock_check(ec_fop_data_t *fop, uintptr_t *mask) locked |= ans->mask; cbk = ans; } else { - notlocked |= ans->mask; + if (ans->op_errno == EAGAIN) { + switch (fop->uint32) { + case EC_LOCK_MODE_NONE: + case EC_LOCK_MODE_ALL: + /* Goal is to treat non-blocking lock as failure + * even if there is a signle EAGAIN*/ + notlocked |= ans->mask; + break; + } + } } } if (error == -1) { if (ec_bits_count(locked | notlocked) >= ec->fragments) { - if (ec_bits_count (locked) >= ec->fragments) { + if (notlocked == 0) { if (fop->answer == NULL) { fop->answer = cbk; } diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c index 3dd04299541..4028aa4d2bb 100644 --- a/xlators/cluster/ec/src/ec.c +++ b/xlators/cluster/ec/src/ec.c @@ -278,6 +278,7 @@ ec_notify_cbk (void *data) { ec_t *ec = data; glusterfs_event_t event = GF_EVENT_MAXVAL; + gf_boolean_t propagate = _gf_false; LOCK(&ec->lock); { @@ -309,10 +310,14 @@ ec_notify_cbk (void *data) /* CHILD_DOWN should not come here as no grace period is given * for notifying CHILD_DOWN. */ - default_notify (ec->xl, event, NULL); + propagate = _gf_true; } unlock: UNLOCK(&ec->lock); + + if (propagate) { + default_notify (ec->xl, event, NULL); + } } void @@ -360,6 +365,49 @@ ec_handle_down (xlator_t *this, ec_t *ec, int32_t idx) } } +gf_boolean_t +ec_force_unlocks(ec_t *ec) +{ + struct list_head list; + ec_fop_data_t *fop; + + if (list_empty(&ec->pending_fops)) { + return _gf_true; + } + + INIT_LIST_HEAD(&list); + + /* All pending fops when GF_EVENT_PARENT_DOWN is received should only + * be fops waiting for a delayed unlock. However the unlock can + * generate new fops. We don't want to trverse these new fops while + * forcing unlocks, so we move all fops to a temporal list. To process + * them without interferences.*/ + list_splice_init(&ec->pending_fops, &list); + + while (!list_empty(&list)) { + fop = list_entry(list.next, ec_fop_data_t, pending_list); + list_move_tail(&fop->pending_list, &ec->pending_fops); + + UNLOCK(&ec->lock); + + ec_unlock_force(fop); + + LOCK(&ec->lock); + } + + ec->shutdown = _gf_true; + + return list_empty(&ec->pending_fops); +} + +void +ec_pending_fops_completed(ec_t *ec) +{ + if (ec->shutdown) { + default_notify(ec->xl, GF_EVENT_PARENT_DOWN, NULL); + } +} + int32_t ec_notify (xlator_t *this, int32_t event, void *data, void *data2) { @@ -367,14 +415,16 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2) int32_t idx = 0; int32_t error = 0; glusterfs_event_t old_event = GF_EVENT_MAXVAL; - glusterfs_event_t new_event = GF_EVENT_MAXVAL; dict_t *input = NULL; dict_t *output = NULL; + gf_boolean_t propagate = _gf_true; + + gf_log (this->name, GF_LOG_TRACE, "NOTIFY(%d): %p, %p", + event, data, data2); if (event == GF_EVENT_TRANSLATOR_OP) { if (!ec->up) { error = -1; - goto out; } else { input = data; output = data2; @@ -400,13 +450,14 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2) */ ec_launch_notify_timer (this, ec); goto unlock; + } else if (event == GF_EVENT_PARENT_DOWN) { + /* If there aren't pending fops running after we have waken up + * them, we immediately propagate the notification. */ + propagate = ec_force_unlocks(ec); + goto unlock; } - gf_log (this->name, GF_LOG_TRACE, "NOTIFY(%d): %p, %d", - event, data, idx); - if (idx < ec->nodes) { /* CHILD_* events */ - old_event = ec_get_event_from_state (ec); if (event == GF_EVENT_CHILD_UP) { @@ -415,28 +466,30 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2) ec_handle_down (this, ec, idx); } - new_event = ec_get_event_from_state (ec); + event = ec_get_event_from_state (ec); - if (new_event == GF_EVENT_CHILD_UP && !ec->up) { + if (event == GF_EVENT_CHILD_UP && !ec->up) { ec_up (this, ec); - } else if (new_event == GF_EVENT_CHILD_DOWN && ec->up) { + } else if (event == GF_EVENT_CHILD_DOWN && ec->up) { ec_down (this, ec); } - if ((new_event == old_event) && (new_event != GF_EVENT_MAXVAL)) - new_event = GF_EVENT_CHILD_MODIFIED; - - event = GF_EVENT_MAXVAL;/* Take care of notifying inside lock */ - if (new_event != GF_EVENT_MAXVAL) - error = default_notify (this, new_event, data); + if (event != GF_EVENT_MAXVAL) { + if (event == old_event) { + event = GF_EVENT_CHILD_MODIFIED; + } + } else { + propagate = _gf_false; + } } - unlock: - UNLOCK (&ec->lock); +unlock: + UNLOCK (&ec->lock); - if (event != GF_EVENT_MAXVAL) - return default_notify (this, event, data); + if (propagate) { + error = default_notify (this, event, data); + } out: - return error; + return error; } int32_t @@ -478,6 +531,8 @@ init (xlator_t *this) ec->xl = this; LOCK_INIT(&ec->lock); + INIT_LIST_HEAD(&ec->pending_fops); + ec->fop_pool = mem_pool_new(ec_fop_data_t, 1024); ec->cbk_pool = mem_pool_new(ec_cbk_data_t, 4096); ec->lock_pool = mem_pool_new(ec_lock_t, 1024); diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h index b8f3e288197..fdedb89ec18 100644 --- a/xlators/cluster/ec/src/ec.h +++ b/xlators/cluster/ec/src/ec.h @@ -44,6 +44,8 @@ struct _ec xlator_t ** xl_list; gf_lock_t lock; gf_timer_t * timer; + gf_boolean_t shutdown; + struct list_head pending_fops; struct mem_pool * fop_pool; struct mem_pool * cbk_pool; struct mem_pool * lock_pool; @@ -51,4 +53,7 @@ struct _ec char vol_uuid[UUID_SIZE + 1]; dict_t *leaf_to_subvolid; }; + +void ec_pending_fops_completed(ec_t *ec); + #endif /* __EC_H__ */ |