summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--xlators/cluster/ec/src/ec-common.c191
-rw-r--r--xlators/cluster/ec/src/ec-common.h10
-rw-r--r--xlators/cluster/ec/src/ec-dir-read.c6
-rw-r--r--xlators/cluster/ec/src/ec-generic.c12
-rw-r--r--xlators/cluster/ec/src/ec-inode-read.c22
-rw-r--r--xlators/cluster/ec/src/ec-inode-write.c124
-rw-r--r--xlators/cluster/ec/src/ec-types.h8
-rw-r--r--xlators/cluster/ec/src/ec.c51
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-set.c6
9 files changed, 288 insertions, 142 deletions
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index 9b9cde8cbc5..f3463ba4489 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -26,6 +26,40 @@
EC_FLAG_WAITING_DATA_DIRTY |\
EC_FLAG_WAITING_METADATA_DIRTY)
+off_t
+ec_range_end_get (off_t fl_start, size_t fl_size)
+{
+ off_t fl_end = 0;
+ switch (fl_size) {
+ case 0:
+ return fl_start;
+ case LLONG_MAX: /*Infinity*/
+ return LLONG_MAX;
+ default:
+ fl_end = fl_start + fl_size - 1;
+ if (fl_end < 0) /*over-flow*/
+ return LLONG_MAX;
+ else
+ return fl_end;
+ }
+}
+
+static gf_boolean_t
+ec_is_range_conflict (ec_lock_link_t *l1, ec_lock_link_t *l2)
+{
+ return ((l1->fl_end >= l2->fl_start) && (l2->fl_end >= l1->fl_start));
+}
+
+static gf_boolean_t
+ec_lock_conflict (ec_lock_link_t *l1, ec_lock_link_t *l2)
+{
+ if ((l1->fop->flags & EC_FLAG_LOCK_SHARED) &&
+ (l2->fop->flags & EC_FLAG_LOCK_SHARED))
+ return _gf_false;
+
+ return ec_is_range_conflict (l1, l2);
+}
+
uint32_t
ec_select_first_by_read_policy (ec_t *ec, ec_fop_data_t *fop)
{
@@ -729,7 +763,7 @@ int32_t ec_lock_compare(ec_lock_t * lock1, ec_lock_t * lock2)
}
void ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, uint32_t flags,
- loc_t *base)
+ loc_t *base, off_t fl_start, size_t fl_size)
{
ec_lock_link_t *link;
@@ -763,12 +797,15 @@ void ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, uint32_t flags,
link->update[EC_DATA_TXN] = (flags & EC_UPDATE_DATA) != 0;
link->update[EC_METADATA_TXN] = (flags & EC_UPDATE_META) != 0;
link->base = base;
+ link->fl_start = fl_start;
+ link->fl_end = ec_range_end_get (fl_start, fl_size);
lock->refs_pending++;
}
void ec_lock_prepare_inode_internal(ec_fop_data_t *fop, loc_t *loc,
- uint32_t flags, loc_t *base)
+ uint32_t flags, loc_t *base,
+ off_t fl_start, size_t fl_size)
{
ec_lock_t *lock = NULL;
ec_inode_t *ctx;
@@ -829,16 +866,17 @@ void ec_lock_prepare_inode_internal(ec_fop_data_t *fop, loc_t *loc,
ctx->inode_lock = lock;
insert:
- ec_lock_insert(fop, lock, flags, base);
+ ec_lock_insert(fop, lock, flags, base, fl_start, fl_size);
update_query:
lock->query |= (flags & EC_QUERY_INFO) != 0;
unlock:
UNLOCK(&loc->inode->lock);
}
-void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, uint32_t flags)
+void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, uint32_t flags,
+ off_t fl_start, size_t fl_size)
{
- ec_lock_prepare_inode_internal(fop, loc, flags, NULL);
+ ec_lock_prepare_inode_internal(fop, loc, flags, NULL, fl_start, fl_size);
}
void ec_lock_prepare_parent_inode(ec_fop_data_t *fop, loc_t *loc, loc_t *base,
@@ -864,12 +902,13 @@ void ec_lock_prepare_parent_inode(ec_fop_data_t *fop, loc_t *loc, loc_t *base,
base = NULL;
}
- ec_lock_prepare_inode_internal(fop, &tmp, flags, base);
+ ec_lock_prepare_inode_internal(fop, &tmp, flags, base, 0, LLONG_MAX);
loc_wipe(&tmp);
}
-void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags)
+void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags,
+ off_t fl_start, size_t fl_size)
{
loc_t loc;
int32_t err;
@@ -885,7 +924,7 @@ void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags)
return;
}
- ec_lock_prepare_inode_internal(fop, &loc, flags, NULL);
+ ec_lock_prepare_inode_internal(fop, &loc, flags, NULL, fl_start, fl_size);
loc_wipe(&loc);
}
@@ -1319,17 +1358,16 @@ out:
}
}
-gf_boolean_t ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode,
- uint64_t *size)
+gf_boolean_t
+__ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode,
+ uint64_t *size)
{
ec_inode_t *ctx;
gf_boolean_t found = _gf_false;
- LOCK(&inode->lock);
-
ctx = __ec_inode_get(inode, fop->xl);
if (ctx == NULL) {
- goto unlock;
+ goto out;
}
if (ctx->have_size) {
@@ -1337,23 +1375,35 @@ gf_boolean_t ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode,
found = _gf_true;
}
-unlock:
+out:
+ return found;
+}
+
+gf_boolean_t
+ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode,
+ uint64_t *size)
+{
+ gf_boolean_t found = _gf_false;
+
+ LOCK(&inode->lock);
+ {
+ found = __ec_get_inode_size (fop, inode, size);
+ }
UNLOCK(&inode->lock);
return found;
}
-gf_boolean_t ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode,
- uint64_t size)
+gf_boolean_t
+__ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode,
+ uint64_t size)
{
ec_inode_t *ctx;
gf_boolean_t found = _gf_false;
- LOCK(&inode->lock);
-
ctx = __ec_inode_get(inode, fop->xl);
if (ctx == NULL) {
- goto unlock;
+ goto out;
}
/* Normal fops always have ctx->have_size set. However self-heal calls this
@@ -1368,8 +1418,21 @@ gf_boolean_t ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode,
found = _gf_true;
-unlock:
- UNLOCK(&inode->lock);
+out:
+ return found;
+}
+
+gf_boolean_t
+ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode,
+ uint64_t size)
+{
+ gf_boolean_t found = _gf_false;
+
+ LOCK (&inode->lock);
+ {
+ found = __ec_set_inode_size (fop, inode, size);
+ }
+ UNLOCK (&inode->lock);
return found;
}
@@ -1476,34 +1539,47 @@ ec_lock_update_fd(ec_lock_t *lock, ec_fop_data_t *fop)
}
}
+static gf_boolean_t
+ec_link_has_lock_conflict (ec_lock_link_t *link, struct list_head *owners)
+{
+ ec_lock_link_t *owner_link = NULL;
+ ec_t *ec = link->fop->xl->private;
+
+ if (!ec->parallel_writes)
+ return _gf_true;
+
+ list_for_each_entry (owner_link, owners, owner_list) {
+ if (ec_lock_conflict (owner_link, link))
+ return _gf_true;
+ }
+ return _gf_false;
+}
+
static void
ec_lock_wake_shared(ec_lock_t *lock, struct list_head *list)
{
ec_fop_data_t *fop;
ec_lock_link_t *link;
- gf_boolean_t exclusive = _gf_false;
+ gf_boolean_t conflict = _gf_false;
- while (!exclusive && !list_empty(&lock->waiting)) {
+ while (!conflict && !list_empty(&lock->waiting)) {
link = list_entry(lock->waiting.next, ec_lock_link_t, wait_list);
fop = link->fop;
/* If lock is not acquired, at most one fop can be assigned as owner.
* The following fops will need to wait in the lock->waiting queue
* until the lock has been fully acquired. */
- exclusive = !lock->acquired;
+ conflict = !lock->acquired;
/* If the fop is not shareable, only this fop can be assigned as owner.
* Other fops will need to wait until this one finishes. */
- if ((fop->flags & EC_FLAG_LOCK_SHARED) == 0) {
- exclusive = _gf_true;
-
- /* Avoid other requests to be assigned as owners. */
- lock->exclusive = 1;
+ if (ec_link_has_lock_conflict (link, &lock->owners)) {
+ conflict = _gf_true;
}
/* If only one fop is allowed, it can be assigned as the owner of the
* lock only if there weren't any other owner. */
- if (exclusive && !list_empty(&lock->owners)) {
+ if (conflict && !list_empty(&lock->owners)) {
break;
}
@@ -1570,9 +1646,7 @@ void ec_lock_acquired(ec_lock_link_t *link)
lock->acquired = _gf_true;
ec_lock_update_fd(lock, fop);
- if ((fop->flags & EC_FLAG_LOCK_SHARED) != 0) {
- ec_lock_wake_shared(lock, &list);
- }
+ ec_lock_wake_shared(lock, &list);
UNLOCK(&lock->loc.inode->lock);
@@ -1683,11 +1757,11 @@ ec_lock_assign_owner(ec_lock_link_t *link)
/* We are trying to acquire a lock that has an unlock timer active.
* This means that the lock must be idle, i.e. no fop can be in the
* owner, waiting or frozen lists. It also means that the lock cannot
- * have been marked as being released (this is done without timers)
- * and it must not be exclusive. There should only be one owner
- * reference, but it's possible that some fops are being prepared to
- * use this lock. */
- GF_ASSERT ((lock->exclusive == 0) && (lock->refs_owners == 1) &&
+ * have been marked as being released (this is done without timers).
+ * There should only be one owner reference, but it's possible that
+ * some fops are being prepared to use this lock.
+ */
+ GF_ASSERT ((lock->refs_owners == 1) &&
list_empty(&lock->owners) && list_empty(&lock->waiting));
/* We take the timer_link before cancelling the timer, since a
@@ -1735,13 +1809,15 @@ ec_lock_assign_owner(ec_lock_link_t *link)
lock->timer = NULL;
}
- lock->exclusive |= (fop->flags & EC_FLAG_LOCK_SHARED) == 0;
-
if (!list_empty(&lock->owners)) {
/* There are other owners of this lock. We can only take ownership if
- * the lock is already acquired and can be shared. Otherwise we need
- * to wait. */
- if (!lock->acquired || (lock->exclusive != 0)) {
+ * the lock is already acquired and doesn't have conflict with existing
+ * owners, or waiters(to prevent starvation).
+ * Otherwise we need to wait.
+ */
+ if (!lock->acquired ||
+ ec_link_has_lock_conflict (link, &lock->owners) ||
+ ec_link_has_lock_conflict (link, &lock->waiting)) {
ec_trace("LOCK_QUEUE_WAIT", fop, "lock=%p", lock);
list_add_tail(&link->wait_list, &lock->waiting);
@@ -1819,10 +1895,7 @@ ec_lock_next_owner(ec_lock_link_t *link, ec_cbk_data_t *cbk,
}
ec_lock_update_good(lock, fop);
- lock->exclusive -= (fop->flags & EC_FLAG_LOCK_SHARED) == 0;
- if (list_empty(&lock->owners)) {
- ec_lock_wake_shared(lock, &list);
- }
+ ec_lock_wake_shared(lock, &list);
UNLOCK(&lock->loc.inode->lock);
@@ -1876,11 +1949,11 @@ ec_lock_unfreeze(ec_lock_link_t *link)
lock->acquired = _gf_false;
/* We are unfreezing a lock. This means that the lock has already been
- * released. In this state it shouldn't be exclusive nor have a pending
- * timer nor have any owner, and the waiting list should be empty. Only
- * the frozen list can contain some fop. */
- GF_ASSERT((lock->exclusive == 0) && (lock->timer == NULL) &&
- list_empty(&lock->waiting) && list_empty(&lock->owners));
+ * released. In this state it shouldn't have a pending timer nor have any
+ * owner, and the waiting list should be empty. Only the frozen list can
+ * contain some fop. */
+ GF_ASSERT((lock->timer == NULL) && list_empty(&lock->waiting) &&
+ list_empty(&lock->owners));
/* We move all frozen fops to the waiting list. */
list_splice_init(&lock->frozen, &lock->waiting);
@@ -2013,7 +2086,7 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version,
ec_fop_data_t *fop;
ec_lock_t *lock;
ec_inode_t *ctx;
- dict_t * dict;
+ dict_t *dict = NULL;
uintptr_t update_on = 0;
int32_t err = -ENOMEM;
@@ -2203,12 +2276,12 @@ ec_unlock_timer_del(ec_lock_link_t *link)
ec_trace("UNLOCK_DELAYED", link->fop, "lock=%p", lock);
/* The unlock timer has expired without anyone cancelling it.
- * This means that it shouldn't have any owner, and the
- * waiting and frozen lists should be empty. It shouldn't have
- * been marked as release nor be exclusive either. It must have
- * only one owner reference, but there can be fops being
- * prepared though. */
- GF_ASSERT(!lock->release && (lock->exclusive == 0) &&
+ * This means that it shouldn't have any owner, and the waiting
+ * and frozen lists should be empty. It must have only one
+ * owner reference, but there can be fops being prepared
+ * though.
+ * */
+ GF_ASSERT(!lock->release &&
(lock->refs_owners == 1) &&
list_empty(&lock->owners) &&
list_empty(&lock->waiting) &&
diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h
index f5d62269acf..2744c6372a1 100644
--- a/xlators/cluster/ec/src/ec-common.h
+++ b/xlators/cluster/ec/src/ec-common.h
@@ -91,18 +91,24 @@ ec_fop_prepare_answer(ec_fop_data_t *fop, gf_boolean_t ro);
gf_boolean_t
ec_cbk_set_error(ec_cbk_data_t *cbk, int32_t error, gf_boolean_t ro);
-void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, uint32_t flags);
+void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, uint32_t flags,
+ off_t fl_start, size_t fl_size);
void ec_lock_prepare_parent_inode(ec_fop_data_t *fop, loc_t *loc, loc_t *base,
uint32_t flags);
-void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags);
+void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags,
+ off_t fl_start, size_t fl_size);
void ec_lock(ec_fop_data_t * fop);
void ec_lock_reuse(ec_fop_data_t *fop);
void ec_unlock(ec_fop_data_t * fop);
gf_boolean_t ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode,
uint64_t *size);
+gf_boolean_t __ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode,
+ uint64_t *size);
gf_boolean_t ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode,
uint64_t size);
+gf_boolean_t __ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode,
+ uint64_t size);
void ec_clear_inode_info(ec_fop_data_t *fop, inode_t *inode);
void ec_flush_size_version(ec_fop_data_t * fop);
diff --git a/xlators/cluster/ec/src/ec-dir-read.c b/xlators/cluster/ec/src/ec-dir-read.c
index 4fe82e3c0b6..48afe54460f 100644
--- a/xlators/cluster/ec/src/ec-dir-read.c
+++ b/xlators/cluster/ec/src/ec-dir-read.c
@@ -141,7 +141,8 @@ int32_t ec_manager_opendir(ec_fop_data_t * fop, int32_t state)
/* Fall through */
case EC_STATE_LOCK:
- ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO);
+ ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO, 0,
+ LLONG_MAX);
ec_lock(fop);
return EC_STATE_DISPATCH;
@@ -432,7 +433,8 @@ int32_t ec_manager_readdir(ec_fop_data_t * fop, int32_t state)
}
fop->mask &= 1ULL << idx;
} else {
- ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO);
+ ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, 0,
+ LLONG_MAX);
ec_lock(fop);
}
diff --git a/xlators/cluster/ec/src/ec-generic.c b/xlators/cluster/ec/src/ec-generic.c
index ddb90ce39cc..a5f986e74f4 100644
--- a/xlators/cluster/ec/src/ec-generic.c
+++ b/xlators/cluster/ec/src/ec-generic.c
@@ -85,7 +85,7 @@ int32_t ec_manager_flush(ec_fop_data_t * fop, int32_t state)
{
case EC_STATE_INIT:
case EC_STATE_LOCK:
- ec_lock_prepare_fd(fop, fop->fd, 0);
+ ec_lock_prepare_fd(fop, fop->fd, 0, 0, LLONG_MAX);
ec_lock(fop);
return EC_STATE_DISPATCH;
@@ -300,7 +300,7 @@ int32_t ec_manager_fsync(ec_fop_data_t * fop, int32_t state)
{
case EC_STATE_INIT:
case EC_STATE_LOCK:
- ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO);
+ ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, 0, LLONG_MAX);
ec_lock(fop);
return EC_STATE_DISPATCH;
@@ -501,7 +501,7 @@ int32_t ec_manager_fsyncdir(ec_fop_data_t * fop, int32_t state)
{
case EC_STATE_INIT:
case EC_STATE_LOCK:
- ec_lock_prepare_fd(fop, fop->fd, 0);
+ ec_lock_prepare_fd(fop, fop->fd, 0, 0, LLONG_MAX);
ec_lock(fop);
return EC_STATE_DISPATCH;
@@ -1220,9 +1220,11 @@ int32_t ec_manager_xattrop(ec_fop_data_t * fop, int32_t state)
case EC_STATE_INIT:
case EC_STATE_LOCK:
if (fop->fd == NULL) {
- ec_lock_prepare_inode(fop, &fop->loc[0], EC_UPDATE_META);
+ ec_lock_prepare_inode(fop, &fop->loc[0], EC_UPDATE_META, 0,
+ LLONG_MAX);
} else {
- ec_lock_prepare_fd(fop, fop->fd, EC_UPDATE_META);
+ ec_lock_prepare_fd(fop, fop->fd, EC_UPDATE_META, 0,
+ LLONG_MAX);
}
ec_lock(fop);
diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c
index 829f47f76aa..33fd7f549bb 100644
--- a/xlators/cluster/ec/src/ec-inode-read.c
+++ b/xlators/cluster/ec/src/ec-inode-read.c
@@ -72,7 +72,8 @@ ec_manager_access(ec_fop_data_t *fop, int32_t state)
switch (state) {
case EC_STATE_INIT:
case EC_STATE_LOCK:
- ec_lock_prepare_inode (fop, &fop->loc[0], EC_QUERY_INFO);
+ ec_lock_prepare_inode (fop, &fop->loc[0], EC_QUERY_INFO, 0,
+ LLONG_MAX);
ec_lock (fop);
return EC_STATE_DISPATCH;
@@ -311,9 +312,11 @@ int32_t ec_manager_getxattr(ec_fop_data_t * fop, int32_t state)
(strncmp(fop->str[0], GF_XATTR_CLRLK_CMD,
strlen(GF_XATTR_CLRLK_CMD)) != 0)) {
if (fop->fd == NULL) {
- ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO);
+ ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO,
+ 0, LLONG_MAX);
} else {
- ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO);
+ ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, 0,
+ LLONG_MAX);
}
ec_lock(fop);
}
@@ -1029,7 +1032,8 @@ int32_t ec_manager_readlink(ec_fop_data_t * fop, int32_t state)
{
case EC_STATE_INIT:
case EC_STATE_LOCK:
- ec_lock_prepare_inode (fop, &fop->loc[0], EC_QUERY_INFO);
+ ec_lock_prepare_inode (fop, &fop->loc[0], EC_QUERY_INFO, 0,
+ LLONG_MAX);
ec_lock (fop);
return EC_STATE_DISPATCH;
@@ -1364,7 +1368,8 @@ int32_t ec_manager_readv(ec_fop_data_t * fop, int32_t state)
/* Fall through */
case EC_STATE_LOCK:
- ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO);
+ ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, fop->offset,
+ fop->size);
ec_lock(fop);
return EC_STATE_DISPATCH;
@@ -1568,7 +1573,7 @@ int32_t ec_manager_seek(ec_fop_data_t *fop, int32_t state)
/* Fall through */
case EC_STATE_LOCK:
- ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO);
+ ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, fop->offset, LLONG_MAX);
ec_lock(fop);
return EC_STATE_DISPATCH;
@@ -1788,9 +1793,10 @@ int32_t ec_manager_stat(ec_fop_data_t * fop, int32_t state)
case EC_STATE_INIT:
case EC_STATE_LOCK:
if (fop->fd == NULL) {
- ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO);
+ ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO, 0,
+ LLONG_MAX);
} else {
- ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO);
+ ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, 0, LLONG_MAX);
}
ec_lock(fop);
diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c
index 3ed9b2a1ba4..e6a67cf67bc 100644
--- a/xlators/cluster/ec/src/ec-inode-write.c
+++ b/xlators/cluster/ec/src/ec-inode-write.c
@@ -127,10 +127,12 @@ ec_manager_xattr (ec_fop_data_t *fop, int32_t state)
case EC_STATE_LOCK:
if (fop->fd == NULL) {
ec_lock_prepare_inode(fop, &fop->loc[0],
- EC_UPDATE_META | EC_QUERY_INFO);
+ EC_UPDATE_META | EC_QUERY_INFO,
+ 0, LLONG_MAX);
} else {
ec_lock_prepare_fd(fop, fop->fd,
- EC_UPDATE_META | EC_QUERY_INFO);
+ EC_UPDATE_META | EC_QUERY_INFO,
+ 0, LLONG_MAX);
}
ec_lock(fop);
@@ -369,10 +371,11 @@ int32_t ec_manager_setattr(ec_fop_data_t * fop, int32_t state)
case EC_STATE_LOCK:
if (fop->fd == NULL) {
ec_lock_prepare_inode(fop, &fop->loc[0],
- EC_UPDATE_META | EC_QUERY_INFO);
+ EC_UPDATE_META | EC_QUERY_INFO,
+ 0, LLONG_MAX);
} else {
- ec_lock_prepare_fd(fop, fop->fd,
- EC_UPDATE_META | EC_QUERY_INFO);
+ ec_lock_prepare_fd(fop, fop->fd, EC_UPDATE_META | EC_QUERY_INFO,
+ 0, LLONG_MAX);
}
ec_lock(fop);
@@ -879,8 +882,8 @@ int32_t ec_manager_fallocate(ec_fop_data_t *fop, int32_t state)
case EC_STATE_LOCK:
ec_lock_prepare_fd(fop, fop->fd,
- EC_UPDATE_DATA | EC_UPDATE_META |
- EC_QUERY_INFO);
+ EC_UPDATE_DATA | EC_UPDATE_META | EC_QUERY_INFO,
+ fop->offset, fop->size);
ec_lock(fop);
return EC_STATE_DISPATCH;
@@ -898,24 +901,28 @@ int32_t ec_manager_fallocate(ec_fop_data_t *fop, int32_t state)
cbk->count);
/* This shouldn't fail because we have the inode locked. */
- GF_ASSERT(ec_get_inode_size(fop, fop->locks[0].lock->loc.inode,
- &cbk->iatt[0].ia_size));
+ LOCK(&fop->locks[0].lock->loc.inode->lock);
+ {
+ GF_ASSERT(__ec_get_inode_size(fop,
+ fop->locks[0].lock->loc.inode,
+ &cbk->iatt[0].ia_size));
- /*If mode has FALLOC_FL_KEEP_SIZE keep the size */
- if (fop->int32 & FALLOC_FL_KEEP_SIZE) {
- cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
- } else if (fop->user_size > cbk->iatt[0].ia_size) {
- cbk->iatt[1].ia_size = fop->user_size;
-
- /* This shouldn't fail because we have the inode
- * locked. */
- GF_ASSERT(ec_set_inode_size(fop,
- fop->locks[0].lock->loc.inode,
- cbk->iatt[1].ia_size));
- } else {
- cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
+ /*If mode has FALLOC_FL_KEEP_SIZE keep the size */
+ if (fop->int32 & FALLOC_FL_KEEP_SIZE) {
+ cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
+ } else if (fop->user_size > cbk->iatt[0].ia_size) {
+ cbk->iatt[1].ia_size = fop->user_size;
+
+ /* This shouldn't fail because we have the inode
+ * locked. */
+ GF_ASSERT(__ec_set_inode_size(fop,
+ fop->locks[0].lock->loc.inode,
+ cbk->iatt[1].ia_size));
+ } else {
+ cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
+ }
}
-
+ UNLOCK(&fop->locks[0].lock->loc.inode->lock);
}
return EC_STATE_REPORT;
@@ -1155,11 +1162,11 @@ int32_t ec_manager_truncate(ec_fop_data_t * fop, int32_t state)
if (fop->id == GF_FOP_TRUNCATE) {
ec_lock_prepare_inode(fop, &fop->loc[0],
EC_UPDATE_DATA | EC_UPDATE_META |
- EC_QUERY_INFO);
+ EC_QUERY_INFO, fop->offset, LLONG_MAX);
} else {
ec_lock_prepare_fd(fop, fop->fd,
EC_UPDATE_DATA | EC_UPDATE_META |
- EC_QUERY_INFO);
+ EC_QUERY_INFO, fop->offset, LLONG_MAX);
}
ec_lock(fop);
@@ -1179,6 +1186,9 @@ int32_t ec_manager_truncate(ec_fop_data_t * fop, int32_t state)
cbk->count);
/* This shouldn't fail because we have the inode locked. */
+ /* Inode size doesn't need to be updated under locks, because
+ * conflicting operations won't be in-flight
+ */
GF_ASSERT(ec_get_inode_size(fop, fop->locks[0].lock->loc.inode,
&cbk->iatt[0].ia_size));
cbk->iatt[1].ia_size = fop->user_size;
@@ -1582,6 +1592,9 @@ void ec_writev_start(ec_fop_data_t *fop)
ctx = ec_fd_get(fop->fd, fop->xl);
if (ctx != NULL) {
if ((ctx->flags & O_APPEND) != 0) {
+ /* Appending writes take full locks so size won't change because
+ * of any parallel operations
+ */
fop->offset = current;
}
}
@@ -1601,6 +1614,10 @@ void ec_writev_start(ec_fop_data_t *fop)
}
tail = fop->size - fop->user_size - fop->head;
if ((tail > 0) && ((fop->head == 0) || (fop->size > ec->stripe_size))) {
+ /* Current locking scheme will make sure the 'current' below will
+ * never decrease while the fop is in progress, so the checks will
+ * work as expected
+ */
if (current > fop->offset + fop->head + fop->user_size) {
if (ec_make_internal_fop_xdata (&xdata)) {
err = -ENOMEM;
@@ -1678,14 +1695,32 @@ ec_writev_encode(ec_fop_data_t *fop)
int32_t ec_manager_writev(ec_fop_data_t *fop, int32_t state)
{
ec_cbk_data_t *cbk;
+ ec_fd_t *ctx = NULL;
+ ec_t *ec = fop->xl->private;
+ off_t fl_start = 0;
+ size_t fl_size = LLONG_MAX;
switch (state)
{
case EC_STATE_INIT:
case EC_STATE_LOCK:
+ ctx = ec_fd_get(fop->fd, fop->xl);
+ if (ctx != NULL) {
+ if ((ctx->flags & O_APPEND) == 0) {
+ off_t user_size = 0;
+ off_t head = 0;
+
+ fl_start = fop->offset;
+ user_size = iov_length(fop->vector, fop->int32);
+ head = ec_adjust_offset_down(ec, &fl_start,
+ _gf_true);
+ fl_size = user_size + head;
+ ec_adjust_size_up(ec, &fl_size, _gf_true);
+ }
+ }
ec_lock_prepare_fd(fop, fop->fd,
EC_UPDATE_DATA | EC_UPDATE_META |
- EC_QUERY_INFO);
+ EC_QUERY_INFO, fl_start, fl_size);
ec_lock(fop);
return EC_STATE_DISPATCH;
@@ -1717,23 +1752,28 @@ int32_t ec_manager_writev(ec_fop_data_t *fop, int32_t state)
cbk->count);
/* This shouldn't fail because we have the inode locked. */
- GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode,
- &cbk->iatt[0].ia_size));
- cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
- size = fop->offset + fop->head + fop->user_size;
- if (size > cbk->iatt[0].ia_size) {
- /* Only update inode size if this is a top level fop.
- * Otherwise this is an internal write and the top
- * level fop should take care of the real inode size.
- */
- if (fop->parent == NULL) {
- /* This shouldn't fail because we have the inode
- * locked. */
- GF_ASSERT(ec_set_inode_size(fop, fop->fd->inode,
- size));
- }
- cbk->iatt[1].ia_size = size;
+ LOCK(&fop->fd->inode->lock);
+ {
+ GF_ASSERT(__ec_get_inode_size(fop, fop->fd->inode,
+ &cbk->iatt[0].ia_size));
+ cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
+ size = fop->offset + fop->head + fop->user_size;
+ if (size > cbk->iatt[0].ia_size) {
+ /* Only update inode size if this is a top level fop.
+ * Otherwise this is an internal write and the top
+ * level fop should take care of the real inode size.
+ */
+ if (fop->parent == NULL) {
+ /* This shouldn't fail because we have the inode
+ * locked. */
+ GF_ASSERT(__ec_set_inode_size(fop,
+ fop->fd->inode, size));
+ }
+ cbk->iatt[1].ia_size = size;
+ }
}
+ UNLOCK(&fop->fd->inode->lock);
+
if (fop->error == 0) {
cbk->op_ret *= ec->fragments;
if (cbk->op_ret < fop->head) {
diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
index 94ac911238e..9aed7dc1d27 100644
--- a/xlators/cluster/ec/src/ec-types.h
+++ b/xlators/cluster/ec/src/ec-types.h
@@ -220,8 +220,8 @@ struct _ec_lock {
struct list_head owners;
/* List of fops waiting to be an owner of the lock. Fops are added to this
- * list when the current owner has an incompatible access (shared vs
- * exclusive) or the lock is not acquired yet. */
+ * list when the current owner has an incompatible access (conflicting lock)
+ * or the lock is not acquired yet. */
struct list_head waiting;
/* List of fops that will wait until the next unlock/lock cycle. This
@@ -230,7 +230,6 @@ struct _ec_lock {
* after the lock is reacquired. */
struct list_head frozen;
- int32_t exclusive;
uintptr_t mask;
uintptr_t good_mask;
uintptr_t healing;
@@ -260,6 +259,8 @@ struct _ec_lock_link {
loc_t *base;
uint64_t size;
uint32_t waiting_flags;
+ off_t fl_start;
+ off_t fl_end;
};
struct _ec_fop_data {
@@ -573,6 +574,7 @@ struct _ec {
gf_boolean_t shutdown;
gf_boolean_t eager_lock;
gf_boolean_t optimistic_changelog;
+ gf_boolean_t parallel_writes;
uint32_t background_heals;
uint32_t heal_wait_qlen;
uint32_t self_heal_window_size; /* max size of read/writes */
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
index c32f4ef21dd..856d60c00c9 100644
--- a/xlators/cluster/ec/src/ec.c
+++ b/xlators/cluster/ec/src/ec.c
@@ -295,6 +295,8 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("optimistic-change-log", ec->optimistic_changelog,
options, bool, failed);
+ GF_OPTION_RECONF ("parallel-writes", ec->parallel_writes,
+ options, bool, failed);
ret = 0;
if (ec_assign_read_policy (ec, read_policy)) {
ret = -1;
@@ -665,6 +667,7 @@ init (xlator_t *this)
GF_OPTION_INIT ("shd-max-threads", ec->shd.max_threads, uint32, failed);
GF_OPTION_INIT ("shd-wait-qlength", ec->shd.wait_qlength, uint32, failed);
GF_OPTION_INIT ("optimistic-change-log", ec->optimistic_changelog, bool, failed);
+ GF_OPTION_INIT ("parallel-writes", ec->parallel_writes, bool, failed);
this->itable = inode_table_new (EC_SHD_INODE_LRU_LIMIT, this);
if (!this->itable)
@@ -1466,28 +1469,34 @@ struct volume_options options[] =
"galois field computations."
},
{ .key = {"self-heal-window-size"},
- .type = GF_OPTION_TYPE_INT,
- .min = 1,
- .max = 1024,
- .default_value = "1",
- .description = "Maximum number blocks(128KB) per file for which "
- "self-heal process would be applied simultaneously."
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 1024,
+ .default_value = "1",
+ .description = "Maximum number blocks(128KB) per file for which "
+ "self-heal process would be applied simultaneously."
},
- { .key = {"optimistic-change-log"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "on",
- .description = "Set/Unset dirty flag for every update fop at the start"
- "of the fop. If OFF, this option impacts performance of"
- "entry operations or metadata operations as it will"
- "set dirty flag at the start and unset it at the end of"
- "ALL update fop. If ON and all the bricks are good,"
- "dirty flag will be set at the start only for file fops"
- "For metadata and entry fops dirty flag will not be set"
- "at the start, if all the bricks are good. This does"
- "not impact performance for metadata operations and"
- "entry operation but has a very small window to miss"
- "marking entry as dirty in case it is required to be"
- "healed"
+ { .key = {"optimistic-change-log"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "Set/Unset dirty flag for every update fop at the start"
+ "of the fop. If OFF, this option impacts performance of"
+ "entry operations or metadata operations as it will"
+ "set dirty flag at the start and unset it at the end of"
+ "ALL update fop. If ON and all the bricks are good,"
+ "dirty flag will be set at the start only for file fops"
+ "For metadata and entry fops dirty flag will not be set"
+ "at the start, if all the bricks are good. This does"
+ "not impact performance for metadata operations and"
+ "entry operation but has a very small window to miss"
+ "marking entry as dirty in case it is required to be"
+ "healed"
+ },
+ { .key = {"parallel-writes"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "This controls if writes can be wound in parallel as long"
+ "as it doesn't modify same stripes"
},
{ .key = {NULL} }
};
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index 87d478bf11d..a6941c205aa 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -3559,6 +3559,12 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.type = NO_DOC,
.op_version = GD_OP_VERSION_4_0_0,
},
+ { .key = "disperse.parallel-writes",
+ .voltype = "cluster/disperse",
+ .type = NO_DOC,
+ .op_version = GD_OP_VERSION_3_13_0,
+ .flags = VOLOPT_FLAG_CLIENT_OPT
+ },
{ .key = NULL
}
};