/* Copyright (c) 2012-2014 DataLab, s.l. This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser General Public License, version 3 or any later version (LGPLv3 or later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ #include "byte-order.h" #include "hashfn.h" #include "ec-mem-types.h" #include "ec-types.h" #include "ec-helpers.h" #include "ec-combine.h" #include "ec-common.h" #include "ec-fops.h" #include "ec-method.h" #include "ec.h" #include "ec-messages.h" #define EC_INVALID_INDEX UINT32_MAX #define EC_XATTROP_ALL_WAITING_FLAGS (EC_FLAG_WAITING_XATTROP |\ EC_FLAG_WAITING_DATA_DIRTY |\ EC_FLAG_WAITING_METADATA_DIRTY) off_t ec_range_end_get (off_t fl_start, size_t fl_size) { off_t fl_end = 0; switch (fl_size) { case 0: return fl_start; case LLONG_MAX: /*Infinity*/ return LLONG_MAX; default: fl_end = fl_start + fl_size - 1; if (fl_end < 0) /*over-flow*/ return LLONG_MAX; else return fl_end; } } static gf_boolean_t ec_is_range_conflict (ec_lock_link_t *l1, ec_lock_link_t *l2) { return ((l1->fl_end >= l2->fl_start) && (l2->fl_end >= l1->fl_start)); } static gf_boolean_t ec_lock_conflict (ec_lock_link_t *l1, ec_lock_link_t *l2) { if ((l1->fop->flags & EC_FLAG_LOCK_SHARED) && (l2->fop->flags & EC_FLAG_LOCK_SHARED)) return _gf_false; return ec_is_range_conflict (l1, l2); } uint32_t ec_select_first_by_read_policy (ec_t *ec, ec_fop_data_t *fop) { if (ec->read_policy == EC_ROUND_ROBIN) { return ec->idx; } else if (ec->read_policy == EC_GFID_HASH) { if (fop->use_fd) { return SuperFastHash((char *)fop->fd->inode->gfid, sizeof(fop->fd->inode->gfid)) % ec->nodes; } else { if (gf_uuid_is_null (fop->loc[0].gfid)) loc_gfid (&fop->loc[0], fop->loc[0].gfid); return SuperFastHash((char *)fop->loc[0].gfid, sizeof(fop->loc[0].gfid)) % ec->nodes; } } return 0; } static gf_boolean_t ec_child_valid(ec_t * ec, ec_fop_data_t * fop, uint32_t idx) { return (idx < ec->nodes) && (((fop->remaining >> idx) & 1) == 1); } static uint32_t ec_child_next(ec_t * ec, ec_fop_data_t * fop, uint32_t idx) { while (!ec_child_valid(ec, fop, idx)) { if (++idx >= ec->nodes) { idx = 0; } if (idx == fop->first) { return EC_INVALID_INDEX; } } return idx; } int32_t ec_heal_report(call_frame_t * frame, void * cookie, xlator_t * this, int32_t op_ret, int32_t op_errno, uintptr_t mask, uintptr_t good, uintptr_t bad, dict_t * xdata) { if (op_ret < 0) { gf_msg (this->name, GF_LOG_DEBUG, op_errno, EC_MSG_HEAL_FAIL, "Heal failed"); } else { if ((mask & ~good) != 0) { gf_msg (this->name, GF_LOG_DEBUG, 0, EC_MSG_HEAL_SUCCESS, "Heal succeeded on %d/%d " "subvolumes", gf_bits_count(mask & ~(good | bad)), gf_bits_count(mask & ~good)); } } return 0; } static uintptr_t ec_fop_needs_name_heal (ec_fop_data_t *fop) { ec_t *ec = NULL; ec_cbk_data_t *cbk = NULL; ec_cbk_data_t *enoent_cbk = NULL; ec = fop->xl->private; if (fop->id != GF_FOP_LOOKUP) return 0; if (!fop->loc[0].name || strlen (fop->loc[0].name) == 0) return 0; list_for_each_entry(cbk, &fop->cbk_list, list) { if (cbk->op_ret < 0 && cbk->op_errno == ENOENT) { enoent_cbk = cbk; break; } } if (!enoent_cbk) return 0; return ec->xl_up & ~enoent_cbk->mask; } int32_t ec_fop_needs_heal(ec_fop_data_t *fop) { ec_t *ec = fop->xl->private; if (fop->lock_count == 0) { /* * if fop->lock_count is zero that means it saw version mismatch * without any locks so it can't be trusted. If we launch a heal * based on this it will lead to INODELKs which will affect I/O * performance. Considering self-heal-daemon and operations on * the inode from client which take locks can still trigger the * heal we can choose to not attempt a heal when fop->lock_count * is zero. */ return 0; } return (ec->xl_up & ~(fop->remaining | fop->good)) != 0; } void ec_check_status(ec_fop_data_t * fop) { ec_t * ec = fop->xl->private; int32_t partial = 0; char str1[32], str2[32], str3[32], str4[32], str5[32]; if (!ec_fop_needs_name_heal (fop) && !ec_fop_needs_heal(fop)) { return; } if (fop->answer && fop->answer->op_ret >= 0) { if ((fop->id == GF_FOP_LOOKUP) || (fop->id == GF_FOP_STAT) || (fop->id == GF_FOP_FSTAT)) { partial = fop->answer->iatt[0].ia_type == IA_IFDIR; } else if (fop->id == GF_FOP_OPENDIR) { partial = 1; } } gf_msg (fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_FAIL_ON_SUBVOLS, "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, " "remaining=%s, good=%s, bad=%s)", gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes, ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes), ec_bin(str2, sizeof(str2), fop->mask, ec->nodes), ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes), ec_bin(str4, sizeof(str4), fop->good, ec->nodes), ec_bin(str5, sizeof(str5), ec->xl_up & ~(fop->remaining | fop->good), ec->nodes)); if (fop->use_fd) { if (fop->fd != NULL) { ec_fheal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL, fop->fd, partial, NULL); } } else { ec_heal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL, &fop->loc[0], partial, NULL); if (fop->loc[1].inode != NULL) { ec_heal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL, &fop->loc[1], partial, NULL); } } } void ec_update_good(ec_fop_data_t *fop, uintptr_t good) { fop->good = good; /* Fops that are executed only on one brick do not have enough information * to decide if healing is needed or not. */ if ((fop->expected != 1) && (fop->parent == NULL)) { ec_check_status(fop); } } void ec_lock_update_good(ec_lock_t *lock, ec_fop_data_t *fop) { /* Fops that are executed only on one brick do not have enough information * to update the global mask of good bricks. */ if (fop->expected == 1) { return; } /* When updating the good mask of the lock, we only take into consideration * those bits corresponding to the bricks where the fop has been executed. * Bad bricks are removed from good_mask, but once marked as bad it's never * set to good until the lock is released and reacquired */ lock->good_mask &= fop->good | fop->remaining; } void __ec_fop_set_error(ec_fop_data_t * fop, int32_t error) { if ((error != 0) && (fop->error == 0)) { fop->error = error; } } void ec_fop_set_error(ec_fop_data_t * fop, int32_t error) { LOCK(&fop->lock); __ec_fop_set_error(fop, error); UNLOCK(&fop->lock); } gf_boolean_t ec_cbk_set_error(ec_cbk_data_t *cbk, int32_t error, gf_boolean_t ro) { if ((error != 0) && (cbk->op_ret >= 0)) { /* If cbk->op_errno was 0, it means that the fop succeeded and this * error has happened while processing the answer. If the operation was * read-only, there's no problem (i.e. we simply return the generated * error code). However if it caused a modification, we must return EIO * to indicate that the operation has been partially executed. */ cbk->op_errno = ro ? error : EIO; cbk->op_ret = -1; ec_fop_set_error(cbk->fop, cbk->op_errno); } return (cbk->op_ret < 0); } ec_cbk_data_t * ec_fop_prepare_answer(ec_fop_data_t *fop, gf_boolean_t ro) { ec_cbk_data_t *cbk; int32_t err; cbk = fop->answer; if (cbk == NULL) { ec_fop_set_error(fop, EIO); return NULL; } if (cbk->op_ret < 0) { ec_fop_set_error(fop, cbk->op_errno); } err = ec_dict_combine(cbk, EC_COMBINE_XDATA); if (ec_cbk_set_error(cbk, -err, ro)) { return NULL; } return cbk; } void ec_sleep(ec_fop_data_t *fop) { LOCK(&fop->lock); GF_ASSERT (fop->refs > 0); fop->refs++; fop->jobs++; UNLOCK(&fop->lock); } int32_t ec_check_complete(ec_fop_data_t * fop, ec_resume_f resume) { int32_t error = -1; LOCK(&fop->lock); GF_ASSERT(fop->resume == NULL); if (--fop->jobs != 0) { ec_trace("WAIT", fop, "resume=%p", resume); fop->resume = resume; } else { error = fop->error; fop->error = 0; } UNLOCK(&fop->lock); return error; } void ec_resume(ec_fop_data_t * fop, int32_t error) { ec_resume_f resume = NULL; LOCK(&fop->lock); __ec_fop_set_error(fop, error); if (--fop->jobs == 0) { resume = fop->resume; fop->resume = NULL; if (resume != NULL) { ec_trace("RESUME", fop, "error=%d", error); if (fop->error != 0) { error = fop->error; } fop->error = 0; } } UNLOCK(&fop->lock); if (resume != NULL) { resume(fop, error); } ec_fop_data_release(fop); } void ec_resume_parent(ec_fop_data_t * fop, int32_t error) { ec_fop_data_t * parent; parent = fop->parent; if (parent != NULL) { ec_trace("RESUME_PARENT", fop, "error=%u", error); fop->parent = NULL; ec_resume(parent, error); } } gf_boolean_t ec_is_recoverable_error (int32_t op_errno) { switch (op_errno) { case ENOTCONN: case ESTALE: case ENOENT: case EBADFD:/*Opened fd but brick is disconnected*/ case EIO:/*Backend-fs crash like XFS/ext4 etc*/ return _gf_true; } return _gf_false; } void ec_complete(ec_fop_data_t * fop) { ec_cbk_data_t * cbk = NULL; int32_t resume = 0, update = 0; int healing_count = 0; LOCK(&fop->lock); ec_trace("COMPLETE", fop, ""); if (--fop->winds == 0) { if (fop->answer == NULL) { if (!list_empty(&fop->cbk_list)) { cbk = list_entry(fop->cbk_list.next, ec_cbk_data_t, list); healing_count = gf_bits_count (cbk->mask & fop->healing); /* fop shouldn't be treated as success if it is not * successful on at least fop->minimum good copies*/ if ((cbk->count - healing_count) >= fop->minimum) { fop->answer = cbk; update = 1; } } resume = 1; } } UNLOCK(&fop->lock); /* ec_update_good() locks inode->lock. This may cause deadlocks with fop->lock when used in another order. Since ec_update_good() will not be called more than once for each fop, it can be called from outside the fop->lock locked region. */ if (update) { ec_update_good(fop, cbk->mask); } if (resume) { ec_resume(fop, 0); } ec_fop_data_release(fop); } /* There could be already granted locks sitting on the bricks, unlock for which * must be wound at all costs*/ static gf_boolean_t ec_must_wind (ec_fop_data_t *fop) { if ((fop->id == GF_FOP_INODELK) || (fop->id == GF_FOP_FINODELK) || (fop->id == GF_FOP_LK)) { if (fop->flock.l_type == F_UNLCK) return _gf_true; } else if ((fop->id == GF_FOP_ENTRYLK) || (fop->id == GF_FOP_FENTRYLK)) { if (fop->entrylk_cmd == ENTRYLK_UNLOCK) return _gf_true; } return _gf_false; } static gf_boolean_t ec_internal_op (ec_fop_data_t *fop) { if (ec_must_wind (fop)) return _gf_true; if (fop->id == GF_FOP_XATTROP) return _gf_true; if (fop->id == GF_FOP_FXATTROP) return _gf_true; return _gf_false; } int32_t ec_child_select(ec_fop_data_t * fop) { ec_t * ec = fop->xl->private; int32_t first = 0, num = 0; ec_fop_cleanup(fop); fop->mask &= ec->node_mask; /* Wind the fop on same subvols as parent for any internal extra fops like * head/tail read in case of writev fop. Unlocks shouldn't do this because * unlock should go on all subvols where lock is performed*/ if (fop->parent && !ec_internal_op (fop)) { fop->mask &= (fop->parent->mask & ~fop->parent->healing); } if ((fop->mask & ~ec->xl_up) != 0) { gf_msg (fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_EXEC_UNAVAIL, "Executing operation with " "some subvolumes unavailable " "(%lX)", fop->mask & ~ec->xl_up); fop->mask &= ec->xl_up; } switch (fop->minimum) { case EC_MINIMUM_ALL: fop->minimum = gf_bits_count(fop->mask); if (fop->minimum >= ec->fragments) { break; } case EC_MINIMUM_MIN: fop->minimum = ec->fragments; break; case EC_MINIMUM_ONE: fop->minimum = 1; } if (ec->read_policy == EC_ROUND_ROBIN) { first = ec->idx; if (++first >= ec->nodes) { first = 0; } ec->idx = first; } num = gf_bits_count(fop->mask); /*Unconditionally wind on healing subvolumes*/ fop->mask |= fop->healing; fop->remaining = fop->mask; fop->received = 0; ec_trace("SELECT", fop, ""); if ((num < fop->minimum) && (num < ec->fragments)) { gf_msg (ec->xl->name, GF_LOG_ERROR, 0, EC_MSG_CHILDS_INSUFFICIENT, "Insufficient available children " "for this request (have %d, need " "%d)", num, fop->minimum); return 0; } ec_sleep(fop); return 1; } void ec_dispatch_next(ec_fop_data_t * fop, uint32_t idx) { uint32_t i = EC_INVALID_INDEX; ec_t * ec = fop->xl->private; LOCK(&fop->lock); i = ec_child_next(ec, fop, idx); if (i < EC_METHOD_MAX_NODES) { idx = i; fop->remaining ^= 1ULL << idx; ec_trace("EXECUTE", fop, "idx=%d", idx); fop->winds++; fop->refs++; } UNLOCK(&fop->lock); if (i < EC_METHOD_MAX_NODES) { fop->wind(ec, fop, idx); } } void ec_dispatch_mask(ec_fop_data_t * fop, uintptr_t mask) { ec_t * ec = fop->xl->private; int32_t count, idx; count = gf_bits_count(mask); LOCK(&fop->lock); ec_trace("EXECUTE", fop, "mask=%lX", mask); fop->remaining ^= mask; fop->winds += count; fop->refs += count; UNLOCK(&fop->lock); idx = 0; while (mask != 0) { if ((mask & 1) != 0) { fop->wind(ec, fop, idx); } idx++; mask >>= 1; } } void ec_dispatch_start(ec_fop_data_t * fop) { fop->answer = NULL; fop->good = 0; INIT_LIST_HEAD(&fop->cbk_list); if (fop->lock_count > 0) { ec_owner_copy(fop->frame, &fop->req_frame->root->lk_owner); } } void ec_dispatch_one(ec_fop_data_t * fop) { ec_dispatch_start(fop); if (ec_child_select(fop)) { fop->expected = 1; fop->first = ec_select_first_by_read_policy (fop->xl->private, fop); ec_dispatch_next(fop, fop->first); } } gf_boolean_t ec_dispatch_one_retry(ec_fop_data_t *fop, ec_cbk_data_t **cbk) { ec_cbk_data_t *tmp; tmp = ec_fop_prepare_answer(fop, _gf_true); if (cbk != NULL) { *cbk = tmp; } if ((tmp != NULL) && (tmp->op_ret < 0) && ec_is_recoverable_error (tmp->op_errno)) { GF_ASSERT (fop->mask & (1ULL << tmp->idx)); fop->mask ^= (1ULL << tmp->idx); if (fop->mask) { return _gf_true; } } return _gf_false; } void ec_dispatch_inc(ec_fop_data_t * fop) { ec_dispatch_start(fop); if (ec_child_select(fop)) { fop->expected = gf_bits_count(fop->remaining); fop->first = 0; ec_dispatch_next(fop, 0); } } void ec_dispatch_all (ec_fop_data_t *fop) { ec_dispatch_start(fop); if (ec_child_select(fop)) { fop->expected = gf_bits_count(fop->remaining); fop->first = 0; ec_dispatch_mask(fop, fop->remaining); } } void ec_dispatch_min(ec_fop_data_t * fop) { ec_t * ec = fop->xl->private; uintptr_t mask; uint32_t idx; int32_t count; ec_dispatch_start(fop); if (ec_child_select(fop)) { fop->expected = count = ec->fragments; fop->first = ec_select_first_by_read_policy (fop->xl->private, fop); idx = fop->first - 1; mask = 0; while (count-- > 0) { idx = ec_child_next(ec, fop, idx + 1); if (idx < EC_METHOD_MAX_NODES) mask |= 1ULL << idx; } ec_dispatch_mask(fop, mask); } } ec_lock_t *ec_lock_allocate(ec_fop_data_t *fop, loc_t *loc) { ec_t *ec = fop->xl->private; ec_lock_t * lock; int32_t err; if ((loc->inode == NULL) || (gf_uuid_is_null(loc->gfid) && gf_uuid_is_null(loc->inode->gfid))) { gf_msg (fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_INVALID_INODE, "Trying to lock based on an invalid " "inode"); __ec_fop_set_error(fop, EINVAL); return NULL; } lock = mem_get0(ec->lock_pool); if (lock != NULL) { lock->good_mask = -1ULL; INIT_LIST_HEAD(&lock->owners); INIT_LIST_HEAD(&lock->waiting); INIT_LIST_HEAD(&lock->frozen); err = ec_loc_from_loc(fop->xl, &lock->loc, loc); if (err != 0) { mem_put(lock); lock = NULL; __ec_fop_set_error(fop, -err); } } return lock; } void ec_lock_destroy(ec_lock_t * lock) { loc_wipe(&lock->loc); if (lock->fd != NULL) { fd_unref(lock->fd); } mem_put(lock); } int32_t ec_lock_compare(ec_lock_t * lock1, ec_lock_t * lock2) { return gf_uuid_compare(lock1->loc.gfid, lock2->loc.gfid); } void ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, uint32_t flags, loc_t *base, off_t fl_start, size_t fl_size) { ec_lock_link_t *link; /* This check is only prepared for up to 2 locks per fop. If more locks * are needed this must be changed. */ if ((fop->lock_count > 0) && (ec_lock_compare(fop->locks[0].lock, lock) < 0)) { fop->first_lock = fop->lock_count; } else { /* When the first lock is added to the current fop, request lock * counts from locks xlator to be able to determine if there is * contention and release the lock sooner. */ if (fop->xdata == NULL) { fop->xdata = dict_new(); if (fop->xdata == NULL) { ec_fop_set_error(fop, ENOMEM); return; } } if (dict_set_str(fop->xdata, GLUSTERFS_INODELK_DOM_COUNT, fop->xl->name) != 0) { ec_fop_set_error(fop, ENOMEM); return; } } link = &fop->locks[fop->lock_count++]; link->lock = lock; link->fop = fop; link->update[EC_DATA_TXN] = (flags & EC_UPDATE_DATA) != 0; link->update[EC_METADATA_TXN] = (flags & EC_UPDATE_META) != 0; link->base = base; link->fl_start = fl_start; link->fl_end = ec_range_end_get (fl_start, fl_size); lock->refs_pending++; } void ec_lock_prepare_inode_internal(ec_fop_data_t *fop, loc_t *loc, uint32_t flags, loc_t *base, off_t fl_start, size_t fl_size) { ec_lock_t *lock = NULL; ec_inode_t *ctx; if ((fop->parent != NULL) || (fop->error != 0) || (loc->inode == NULL)) { return; } LOCK(&loc->inode->lock); ctx = __ec_inode_get(loc->inode, fop->xl); if (ctx == NULL) { __ec_fop_set_error(fop, ENOMEM); goto unlock; } if (ctx->inode_lock != NULL) { lock = ctx->inode_lock; /* If there's another lock, make sure that it's not the same. Otherwise * do not insert it. * * This can only happen on renames where source and target names are * in the same directory. */ if ((fop->lock_count > 0) && (fop->locks[0].lock == lock)) { /* Combine data/meta updates */ fop->locks[0].update[EC_DATA_TXN] |= (flags & EC_UPDATE_DATA) != 0; fop->locks[0].update[EC_METADATA_TXN] |= (flags & EC_UPDATE_META) != 0; /* Only one base inode is allowed per fop, so there shouldn't be * overwrites here. */ if (base != NULL) { fop->locks[0].base = base; } goto update_query; } ec_trace("LOCK_INODELK", fop, "lock=%p, inode=%p. Lock already " "acquired", lock, loc->inode); goto insert; } lock = ec_lock_allocate(fop, loc); if (lock == NULL) { goto unlock; } ec_trace("LOCK_CREATE", fop, "lock=%p", lock); lock->flock.l_type = F_WRLCK; lock->flock.l_whence = SEEK_SET; lock->ctx = ctx; ctx->inode_lock = lock; insert: ec_lock_insert(fop, lock, flags, base, fl_start, fl_size); update_query: lock->query |= (flags & EC_QUERY_INFO) != 0; unlock: UNLOCK(&loc->inode->lock); } void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, uint32_t flags, off_t fl_start, size_t fl_size) { ec_lock_prepare_inode_internal(fop, loc, flags, NULL, fl_start, fl_size); } void ec_lock_prepare_parent_inode(ec_fop_data_t *fop, loc_t *loc, loc_t *base, uint32_t flags) { loc_t tmp; int32_t err; if (fop->error != 0) { return; } err = ec_loc_parent(fop->xl, loc, &tmp); if (err != 0) { ec_fop_set_error(fop, -err); return; } if ((flags & EC_INODE_SIZE) != 0) { flags ^= EC_INODE_SIZE; } else { base = NULL; } ec_lock_prepare_inode_internal(fop, &tmp, flags, base, 0, LLONG_MAX); loc_wipe(&tmp); } void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags, off_t fl_start, size_t fl_size) { loc_t loc; int32_t err; if (fop->error != 0) { return; } err = ec_loc_from_fd(fop->xl, &loc, fd); if (err != 0) { ec_fop_set_error(fop, -err); return; } ec_lock_prepare_inode_internal(fop, &loc, flags, NULL, fl_start, fl_size); loc_wipe(&loc); } gf_boolean_t ec_config_check (xlator_t *xl, ec_config_t *config) { ec_t *ec; ec = xl->private; if ((config->version != EC_CONFIG_VERSION) || (config->algorithm != EC_CONFIG_ALGORITHM) || (config->gf_word_size != EC_GF_BITS) || (config->bricks != ec->nodes) || (config->redundancy != ec->redundancy) || (config->chunk_size != EC_METHOD_CHUNK_SIZE)) { uint32_t data_bricks; /* This combination of version/algorithm requires the following values. Incorrect values for these fields are a sign of corruption: redundancy > 0 redundancy * 2 < bricks gf_word_size must be a power of 2 chunk_size (in bits) must be a multiple of gf_word_size * (bricks - redundancy) */ data_bricks = config->bricks - config->redundancy; if ((config->redundancy < 1) || (config->redundancy * 2 >= config->bricks) || !ec_is_power_of_2(config->gf_word_size) || ((config->chunk_size * 8) % (config->gf_word_size * data_bricks) != 0)) { gf_msg (xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_INVALID_CONFIG, "Invalid or corrupted config"); } else { gf_msg (xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_INVALID_CONFIG, "Unsupported config " "(V=%u, A=%u, W=%u, " "N=%u, R=%u, S=%u)", config->version, config->algorithm, config->gf_word_size, config->bricks, config->redundancy, config->chunk_size); } return _gf_false; } return _gf_true; } gf_boolean_t ec_set_dirty_flag (ec_lock_link_t *link, ec_inode_t *ctx, uint64_t *dirty) { gf_boolean_t set_dirty = _gf_false; if (link->update[EC_DATA_TXN] && !ctx->dirty[EC_DATA_TXN]) { if (!link->optimistic_changelog) dirty[EC_DATA_TXN] = 1; } if (link->update[EC_METADATA_TXN] && !ctx->dirty[EC_METADATA_TXN]) { if (!link->optimistic_changelog) dirty[EC_METADATA_TXN] = 1; } if (dirty[EC_METADATA_TXN] || dirty[EC_DATA_TXN]) { set_dirty = _gf_true; } return set_dirty; } int32_t ec_prepare_update_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { struct list_head list; ec_fop_data_t *fop = cookie, *parent, *tmp; ec_lock_link_t *parent_link = fop->data; ec_lock_link_t *link = NULL; ec_lock_t *lock = NULL; ec_inode_t *ctx; gf_boolean_t release = _gf_false; uint64_t waiting_flags = 0; uint64_t dirty[EC_VERSION_SIZE] = {0, 0}; lock = parent_link->lock; parent = parent_link->fop; ctx = lock->ctx; INIT_LIST_HEAD(&list); waiting_flags = parent_link->waiting_flags & EC_XATTROP_ALL_WAITING_FLAGS; LOCK(&lock->loc.inode->lock); list_for_each_entry(link, &lock->owners, owner_list) { if ((link->waiting_flags & waiting_flags) != 0) { link->waiting_flags ^= (link->waiting_flags & waiting_flags); if ((link->waiting_flags & EC_XATTROP_ALL_WAITING_FLAGS) == 0) list_add_tail(&link->fop->cbk_list, &list); } } if (op_ret < 0) { gf_msg (this->name, GF_LOG_WARNING, op_errno, EC_MSG_SIZE_VERS_GET_FAIL, "Failed to get size and version"); goto unlock; } if (waiting_flags & EC_FLAG_WAITING_XATTROP) { op_errno = -ec_dict_del_array(dict, EC_XATTR_VERSION, ctx->pre_version, EC_VERSION_SIZE); if (op_errno != 0) { gf_msg (this->name, GF_LOG_ERROR, op_errno, EC_MSG_VER_XATTR_GET_FAIL, "Unable to get version xattr"); goto unlock; } ctx->post_version[0] += ctx->pre_version[0]; ctx->post_version[1] += ctx->pre_version[1]; ctx->have_version = _gf_true; if (lock->loc.inode->ia_type == IA_IFREG || lock->loc.inode->ia_type == IA_INVAL) { op_errno = -ec_dict_del_number(dict, EC_XATTR_SIZE, &ctx->pre_size); if (op_errno != 0) { if (lock->loc.inode->ia_type == IA_IFREG) { gf_msg (this->name, GF_LOG_ERROR, op_errno, EC_MSG_SIZE_XATTR_GET_FAIL, "Unable to get size xattr"); goto unlock; } } else { ctx->post_size = ctx->pre_size; ctx->have_size = _gf_true; } op_errno = -ec_dict_del_config(dict, EC_XATTR_CONFIG, &ctx->config); if (op_errno != 0) { if ((lock->loc.inode->ia_type == IA_IFREG) || (op_errno != ENODATA)) { gf_msg (this->name, GF_LOG_ERROR, op_errno, EC_MSG_CONFIG_XATTR_GET_FAIL, "Unable to get config xattr"); goto unlock; } } else { if (!ec_config_check(parent->xl, &ctx->config)) { gf_msg (this->name, GF_LOG_ERROR, EINVAL, EC_MSG_CONFIG_XATTR_INVALID, "Invalid config xattr"); op_errno = EINVAL; goto unlock; } ctx->have_config = _gf_true; } } ctx->have_info = _gf_true; } ec_set_dirty_flag (fop->data, ctx, dirty); if (dirty[EC_METADATA_TXN] && (waiting_flags & EC_FLAG_WAITING_METADATA_DIRTY)) { GF_ASSERT (!ctx->dirty[EC_METADATA_TXN]); ctx->dirty[EC_METADATA_TXN] = 1; } if (dirty[EC_DATA_TXN] && (waiting_flags & EC_FLAG_WAITING_DATA_DIRTY)) { GF_ASSERT (!ctx->dirty[EC_DATA_TXN]); ctx->dirty[EC_DATA_TXN] = 1; } op_errno = 0; unlock: lock->waiting_flags ^= waiting_flags; if (op_errno == 0) { /* If the fop fails on any of the good bricks, it is important to mark * it dirty and update versions right away if dirty was not set before. */ if (lock->good_mask & ~(fop->good | fop->remaining)) { release = _gf_true; } if (parent_link->update[0] && !parent_link->dirty[0]) { lock->release |= release; } if (parent_link->update[1] && !parent_link->dirty[1]) { lock->release |= release; } /* We don't allow the main fop to be executed on bricks that have not * succeeded the initial xattrop. */ ec_lock_update_good (lock, fop); /*As of now only data healing marks bricks as healing*/ lock->healing |= fop->healing; } UNLOCK(&lock->loc.inode->lock); while (!list_empty(&list)) { tmp = list_entry(list.next, ec_fop_data_t, cbk_list); list_del_init(&tmp->cbk_list); if (op_errno == 0) { tmp->mask &= fop->good; /*As of now only data healing marks bricks as healing*/ if (ec_is_data_fop (tmp->id)) { tmp->healing |= fop->healing; } } ec_resume(tmp, op_errno); } return 0; } static uint64_t ec_set_xattrop_flags_and_params (ec_lock_t *lock, ec_lock_link_t *link, uint64_t *dirty) { uint64_t oldflags = 0; uint64_t newflags = 0; ec_inode_t *ctx = lock->ctx; oldflags = lock->waiting_flags & EC_XATTROP_ALL_WAITING_FLAGS; if (lock->query && !ctx->have_info) { lock->waiting_flags |= EC_FLAG_WAITING_XATTROP; link->waiting_flags |= EC_FLAG_WAITING_XATTROP; } if (dirty[EC_DATA_TXN]) { if (oldflags & EC_FLAG_WAITING_DATA_DIRTY) { dirty[EC_DATA_TXN] = 0; } else { lock->waiting_flags |= EC_FLAG_WAITING_DATA_DIRTY; } link->waiting_flags |= EC_FLAG_WAITING_DATA_DIRTY; } if (dirty[EC_METADATA_TXN]) { if (oldflags & EC_FLAG_WAITING_METADATA_DIRTY) { dirty[EC_METADATA_TXN] = 0; } else { lock->waiting_flags |= EC_FLAG_WAITING_METADATA_DIRTY; } link->waiting_flags |= EC_FLAG_WAITING_METADATA_DIRTY; } newflags = lock->waiting_flags & EC_XATTROP_ALL_WAITING_FLAGS; return oldflags ^ newflags; } void ec_get_size_version(ec_lock_link_t *link) { loc_t loc; ec_lock_t *lock; ec_inode_t *ctx; ec_fop_data_t *fop; dict_t *dict = NULL; dict_t *xdata = NULL; ec_t *ec = NULL; int32_t error = 0; gf_boolean_t set_dirty = _gf_false; uint64_t allzero[EC_VERSION_SIZE] = {0, 0}; uint64_t dirty[EC_VERSION_SIZE] = {0, 0}; lock = link->lock; ctx = lock->ctx; fop = link->fop; ec = fop->xl->private; uint64_t changed_flags = 0; if (ec->optimistic_changelog && !(ec->node_mask & ~link->lock->good_mask) && !ec_is_data_fop (fop->id)) link->optimistic_changelog = _gf_true; set_dirty = ec_set_dirty_flag (link, ctx, dirty); /* If ec metadata has already been retrieved, do not try again. */ if (ctx->have_info && (!set_dirty)) { if (ec_is_data_fop (fop->id)) { fop->healing |= lock->healing; } return; } /* Determine if there's something we need to retrieve for the current * operation. */ if (!set_dirty && !lock->query && (lock->loc.inode->ia_type != IA_IFREG) && (lock->loc.inode->ia_type != IA_INVAL)) { return; } memset(&loc, 0, sizeof(loc)); LOCK(&lock->loc.inode->lock); changed_flags = ec_set_xattrop_flags_and_params (lock, link, dirty); if (link->waiting_flags) { /* This fop needs to wait until all its flags are cleared which * potentially can be cleared by other xattrops that are already * wound*/ ec_sleep(fop); } else { GF_ASSERT (!changed_flags); } UNLOCK(&lock->loc.inode->lock); if (!changed_flags) goto out; dict = dict_new(); if (dict == NULL) { error = -ENOMEM; goto out; } if (changed_flags & EC_FLAG_WAITING_XATTROP) { /* Once we know that an xattrop will be needed, * we try to get all available information in a * single call. */ error = ec_dict_set_array(dict, EC_XATTR_VERSION, allzero, EC_VERSION_SIZE); if (error != 0) { goto out; } if (lock->loc.inode->ia_type == IA_IFREG || lock->loc.inode->ia_type == IA_INVAL) { error = ec_dict_set_number(dict, EC_XATTR_SIZE, 0); if (error == 0) { error = ec_dict_set_number(dict, EC_XATTR_CONFIG, 0); } if (error != 0) { goto out; } xdata = dict_new(); if (xdata == NULL || dict_set_int32 (xdata, GF_GET_SIZE, 1)) { error = -ENOMEM; goto out; } } } if (memcmp (allzero, dirty, sizeof (allzero))) { error = ec_dict_set_array(dict, EC_XATTR_DIRTY, dirty, EC_VERSION_SIZE); if (error != 0) { goto out; } } fop->frame->root->uid = 0; fop->frame->root->gid = 0; /* For normal fops, ec_[f]xattrop() must succeed on at least * EC_MINIMUM_MIN bricks, however when this is called as part of a * self-heal operation the mask of target bricks (fop->mask) could * contain less than EC_MINIMUM_MIN bricks, causing the xattrop to * always fail. Thus we always use the same minimum used for the main * fop. */ if (lock->fd == NULL) { error = ec_loc_from_loc(fop->xl, &loc, &lock->loc); if (error != 0) { goto out; } if (gf_uuid_is_null(loc.pargfid)) { if (loc.parent != NULL) { inode_unref(loc.parent); loc.parent = NULL; } GF_FREE((char *)loc.path); loc.path = NULL; loc.name = NULL; } ec_xattrop (fop->frame, fop->xl, fop->mask, fop->minimum, ec_prepare_update_cbk, link, &loc, GF_XATTROP_ADD_ARRAY64, dict, xdata); } else { ec_fxattrop(fop->frame, fop->xl, fop->mask, fop->minimum, ec_prepare_update_cbk, link, lock->fd, GF_XATTROP_ADD_ARRAY64, dict, xdata); } error = 0; out: fop->frame->root->uid = fop->uid; fop->frame->root->gid = fop->gid; loc_wipe(&loc); if (dict != NULL) { dict_unref(dict); } if (xdata != NULL) { dict_unref(xdata); } if (error != 0) { ec_fop_set_error(fop, -error); } } gf_boolean_t __ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t *size) { ec_inode_t *ctx; gf_boolean_t found = _gf_false; ctx = __ec_inode_get(inode, fop->xl); if (ctx == NULL) { goto out; } if (ctx->have_size) { *size = ctx->post_size; found = _gf_true; } out: return found; } gf_boolean_t ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t *size) { gf_boolean_t found = _gf_false; LOCK(&inode->lock); { found = __ec_get_inode_size (fop, inode, size); } UNLOCK(&inode->lock); return found; } gf_boolean_t __ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t size) { ec_inode_t *ctx; gf_boolean_t found = _gf_false; ctx = __ec_inode_get(inode, fop->xl); if (ctx == NULL) { goto out; } /* Normal fops always have ctx->have_size set. However self-heal calls this * to prepare the inode, so ctx->have_size will be false. In this case we * prepare both pre_size and post_size, and set have_size and have_info to * true. */ if (!ctx->have_size) { ctx->pre_size = size; ctx->have_size = ctx->have_info = _gf_true; } ctx->post_size = size; found = _gf_true; out: return found; } gf_boolean_t ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t size) { gf_boolean_t found = _gf_false; LOCK (&inode->lock); { found = __ec_set_inode_size (fop, inode, size); } UNLOCK (&inode->lock); return found; } static void ec_release_stripe_cache (ec_inode_t *ctx) { ec_stripe_list_t *stripe_cache = NULL; ec_stripe_t *stripe = NULL; stripe_cache = &ctx->stripe_cache; while (!list_empty (&stripe_cache->lru)) { stripe = list_first_entry (&stripe_cache->lru, ec_stripe_t, lru); list_del (&stripe->lru); GF_FREE (stripe); } stripe_cache->count = 0; stripe_cache->max = 0; } void ec_clear_inode_info(ec_fop_data_t *fop, inode_t *inode) { ec_inode_t *ctx; LOCK(&inode->lock); ctx = __ec_inode_get(inode, fop->xl); if (ctx == NULL) { goto unlock; } ec_release_stripe_cache (ctx); ctx->have_info = _gf_false; ctx->have_config = _gf_false; ctx->have_version = _gf_false; ctx->have_size = _gf_false; memset(&ctx->config, 0, sizeof(ctx->config)); memset(ctx->pre_version, 0, sizeof(ctx->pre_version)); memset(ctx->post_version, 0, sizeof(ctx->post_version)); ctx->pre_size = ctx->post_size = 0; memset(ctx->dirty, 0, sizeof(ctx->dirty)); unlock: UNLOCK(&inode->lock); } int32_t ec_get_real_size_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata, struct iatt *postparent) { ec_fop_data_t *fop = cookie; ec_lock_link_t *link; if (op_ret >= 0) { link = fop->data; link->size = buf->ia_size; } else { /* Prevent failure of parent fop. */ fop->error = 0; } return 0; } /* This function is used to get the trusted.ec.size xattr from a file when * no lock is needed on the inode. This is only required to maintan iatt * structs on fops that manipulate directory entries but do not operate * directly on the inode, like link, rename, ... * * Any error processing this request is ignored. In the worst case, an invalid * or not up to date value in the iatt could cause some cache invalidation. */ void ec_get_real_size(ec_lock_link_t *link) { ec_fop_data_t *fop; dict_t *xdata; if (link->base == NULL || link->base->inode == NULL) { return; } if (link->base->inode->ia_type != IA_IFREG) { return; } fop = link->fop; if (ec_get_inode_size(fop, link->base->inode, &link->size)) { return; } xdata = dict_new(); if (xdata == NULL) { return; } if (ec_dict_set_number(xdata, EC_XATTR_SIZE, 0) != 0) { goto out; } /* Send a simple lookup. A single answer is considered ok since this value * is only used to return an iatt struct related to an inode that is not * locked and have not suffered any operation. */ ec_lookup(fop->frame, fop->xl, fop->mask, 1, ec_get_real_size_cbk, link, link->base, xdata); out: if (xdata != NULL) { dict_unref(xdata); } } static void ec_lock_update_fd(ec_lock_t *lock, ec_fop_data_t *fop) { /* If the fop has an fd available, attach it to the lock structure to be * able to do fxattrop calls instead of xattrop. */ if (fop->use_fd && (lock->fd == NULL)) { lock->fd = __fd_ref(fop->fd); } } static gf_boolean_t ec_link_has_lock_conflict (ec_lock_link_t *link, struct list_head *owners) { ec_lock_link_t *owner_link = NULL; ec_t *ec = link->fop->xl->private; if (!ec->parallel_writes) return _gf_true; list_for_each_entry (owner_link, owners, owner_list) { if (ec_lock_conflict (owner_link, link)) return _gf_true; } return _gf_false; } static void ec_lock_wake_shared(ec_lock_t *lock, struct list_head *list) { ec_fop_data_t *fop; ec_lock_link_t *link; gf_boolean_t conflict = _gf_false; while (!conflict && !list_empty(&lock->waiting)) { link = list_entry(lock->waiting.next, ec_lock_link_t, wait_list); fop = link->fop; /* If lock is not acquired, at most one fop can be assigned as owner. * The following fops will need to wait in the lock->waiting queue * until the lock has been fully acquired. */ conflict = !lock->acquired; /* If the fop is not shareable, only this fop can be assigned as owner. * Other fops will need to wait until this one finishes. */ if (ec_link_has_lock_conflict (link, &lock->owners)) { conflict = _gf_true; } /* If only one fop is allowed, it can be assigned as the owner of the * lock only if there weren't any other owner. */ if (conflict && !list_empty(&lock->owners)) { break; } list_move_tail(&link->wait_list, list); list_add_tail(&link->owner_list, &lock->owners); lock->refs_owners++; ec_lock_update_fd(lock, fop); } } static void ec_lock_apply(ec_lock_link_t *link) { ec_fop_data_t *fop = link->fop; fop->mask &= link->lock->good_mask; fop->locked++; ec_get_size_version(link); ec_get_real_size(link); } gf_boolean_t ec_lock_acquire(ec_lock_link_t *link); static void ec_lock_resume_shared(struct list_head *list) { ec_lock_link_t *link; while (!list_empty(list)) { link = list_entry(list->next, ec_lock_link_t, wait_list); list_del_init(&link->wait_list); if (link->lock->acquired) { ec_lock_apply(link); ec_lock(link->fop); } else { GF_ASSERT(list_empty(list)); ec_lock_acquire(link); } ec_resume(link->fop, 0); } } void ec_lock_acquired(ec_lock_link_t *link) { struct list_head list; ec_lock_t *lock; ec_fop_data_t *fop; lock = link->lock; fop = link->fop; ec_trace("LOCKED", fop, "lock=%p", lock); INIT_LIST_HEAD(&list); LOCK(&lock->loc.inode->lock); lock->acquired = _gf_true; ec_lock_update_fd(lock, fop); ec_lock_wake_shared(lock, &list); UNLOCK(&lock->loc.inode->lock); ec_lock_apply(link); ec_lock_resume_shared(&list); } int32_t ec_locked(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { ec_fop_data_t *fop = cookie; ec_lock_link_t *link = NULL; ec_lock_t *lock = NULL; if (op_ret >= 0) { link = fop->data; lock = link->lock; lock->mask = lock->good_mask = fop->good; lock->healing = 0; ec_lock_acquired(link); ec_lock(fop->parent); } else { gf_msg (this->name, GF_LOG_WARNING, op_errno, EC_MSG_PREOP_LOCK_FAILED, "Failed to complete preop lock"); } return 0; } gf_boolean_t ec_lock_acquire(ec_lock_link_t *link) { ec_lock_t *lock; ec_fop_data_t *fop; gf_lkowner_t lk_owner; lock = link->lock; fop = link->fop; if (!lock->acquired) { set_lk_owner_from_ptr(&lk_owner, lock); ec_trace("LOCK_ACQUIRE", fop, "lock=%p, inode=%p", lock, lock->loc.inode); lock->flock.l_type = F_WRLCK; ec_inodelk(fop->frame, fop->xl, &lk_owner, -1, EC_MINIMUM_ALL, ec_locked, link, fop->xl->name, &lock->loc, F_SETLKW, &lock->flock, NULL); return _gf_false; } ec_trace("LOCK_REUSE", fop, "lock=%p", lock); ec_lock_acquired(link); return _gf_true; } static gf_boolean_t ec_lock_assign_owner(ec_lock_link_t *link) { ec_fop_data_t *fop; ec_lock_t *lock; ec_lock_link_t *timer_link = NULL; gf_boolean_t assigned = _gf_false; /* The link cannot be in any list because we have just finished preparing * it. */ GF_ASSERT(list_empty(&link->wait_list)); fop = link->fop; lock = link->lock; LOCK(&lock->loc.inode->lock); /* Since the link has just been prepared but it's not active yet, the * refs_pending must be one at least (the ref owned by this link). */ GF_ASSERT (lock->refs_pending > 0); /* The link is not pending any more. It will be assigned to the owner, * waiting or frozen list. */ lock->refs_pending--; if (lock->release) { ec_trace("LOCK_QUEUE_FREEZE", fop, "lock=%p", lock); /* When lock->release is set, we'll unlock the lock as soon as * possible, meaning that we won't use a timer. */ GF_ASSERT(lock->timer == NULL); /* The lock is marked to be released. We can still have owners and fops * in the waiting ilist f they have been added before the lock has been * marked to be released. However new fops are put into the frozen list * to wait for the next unlock/lock cycle. */ list_add_tail(&link->wait_list, &lock->frozen); goto unlock; } /* The lock is not marked to be released, so the frozen list should be * empty. */ GF_ASSERT(list_empty(&lock->frozen)); if (lock->timer != NULL) { /* We are trying to acquire a lock that has an unlock timer active. * This means that the lock must be idle, i.e. no fop can be in the * owner, waiting or frozen lists. It also means that the lock cannot * have been marked as being released (this is done without timers). * There should only be one owner reference, but it's possible that * some fops are being prepared to use this lock. */ GF_ASSERT ((lock->refs_owners == 1) && list_empty(&lock->owners) && list_empty(&lock->waiting)); /* We take the timer_link before cancelling the timer, since a * successful cancellation will destroy it. It must not be NULL * because it references the fop responsible for the delayed unlock * that we are currently trying to cancel. */ timer_link = lock->timer->data; GF_ASSERT(timer_link != NULL); if (gf_timer_call_cancel(fop->xl->ctx, lock->timer) < 0) { /* It's too late to avoid the execution of the timer callback. * Since we need to be sure that the callback has access to all * needed resources, we cannot resume the execution of the timer * fop now. This will be done in the callback. */ timer_link = NULL; } else { /* The timer has been cancelled, so we need to release the owner * reference that was held by the fop waiting for the timer. This * can be the last reference, but we'll immediately increment it * for the current fop, so no need to check it. */ lock->refs_owners--; ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock); } /* We have two options here: * * 1. The timer has been successfully cancelled. * * This is the easiest case and we can continue with the currently * acquired lock. * * 2. The timer callback has already been fired. * * In this case we have not been able to cancel the timer before * the timer callback has been fired, but we also know that * lock->timer != NULL. This means that the timer callback is still * trying to acquire the inode mutex that we currently own. We are * safe until we release it. In this case we can safely clear * lock->timer. This will cause that the timer callback does nothing * once it acquires the mutex. */ lock->timer = NULL; } if (!list_empty(&lock->owners)) { /* There are other owners of this lock. We can only take ownership if * the lock is already acquired and doesn't have conflict with existing * owners, or waiters(to prevent starvation). * Otherwise we need to wait. */ if (!lock->acquired || ec_link_has_lock_conflict (link, &lock->owners) || ec_link_has_lock_conflict (link, &lock->waiting)) { ec_trace("LOCK_QUEUE_WAIT", fop, "lock=%p", lock); list_add_tail(&link->wait_list, &lock->waiting); goto unlock; } } list_add_tail(&link->owner_list, &lock->owners); lock->refs_owners++; assigned = _gf_true; unlock: if (!assigned) { /* We have not been able to take ownership of this lock. The fop must * be put to sleep. */ ec_sleep(fop); } UNLOCK(&lock->loc.inode->lock); /* If we have cancelled the timer, we need to resume the fop that was * waiting for it. */ if (timer_link != NULL) { ec_resume(timer_link->fop, 0); } return assigned; } static void ec_lock_next_owner(ec_lock_link_t *link, ec_cbk_data_t *cbk, gf_boolean_t release) { struct list_head list; ec_lock_t *lock = link->lock; ec_fop_data_t *fop = link->fop; ec_inode_t *ctx = lock->ctx; INIT_LIST_HEAD(&list); LOCK(&lock->loc.inode->lock); ec_trace("LOCK_DONE", fop, "lock=%p", lock); /* Current link must belong to the owner list of the lock. We don't * decrement lock->refs_owners here because the inode mutex is released * before ec_unlock() is called and we need to know when the last owner * unlocks the lock to do proper cleanup. lock->refs_owners is used for * this task. */ GF_ASSERT((lock->refs_owners > 0) && !list_empty(&link->owner_list)); list_del_init(&link->owner_list); lock->release |= release; if ((fop->error == 0) && (cbk != NULL) && (cbk->op_ret >= 0)) { if (link->update[0]) { ctx->post_version[0]++; } if (link->update[1]) { ctx->post_version[1]++; } /* If the fop fails on any of the good bricks, it is important to mark * it dirty and update versions right away. */ if (link->update[0] || link->update[1]) { if (lock->good_mask & ~(fop->good | fop->remaining)) { lock->release = _gf_true; } } } if (fop->healing) { lock->healing = fop->healing & (fop->good | fop->remaining); } ec_lock_update_good(lock, fop); ec_lock_wake_shared(lock, &list); UNLOCK(&lock->loc.inode->lock); ec_lock_resume_shared(&list); } void ec_lock(ec_fop_data_t *fop) { ec_lock_link_t *link; /* There is a chance that ec_resume is called on fop even before ec_sleep. * Which can result in refs == 0 for fop leading to use after free in this * function when it calls ec_sleep so do ec_sleep at start and ec_resume at * the end of this function.*/ ec_sleep (fop); while (fop->locked < fop->lock_count) { /* Since there are only up to 2 locks per fop, this xor will change * the order of the locks if fop->first_lock is 1. */ link = &fop->locks[fop->locked ^ fop->first_lock]; if (!ec_lock_assign_owner(link) || !ec_lock_acquire(link)) { break; } } ec_resume(fop, 0); } void ec_lock_unfreeze(ec_lock_link_t *link) { struct list_head list; ec_lock_t *lock; gf_boolean_t destroy = _gf_false; lock = link->lock; INIT_LIST_HEAD(&list); LOCK(&lock->loc.inode->lock); /* The lock must be marked to be released here, since we have just released * it and any attempt to assign it to more fops must have added them to the * frozen list. We can only have one active reference here: the one that * is processing this unfreeze. */ GF_ASSERT(lock->release && (lock->refs_owners == 1)); lock->release = _gf_false; lock->refs_owners = 0; lock->acquired = _gf_false; /* We are unfreezing a lock. This means that the lock has already been * released. In this state it shouldn't have a pending timer nor have any * owner, and the waiting list should be empty. Only the frozen list can * contain some fop. */ GF_ASSERT((lock->timer == NULL) && list_empty(&lock->waiting) && list_empty(&lock->owners)); /* We move all frozen fops to the waiting list. */ list_splice_init(&lock->frozen, &lock->waiting); /* If we don't have any fop waiting nor there are any prepared fops using * this lock, we can finally dispose it. */ destroy = list_empty(&lock->waiting) && (lock->refs_pending == 0); if (destroy) { ec_trace("LOCK_DESTROY", link->fop, "lock=%p", lock); lock->ctx->inode_lock = NULL; } else { ec_trace("LOCK_UNFREEZE", link->fop, "lock=%p", lock); ec_lock_wake_shared(lock, &list); } UNLOCK(&lock->loc.inode->lock); ec_lock_resume_shared(&list); if (destroy) { ec_lock_destroy(lock); } } int32_t ec_unlocked(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { ec_fop_data_t *fop = cookie; ec_lock_link_t *link = fop->data; if (op_ret < 0) { gf_msg (this->name, GF_LOG_WARNING, op_errno, EC_MSG_UNLOCK_FAILED, "entry/inode unlocking failed (%s)", ec_fop_name(link->fop->id)); } else { ec_trace("UNLOCKED", link->fop, "lock=%p", link->lock); } ec_lock_unfreeze(link); return 0; } void ec_unlock_lock(ec_lock_link_t *link) { ec_lock_t *lock; ec_fop_data_t *fop; gf_lkowner_t lk_owner; lock = link->lock; fop = link->fop; lock->unlock_now = _gf_false; ec_clear_inode_info(fop, lock->loc.inode); if ((lock->mask != 0) && lock->acquired) { set_lk_owner_from_ptr(&lk_owner, lock); lock->flock.l_type = F_UNLCK; ec_trace("UNLOCK_INODELK", fop, "lock=%p, inode=%p", lock, lock->loc.inode); ec_inodelk(fop->frame, fop->xl, &lk_owner, lock->mask, EC_MINIMUM_ONE, ec_unlocked, link, fop->xl->name, &lock->loc, F_SETLK, &lock->flock, NULL); } else { ec_lock_unfreeze(link); } } int32_t ec_update_size_version_done(call_frame_t * frame, void * cookie, xlator_t * this, int32_t op_ret, int32_t op_errno, dict_t * xattr, dict_t * xdata) { ec_fop_data_t *fop = cookie; ec_lock_link_t *link; ec_lock_t *lock; ec_inode_t *ctx; link = fop->data; lock = link->lock; ctx = lock->ctx; if (op_ret < 0) { gf_msg(fop->xl->name, fop_log_level (fop->id, op_errno), op_errno, EC_MSG_SIZE_VERS_UPDATE_FAIL, "Failed to update version and size"); } else { fop->parent->good &= fop->good; ec_lock_update_good(lock, fop); if (ec_dict_del_array(xattr, EC_XATTR_VERSION, ctx->post_version, EC_VERSION_SIZE) == 0) { ctx->pre_version[0] = ctx->post_version[0]; ctx->pre_version[1] = ctx->post_version[1]; ctx->have_version = _gf_true; } if (ec_dict_del_number(xattr, EC_XATTR_SIZE, &ctx->post_size) == 0) { ctx->pre_size = ctx->post_size; ctx->have_size = _gf_true; } if ((ec_dict_del_config(xdata, EC_XATTR_CONFIG, &ctx->config) == 0) && ec_config_check(fop->xl, &ctx->config)) { ctx->have_config = _gf_true; } ctx->have_info = _gf_true; } /* If we are here because of fop's and other than unlock request, * that means we are still holding a lock. That make sure * lock->unlock_now can not be modified. */ if (lock->unlock_now) { ec_unlock_lock(fop->data); } return 0; } void ec_update_size_version(ec_lock_link_t *link, uint64_t *version, uint64_t size, uint64_t *dirty) { ec_fop_data_t *fop; ec_lock_t *lock; ec_inode_t *ctx; dict_t *dict = NULL; uintptr_t update_on = 0; int32_t err = -ENOMEM; fop = link->fop; lock = link->lock; ctx = lock->ctx; ec_trace("UPDATE", fop, "version=%ld/%ld, size=%ld, dirty=%ld/%ld", version[0], version[1], size, dirty[0], dirty[1]); dict = dict_new(); if (dict == NULL) { goto out; } /* If we don't have version information or it has been modified, we * update it. */ if (!ctx->have_version || (version[0] != 0) || (version[1] != 0)) { err = ec_dict_set_array(dict, EC_XATTR_VERSION, version, EC_VERSION_SIZE); if (err != 0) { goto out; } } if (size != 0) { /* If size has been changed, we should already * know the previous size of the file. */ GF_ASSERT(ctx->have_size); err = ec_dict_set_number(dict, EC_XATTR_SIZE, size); if (err != 0) { goto out; } } if (dirty[0] || dirty[1]) { err = ec_dict_set_array(dict, EC_XATTR_DIRTY, dirty, EC_VERSION_SIZE); if (err != 0) { goto out; } } /* If config information is not known, we request it now. */ if ((lock->loc.inode->ia_type == IA_IFREG) && !ctx->have_config) { /* A failure requesting this xattr is ignored because it's not * absolutely required right now. */ (void)ec_dict_set_number(dict, EC_XATTR_CONFIG, 0); } fop->frame->root->uid = 0; fop->frame->root->gid = 0; update_on = lock->good_mask | lock->healing; if (link->lock->fd == NULL) { ec_xattrop(fop->frame, fop->xl, update_on, EC_MINIMUM_MIN, ec_update_size_version_done, link, &link->lock->loc, GF_XATTROP_ADD_ARRAY64, dict, NULL); } else { ec_fxattrop(fop->frame, fop->xl, update_on, EC_MINIMUM_MIN, ec_update_size_version_done, link, link->lock->fd, GF_XATTROP_ADD_ARRAY64, dict, NULL); } fop->frame->root->uid = fop->uid; fop->frame->root->gid = fop->gid; dict_unref(dict); return; out: if (dict != NULL) { dict_unref(dict); } ec_fop_set_error(fop, -err); gf_msg (fop->xl->name, GF_LOG_ERROR, -err, EC_MSG_SIZE_VERS_UPDATE_FAIL, "Unable to update version and size"); if (lock->unlock_now) { ec_unlock_lock(fop->data); } } gf_boolean_t ec_update_info(ec_lock_link_t *link) { ec_lock_t *lock; ec_inode_t *ctx; uint64_t version[2] = {0, 0}; uint64_t dirty[2] = {0, 0}; uint64_t size; ec_t *ec = NULL; lock = link->lock; ctx = lock->ctx; ec = link->fop->xl->private; /* pre_version[*] will be 0 if have_version is false */ version[0] = ctx->post_version[0] - ctx->pre_version[0]; version[1] = ctx->post_version[1] - ctx->pre_version[1]; size = ctx->post_size - ctx->pre_size; /* If we set the dirty flag for update fop, we have to unset it. * If fop has failed on some bricks, leave the dirty as marked. */ if (lock->unlock_now) { /* Ensure that nodes are up while doing final * metadata update.*/ if (!(ec->node_mask & ~lock->good_mask) && !(ec->node_mask & ~ec->xl_up)) { if (ctx->dirty[0] != 0) { dirty[0] = -1; } if (ctx->dirty[1] != 0) { dirty[1] = -1; } } else { link->optimistic_changelog = _gf_false; ec_set_dirty_flag (link, ctx, dirty); } memset(ctx->dirty, 0, sizeof(ctx->dirty)); } if ((version[0] != 0) || (version[1] != 0) || (dirty[0] != 0) || (dirty[1] != 0)) { ec_update_size_version(link, version, size, dirty); return _gf_true; } return _gf_false; } void ec_unlock_now(ec_lock_link_t *link) { ec_lock_t *lock; lock = link->lock; ec_trace("UNLOCK_NOW", link->fop, "lock=%p", link->lock); /*At this point, lock is not being used by any fop and *can not be reused by any fop as it is going to be released. *lock->unlock_now can not be modified at any other place. */ lock->unlock_now = _gf_true; if (!ec_update_info(link)) { ec_unlock_lock(link); } ec_resume(link->fop, 0); } void ec_unlock_timer_add(ec_lock_link_t *link); void ec_unlock_timer_del(ec_lock_link_t *link) { ec_lock_t *lock; inode_t *inode; gf_boolean_t now = _gf_false; /* If we are here, it means that the timer has expired before having * been cancelled. This guarantees that 'link' is still valid because * the fop that contains it must be pending (if timer cancellation in * ec_lock_assign_owner() fails, the fop is left sleeping). * * At the same time, the fop still has a reference to the lock, so * it must also be valid. */ lock = link->lock; /* 'lock' must have a valid inode since it can only be destroyed * when the lock itself is destroyed, but we have a reference to the * lock to avoid this. */ inode = lock->loc.inode; LOCK(&inode->lock); if (lock->timer != NULL) { ec_trace("UNLOCK_DELAYED", link->fop, "lock=%p", lock); /* The unlock timer has expired without anyone cancelling it. * This means that it shouldn't have any owner, and the waiting * and frozen lists should be empty. It must have only one * owner reference, but there can be fops being prepared * though. * */ GF_ASSERT(!lock->release && (lock->refs_owners == 1) && list_empty(&lock->owners) && list_empty(&lock->waiting) && list_empty(&lock->frozen)); gf_timer_call_cancel(link->fop->xl->ctx, lock->timer); lock->timer = NULL; /* Any fop being processed from now on, will need to wait * until the next unlock/lock cycle. */ lock->release = now = _gf_true; } UNLOCK(&inode->lock); if (now) { ec_unlock_now(link); } else { /* The timer has been cancelled just after firing it but before * getting here. This means that another fop has used the lock * and everything should be handled as if this callback were * have not been executed. However we still have an owner * reference. * * We need to release our reference. If this is not the last * reference (the most common case because another fop has * taken another ref) we only need to decrement the counter. * Otherwise we have been delayed enough so that the other fop * has had time to acquire the reference, do its operation and * release it. At the time of releasing it, the fop did found * that the ref counter was > 1 (our reference), so the delayed * unlock timer wasn't started. We need to start it again if we * are the last reference. * * ec_unlock_timer_add() handles both cases. */ ec_unlock_timer_add(link); /* We need to resume the fop that was waiting for the delayed * unlock. */ ec_resume(link->fop, 0); } } void ec_unlock_timer_cbk(void *data) { ec_unlock_timer_del(data); } void ec_unlock_timer_add(ec_lock_link_t *link) { struct timespec delay; ec_fop_data_t *fop = link->fop; ec_lock_t *lock = link->lock; gf_boolean_t now = _gf_false; LOCK(&lock->loc.inode->lock); /* We are trying to unlock the lock. We can have multiple scenarios here, * but all of them need to have lock->timer == NULL: * * 1. There are other owners currently running that can call ec_unlock(). * * None of them can have started the timer until the last one. But this * call should be the consequence of this lastest one. * * 2. There are fops in the waiting or frozen lists. * * These fops cannot call ec_unlock(). So we should be here. * * We must reach here with at least one owner reference. */ GF_ASSERT((lock->timer == NULL) && (lock->refs_owners > 0)); /* If the fop detects that a heal is needed, we mark the lock to be * released as soon as possible. */ lock->release |= ec_fop_needs_heal(fop); if (lock->refs_owners > 1) { ec_trace("UNLOCK_SKIP", fop, "lock=%p", lock); /* If there are other owners we cannot do anything else with the lock. * Note that the current fop has already been removed from the owners * list in ec_lock_reuse(). */ lock->refs_owners--; UNLOCK(&lock->loc.inode->lock); } else if (lock->acquired) { /* There are no other owners and the lock is acquired. If there were * fops waiting, at least one of them should have been promoted to an * owner, so the waiting list should be empty. */ GF_ASSERT(list_empty(&lock->owners) && list_empty(&lock->waiting)); ec_t *ec = fop->xl->private; /* If everything goes as expected this fop will be put to sleep until * the timer callback is executed. */ ec_sleep(fop); /* If the lock needs to be released, or ec is shutting down, do not * delay lock release. */ if (!lock->release && !ec->shutdown) { ec_trace("UNLOCK_DELAY", fop, "lock=%p, release=%d", lock, lock->release); delay.tv_sec = 1; delay.tv_nsec = 0; lock->timer = gf_timer_call_after(fop->xl->ctx, delay, ec_unlock_timer_cbk, link); if (lock->timer == NULL) { gf_msg(fop->xl->name, GF_LOG_WARNING, ENOMEM, EC_MSG_UNLOCK_DELAY_FAILED, "Unable to delay an unlock"); /* We are unable to create a new timer. We immediately release * the lock. */ lock->release = now = _gf_true; } } else { ec_trace("UNLOCK_FORCE", fop, "lock=%p, release=%d", lock, lock->release); lock->release = now = _gf_true; } UNLOCK(&lock->loc.inode->lock); if (now) { ec_unlock_now(link); } } else { /* There are no owners and the lock is not acquired. This can only * happen if a lock attempt has failed and we get to the unlock step * of the fop. As in the previous case, the waiting list must be * empty. */ GF_ASSERT(list_empty(&lock->owners) && list_empty(&lock->waiting)); /* We need to mark the lock to be released to correctly handle fops * that may get in after we release the inode mutex but before * ec_lock_unfreeze() is processed. */ lock->release = _gf_true; UNLOCK(&lock->loc.inode->lock); ec_lock_unfreeze(link); } } void ec_unlock(ec_fop_data_t *fop) { int32_t i; for (i = 0; i < fop->lock_count; i++) { ec_unlock_timer_add(&fop->locks[i]); } } void ec_flush_size_version(ec_fop_data_t * fop) { GF_ASSERT(fop->lock_count == 1); ec_update_info(&fop->locks[0]); } static gf_boolean_t ec_use_eager_lock(ec_t *ec, ec_fop_data_t *fop) { /* Fops with no locks at this point mean that they are sent as sub-fops * of other higher level fops. In this case we simply assume that the * parent fop will take correct care of the eager lock. */ if (fop->lock_count == 0) { return _gf_true; } /* We may have more than one lock, but this only happens in the rename * fop, and both locks will reference an inode of the same type (a * directory in this case), so we only need to check the first lock. */ if (fop->locks[0].lock->loc.inode->ia_type == IA_IFREG) { return ec->eager_lock; } return ec->other_eager_lock; } static void ec_update_stripe(ec_t *ec, ec_stripe_list_t *stripe_cache, ec_stripe_t *stripe, ec_fop_data_t *fop) { off_t base; /* On write fops, we only update existing fragments if the write has * succeeded. Otherwise, we remove them from the cache. */ if ((fop->id == GF_FOP_WRITE) && (fop->answer != NULL) && (fop->answer->op_ret >= 0)) { base = stripe->frag_offset - fop->frag_range.first; base *= ec->fragments; /* We check if the stripe offset falls inside the real region * modified by the write fop (a write request is allowed, * though uncommon, to write less bytes than requested). The * current write fop implementation doesn't allow partial * writes of fragments, so if there's no error, we are sure * that a full stripe has been completely modified or not * touched at all. The value of op_ret may not be a multiple * of the stripe size because it depends on the requested * size by the user, so we update the stripe if the write has * modified at least one byte (meaning ec has written the full * stripe). */ if (base < fop->answer->op_ret + fop->head) { memcpy(stripe->data, fop->vector[0].iov_base + base, ec->stripe_size); list_move_tail(&stripe->lru, &stripe_cache->lru); GF_ATOMIC_INC(ec->stats.stripe_cache.updates); } } else { stripe->frag_offset = -1; list_move (&stripe->lru, &stripe_cache->lru); GF_ATOMIC_INC(ec->stats.stripe_cache.invals); } } static void ec_update_cached_stripes (ec_fop_data_t *fop) { uint64_t first; uint64_t last; ec_stripe_t *stripe = NULL; ec_inode_t *ctx = NULL; ec_stripe_list_t *stripe_cache = NULL; inode_t *inode = NULL; struct list_head *temp; struct list_head sentinel; first = fop->frag_range.first; /* 'last' represents the first stripe not touched by the operation */ last = fop->frag_range.last; /* If there are no modified stripes, we don't need to do anything * else. */ if (last <= first) { return; } if (!fop->use_fd) { inode = fop->loc[0].inode; } else { inode = fop->fd->inode; } LOCK(&inode->lock); ctx = __ec_inode_get (inode, fop->xl); if (ctx == NULL) { goto out; } stripe_cache = &ctx->stripe_cache; /* Since we'll be moving elements of the list to the tail, we might * end in an infinite loop. To avoid it, we insert a sentinel element * into the list, so that it will be used to detect when we have * traversed all existing elements once. */ list_add_tail(&sentinel, &stripe_cache->lru); temp = stripe_cache->lru.next; while (temp != &sentinel) { stripe = list_entry(temp, ec_stripe_t, lru); temp = temp->next; if ((first <= stripe->frag_offset) && (stripe->frag_offset < last)) { ec_update_stripe (fop->xl->private, stripe_cache, stripe, fop); } } list_del(&sentinel); out: UNLOCK(&inode->lock); } void ec_lock_reuse(ec_fop_data_t *fop) { ec_cbk_data_t *cbk; ec_t *ec = NULL; int32_t i, count; gf_boolean_t release = _gf_false; ec = fop->xl->private; cbk = fop->answer; if (ec_use_eager_lock(ec, fop) && cbk != NULL) { if (cbk->xdata != NULL) { if ((dict_get_int32(cbk->xdata, GLUSTERFS_INODELK_COUNT, &count) == 0) && (count > 1)) { release = _gf_true; } if (release) { gf_msg_debug (fop->xl->name, 0, "Lock contention detected"); } } } else { /* If eager lock is disabled or if we haven't get * an answer with enough quorum, we always release * the lock. */ release = _gf_true; } ec_update_cached_stripes (fop); for (i = 0; i < fop->lock_count; i++) { ec_lock_next_owner(&fop->locks[i], cbk, release); } } void __ec_manager(ec_fop_data_t * fop, int32_t error) { ec_t *ec = fop->xl->private; do { ec_trace("MANAGER", fop, "error=%d", error); if (!ec_must_wind (fop)) { if (ec->xl_up_count < ec->fragments) { error = ENOTCONN; } } if (error != 0) { fop->error = error; fop->state = -fop->state; } if ((fop->state == EC_STATE_END) || (fop->state == -EC_STATE_END)) { ec_fop_data_release(fop); break; } /* At each state, fop must not be used anywhere else and there * shouldn't be any pending subfop going on. */ GF_ASSERT(fop->jobs == 0); /* While the manager is running we need to avoid that subfops launched * from it could finish and call ec_resume() before the fop->handler * has completed. This could lead to the same manager being executed * by two threads concurrently. ec_check_complete() will take care of * this reference. */ fop->jobs = 1; fop->state = fop->handler(fop, fop->state); GF_ASSERT (fop->state >= 0); error = ec_check_complete(fop, __ec_manager); } while (error >= 0); } void ec_manager(ec_fop_data_t * fop, int32_t error) { GF_ASSERT(fop->jobs == 0); GF_ASSERT(fop->winds == 0); GF_ASSERT(fop->error == 0); if (fop->state == EC_STATE_START) { fop->state = EC_STATE_INIT; } __ec_manager(fop, error); }