/* Copyright (c) 2008-2012 Red Hat, Inc. This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser General Public License, version 3 or any later version (LGPLv3 or later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ #include "glusterfs.h" #include "xlator.h" #include "dht-common.h" #include "dht-messages.h" #include "glusterfs-acl.h" #define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,path) do { \ layout->list[i].start = srt; \ layout->list[i].stop = srt + chunk - 1; \ layout->list[i].commit_hash = layout->commit_hash; \ \ gf_msg_trace (this->name, 0, \ "gave fix: %u - %u, with commit-hash %u" \ " on %s for %s", \ layout->list[i].start, \ layout->list[i].stop, \ layout->list[i].commit_hash, \ layout->list[i].xlator->name, path); \ } while (0) #define DHT_RESET_LAYOUT_RANGE(layout) do { \ int cnt = 0; \ for (cnt = 0; cnt < layout->cnt; cnt++ ) { \ layout->list[cnt].start = 0; \ layout->list[cnt].stop = 0; \ } \ } while (0) int dht_selfheal_layout_lock (call_frame_t *frame, dht_layout_t *layout, gf_boolean_t newdir, dht_selfheal_layout_t healer, dht_need_heal_t should_heal); static uint32_t dht_overlap_calc (dht_layout_t *old, int o, dht_layout_t *new, int n) { if (o >= old->cnt || n >= new->cnt) return 0; if (old->list[o].err > 0 || new->list[n].err > 0) return 0; if (old->list[o].start == old->list[o].stop) { return 0; } if (new->list[n].start == new->list[n].stop) { return 0; } if ((old->list[o].start > new->list[n].stop) || (old->list[o].stop < new->list[n].start)) return 0; return min (old->list[o].stop, new->list[n].stop) - max (old->list[o].start, new->list[n].start) + 1; } int dht_selfheal_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { DHT_STACK_DESTROY (frame); return 0; } int dht_selfheal_dir_finish (call_frame_t *frame, xlator_t *this, int ret, int invoke_cbk) { dht_local_t *local = NULL, *lock_local = NULL; call_frame_t *lock_frame = NULL; int lock_count = 0; local = frame->local; lock_count = dht_lock_count (local->lock.locks, local->lock.lk_count); if (lock_count == 0) goto done; lock_frame = copy_frame (frame); if (lock_frame == NULL) { goto done; } lock_local = dht_local_init (lock_frame, &local->loc, NULL, lock_frame->root->op); if (lock_local == NULL) { goto done; } lock_local->lock.locks = local->lock.locks; lock_local->lock.lk_count = local->lock.lk_count; local->lock.locks = NULL; local->lock.lk_count = 0; dht_unlock_inodelk (lock_frame, lock_local->lock.locks, lock_local->lock.lk_count, dht_selfheal_unlock_cbk); lock_frame = NULL; done: if (!invoke_cbk) local->selfheal.dir_cbk (frame, NULL, frame->this, ret, local->op_errno, NULL); if (lock_frame != NULL) { DHT_STACK_DESTROY (lock_frame); } return 0; } int dht_refresh_layout_done (call_frame_t *frame) { int ret = -1; dht_layout_t *refreshed = NULL, *heal = NULL; dht_local_t *local = NULL; dht_need_heal_t should_heal = NULL; dht_selfheal_layout_t healer = NULL; local = frame->local; refreshed = local->selfheal.refreshed_layout; heal = local->selfheal.layout; healer = local->selfheal.healer; should_heal = local->selfheal.should_heal; ret = dht_layout_sort (refreshed); if (ret == -1) { gf_msg (frame->this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_SORT_FAILED, "sorting the layout failed"); goto err; } if (should_heal (frame, &heal, &refreshed)) { healer (frame, &local->loc, heal); } else { local->selfheal.layout = NULL; local->selfheal.refreshed_layout = NULL; local->selfheal.layout = refreshed; dht_layout_unref (frame->this, heal); dht_selfheal_dir_finish (frame, frame->this, 0, 0); } return 0; err: dht_selfheal_dir_finish (frame, frame->this, -1, 0); return 0; } int dht_refresh_layout_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, dict_t *xattr, struct iatt *postparent) { dht_local_t *local = NULL; int this_call_cnt = 0; call_frame_t *prev = NULL; dht_layout_t *layout = NULL; GF_VALIDATE_OR_GOTO ("dht", frame, err); GF_VALIDATE_OR_GOTO ("dht", this, err); GF_VALIDATE_OR_GOTO ("dht", frame->local, err); GF_VALIDATE_OR_GOTO ("dht", this->private, err); local = frame->local; prev = cookie; layout = local->selfheal.refreshed_layout; LOCK (&frame->lock); { op_ret = dht_layout_merge (this, layout, prev->this, op_ret, op_errno, xattr); if (op_ret == -1) { local->op_errno = op_errno; gf_msg_debug (this->name, op_errno, "lookup of %s on %s returned error", local->loc.path, prev->this->name); goto unlock; } local->op_ret = 0; } unlock: UNLOCK (&frame->lock); this_call_cnt = dht_frame_return (frame); if (is_last_call (this_call_cnt)) { if (local->op_ret == 0) { local->refresh_layout_done (frame); } else { goto err; } } return 0; err: local->refresh_layout_unlock (frame, this, -1, 0); dht_selfheal_dir_finish (frame, this, -1, 0); return 0; } int dht_refresh_layout (call_frame_t *frame) { int call_cnt = 0; int i = 0, ret = -1; dht_conf_t *conf = NULL; dht_local_t *local = NULL; xlator_t *this = NULL; GF_VALIDATE_OR_GOTO ("dht", frame, out); GF_VALIDATE_OR_GOTO ("dht", frame->local, out); this = frame->this; conf = this->private; local = frame->local; call_cnt = conf->subvolume_cnt; local->call_cnt = call_cnt; local->op_ret = -1; if (local->selfheal.refreshed_layout) { dht_layout_unref (this, local->selfheal.refreshed_layout); local->selfheal.refreshed_layout = NULL; } local->selfheal.refreshed_layout = dht_layout_new (this, conf->subvolume_cnt); if (!local->selfheal.refreshed_layout) { goto out; } if (local->xattr != NULL) { dict_del (local->xattr, conf->xattr_name); } if (local->xattr_req == NULL) { local->xattr_req = dict_new (); if (local->xattr_req == NULL) { goto out; } } if (dict_get (local->xattr_req, conf->xattr_name) == 0) { ret = dict_set_uint32 (local->xattr_req, conf->xattr_name, 4 * 4); if (ret) gf_msg (this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, "%s: Failed to set dictionary value:key = %s", local->loc.path, conf->xattr_name); } for (i = 0; i < call_cnt; i++) { STACK_WIND (frame, dht_refresh_layout_cbk, conf->subvolumes[i], conf->subvolumes[i]->fops->lookup, &local->loc, local->xattr_req); } return 0; out: local->refresh_layout_unlock (frame, this, -1, 0); dht_selfheal_dir_finish (frame, this, -1, 0); return 0; } int32_t dht_selfheal_layout_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { dht_local_t *local = NULL; local = frame->local; if (!local) { goto err; } if (op_ret < 0) { goto err; } local->refresh_layout_unlock = dht_selfheal_dir_finish; local->refresh_layout_done = dht_refresh_layout_done; dht_refresh_layout (frame); return 0; err: dht_selfheal_dir_finish (frame, this, -1, 0); return 0; } gf_boolean_t dht_should_heal_layout (call_frame_t *frame, dht_layout_t **heal, dht_layout_t **ondisk) { gf_boolean_t fixit = _gf_true; dht_local_t *local = NULL; int ret = -1, heal_missing_dirs = 0; local = frame->local; if ((heal == NULL) || (*heal == NULL) || (ondisk == NULL) || (*ondisk == NULL)) goto out; ret = dht_layout_anomalies (frame->this, &local->loc, *ondisk, &local->selfheal.hole_cnt, &local->selfheal.overlaps_cnt, NULL, &local->selfheal.down, &local->selfheal.misc, NULL); if (ret < 0) goto out; /* Directories might've been created as part of this self-heal. We've to * sync non-layout xattrs and set range 0-0 on new directories */ heal_missing_dirs = local->selfheal.force_mkdir ? local->selfheal.force_mkdir : dht_layout_missing_dirs (*heal); if ((local->selfheal.hole_cnt == 0) && (local->selfheal.overlaps_cnt == 0) && heal_missing_dirs) { dht_layout_t *tmp = NULL; /* Just added a brick and need to set 0-0 range on this brick. * But ondisk layout is well-formed. So, swap layouts "heal" and * "ondisk". Now "ondisk" layout will be used for healing * xattrs. If there are any non-participating subvols in * "ondisk" layout, dht_selfheal_dir_xattr_persubvol will set * 0-0 and non-layout xattrs. This way we won't end up in * "corrupting" already set and well-formed "ondisk" layout. */ tmp = *heal; *heal = *ondisk; *ondisk = tmp; /* Current selfheal code, heals non-layout xattrs only after * an add-brick. In fact non-layout xattrs are considered as * secondary citizens which are healed only if layout xattrs * need to be healed. This is wrong, since for eg., quota can be * set when layout is well-formed, but a node is down. Also, * just for healing non-layout xattrs, we don't need locking. * This issue is _NOT FIXED_ by this patch. */ } fixit = (local->selfheal.hole_cnt || local->selfheal.overlaps_cnt || heal_missing_dirs); out: return fixit; } int dht_layout_span (dht_layout_t *layout) { int i = 0, count = 0; for (i = 0; i < layout->cnt; i++) { if (layout->list[i].err) continue; if (layout->list[i].start != layout->list[i].stop) count++; } return count; } int dht_decommissioned_bricks_in_layout (xlator_t *this, dht_layout_t *layout) { dht_conf_t *conf = NULL; int count = 0, i = 0, j = 0; if ((this == NULL) || (layout == NULL)) goto out; conf = this->private; for (i = 0; i < layout->cnt; i++) { for (j = 0; j < conf->subvolume_cnt; j++) { if (conf->decommissioned_bricks[j] && conf->decommissioned_bricks[j] == layout->list[i].xlator) { count++; } } } out: return count; } dht_distribution_type_t dht_distribution_type (xlator_t *this, dht_layout_t *layout) { dht_distribution_type_t type = GF_DHT_EQUAL_DISTRIBUTION; int i = 0; uint32_t start_range = 0, range = 0, diff = 0; if ((this == NULL) || (layout == NULL) || (layout->cnt < 1)) { goto out; } for (i = 0; i < layout->cnt; i++) { if (start_range == 0) { start_range = layout->list[i].stop - layout->list[i].start; continue; } range = layout->list[i].stop - layout->list[i].start; diff = abs (range - start_range); if ((range != 0) && (diff > layout->cnt)) { type = GF_DHT_WEIGHTED_DISTRIBUTION; break; } } out: return type; } gf_boolean_t dht_should_fix_layout (call_frame_t *frame, dht_layout_t **inmem, dht_layout_t **ondisk) { gf_boolean_t fixit = _gf_true; dht_local_t *local = NULL; int layout_span = 0; int decommissioned_bricks = 0; int ret = 0; dht_conf_t *conf = NULL; dht_distribution_type_t inmem_dist_type = 0; dht_distribution_type_t ondisk_dist_type = 0; conf = frame->this->private; local = frame->local; if ((inmem == NULL) || (*inmem == NULL) || (ondisk == NULL) || (*ondisk == NULL)) goto out; ret = dht_layout_anomalies (frame->this, &local->loc, *ondisk, &local->selfheal.hole_cnt, &local->selfheal.overlaps_cnt, NULL, &local->selfheal.down, &local->selfheal.misc, NULL); if (ret < 0) { fixit = _gf_false; goto out; } if (local->selfheal.down || local->selfheal.misc) { fixit = _gf_false; goto out; } if (local->selfheal.hole_cnt || local->selfheal.overlaps_cnt) goto out; /* If commit hashes are being updated, let it through */ if ((*inmem)->commit_hash != (*ondisk)->commit_hash) goto out; layout_span = dht_layout_span (*ondisk); decommissioned_bricks = dht_decommissioned_bricks_in_layout (frame->this, *ondisk); inmem_dist_type = dht_distribution_type (frame->this, *inmem); ondisk_dist_type = dht_distribution_type (frame->this, *ondisk); if ((decommissioned_bricks == 0) && (layout_span == (conf->subvolume_cnt - conf->decommission_subvols_cnt)) && (inmem_dist_type == ondisk_dist_type)) fixit = _gf_false; out: return fixit; } int dht_selfheal_layout_lock (call_frame_t *frame, dht_layout_t *layout, gf_boolean_t newdir, dht_selfheal_layout_t healer, dht_need_heal_t should_heal) { dht_local_t *local = NULL; int count = 1, ret = -1, i = 0; dht_lock_t **lk_array = NULL; dht_conf_t *conf = NULL; dht_layout_t *tmp = NULL; GF_VALIDATE_OR_GOTO ("dht", frame, err); GF_VALIDATE_OR_GOTO (frame->this->name, frame->local, err); local = frame->local; conf = frame->this->private; local->selfheal.healer = healer; local->selfheal.should_heal = should_heal; tmp = local->selfheal.layout; local->selfheal.layout = dht_layout_ref (frame->this, layout); dht_layout_unref (frame->this, tmp); if (!newdir) { count = conf->subvolume_cnt; lk_array = GF_CALLOC (count, sizeof (*lk_array), gf_common_mt_char); if (lk_array == NULL) goto err; for (i = 0; i < count; i++) { lk_array[i] = dht_lock_new (frame->this, conf->subvolumes[i], &local->loc, F_WRLCK, DHT_LAYOUT_HEAL_DOMAIN); if (lk_array[i] == NULL) goto err; } } else { count = 1; lk_array = GF_CALLOC (count, sizeof (*lk_array), gf_common_mt_char); if (lk_array == NULL) goto err; lk_array[0] = dht_lock_new (frame->this, local->hashed_subvol, &local->loc, F_WRLCK, DHT_LAYOUT_HEAL_DOMAIN); if (lk_array[0] == NULL) goto err; } local->lock.locks = lk_array; local->lock.lk_count = count; ret = dht_blocking_inodelk (frame, lk_array, count, FAIL_ON_ANY_ERROR, dht_selfheal_layout_lock_cbk); if (ret < 0) { local->lock.locks = NULL; local->lock.lk_count = 0; goto err; } return 0; err: if (lk_array != NULL) { dht_lock_array_free (lk_array, count); GF_FREE (lk_array); } return -1; } int dht_selfheal_dir_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; xlator_t *subvol = NULL; int i = 0; dht_layout_t *layout = NULL; int err = 0; int this_call_cnt = 0; local = frame->local; layout = local->selfheal.layout; prev = cookie; subvol = prev->this; if (op_ret == 0) err = 0; else err = op_errno; for (i = 0; i < layout->cnt; i++) { if (layout->list[i].xlator == subvol) { layout->list[i].err = err; break; } } this_call_cnt = dht_frame_return (frame); if (is_last_call (this_call_cnt)) { dht_selfheal_dir_finish (frame, this, 0, 0); } return 0; } int dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc, dht_layout_t *layout, int i, xlator_t *req_subvol) { xlator_t *subvol = NULL; dict_t *xattr = NULL; int ret = 0; xlator_t *this = NULL; int32_t *disk_layout = NULL; dht_local_t *local = NULL; dht_conf_t *conf = NULL; data_t *data = NULL; char gfid[GF_UUID_BUF_SIZE] = {0}; local = frame->local; if (req_subvol) subvol = req_subvol; else subvol = layout->list[i].xlator; this = frame->this; GF_VALIDATE_OR_GOTO ("", this, err); GF_VALIDATE_OR_GOTO (this->name, layout, err); GF_VALIDATE_OR_GOTO (this->name, local, err); GF_VALIDATE_OR_GOTO (this->name, subvol, err); VALIDATE_OR_GOTO (this->private, err); conf = this->private; xattr = get_new_dict (); if (!xattr) { goto err; } gf_uuid_unparse(loc->inode->gfid, gfid); ret = dht_disk_layout_extract (this, layout, i, &disk_layout); if (ret == -1) { gf_msg (this->name, GF_LOG_WARNING, 0, DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, "Directory self heal xattr failed:" " %s: (subvol %s) Failed to extract disk layout," " gfid = %s", loc->path, subvol->name, gfid); goto err; } ret = dict_set_bin (xattr, conf->xattr_name, disk_layout, 4 * 4); if (ret == -1) { gf_msg (this->name, GF_LOG_WARNING, 0, DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, "Directory self heal xattr failed:" "%s: (subvol %s) Failed to set xattr dictionary," " gfid = %s", loc->path, subvol->name, gfid); GF_FREE (disk_layout); goto err; } disk_layout = NULL; gf_msg_trace (this->name, 0, "setting hash range %u - %u (type %d) on subvolume %s" " for %s", layout->list[i].start, layout->list[i].stop, layout->type, subvol->name, loc->path); dict_ref (xattr); if (local->xattr) { data = dict_get (local->xattr, QUOTA_LIMIT_KEY); if (data) { ret = dict_add (xattr, QUOTA_LIMIT_KEY, data); if (ret) { gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, "%s: Failed to set dictionary value:" " key = %s", loc->path, QUOTA_LIMIT_KEY); } } } if (!gf_uuid_is_null (local->gfid)) gf_uuid_copy (loc->gfid, local->gfid); STACK_WIND (frame, dht_selfheal_dir_xattr_cbk, subvol, subvol->fops->setxattr, loc, xattr, 0, NULL); dict_unref (xattr); return 0; err: if (xattr) dict_destroy (xattr); GF_FREE (disk_layout); dht_selfheal_dir_xattr_cbk (frame, subvol, frame->this, -1, ENOMEM, NULL); return 0; } int dht_fix_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) { dht_local_t *local = NULL; int i = 0; int count = 0; xlator_t *this = NULL; dht_conf_t *conf = NULL; dht_layout_t *dummy = NULL; local = frame->local; this = frame->this; conf = this->private; gf_msg_debug (this->name, 0, "%s: Writing the new range for all subvolumes", loc->path); local->call_cnt = count = conf->subvolume_cnt; dht_log_new_layout_for_dir_selfheal (this, loc, layout); for (i = 0; i < layout->cnt; i++) { dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i, NULL); if (--count == 0) goto out; } /* if we are here, subvolcount > layout_count. subvols-per-directory * option might be set here. We need to clear out layout from the * non-participating subvolumes, else it will result in overlaps */ dummy = dht_layout_new (this, 1); if (!dummy) goto out; dummy->commit_hash = layout->commit_hash; for (i = 0; i < conf->subvolume_cnt; i++) { if (_gf_false == dht_is_subvol_in_layout (layout, conf->subvolumes[i])) { dht_selfheal_dir_xattr_persubvol (frame, loc, dummy, 0, conf->subvolumes[i]); if (--count == 0) break; } } dht_layout_unref (this, dummy); out: return 0; } int dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) { dht_local_t *local = NULL; int missing_xattr = 0; int i = 0; xlator_t *this = NULL; dht_conf_t *conf = NULL; dht_layout_t *dummy = NULL; local = frame->local; this = frame->this; conf = this->private; for (i = 0; i < layout->cnt; i++) { if (layout->list[i].err != -1 || !layout->list[i].stop) { /* err != -1 would mean xattr present on the directory * or the directory is non existent. * !layout->list[i].stop would mean layout absent */ continue; } missing_xattr++; } /* Also account for subvolumes with no-layout. Used for zero'ing out * the layouts and for setting quota key's if present */ for (i = 0; i < conf->subvolume_cnt; i++) { if (_gf_false == dht_is_subvol_in_layout (layout, conf->subvolumes[i])) { missing_xattr++; } } gf_msg_trace (this->name, 0, "%d subvolumes missing xattr for %s", missing_xattr, loc->path); if (missing_xattr == 0) { dht_selfheal_dir_finish (frame, this, 0, 0); return 0; } local->call_cnt = missing_xattr; dht_log_new_layout_for_dir_selfheal (this, loc, layout); for (i = 0; i < layout->cnt; i++) { if (layout->list[i].err != -1 || !layout->list[i].stop) continue; dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i, NULL); if (--missing_xattr == 0) break; } dummy = dht_layout_new (this, 1); if (!dummy) goto out; for (i = 0; i < conf->subvolume_cnt && missing_xattr; i++) { if (_gf_false == dht_is_subvol_in_layout (layout, conf->subvolumes[i])) { dht_selfheal_dir_xattr_persubvol (frame, loc, dummy, 0, conf->subvolumes[i]); missing_xattr--; } } dht_layout_unref (this, dummy); out: return 0; } gf_boolean_t dht_is_subvol_part_of_layout (dht_layout_t *layout, xlator_t *xlator) { int i = 0; gf_boolean_t ret = _gf_false; for (i = 0; i < layout->cnt; i++) { if (!strcmp (layout->list[i].xlator->name, xlator->name)) { ret = _gf_true; break; } } return ret; } int dht_layout_index_from_conf (dht_layout_t *layout, xlator_t *xlator) { int i = -1; int j = 0; for (j = 0; j < layout->cnt; j++) { if (!strcmp (layout->list[j].xlator->name, xlator->name)) { i = j; break; } } return i; } static int dht_selfheal_dir_xattr_for_nameless_lookup (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) { dht_local_t *local = NULL; int missing_xattr = 0; int i = 0; xlator_t *this = NULL; dht_conf_t *conf = NULL; dht_layout_t *dummy = NULL; int j = 0; local = frame->local; this = frame->this; conf = this->private; for (i = 0; i < layout->cnt; i++) { if (layout->list[i].err != -1 || !layout->list[i].stop) { /* err != -1 would mean xattr present on the directory or the directory is non existent. !layout->list[i].stop would mean layout absent */ continue; } missing_xattr++; } /* Also account for subvolumes with no-layout. Used for zero'ing out the layouts and for setting quota key's if present */ /* Send where either the subvol is not part of layout, * or it is part of the layout but error is non-zero but error * is not equal to -1 or ENOENT. */ for (i = 0; i < conf->subvolume_cnt; i++) { if (dht_is_subvol_part_of_layout (layout, conf->subvolumes[i]) == _gf_false) { missing_xattr++; continue; } j = dht_layout_index_from_conf (layout, conf->subvolumes[i]); if ((j != -1) && (layout->list[j].err != -1) && (layout->list[j].err != 0) && (layout->list[j].err != ENOENT)) { missing_xattr++; } } gf_msg_trace (this->name, 0, "%d subvolumes missing xattr for %s", missing_xattr, loc->path); if (missing_xattr == 0) { dht_selfheal_dir_finish (frame, this, 0, 0); return 0; } local->call_cnt = missing_xattr; dht_log_new_layout_for_dir_selfheal (this, loc, layout); for (i = 0; i < layout->cnt; i++) { if (layout->list[i].err != -1 || !layout->list[i].stop) continue; dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i, NULL); if (--missing_xattr == 0) break; } dummy = dht_layout_new (this, 1); if (!dummy) goto out; for (i = 0; i < conf->subvolume_cnt && missing_xattr; i++) { if (dht_is_subvol_part_of_layout (layout, conf->subvolumes[i]) == _gf_false) { dht_selfheal_dir_xattr_persubvol (frame, loc, dummy, 0, conf->subvolumes[i]); missing_xattr--; continue; } j = dht_layout_index_from_conf (layout, conf->subvolumes[i]); if ((j != -1) && (layout->list[j].err != -1) && (layout->list[j].err != ENOENT) && (layout->list[j].err != 0)) { dht_selfheal_dir_xattr_persubvol (frame, loc, dummy, 0, conf->subvolumes[i]); missing_xattr--; } } dht_layout_unref (this, dummy); out: return 0; } int dht_selfheal_dir_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iatt *statpre, struct iatt *statpost, dict_t *xdata) { dht_local_t *local = NULL; dht_layout_t *layout = NULL; int this_call_cnt = 0, ret = -1; local = frame->local; layout = local->selfheal.layout; this_call_cnt = dht_frame_return (frame); if (is_last_call (this_call_cnt)) { ret = dht_selfheal_layout_lock (frame, layout, _gf_false, dht_selfheal_dir_xattr, dht_should_heal_layout); if (ret < 0) { dht_selfheal_dir_finish (frame, this, -1, 0); } } return 0; } int dht_selfheal_dir_setattr (call_frame_t *frame, loc_t *loc, struct iatt *stbuf, int32_t valid, dht_layout_t *layout) { int missing_attr = 0; int i = 0, ret = -1; dht_local_t *local = NULL; xlator_t *this = NULL; local = frame->local; this = frame->this; for (i = 0; i < layout->cnt; i++) { if (layout->list[i].err == -1) missing_attr++; } if (missing_attr == 0) { ret = dht_selfheal_layout_lock (frame, layout, _gf_false, dht_selfheal_dir_xattr, dht_should_heal_layout); if (ret < 0) { dht_selfheal_dir_finish (frame, this, -1, 0); } return 0; } if (!gf_uuid_is_null (local->gfid)) gf_uuid_copy (loc->gfid, local->gfid); local->call_cnt = missing_attr; for (i = 0; i < layout->cnt; i++) { if (layout->list[i].err == -1) { gf_msg_trace (this->name, 0, "%s: setattr on subvol %s, gfid = %s", loc->path, layout->list[i].xlator->name, uuid_utoa(loc->gfid)); STACK_WIND (frame, dht_selfheal_dir_setattr_cbk, layout->list[i].xlator, layout->list[i].xlator->fops->setattr, loc, stbuf, valid, NULL); } } return 0; } int dht_selfheal_dir_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; dht_layout_t *layout = NULL; call_frame_t *prev = NULL; xlator_t *subvol = NULL; int i = 0, ret = -1; int this_call_cnt = 0; char gfid[GF_UUID_BUF_SIZE] = {0}; local = frame->local; layout = local->selfheal.layout; prev = cookie; subvol = prev->this; if ((op_ret == 0) || ((op_ret == -1) && (op_errno == EEXIST))) { for (i = 0; i < layout->cnt; i++) { if (layout->list[i].xlator == subvol) { layout->list[i].err = -1; break; } } } if (op_ret) { gf_uuid_unparse(local->loc.gfid, gfid); gf_msg (this->name, ((op_errno == EEXIST) ? GF_LOG_DEBUG : GF_LOG_WARNING), op_errno, DHT_MSG_DIR_SELFHEAL_FAILED, "Directory selfheal failed: path = %s, gfid = %s", local->loc.path, gfid ); goto out; } dht_iatt_merge (this, &local->preparent, preparent, prev->this); dht_iatt_merge (this, &local->postparent, postparent, prev->this); ret = 0; out: this_call_cnt = dht_frame_return (frame); if (is_last_call (this_call_cnt)) { dht_selfheal_dir_finish (frame, this, ret, -1); dht_selfheal_dir_setattr (frame, &local->loc, &local->stbuf, 0xffffff, layout); } return 0; } void dht_selfheal_dir_mkdir_setacl (dict_t *xattr, dict_t *dict) { data_t *acl_default = NULL; data_t *acl_access = NULL; xlator_t *this = NULL; int ret = -1; GF_ASSERT (xattr); GF_ASSERT (dict); this = THIS; GF_ASSERT (this); acl_default = dict_get (xattr, POSIX_ACL_DEFAULT_XATTR); if (!acl_default) { gf_msg_debug (this->name, 0, "ACL_DEFAULT xattr not present"); goto cont; } ret = dict_set (dict, POSIX_ACL_DEFAULT_XATTR, acl_default); if (ret) gf_msg (this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, "Failed to set dictionary value.key = %s", POSIX_ACL_DEFAULT_XATTR); cont: acl_access = dict_get (xattr, POSIX_ACL_ACCESS_XATTR); if (!acl_access) { gf_msg_debug (this->name, 0, "ACL_ACCESS xattr not present"); goto out; } ret = dict_set (dict, POSIX_ACL_ACCESS_XATTR, acl_access); if (ret) gf_msg (this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, "Failed to set dictionary value.key = %s", POSIX_ACL_ACCESS_XATTR); out: return; } int dht_selfheal_dir_mkdir_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { dht_local_t *local = NULL; int i = 0; int ret = -1; dict_t *dict = NULL; dht_layout_t *layout = NULL; loc_t *loc = NULL; VALIDATE_OR_GOTO (this->private, err); local = frame->local; layout = local->layout; loc = &local->loc; if (op_ret < 0) { gf_msg (this->name, GF_LOG_WARNING, op_errno, DHT_MSG_INODE_LK_ERROR, "acquiring inodelk failed for %s", loc->path); local->op_ret = -1; local->op_errno = (op_errno == EAGAIN) ? EBUSY : op_errno; goto err; } if (!gf_uuid_is_null (local->gfid)) { dict = dict_new (); if (!dict) return -1; ret = dict_set_static_bin (dict, "gfid-req", local->gfid, 16); if (ret) gf_msg (this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, "%s: Failed to set dictionary value:" " key = gfid-req", loc->path); } else if (local->params) { /* Send the dictionary from higher layers directly */ dict = dict_ref (local->params); } /* Set acls */ if (local->xattr && dict) dht_selfheal_dir_mkdir_setacl (local->xattr, dict); if (!dict) gf_msg (this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, "dict is NULL, need to make sure gfids are same"); /* We don't have to do a lookup here again: 1) Parallel rmdir would had removed the directory and locking would have anyway failed with an ESTALE on all subvols. Hence selfheal will never create the directory. 2) Parallel lookup creating directory does not have to be mutually exclusive for the mkdir phase of lookup selfheal. */ for (i = 0; i < layout->cnt; i++) { if (layout->list[i].err == ENOENT || local->selfheal.force_mkdir) { gf_msg_debug (this->name, 0, "Creating directory %s on subvol %s", loc->path, layout->list[i].xlator->name); STACK_WIND (frame, dht_selfheal_dir_mkdir_cbk, layout->list[i].xlator, layout->list[i].xlator->fops->mkdir, loc, st_mode_from_ia (local->stbuf.ia_prot, local->stbuf.ia_type), 0, dict); } } if (dict) dict_unref (dict); return 0; err: dht_selfheal_dir_finish (frame, this, -1, 0); return 0; } int dht_selfheal_dir_mkdir (call_frame_t *frame, loc_t *loc, dht_layout_t *layout, int force) { int missing_dirs = 0; int i = 0; int ret = -1; int count = 1; dht_local_t *local = NULL; dht_conf_t *conf = NULL; xlator_t *this = NULL; dht_lock_t **lk_array = NULL; local = frame->local; this = frame->this; conf = this->private; local->selfheal.force_mkdir = force ? _gf_true : _gf_false; for (i = 0; i < layout->cnt; i++) { if (layout->list[i].err == ENOENT || force) missing_dirs++; } if (missing_dirs == 0) { dht_selfheal_dir_setattr (frame, loc, &local->stbuf, 0xffffffff, layout); return 0; } local->call_cnt = missing_dirs; count = conf->subvolume_cnt; /* Locking on all subvols in the mkdir phase of lookup selfheal is is done to synchronize with rmdir/rename. */ lk_array = GF_CALLOC (count, sizeof (*lk_array), gf_common_mt_char); if (lk_array == NULL) goto err; for (i = 0; i < count; i++) { lk_array[i] = dht_lock_new (frame->this, conf->subvolumes[i], &local->loc, F_WRLCK, DHT_LAYOUT_HEAL_DOMAIN); if (lk_array[i] == NULL) goto err; } local->lock.locks = lk_array; local->lock.lk_count = count; ret = dht_blocking_inodelk (frame, lk_array, count, IGNORE_ENOENT_ESTALE, dht_selfheal_dir_mkdir_lock_cbk); if (ret < 0) { local->lock.locks = NULL; local->lock.lk_count = 0; goto err; } return 0; err: if (lk_array != NULL) { dht_lock_array_free (lk_array, count); GF_FREE (lk_array); } return -1; } int dht_selfheal_layout_alloc_start (xlator_t *this, loc_t *loc, dht_layout_t *layout) { int start = 0; uint32_t hashval = 0; int ret = 0; const char *str = NULL; dht_conf_t *conf = NULL; char buf[UUID_CANONICAL_FORM_LEN + 1] = {0, }; conf = this->private; if (conf->randomize_by_gfid) { str = uuid_utoa_r (loc->gfid, buf); } else { str = loc->path; } ret = dht_hash_compute (this, layout->type, str, &hashval); if (ret == 0) { start = (hashval % layout->cnt); } return start; } static int dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout) { int i = 0; int j = 0; int err = 0; int count = 0; dht_conf_t *conf = NULL; /* Gets in use only for replace-brick, remove-brick */ conf = this->private; for (i = 0; i < layout->cnt; i++) { for (j = 0; j < conf->subvolume_cnt; j++) { if (conf->decommissioned_bricks[j] && conf->decommissioned_bricks[j] == layout->list[i].xlator) { layout->list[i].err = EINVAL; break; } } } for (i = 0; i < layout->cnt; i++) { err = layout->list[i].err; if (err == -1 || err == 0 || err == ENOENT) { /* Take this with a pinch of salt. The behaviour seems * to be slightly different when this function is * invoked from mkdir codepath. For eg., err == 0 in * mkdir codepath means directory created but xattr * is not set yet. */ /* Setting list[i].err = -1 is an indication for dht_selfheal_layout_new_directory() to assign a range. We set it to -1 based on any one of the three criteria: - err == -1 already, which means directory existed but layout was not set on it. - err == 0, which means directory exists and has an old layout piece which will be overwritten now. - err == ENOENT, which means directory does not exist (possibly racing with mkdir or finishing half done mkdir). The missing directory will be attempted to be recreated. */ count++; if (!err) layout->list[i].err = -1; } } /* no subvolume has enough space, but can't stop directory creation */ if (!count || !new_layout) { for (i = 0; i < layout->cnt; i++) { err = layout->list[i].err; if (err == ENOSPC) { layout->list[i].err = -1; count++; } } } /* if layout->spread_cnt is set, check if it is <= available * subvolumes (down brick and decommissioned bricks are considered * un-availbale). Else return count (available up bricks) */ count = ((layout->spread_cnt && (layout->spread_cnt <= count)) ? layout->spread_cnt : ((count) ? count : 1)); return count; } void dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, dht_layout_t *new_layout); void dht_layout_entry_swap (dht_layout_t *layout, int i, int j); void dht_layout_range_swap (dht_layout_t *layout, int i, int j); /* * It's a bit icky using local variables in a macro, but it makes the rest * of the code a lot clearer. */ #define OV_ENTRY(x,y) table[x*new->cnt+y] void dht_selfheal_layout_maximize_overlap (call_frame_t *frame, loc_t *loc, dht_layout_t *new, dht_layout_t *old) { int i = 0; int j = 0; uint32_t curr_overlap = 0; uint32_t max_overlap = 0; int max_overlap_idx = -1; uint32_t overlap = 0; uint32_t *table = NULL; dht_layout_sort_volname (old); /* Now both old_layout->list[] and new_layout->list[] are match the same xlators/subvolumes. i.e, old_layout->[i] and new_layout->[i] are referring to the same subvolumes */ /* Build a table of overlaps between new[i] and old[j]. */ table = alloca(sizeof(overlap)*old->cnt*new->cnt); if (!table) { return; } memset(table,0,sizeof(overlap)*old->cnt*new->cnt); for (i = 0; i < new->cnt; ++i) { for (j = 0; j < old->cnt; ++j) { OV_ENTRY(i,j) = dht_overlap_calc(old,j,new,i); } } for (i = 0; i < new->cnt; i++) { if (new->list[i].err > 0) { /* Subvol might be marked for decommission with EINVAL, or some other serious error marked with positive errno. */ continue; } max_overlap = 0; max_overlap_idx = i; for (j = (i + 1); j < new->cnt; ++j) { if (new->list[j].err > 0) { /* Subvol might be marked for decommission with EINVAL, or some other serious error marked with positive errno. */ continue; } /* Calculate the overlap now. */ curr_overlap = OV_ENTRY(i,i) + OV_ENTRY(j,j); /* Calculate the overlap after the proposed swap. */ overlap = OV_ENTRY(i,j) + OV_ENTRY(j,i); /* Are we better than status quo? */ if (overlap > curr_overlap) { overlap -= curr_overlap; /* Are we better than the previous choice? */ if (overlap > max_overlap) { max_overlap = overlap; max_overlap_idx = j; } } } if (max_overlap_idx != i) { dht_layout_range_swap (new, i, max_overlap_idx); /* Need to swap the table values too. */ for (j = 0; j < old->cnt; ++j) { overlap = OV_ENTRY(i,j); OV_ENTRY(i,j) = OV_ENTRY(max_overlap_idx,j); OV_ENTRY(max_overlap_idx,j) = overlap; } } } } dht_layout_t * dht_fix_layout_of_directory (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) { int i = 0; xlator_t *this = NULL; dht_layout_t *new_layout = NULL; dht_conf_t *priv = NULL; dht_local_t *local = NULL; uint32_t subvol_down = 0; int ret = 0; this = frame->this; priv = this->private; local = frame->local; if (layout->type == DHT_HASH_TYPE_DM_USER) { gf_msg_debug (THIS->name, 0, "leaving %s alone", loc->path); goto done; } new_layout = dht_layout_new (this, priv->subvolume_cnt); if (!new_layout) goto done; /* If a subvolume is down, do not re-write the layout. */ ret = dht_layout_anomalies (this, loc, layout, NULL, NULL, NULL, &subvol_down, NULL, NULL); if (subvol_down || (ret == -1)) { gf_msg (this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_FIX_FAILED, "Layout fix failed: %u subvolume(s) are down" ". Skipping fix layout.", subvol_down); GF_FREE (new_layout); return NULL; } for (i = 0; i < new_layout->cnt; i++) { if (layout->list[i].err != ENOSPC) new_layout->list[i].err = layout->list[i].err; else new_layout->list[i].err = -1; new_layout->list[i].xlator = layout->list[i].xlator; } new_layout->commit_hash = layout->commit_hash; if (priv->du_stats) { for (i = 0; i < priv->subvolume_cnt; ++i) { gf_msg (this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO, "subvolume %d (%s): %u chunks", i, priv->subvolumes[i]->name, priv->du_stats[i].chunks); } } else { gf_msg (this->name, GF_LOG_WARNING, 0, DHT_MSG_NO_DISK_USAGE_STATUS, "no du stats ?!?"); } /* First give it a layout as though it is a new directory. This ensures rotation to kick in */ dht_layout_sort_volname (new_layout); dht_selfheal_layout_new_directory (frame, loc, new_layout); /* Now selectively re-assign ranges only when it helps */ dht_selfheal_layout_maximize_overlap (frame, loc, new_layout, layout); done: if (new_layout) { /* Now that the new layout has all the proper layout, change the inode context */ dht_layout_set (this, loc->inode, new_layout); /* Make sure the extra 'ref' for existing layout is removed */ dht_layout_unref (this, local->layout); local->layout = new_layout; } return local->layout; } /* * Having to call this 2x for each entry in the layout is pretty horrible, but * that's what all of this layout-sorting nonsense gets us. */ uint32_t dht_get_chunks_from_xl (xlator_t *parent, xlator_t *child) { dht_conf_t *priv = parent->private; xlator_list_t *trav; uint32_t index = 0; if (!priv->du_stats) { return 0; } for (trav = parent->children; trav; trav = trav->next) { if (trav->xlator == child) { return priv->du_stats[index].chunks; } ++index; } return 0; } void dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) { xlator_t *this = NULL; uint32_t chunk = 0; int i = 0; uint32_t start = 0; int bricks_to_use = 0; int err = 0; int start_subvol = 0; uint32_t curr_size; uint32_t total_size = 0; int real_i; dht_conf_t *priv; gf_boolean_t weight_by_size; int bricks_used = 0; this = frame->this; priv = this->private; weight_by_size = priv->do_weighting; bricks_to_use = dht_get_layout_count (this, layout, 1); GF_ASSERT (bricks_to_use > 0); bricks_used = 0; for (i = 0; i < layout->cnt; ++i) { err = layout->list[i].err; if ((err != -1) && (err != ENOENT)) { continue; } curr_size = dht_get_chunks_from_xl (this, layout->list[i].xlator); if (!curr_size) { weight_by_size = _gf_false; break; } total_size += curr_size; if (++bricks_used >= bricks_to_use) { break; } } if (weight_by_size && total_size) { /* We know total_size is not zero. */ chunk = ((unsigned long) 0xffffffff) / total_size; gf_msg_debug (this->name, 0, "chunk size = 0xffffffff / %u = 0x%x", total_size, chunk); } else { weight_by_size = _gf_false; chunk = ((unsigned long) 0xffffffff) / bricks_to_use; } start_subvol = dht_selfheal_layout_alloc_start (this, loc, layout); /* clear out the range, as we are re-computing here */ DHT_RESET_LAYOUT_RANGE (layout); /* * OK, what's this "real_i" stuff about? This used to be two loops - * from start_subvol to layout->cnt-1, then from 0 to start_subvol-1. * That way is practically an open invitation to bugs when only one * of the loops is updated. Using real_i and modulo operators to make * it one loop avoids this problem. Remember, folks: it's everyone's * responsibility to help stamp out copy/paste abuse. */ bricks_used = 0; for (real_i = 0; real_i < layout->cnt; real_i++) { i = (real_i + start_subvol) % layout->cnt; err = layout->list[i].err; if ((err != -1) && (err != ENOENT)) { continue; } if (weight_by_size) { curr_size = dht_get_chunks_from_xl (this, layout->list[i].xlator); if (!curr_size) { continue; } } else { curr_size = 1; } gf_msg_debug (this->name, 0, "assigning range size 0x%x to %s", chunk * curr_size, layout->list[i].xlator->name); DHT_SET_LAYOUT_RANGE(layout, i, start, chunk * curr_size, loc->path); if (++bricks_used >= bricks_to_use) { layout->list[i].stop = 0xffffffff; goto done; } start += (chunk * curr_size); } done: return; } int dht_selfheal_dir_getafix (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) { dht_local_t *local = NULL; uint32_t holes = 0; int ret = -1; int i = -1; uint32_t overlaps = 0; local = frame->local; holes = local->selfheal.hole_cnt; overlaps = local->selfheal.overlaps_cnt; if (holes || overlaps) { /* If the layout has anomolies which would change the hash * ranges, then we need to reset the commit_hash for this * directory, as the layout would change and things may not * be in place as expected */ layout->commit_hash = DHT_LAYOUT_HASH_INVALID; dht_selfheal_layout_new_directory (frame, loc, layout); ret = 0; } for (i = 0; i < layout->cnt; i++) { /* directory not present */ if (layout->list[i].err == ENOENT) { ret = 0; break; } } /* TODO: give a fix to these non-virgins */ return ret; } int dht_selfheal_new_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, dht_layout_t *layout) { dht_local_t *local = NULL; int ret = 0; local = frame->local; local->selfheal.dir_cbk = dir_cbk; local->selfheal.layout = dht_layout_ref (frame->this, layout); dht_layout_sort_volname (layout); dht_selfheal_layout_new_directory (frame, &local->loc, layout); ret = dht_selfheal_layout_lock (frame, layout, _gf_true, dht_selfheal_dir_xattr, dht_should_heal_layout); if (ret < 0) { dir_cbk (frame, NULL, frame->this, -1, ENOMEM, NULL); } return 0; } int dht_fix_directory_layout (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, dht_layout_t *layout) { dht_local_t *local = NULL; dht_layout_t *tmp_layout = NULL; int ret = 0; local = frame->local; local->selfheal.dir_cbk = dir_cbk; local->selfheal.layout = dht_layout_ref (frame->this, layout); /* No layout sorting required here */ tmp_layout = dht_fix_layout_of_directory (frame, &local->loc, layout); if (!tmp_layout) { return -1; } ret = dht_selfheal_layout_lock (frame, tmp_layout, _gf_false, dht_fix_dir_xattr, dht_should_fix_layout); return ret; } int dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, loc_t *loc, dht_layout_t *layout) { dht_local_t *local = NULL; uint32_t down = 0; uint32_t misc = 0; int ret = 0; xlator_t *this = NULL; char gfid[GF_UUID_BUF_SIZE] = {0}; local = frame->local; this = frame->this; gf_uuid_unparse(loc->gfid, gfid); dht_layout_anomalies (this, loc, layout, &local->selfheal.hole_cnt, &local->selfheal.overlaps_cnt, NULL, &local->selfheal.down, &local->selfheal.misc, NULL); down = local->selfheal.down; misc = local->selfheal.misc; local->selfheal.dir_cbk = dir_cbk; local->selfheal.layout = dht_layout_ref (this, layout); if (down) { gf_msg (this->name, GF_LOG_WARNING, 0, DHT_MSG_DIR_SELFHEAL_FAILED, "Directory selfheal failed: %d subvolumes down." "Not fixing. path = %s, gfid = %s", down, loc->path, gfid); ret = 0; goto sorry_no_fix; } if (misc) { gf_msg (this->name, GF_LOG_WARNING, 0, DHT_MSG_DIR_SELFHEAL_FAILED, "Directory selfheal failed : %d subvolumes " "have unrecoverable errors. path = %s, gfid = %s", misc, loc->path, gfid); ret = 0; goto sorry_no_fix; } dht_layout_sort_volname (layout); ret = dht_selfheal_dir_getafix (frame, loc, layout); if (ret == -1) { gf_msg (this->name, GF_LOG_WARNING, 0, DHT_MSG_DIR_SELFHEAL_FAILED, "Directory selfheal failed: " "Unable to form layout for directory %s", loc->path); goto sorry_no_fix; } dht_selfheal_dir_mkdir (frame, loc, layout, 0); return 0; sorry_no_fix: /* TODO: need to put appropriate local->op_errno */ dht_selfheal_dir_finish (frame, this, ret, 0); return 0; } int dht_selfheal_directory_for_nameless_lookup (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, loc_t *loc, dht_layout_t *layout) { dht_local_t *local = NULL; uint32_t down = 0; uint32_t misc = 0; int ret = 0; xlator_t *this = NULL; local = frame->local; this = frame->this; dht_layout_anomalies (this, loc, layout, &local->selfheal.hole_cnt, &local->selfheal.overlaps_cnt, NULL, &local->selfheal.down, &local->selfheal.misc, NULL); down = local->selfheal.down; misc = local->selfheal.misc; local->selfheal.dir_cbk = dir_cbk; local->selfheal.layout = dht_layout_ref (this, layout); if (down) { gf_msg (this->name, GF_LOG_WARNING, 0, DHT_MSG_SUBVOL_DOWN_ERROR, "%d subvolumes down -- not fixing", down); ret = 0; goto sorry_no_fix; } if (misc) { gf_msg (this->name, GF_LOG_WARNING, 0, DHT_MSG_SUBVOL_ERROR, "%d subvolumes have unrecoverable errors", misc); ret = 0; goto sorry_no_fix; } dht_layout_sort_volname (layout); ret = dht_selfheal_dir_getafix (frame, loc, layout); if (ret == -1) { gf_msg (this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_FORM_FAILED, "not able to form layout for the directory"); goto sorry_no_fix; } ret = dht_selfheal_layout_lock (frame, layout, _gf_false, dht_selfheal_dir_xattr_for_nameless_lookup, dht_should_heal_layout); if (ret < 0) { goto sorry_no_fix; } return 0; sorry_no_fix: /* TODO: need to put appropriate local->op_errno */ dht_selfheal_dir_finish (frame, this, ret, 0); return 0; } int dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, loc_t *loc, dht_layout_t *layout) { int ret = 0; dht_local_t *local = NULL; local = frame->local; local->selfheal.dir_cbk = dir_cbk; local->selfheal.layout = dht_layout_ref (frame->this, layout); ret = dht_selfheal_dir_mkdir (frame, loc, layout, 1); return ret; } int dht_dir_attr_heal (void *data) { call_frame_t *frame = NULL; dht_local_t *local = NULL; xlator_t *subvol = NULL; xlator_t *this = NULL; dht_conf_t *conf = NULL; int call_cnt = 0; int ret = -1; int i = 0; char gfid[GF_UUID_BUF_SIZE] = {0}; GF_VALIDATE_OR_GOTO ("dht", data, out); frame = data; local = frame->local; this = frame->this; GF_VALIDATE_OR_GOTO ("dht", this, out); GF_VALIDATE_OR_GOTO ("dht", local, out); conf = this->private; GF_VALIDATE_OR_GOTO ("dht", conf, out); call_cnt = conf->subvolume_cnt; for (i = 0; i < call_cnt; i++) { subvol = conf->subvolumes[i]; if (!subvol || (subvol == dht_first_up_subvol (this))) continue; ret = syncop_setattr (subvol, &local->loc, &local->stbuf, (GF_SET_ATTR_UID | GF_SET_ATTR_GID), NULL, NULL, NULL, NULL); if (ret) { gf_uuid_unparse(local->loc.gfid, gfid); gf_msg ("dht", GF_LOG_ERROR, -ret, DHT_MSG_DIR_ATTR_HEAL_FAILED, "Directory attr heal failed. Failed to set" " uid/gid on path %s on subvol %s, gfid = %s ", local->loc.path, subvol->name, gfid); } } out: return 0; } int dht_dir_attr_heal_done (int ret, call_frame_t *sync_frame, void *data) { DHT_STACK_DESTROY (sync_frame); return 0; } /* EXIT: dht_update_commit_hash_for_layout */ int dht_update_commit_hash_for_layout_done (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { dht_local_t *local = NULL; local = frame->local; /* preserve oldest error */ if (op_ret && !local->op_ret) { local->op_ret = op_ret; local->op_errno = op_errno; } DHT_STACK_UNWIND (setxattr, frame, local->op_ret, local->op_errno, NULL); return 0; } int dht_update_commit_hash_for_layout_unlock (call_frame_t *frame, xlator_t *this) { dht_local_t *local = NULL; int ret = 0; local = frame->local; ret = dht_unlock_inodelk (frame, local->lock.locks, local->lock.lk_count, dht_update_commit_hash_for_layout_done); if (ret < 0) { /* preserve oldest error, just ... */ if (!local->op_ret) { local->op_errno = errno; local->op_ret = -1; } gf_msg (this->name, GF_LOG_WARNING, errno, DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, "Winding unlock failed: stale locks left on brick" " %s", local->loc.path); dht_update_commit_hash_for_layout_done (frame, NULL, this, 0, 0, NULL); } return 0; } int dht_update_commit_hash_for_layout_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = 0; local = frame->local; LOCK (&frame->lock); /* store first failure, just because */ if (op_ret && !local->op_ret) { local->op_ret = op_ret; local->op_errno = op_errno; } UNLOCK (&frame->lock); this_call_cnt = dht_frame_return (frame); if (is_last_call (this_call_cnt)) { dht_update_commit_hash_for_layout_unlock (frame, this); } return 0; } int dht_update_commit_hash_for_layout_resume (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { dht_local_t *local = NULL; int count = 1, ret = -1, i = 0, j = 0; dht_conf_t *conf = NULL; dht_layout_t *layout = NULL; int32_t *disk_layout = NULL; dict_t **xattr = NULL; local = frame->local; conf = frame->this->private; count = conf->local_subvols_cnt; layout = local->layout; if (op_ret < 0) { goto err_done; } /* We precreate the xattr list as we cannot change call count post the * first wind as we may never continue from there. So we finish prep * work before winding the setxattrs */ xattr = GF_CALLOC (count, sizeof (*xattr), gf_common_mt_char); if (!xattr) { local->op_errno = errno; gf_msg (this->name, GF_LOG_WARNING, errno, DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, "Directory commit hash update failed:" " %s: Allocation failed", local->loc.path); goto err; } for (i = 0; i < count; i++) { /* find the layout index for the subvolume */ ret = dht_layout_index_for_subvol (layout, conf->local_subvols[i]); if (ret < 0) { local->op_errno = ENOENT; gf_msg (this->name, GF_LOG_WARNING, 0, DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, "Directory commit hash update failed:" " %s: (subvol %s) Failed to find disk layout", local->loc.path, conf->local_subvols[i]->name); goto err; } j = ret; /* update the commit hash for the layout */ layout->list[j].commit_hash = layout->commit_hash; /* extract the current layout */ ret = dht_disk_layout_extract (this, layout, j, &disk_layout); if (ret == -1) { local->op_errno = errno; gf_msg (this->name, GF_LOG_WARNING, errno, DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, "Directory commit hash update failed:" " %s: (subvol %s) Failed to extract disk" " layout", local->loc.path, conf->local_subvols[i]->name); goto err; } xattr[i] = get_new_dict (); if (!xattr[i]) { local->op_errno = errno; gf_msg (this->name, GF_LOG_WARNING, errno, DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, "Directory commit hash update failed:" " %s: Allocation failed", local->loc.path); goto err; } ret = dict_set_bin (xattr[i], conf->xattr_name, disk_layout, 4 * 4); if (ret != 0) { local->op_errno = ENOMEM; gf_msg (this->name, GF_LOG_WARNING, 0, DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, "Directory self heal xattr failed:" "%s: (subvol %s) Failed to set xattr" " dictionary,", local->loc.path, conf->local_subvols[i]->name); GF_FREE (disk_layout); goto err; } disk_layout = NULL; gf_msg_trace (this->name, 0, "setting commit hash %u on subvolume %s" " for %s", layout->list[j].commit_hash, conf->local_subvols[i]->name, local->loc.path); } /* wind the setting of the commit hash across the local subvols */ local->call_cnt = count; local->op_ret = 0; local->op_errno = 0; for (i = 0; i < count; i++) { dict_ref (xattr[i]); STACK_WIND (frame, dht_update_commit_hash_for_layout_cbk, conf->local_subvols[i], conf->local_subvols[i]->fops->setxattr, &local->loc, xattr[i], 0, NULL); dict_unref (xattr[i]); } return 0; err: if (xattr) { for (i = 0; i < count; i++) { if (xattr[i]) dict_destroy (xattr[i]); } GF_FREE (xattr); } GF_FREE (disk_layout); local->op_ret = -1; dht_update_commit_hash_for_layout_unlock (frame, this); return 0; err_done: local->op_ret = -1; dht_update_commit_hash_for_layout_done (frame, NULL, this, 0, 0, NULL); return 0; } /* ENTER: dht_update_commit_hash_for_layout (see EXIT above) * This function is invoked from rebalance only. * As a result, the check here is simple enough to see if defrag is present * in the conf, as other data would be populated appropriately if so. * If ever this was to be used in other code paths, checks would need to * change. * * Functional details: * - Lock the inodes on the subvols that we want the commit hash updated * - Update each layout with the inode layout, modified to take in the new * commit hash. * - Unlock and return. */ int dht_update_commit_hash_for_layout (call_frame_t *frame) { dht_local_t *local = NULL; int count = 1, ret = -1, i = 0; dht_lock_t **lk_array = NULL; dht_conf_t *conf = NULL; GF_VALIDATE_OR_GOTO ("dht", frame, err); GF_VALIDATE_OR_GOTO (frame->this->name, frame->local, err); local = frame->local; conf = frame->this->private; if (!conf->defrag) goto err; count = conf->local_subvols_cnt; lk_array = GF_CALLOC (count, sizeof (*lk_array), gf_common_mt_char); if (lk_array == NULL) goto err; for (i = 0; i < count; i++) { lk_array[i] = dht_lock_new (frame->this, conf->local_subvols[i], &local->loc, F_WRLCK, DHT_LAYOUT_HEAL_DOMAIN); if (lk_array[i] == NULL) goto err; } local->lock.locks = lk_array; local->lock.lk_count = count; ret = dht_blocking_inodelk (frame, lk_array, count, FAIL_ON_ANY_ERROR, dht_update_commit_hash_for_layout_resume); if (ret < 0) { local->lock.locks = NULL; local->lock.lk_count = 0; goto err; } return 0; err: if (lk_array != NULL) { dht_lock_array_free (lk_array, count); GF_FREE (lk_array); } return -1; }