/* Copyright (c) 2015 Red Hat, Inc. This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser General Public License, version 3 or any later version (LGPLv3 or later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ #include #include "shard.h" #include "shard-mem-types.h" #include "byte-order.h" #include "defaults.h" #include "statedump.h" static gf_boolean_t __is_shard_dir(uuid_t gfid) { shard_priv_t *priv = THIS->private; if (gf_uuid_compare(gfid, priv->dot_shard_gfid) == 0) return _gf_true; return _gf_false; } static gf_boolean_t __is_gsyncd_on_shard_dir(call_frame_t *frame, loc_t *loc) { if (frame->root->pid == GF_CLIENT_PID_GSYNCD && (__is_shard_dir(loc->pargfid) || (loc->parent && __is_shard_dir(loc->parent->gfid)))) return _gf_true; return _gf_false; } void shard_make_block_bname(int block_num, uuid_t gfid, char *buf, size_t len) { char gfid_str[GF_UUID_BUF_SIZE] = { 0, }; gf_uuid_unparse(gfid, gfid_str); snprintf(buf, len, "%s.%d", gfid_str, block_num); } void shard_make_block_abspath(int block_num, uuid_t gfid, char *filepath, size_t len) { char gfid_str[GF_UUID_BUF_SIZE] = { 0, }; gf_uuid_unparse(gfid, gfid_str); snprintf(filepath, len, "/%s/%s.%d", GF_SHARD_DIR, gfid_str, block_num); } int __shard_inode_ctx_get(inode_t *inode, xlator_t *this, shard_inode_ctx_t **ctx) { int ret = -1; uint64_t ctx_uint = 0; shard_inode_ctx_t *ctx_p = NULL; ret = __inode_ctx_get(inode, this, &ctx_uint); if (ret == 0) { *ctx = (shard_inode_ctx_t *)ctx_uint; return ret; } ctx_p = GF_CALLOC(1, sizeof(*ctx_p), gf_shard_mt_inode_ctx_t); if (!ctx_p) return ret; INIT_LIST_HEAD(&ctx_p->ilist); INIT_LIST_HEAD(&ctx_p->to_fsync_list); ret = __inode_ctx_set(inode, this, (uint64_t *)&ctx_p); if (ret < 0) { GF_FREE(ctx_p); return ret; } *ctx = ctx_p; return ret; } int shard_inode_ctx_get(inode_t *inode, xlator_t *this, shard_inode_ctx_t **ctx) { int ret = 0; LOCK(&inode->lock); { ret = __shard_inode_ctx_get(inode, this, ctx); } UNLOCK(&inode->lock); return ret; } int __shard_inode_ctx_set(inode_t *inode, xlator_t *this, struct iatt *stbuf, uint64_t block_size, int32_t valid) { int ret = -1; shard_inode_ctx_t *ctx = NULL; ret = __shard_inode_ctx_get(inode, this, &ctx); if (ret) return ret; if (valid & SHARD_MASK_BLOCK_SIZE) ctx->block_size = block_size; if (valid & SHARD_MASK_PROT) ctx->stat.ia_prot = stbuf->ia_prot; if (valid & SHARD_MASK_NLINK) ctx->stat.ia_nlink = stbuf->ia_nlink; if (valid & SHARD_MASK_UID) ctx->stat.ia_uid = stbuf->ia_uid; if (valid & SHARD_MASK_GID) ctx->stat.ia_gid = stbuf->ia_gid; if (valid & SHARD_MASK_SIZE) ctx->stat.ia_size = stbuf->ia_size; if (valid & SHARD_MASK_BLOCKS) ctx->stat.ia_blocks = stbuf->ia_blocks; if (valid & SHARD_MASK_TIMES) { SHARD_TIME_UPDATE(ctx->stat.ia_mtime, ctx->stat.ia_mtime_nsec, stbuf->ia_mtime, stbuf->ia_mtime_nsec); SHARD_TIME_UPDATE(ctx->stat.ia_ctime, ctx->stat.ia_ctime_nsec, stbuf->ia_ctime, stbuf->ia_ctime_nsec); SHARD_TIME_UPDATE(ctx->stat.ia_atime, ctx->stat.ia_atime_nsec, stbuf->ia_atime, stbuf->ia_atime_nsec); } if (valid & SHARD_MASK_OTHERS) { ctx->stat.ia_ino = stbuf->ia_ino; gf_uuid_copy(ctx->stat.ia_gfid, stbuf->ia_gfid); ctx->stat.ia_dev = stbuf->ia_dev; ctx->stat.ia_type = stbuf->ia_type; ctx->stat.ia_rdev = stbuf->ia_rdev; ctx->stat.ia_blksize = stbuf->ia_blksize; } if (valid & SHARD_MASK_REFRESH_RESET) ctx->refresh = _gf_false; return 0; } int shard_inode_ctx_set(inode_t *inode, xlator_t *this, struct iatt *stbuf, uint64_t block_size, int32_t valid) { int ret = -1; LOCK(&inode->lock); { ret = __shard_inode_ctx_set(inode, this, stbuf, block_size, valid); } UNLOCK(&inode->lock); return ret; } int __shard_inode_ctx_set_refresh_flag(inode_t *inode, xlator_t *this) { int ret = -1; shard_inode_ctx_t *ctx = NULL; ret = __shard_inode_ctx_get(inode, this, &ctx); if (ret) return ret; ctx->refresh = _gf_true; return 0; } int shard_inode_ctx_set_refresh_flag(inode_t *inode, xlator_t *this) { int ret = -1; LOCK(&inode->lock); { ret = __shard_inode_ctx_set_refresh_flag(inode, this); } UNLOCK(&inode->lock); return ret; } int __shard_inode_ctx_mark_dir_refreshed(inode_t *inode, xlator_t *this) { int ret = -1; shard_inode_ctx_t *ctx = NULL; ret = __shard_inode_ctx_get(inode, this, &ctx); if (ret) return ret; ctx->refreshed = _gf_true; return 0; } int shard_inode_ctx_mark_dir_refreshed(inode_t *inode, xlator_t *this) { int ret = -1; LOCK(&inode->lock); { ret = __shard_inode_ctx_mark_dir_refreshed(inode, this); } UNLOCK(&inode->lock); return ret; } int __shard_inode_ctx_add_to_fsync_list(inode_t *base_inode, xlator_t *this, inode_t *shard_inode) { int ret = -1; shard_inode_ctx_t *base_ictx = NULL; shard_inode_ctx_t *shard_ictx = NULL; ret = __shard_inode_ctx_get(base_inode, this, &base_ictx); if (ret) return ret; ret = __shard_inode_ctx_get(shard_inode, this, &shard_ictx); if (ret) return ret; if (shard_ictx->fsync_needed) { shard_ictx->fsync_needed++; return 1; } list_add_tail(&shard_ictx->to_fsync_list, &base_ictx->to_fsync_list); shard_ictx->inode = shard_inode; shard_ictx->fsync_needed++; base_ictx->fsync_count++; shard_ictx->base_inode = base_inode; return 0; } int shard_inode_ctx_add_to_fsync_list(inode_t *base_inode, xlator_t *this, inode_t *shard_inode) { int ret = -1; /* This ref acts as a refkeepr on the base inode. We * need to keep this inode alive as it holds the head * of the to_fsync_list. */ inode_ref(base_inode); inode_ref(shard_inode); LOCK(&base_inode->lock); LOCK(&shard_inode->lock); { ret = __shard_inode_ctx_add_to_fsync_list(base_inode, this, shard_inode); } UNLOCK(&shard_inode->lock); UNLOCK(&base_inode->lock); /* Unref the base inode corresponding to the ref above, if the shard is * found to be already part of the fsync list. */ if (ret != 0) { inode_unref(base_inode); inode_unref(shard_inode); } return ret; } gf_boolean_t __shard_inode_ctx_needs_lookup(inode_t *inode, xlator_t *this) { int ret = -1; shard_inode_ctx_t *ctx = NULL; ret = __shard_inode_ctx_get(inode, this, &ctx); /* If inode ctx get fails, better to err on the side of caution and * try again? Unless the failure is due to mem-allocation. */ if (ret) return _gf_true; return !ctx->refreshed; } gf_boolean_t shard_inode_ctx_needs_lookup(inode_t *inode, xlator_t *this) { gf_boolean_t flag = _gf_false; LOCK(&inode->lock); { flag = __shard_inode_ctx_needs_lookup(inode, this); } UNLOCK(&inode->lock); return flag; } int __shard_inode_ctx_invalidate(inode_t *inode, xlator_t *this, struct iatt *stbuf) { int ret = -1; shard_inode_ctx_t *ctx = NULL; ret = __shard_inode_ctx_get(inode, this, &ctx); if (ret) return ret; if ((stbuf->ia_size != ctx->stat.ia_size) || (stbuf->ia_blocks != ctx->stat.ia_blocks)) ctx->refresh = _gf_true; return 0; } int shard_inode_ctx_invalidate(inode_t *inode, xlator_t *this, struct iatt *stbuf) { int ret = -1; LOCK(&inode->lock); { ret = __shard_inode_ctx_invalidate(inode, this, stbuf); } UNLOCK(&inode->lock); return ret; } int __shard_inode_ctx_get_block_size(inode_t *inode, xlator_t *this, uint64_t *block_size) { int ret = -1; uint64_t ctx_uint = 0; shard_inode_ctx_t *ctx = NULL; ret = __inode_ctx_get(inode, this, &ctx_uint); if (ret < 0) return ret; ctx = (shard_inode_ctx_t *)ctx_uint; *block_size = ctx->block_size; return 0; } int shard_inode_ctx_get_block_size(inode_t *inode, xlator_t *this, uint64_t *block_size) { int ret = -1; LOCK(&inode->lock); { ret = __shard_inode_ctx_get_block_size(inode, this, block_size); } UNLOCK(&inode->lock); return ret; } int __shard_inode_ctx_get_fsync_count(inode_t *inode, xlator_t *this, int *fsync_count) { int ret = -1; uint64_t ctx_uint = 0; shard_inode_ctx_t *ctx = NULL; ret = __inode_ctx_get(inode, this, &ctx_uint); if (ret < 0) return ret; ctx = (shard_inode_ctx_t *)ctx_uint; *fsync_count = ctx->fsync_needed; return 0; } int shard_inode_ctx_get_fsync_count(inode_t *inode, xlator_t *this, int *fsync_count) { int ret = -1; LOCK(&inode->lock); { ret = __shard_inode_ctx_get_fsync_count(inode, this, fsync_count); } UNLOCK(&inode->lock); return ret; } int __shard_inode_ctx_get_all(inode_t *inode, xlator_t *this, shard_inode_ctx_t *ctx_out) { int ret = -1; uint64_t ctx_uint = 0; shard_inode_ctx_t *ctx = NULL; ret = __inode_ctx_get(inode, this, &ctx_uint); if (ret < 0) return ret; ctx = (shard_inode_ctx_t *)ctx_uint; memcpy(ctx_out, ctx, sizeof(shard_inode_ctx_t)); return 0; } int shard_inode_ctx_get_all(inode_t *inode, xlator_t *this, shard_inode_ctx_t *ctx_out) { int ret = -1; LOCK(&inode->lock); { ret = __shard_inode_ctx_get_all(inode, this, ctx_out); } UNLOCK(&inode->lock); return ret; } int __shard_inode_ctx_fill_iatt_from_cache(inode_t *inode, xlator_t *this, struct iatt *buf, gf_boolean_t *need_refresh) { int ret = -1; uint64_t ctx_uint = 0; shard_inode_ctx_t *ctx = NULL; ret = __inode_ctx_get(inode, this, &ctx_uint); if (ret < 0) return ret; ctx = (shard_inode_ctx_t *)ctx_uint; if (ctx->refresh == _gf_false) *buf = ctx->stat; else *need_refresh = _gf_true; return 0; } int shard_inode_ctx_fill_iatt_from_cache(inode_t *inode, xlator_t *this, struct iatt *buf, gf_boolean_t *need_refresh) { int ret = -1; LOCK(&inode->lock); { ret = __shard_inode_ctx_fill_iatt_from_cache(inode, this, buf, need_refresh); } UNLOCK(&inode->lock); return ret; } void shard_local_wipe(shard_local_t *local) { int i = 0; int count = 0; count = local->num_blocks; syncbarrier_destroy(&local->barrier); loc_wipe(&local->loc); loc_wipe(&local->dot_shard_loc); loc_wipe(&local->dot_shard_rm_loc); loc_wipe(&local->loc2); loc_wipe(&local->tmp_loc); loc_wipe(&local->int_inodelk.loc); loc_wipe(&local->int_entrylk.loc); loc_wipe(&local->newloc); if (local->int_entrylk.basename) GF_FREE(local->int_entrylk.basename); if (local->fd) fd_unref(local->fd); if (local->xattr_req) dict_unref(local->xattr_req); if (local->xattr_rsp) dict_unref(local->xattr_rsp); for (i = 0; i < count; i++) { if (!local->inode_list) break; if (local->inode_list[i]) inode_unref(local->inode_list[i]); } GF_FREE(local->inode_list); GF_FREE(local->vector); if (local->iobref) iobref_unref(local->iobref); if (local->list_inited) gf_dirent_free(&local->entries_head); if (local->inodelk_frame) SHARD_STACK_DESTROY(local->inodelk_frame); if (local->entrylk_frame) SHARD_STACK_DESTROY(local->entrylk_frame); } int shard_modify_size_and_block_count(struct iatt *stbuf, dict_t *dict) { int ret = -1; void *size_attr = NULL; uint64_t size_array[4]; ret = dict_get_ptr(dict, GF_XATTR_SHARD_FILE_SIZE, &size_attr); if (ret) { gf_msg_callingfn(THIS->name, GF_LOG_ERROR, 0, SHARD_MSG_INTERNAL_XATTR_MISSING, "Failed to " "get " GF_XATTR_SHARD_FILE_SIZE " for %s", uuid_utoa(stbuf->ia_gfid)); return ret; } memcpy(size_array, size_attr, sizeof(size_array)); stbuf->ia_size = ntoh64(size_array[0]); stbuf->ia_blocks = ntoh64(size_array[2]); return 0; } int shard_call_count_return(call_frame_t *frame) { int call_count = 0; shard_local_t *local = NULL; local = frame->local; LOCK(&frame->lock); { call_count = --local->call_count; } UNLOCK(&frame->lock); return call_count; } static char * shard_internal_dir_string(shard_internal_dir_type_t type) { char *str = NULL; switch (type) { case SHARD_INTERNAL_DIR_DOT_SHARD: str = GF_SHARD_DIR; break; case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: str = GF_SHARD_REMOVE_ME_DIR; break; default: break; } return str; } static int shard_init_internal_dir_loc(xlator_t *this, shard_local_t *local, shard_internal_dir_type_t type) { int ret = -1; char *bname = NULL; inode_t *parent = NULL; loc_t *internal_dir_loc = NULL; shard_priv_t *priv = NULL; priv = this->private; if (!local) return -1; switch (type) { case SHARD_INTERNAL_DIR_DOT_SHARD: internal_dir_loc = &local->dot_shard_loc; bname = GF_SHARD_DIR; parent = inode_ref(this->itable->root); break; case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: internal_dir_loc = &local->dot_shard_rm_loc; bname = GF_SHARD_REMOVE_ME_DIR; parent = inode_ref(priv->dot_shard_inode); break; default: break; } internal_dir_loc->inode = inode_new(this->itable); internal_dir_loc->parent = parent; ret = inode_path(internal_dir_loc->parent, bname, (char **)&internal_dir_loc->path); if (ret < 0 || !(internal_dir_loc->inode)) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, "Inode path failed on %s", bname); goto out; } internal_dir_loc->name = strrchr(internal_dir_loc->path, '/'); if (internal_dir_loc->name) internal_dir_loc->name++; ret = 0; out: return ret; } inode_t * __shard_update_shards_inode_list(inode_t *linked_inode, xlator_t *this, inode_t *base_inode, int block_num, uuid_t gfid) { char block_bname[256] = { 0, }; inode_t *lru_inode = NULL; shard_priv_t *priv = NULL; shard_inode_ctx_t *ctx = NULL; shard_inode_ctx_t *lru_inode_ctx = NULL; shard_inode_ctx_t *lru_base_inode_ctx = NULL; inode_t *fsync_inode = NULL; inode_t *lru_base_inode = NULL; gf_boolean_t do_fsync = _gf_false; priv = this->private; shard_inode_ctx_get(linked_inode, this, &ctx); if (list_empty(&ctx->ilist)) { if (priv->inode_count + 1 <= priv->lru_limit) { /* If this inode was linked here for the first time (indicated * by empty list), and if there is still space in the priv list, * add this ctx to the tail of the list. */ /* For as long as an inode is in lru list, we try to * keep it alive by holding a ref on it. */ inode_ref(linked_inode); if (base_inode) gf_uuid_copy(ctx->base_gfid, base_inode->gfid); else gf_uuid_copy(ctx->base_gfid, gfid); ctx->block_num = block_num; list_add_tail(&ctx->ilist, &priv->ilist_head); priv->inode_count++; if (base_inode) ctx->base_inode = inode_ref(base_inode); } else { /*If on the other hand there is no available slot for this inode * in the list, delete the lru inode from the head of the list, * unlink it. And in its place add this new inode into the list. */ lru_inode_ctx = list_first_entry(&priv->ilist_head, shard_inode_ctx_t, ilist); GF_ASSERT(lru_inode_ctx->block_num > 0); lru_base_inode = lru_inode_ctx->base_inode; list_del_init(&lru_inode_ctx->ilist); lru_inode = inode_find(linked_inode->table, lru_inode_ctx->stat.ia_gfid); /* If the lru inode was part of the pending-fsync list, * the base inode needs to be unref'd, the lru inode * deleted from fsync list and fsync'd in a new frame, * and then unlinked in memory and forgotten. */ if (!lru_base_inode) goto after_fsync_check; LOCK(&lru_base_inode->lock); LOCK(&lru_inode->lock); { if (!list_empty(&lru_inode_ctx->to_fsync_list)) { list_del_init(&lru_inode_ctx->to_fsync_list); lru_inode_ctx->fsync_needed = 0; do_fsync = _gf_true; __shard_inode_ctx_get(lru_base_inode, this, &lru_base_inode_ctx); lru_base_inode_ctx->fsync_count--; } } UNLOCK(&lru_inode->lock); UNLOCK(&lru_base_inode->lock); after_fsync_check: if (!do_fsync) { shard_make_block_bname(lru_inode_ctx->block_num, lru_inode_ctx->base_gfid, block_bname, sizeof(block_bname)); /* The following unref corresponds to the ref held at * the time the shard was added to the lru list. */ inode_unref(lru_inode); inode_unlink(lru_inode, priv->dot_shard_inode, block_bname); inode_forget(lru_inode, 0); } else { /* The following unref corresponds to the ref * held when the shard was added to fsync list. */ inode_unref(lru_inode); fsync_inode = lru_inode; if (lru_base_inode) inode_unref(lru_base_inode); } /* The following unref corresponds to the ref * held by inode_find() above. */ inode_unref(lru_inode); /* The following unref corresponds to the ref held on the base shard * at the time of adding shard inode to lru list */ if (lru_base_inode) inode_unref(lru_base_inode); /* For as long as an inode is in lru list, we try to * keep it alive by holding a ref on it. */ inode_ref(linked_inode); if (base_inode) gf_uuid_copy(ctx->base_gfid, base_inode->gfid); else gf_uuid_copy(ctx->base_gfid, gfid); ctx->block_num = block_num; if (base_inode) ctx->base_inode = inode_ref(base_inode); list_add_tail(&ctx->ilist, &priv->ilist_head); } } else { /* If this is not the first time this inode is being operated on, move * it to the most recently used end of the list. */ list_move_tail(&ctx->ilist, &priv->ilist_head); } return fsync_inode; } int shard_common_failure_unwind(glusterfs_fop_t fop, call_frame_t *frame, int32_t op_ret, int32_t op_errno) { switch (fop) { case GF_FOP_LOOKUP: SHARD_STACK_UNWIND(lookup, frame, op_ret, op_errno, NULL, NULL, NULL, NULL); break; case GF_FOP_STAT: SHARD_STACK_UNWIND(stat, frame, op_ret, op_errno, NULL, NULL); break; case GF_FOP_FSTAT: SHARD_STACK_UNWIND(fstat, frame, op_ret, op_errno, NULL, NULL); break; case GF_FOP_TRUNCATE: SHARD_STACK_UNWIND(truncate, frame, op_ret, op_errno, NULL, NULL, NULL); break; case GF_FOP_FTRUNCATE: SHARD_STACK_UNWIND(ftruncate, frame, op_ret, op_errno, NULL, NULL, NULL); break; case GF_FOP_MKNOD: SHARD_STACK_UNWIND(mknod, frame, op_ret, op_errno, NULL, NULL, NULL, NULL, NULL); break; case GF_FOP_LINK: SHARD_STACK_UNWIND(link, frame, op_ret, op_errno, NULL, NULL, NULL, NULL, NULL); break; case GF_FOP_CREATE: SHARD_STACK_UNWIND(create, frame, op_ret, op_errno, NULL, NULL, NULL, NULL, NULL, NULL); break; case GF_FOP_UNLINK: SHARD_STACK_UNWIND(unlink, frame, op_ret, op_errno, NULL, NULL, NULL); break; case GF_FOP_RENAME: SHARD_STACK_UNWIND(rename, frame, op_ret, op_errno, NULL, NULL, NULL, NULL, NULL, NULL); break; case GF_FOP_WRITE: SHARD_STACK_UNWIND(writev, frame, op_ret, op_errno, NULL, NULL, NULL); break; case GF_FOP_FALLOCATE: SHARD_STACK_UNWIND(fallocate, frame, op_ret, op_errno, NULL, NULL, NULL); break; case GF_FOP_ZEROFILL: SHARD_STACK_UNWIND(zerofill, frame, op_ret, op_errno, NULL, NULL, NULL); break; case GF_FOP_DISCARD: SHARD_STACK_UNWIND(discard, frame, op_ret, op_errno, NULL, NULL, NULL); break; case GF_FOP_READ: SHARD_STACK_UNWIND(readv, frame, op_ret, op_errno, NULL, -1, NULL, NULL, NULL); break; case GF_FOP_FSYNC: SHARD_STACK_UNWIND(fsync, frame, op_ret, op_errno, NULL, NULL, NULL); break; case GF_FOP_REMOVEXATTR: SHARD_STACK_UNWIND(removexattr, frame, op_ret, op_errno, NULL); break; case GF_FOP_FREMOVEXATTR: SHARD_STACK_UNWIND(fremovexattr, frame, op_ret, op_errno, NULL); break; case GF_FOP_FGETXATTR: SHARD_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, NULL, NULL); break; case GF_FOP_GETXATTR: SHARD_STACK_UNWIND(getxattr, frame, op_ret, op_errno, NULL, NULL); break; case GF_FOP_FSETXATTR: SHARD_STACK_UNWIND(fsetxattr, frame, op_ret, op_errno, NULL); break; case GF_FOP_SETXATTR: SHARD_STACK_UNWIND(setxattr, frame, op_ret, op_errno, NULL); break; case GF_FOP_SETATTR: SHARD_STACK_UNWIND(setattr, frame, op_ret, op_errno, NULL, NULL, NULL); break; case GF_FOP_FSETATTR: SHARD_STACK_UNWIND(fsetattr, frame, op_ret, op_errno, NULL, NULL, NULL); break; case GF_FOP_SEEK: SHARD_STACK_UNWIND(seek, frame, op_ret, op_errno, 0, NULL); break; default: gf_msg(THIS->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, "Invalid fop id = %d", fop); break; } return 0; } int shard_common_inode_write_success_unwind(glusterfs_fop_t fop, call_frame_t *frame, int32_t op_ret) { shard_local_t *local = NULL; local = frame->local; switch (fop) { case GF_FOP_WRITE: SHARD_STACK_UNWIND(writev, frame, op_ret, 0, &local->prebuf, &local->postbuf, local->xattr_rsp); break; case GF_FOP_FALLOCATE: SHARD_STACK_UNWIND(fallocate, frame, op_ret, 0, &local->prebuf, &local->postbuf, local->xattr_rsp); break; case GF_FOP_ZEROFILL: SHARD_STACK_UNWIND(zerofill, frame, op_ret, 0, &local->prebuf, &local->postbuf, local->xattr_rsp); break; case GF_FOP_DISCARD: SHARD_STACK_UNWIND(discard, frame, op_ret, 0, &local->prebuf, &local->postbuf, local->xattr_rsp); break; default: gf_msg(THIS->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, "Invalid fop id = %d", fop); break; } return 0; } int shard_evicted_inode_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { char block_bname[256] = { 0, }; fd_t *anon_fd = cookie; inode_t *shard_inode = NULL; shard_inode_ctx_t *ctx = NULL; shard_priv_t *priv = NULL; priv = this->private; if (anon_fd == NULL || op_ret < 0) { gf_msg(this->name, GF_LOG_WARNING, op_errno, SHARD_MSG_MEMALLOC_FAILED, "fsync failed on shard"); goto out; } shard_inode = anon_fd->inode; LOCK(&priv->lock); LOCK(&shard_inode->lock); { __shard_inode_ctx_get(shard_inode, this, &ctx); if ((list_empty(&ctx->to_fsync_list)) && (list_empty(&ctx->ilist))) { shard_make_block_bname(ctx->block_num, shard_inode->gfid, block_bname, sizeof(block_bname)); inode_unlink(shard_inode, priv->dot_shard_inode, block_bname); /* The following unref corresponds to the ref held by * inode_link() at the time the shard was created or * looked up */ inode_unref(shard_inode); inode_forget(shard_inode, 0); } } UNLOCK(&shard_inode->lock); UNLOCK(&priv->lock); out: if (anon_fd) fd_unref(anon_fd); STACK_DESTROY(frame->root); return 0; } int shard_initiate_evicted_inode_fsync(xlator_t *this, inode_t *inode) { fd_t *anon_fd = NULL; call_frame_t *fsync_frame = NULL; fsync_frame = create_frame(this, this->ctx->pool); if (!fsync_frame) { gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, "Failed to create new frame " "to fsync shard"); return -1; } anon_fd = fd_anonymous(inode); if (!anon_fd) { gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, "Failed to create anon fd to" " fsync shard"); STACK_DESTROY(fsync_frame->root); return -1; } STACK_WIND_COOKIE(fsync_frame, shard_evicted_inode_fsync_cbk, anon_fd, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync, anon_fd, 1, NULL); return 0; } int shard_common_resolve_shards(call_frame_t *frame, xlator_t *this, shard_post_resolve_fop_handler_t post_res_handler) { int i = -1; uint32_t shard_idx_iter = 0; char path[PATH_MAX] = { 0, }; uuid_t gfid = { 0, }; inode_t *inode = NULL; inode_t *res_inode = NULL; inode_t *fsync_inode = NULL; shard_priv_t *priv = NULL; shard_local_t *local = NULL; priv = this->private; local = frame->local; local->call_count = 0; shard_idx_iter = local->first_block; res_inode = local->resolver_base_inode; if (res_inode) gf_uuid_copy(gfid, res_inode->gfid); else gf_uuid_copy(gfid, local->base_gfid); if ((local->op_ret < 0) || (local->resolve_not)) goto out; while (shard_idx_iter <= local->last_block) { i++; if (shard_idx_iter == 0) { local->inode_list[i] = inode_ref(res_inode); shard_idx_iter++; continue; } shard_make_block_abspath(shard_idx_iter, gfid, path, sizeof(path)); inode = NULL; inode = inode_resolve(this->itable, path); if (inode) { gf_msg_debug(this->name, 0, "Shard %d already " "present. gfid=%s. Saving inode for future.", shard_idx_iter, uuid_utoa(inode->gfid)); local->inode_list[i] = inode; /* Let the ref on the inodes that are already present * in inode table still be held so that they don't get * forgotten by the time the fop reaches the actual * write stage. */ LOCK(&priv->lock); { fsync_inode = __shard_update_shards_inode_list( inode, this, res_inode, shard_idx_iter, gfid); } UNLOCK(&priv->lock); shard_idx_iter++; if (fsync_inode) shard_initiate_evicted_inode_fsync(this, fsync_inode); continue; } else { local->call_count++; shard_idx_iter++; } } out: post_res_handler(frame, this); return 0; } int shard_update_file_size_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { inode_t *inode = NULL; shard_local_t *local = NULL; local = frame->local; if ((local->fd) && (local->fd->inode)) inode = local->fd->inode; else if (local->loc.inode) inode = local->loc.inode; if (op_ret < 0) { gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_UPDATE_FILE_SIZE_FAILED, "Update to file size" " xattr failed on %s", uuid_utoa(inode->gfid)); local->op_ret = op_ret; local->op_errno = op_errno; goto err; } if (shard_modify_size_and_block_count(&local->postbuf, dict)) { local->op_ret = -1; local->op_errno = ENOMEM; goto err; } err: local->post_update_size_handler(frame, this); return 0; } int shard_set_size_attrs(int64_t size, int64_t block_count, int64_t **size_attr_p) { int ret = -1; int64_t *size_attr = NULL; if (!size_attr_p) goto out; size_attr = GF_CALLOC(4, sizeof(int64_t), gf_shard_mt_int64_t); if (!size_attr) goto out; size_attr[0] = hton64(size); /* As sharding evolves, it _may_ be necessary to embed more pieces of * information within the same xattr. So allocating slots for them in * advance. For now, only bytes 0-63 and 128-191 which would make up the * current size and block count respectively of the file are valid. */ size_attr[2] = hton64(block_count); *size_attr_p = size_attr; ret = 0; out: return ret; } int shard_update_file_size(call_frame_t *frame, xlator_t *this, fd_t *fd, loc_t *loc, shard_post_update_size_fop_handler_t handler) { int ret = -1; int64_t *size_attr = NULL; inode_t *inode = NULL; shard_local_t *local = NULL; dict_t *xattr_req = NULL; local = frame->local; local->post_update_size_handler = handler; xattr_req = dict_new(); if (!xattr_req) { local->op_ret = -1; local->op_errno = ENOMEM; goto out; } if (fd) inode = fd->inode; else inode = loc->inode; /* If both size and block count have not changed, then skip the xattrop. */ if ((local->delta_size + local->hole_size == 0) && (local->delta_blocks == 0)) { goto out; } ret = shard_set_size_attrs(local->delta_size + local->hole_size, local->delta_blocks, &size_attr); if (ret) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_SIZE_SET_FAILED, "Failed to set size attrs for %s", uuid_utoa(inode->gfid)); local->op_ret = -1; local->op_errno = ENOMEM; goto out; } ret = dict_set_bin(xattr_req, GF_XATTR_SHARD_FILE_SIZE, size_attr, 8 * 4); if (ret) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, "Failed to set key %s into dict. gfid=%s", GF_XATTR_SHARD_FILE_SIZE, uuid_utoa(inode->gfid)); GF_FREE(size_attr); local->op_ret = -1; local->op_errno = ENOMEM; goto out; } if (fd) STACK_WIND(frame, shard_update_file_size_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fxattrop, fd, GF_XATTROP_ADD_ARRAY64, xattr_req, NULL); else STACK_WIND(frame, shard_update_file_size_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->xattrop, loc, GF_XATTROP_ADD_ARRAY64, xattr_req, NULL); dict_unref(xattr_req); return 0; out: if (xattr_req) dict_unref(xattr_req); handler(frame, this); return 0; } static inode_t * shard_link_internal_dir_inode(shard_local_t *local, inode_t *inode, struct iatt *buf, shard_internal_dir_type_t type) { inode_t *linked_inode = NULL; shard_priv_t *priv = NULL; char *bname = NULL; inode_t **priv_inode = NULL; inode_t *parent = NULL; priv = THIS->private; switch (type) { case SHARD_INTERNAL_DIR_DOT_SHARD: bname = GF_SHARD_DIR; priv_inode = &priv->dot_shard_inode; parent = inode->table->root; break; case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: bname = GF_SHARD_REMOVE_ME_DIR; priv_inode = &priv->dot_shard_rm_inode; parent = priv->dot_shard_inode; break; default: break; } linked_inode = inode_link(inode, parent, bname, buf); inode_lookup(linked_inode); *priv_inode = linked_inode; return linked_inode; } int shard_refresh_internal_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata, struct iatt *postparent) { shard_local_t *local = NULL; inode_t *linked_inode = NULL; shard_internal_dir_type_t type = (shard_internal_dir_type_t)cookie; local = frame->local; if (op_ret) { local->op_ret = op_ret; local->op_errno = op_errno; goto out; } /* To-Do: Fix refcount increment per call to * shard_link_internal_dir_inode(). */ linked_inode = shard_link_internal_dir_inode(local, inode, buf, type); shard_inode_ctx_mark_dir_refreshed(linked_inode, this); out: shard_common_resolve_shards(frame, this, local->post_res_handler); return 0; } int shard_refresh_internal_dir(call_frame_t *frame, xlator_t *this, shard_internal_dir_type_t type) { loc_t loc = { 0, }; inode_t *inode = NULL; shard_priv_t *priv = NULL; shard_local_t *local = NULL; uuid_t gfid = { 0, }; local = frame->local; priv = this->private; switch (type) { case SHARD_INTERNAL_DIR_DOT_SHARD: gf_uuid_copy(gfid, priv->dot_shard_gfid); break; case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: gf_uuid_copy(gfid, priv->dot_shard_rm_gfid); break; default: break; } inode = inode_find(this->itable, gfid); if (!shard_inode_ctx_needs_lookup(inode, this)) { local->op_ret = 0; goto out; } /* Plain assignment because the ref is already taken above through * call to inode_find() */ loc.inode = inode; gf_uuid_copy(loc.gfid, gfid); STACK_WIND_COOKIE(frame, shard_refresh_internal_dir_cbk, (void *)(long)type, FIRST_CHILD(this), FIRST_CHILD(this)->fops->lookup, &loc, NULL); loc_wipe(&loc); return 0; out: shard_common_resolve_shards(frame, this, local->post_res_handler); return 0; } int shard_lookup_internal_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata, struct iatt *postparent) { inode_t *link_inode = NULL; shard_local_t *local = NULL; shard_internal_dir_type_t type = (shard_internal_dir_type_t)cookie; local = frame->local; if (op_ret) { local->op_ret = op_ret; local->op_errno = op_errno; goto unwind; } if (!IA_ISDIR(buf->ia_type)) { gf_msg(this->name, GF_LOG_CRITICAL, 0, SHARD_MSG_DOT_SHARD_NODIR, "%s already exists and " "is not a directory. Please remove it from all bricks " "and try again", shard_internal_dir_string(type)); local->op_ret = -1; local->op_errno = EIO; goto unwind; } link_inode = shard_link_internal_dir_inode(local, inode, buf, type); if (link_inode != inode) { shard_refresh_internal_dir(frame, this, type); } else { shard_inode_ctx_mark_dir_refreshed(link_inode, this); shard_common_resolve_shards(frame, this, local->post_res_handler); } return 0; unwind: local->post_res_handler(frame, this); return 0; } int shard_lookup_internal_dir(call_frame_t *frame, xlator_t *this, shard_post_resolve_fop_handler_t post_res_handler, shard_internal_dir_type_t type) { int ret = -1; dict_t *xattr_req = NULL; shard_priv_t *priv = NULL; shard_local_t *local = NULL; uuid_t *gfid = NULL; loc_t *loc = NULL; gf_boolean_t free_gfid = _gf_true; local = frame->local; priv = this->private; local->post_res_handler = post_res_handler; gfid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t); if (!gfid) goto err; xattr_req = dict_new(); if (!xattr_req) { local->op_ret = -1; local->op_errno = ENOMEM; goto err; } switch (type) { case SHARD_INTERNAL_DIR_DOT_SHARD: gf_uuid_copy(*gfid, priv->dot_shard_gfid); loc = &local->dot_shard_loc; break; case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: gf_uuid_copy(*gfid, priv->dot_shard_rm_gfid); loc = &local->dot_shard_rm_loc; break; default: bzero(*gfid, sizeof(uuid_t)); break; } ret = dict_set_gfuuid(xattr_req, "gfid-req", *gfid, false); if (ret) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, "Failed to set gfid of %s into dict", shard_internal_dir_string(type)); local->op_ret = -1; local->op_errno = ENOMEM; goto err; } else { free_gfid = _gf_false; } STACK_WIND_COOKIE(frame, shard_lookup_internal_dir_cbk, (void *)(long)type, FIRST_CHILD(this), FIRST_CHILD(this)->fops->lookup, loc, xattr_req); dict_unref(xattr_req); return 0; err: if (xattr_req) dict_unref(xattr_req); if (free_gfid) GF_FREE(gfid); post_res_handler(frame, this); return 0; } static void shard_inode_ctx_update(inode_t *inode, xlator_t *this, dict_t *xdata, struct iatt *buf) { int ret = 0; uint64_t size = 0; void *bsize = NULL; if (shard_inode_ctx_get_block_size(inode, this, &size)) { /* Fresh lookup */ ret = dict_get_ptr(xdata, GF_XATTR_SHARD_BLOCK_SIZE, &bsize); if (!ret) size = ntoh64(*((uint64_t *)bsize)); /* If the file is sharded, set its block size, otherwise just * set 0. */ shard_inode_ctx_set(inode, this, buf, size, SHARD_MASK_BLOCK_SIZE); } /* If the file is sharded, also set the remaining attributes, * except for ia_size and ia_blocks. */ if (size) { shard_inode_ctx_set(inode, this, buf, 0, SHARD_LOOKUP_MASK); (void)shard_inode_ctx_invalidate(inode, this, buf); } } int shard_delete_shards(void *opaque); int shard_delete_shards_cbk(int ret, call_frame_t *frame, void *data); int shard_start_background_deletion(xlator_t *this) { int ret = 0; gf_boolean_t i_cleanup = _gf_true; shard_priv_t *priv = NULL; call_frame_t *cleanup_frame = NULL; priv = this->private; LOCK(&priv->lock); { switch (priv->bg_del_state) { case SHARD_BG_DELETION_NONE: i_cleanup = _gf_true; priv->bg_del_state = SHARD_BG_DELETION_LAUNCHING; break; case SHARD_BG_DELETION_LAUNCHING: i_cleanup = _gf_false; break; case SHARD_BG_DELETION_IN_PROGRESS: priv->bg_del_state = SHARD_BG_DELETION_LAUNCHING; i_cleanup = _gf_false; break; default: break; } } UNLOCK(&priv->lock); if (!i_cleanup) return 0; cleanup_frame = create_frame(this, this->ctx->pool); if (!cleanup_frame) { gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, "Failed to create " "new frame to delete shards"); ret = -ENOMEM; goto err; } set_lk_owner_from_ptr(&cleanup_frame->root->lk_owner, cleanup_frame->root); ret = synctask_new(this->ctx->env, shard_delete_shards, shard_delete_shards_cbk, cleanup_frame, cleanup_frame); if (ret < 0) { gf_msg(this->name, GF_LOG_WARNING, errno, SHARD_MSG_SHARDS_DELETION_FAILED, "failed to create task to do background " "cleanup of shards"); STACK_DESTROY(cleanup_frame->root); goto err; } return 0; err: LOCK(&priv->lock); { priv->bg_del_state = SHARD_BG_DELETION_NONE; } UNLOCK(&priv->lock); return ret; } int shard_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata, struct iatt *postparent) { int ret = -1; shard_priv_t *priv = NULL; gf_boolean_t i_start_cleanup = _gf_false; priv = this->private; if (op_ret < 0) goto unwind; if (IA_ISDIR(buf->ia_type)) goto unwind; /* Also, if the file is sharded, get the file size and block cnt xattr, * and store them in the stbuf appropriately. */ if (dict_get(xdata, GF_XATTR_SHARD_FILE_SIZE) && frame->root->pid != GF_CLIENT_PID_GSYNCD) shard_modify_size_and_block_count(buf, xdata); /* If this was a fresh lookup, there are two possibilities: * 1) If the file is sharded (indicated by the presence of block size * xattr), store this block size, along with rdev and mode in its * inode ctx. * 2) If the file is not sharded, store size along with rdev and mode * (which are anyway don't cares) in inode ctx. Since @ctx_tmp is * already initialised to all zeroes, nothing more needs to be done. */ (void)shard_inode_ctx_update(inode, this, xdata, buf); LOCK(&priv->lock); { if (priv->first_lookup_done == _gf_false) { priv->first_lookup_done = _gf_true; i_start_cleanup = _gf_true; } } UNLOCK(&priv->lock); if (!i_start_cleanup) goto unwind; ret = shard_start_background_deletion(this); if (ret < 0) { LOCK(&priv->lock); { priv->first_lookup_done = _gf_false; } UNLOCK(&priv->lock); } unwind: SHARD_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, buf, xdata, postparent); return 0; } int shard_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) { int ret = -1; int32_t op_errno = ENOMEM; uint64_t block_size = 0; shard_local_t *local = NULL; this->itable = loc->inode->table; if (frame->root->pid != GF_CLIENT_PID_GSYNCD) { SHARD_ENTRY_FOP_CHECK(loc, op_errno, err); } local = mem_get0(this->local_pool); if (!local) goto err; frame->local = local; loc_copy(&local->loc, loc); local->xattr_req = xattr_req ? dict_ref(xattr_req) : dict_new(); if (!local->xattr_req) goto err; if (shard_inode_ctx_get_block_size(loc->inode, this, &block_size)) { ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_BLOCK_SIZE, 0); if (ret) { gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, "Failed to set dict" " value: key:%s for path %s", GF_XATTR_SHARD_BLOCK_SIZE, loc->path); goto err; } } if (frame->root->pid != GF_CLIENT_PID_GSYNCD) { ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_FILE_SIZE, 8 * 4); if (ret) { gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, "Failed to set dict value: key:%s for path %s.", GF_XATTR_SHARD_FILE_SIZE, loc->path); goto err; } } if ((xattr_req) && (dict_get(xattr_req, GF_CONTENT_KEY))) dict_del(xattr_req, GF_CONTENT_KEY); STACK_WIND(frame, shard_lookup_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->lookup, loc, local->xattr_req); return 0; err: shard_common_failure_unwind(GF_FOP_LOOKUP, frame, -1, op_errno); return 0; } int shard_lookup_base_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata, struct iatt *postparent) { int ret = -1; int32_t mask = SHARD_INODE_WRITE_MASK; shard_local_t *local = NULL; shard_inode_ctx_t ctx = { 0, }; local = frame->local; if (op_ret < 0) { gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_BASE_FILE_LOOKUP_FAILED, "Lookup on base file" " failed : %s", loc_gfid_utoa(&(local->loc))); local->op_ret = op_ret; local->op_errno = op_errno; goto unwind; } local->prebuf = *buf; if (shard_modify_size_and_block_count(&local->prebuf, xdata)) { local->op_ret = -1; local->op_errno = EINVAL; goto unwind; } if (shard_inode_ctx_get_all(inode, this, &ctx)) mask = SHARD_ALL_MASK; ret = shard_inode_ctx_set(inode, this, &local->prebuf, 0, (mask | SHARD_MASK_REFRESH_RESET)); if (ret) { gf_msg(this->name, GF_LOG_ERROR, SHARD_MSG_INODE_CTX_SET_FAILED, 0, "Failed to set inode" " write params into inode ctx for %s", uuid_utoa(buf->ia_gfid)); local->op_ret = -1; local->op_errno = ENOMEM; goto unwind; } unwind: local->handler(frame, this); return 0; } int shard_lookup_base_file(call_frame_t *frame, xlator_t *this, loc_t *loc, shard_post_fop_handler_t handler) { int ret = -1; shard_local_t *local = NULL; dict_t *xattr_req = NULL; gf_boolean_t need_refresh = _gf_false; local = frame->local; local->handler = handler; ret = shard_inode_ctx_fill_iatt_from_cache(loc->inode, this, &local->prebuf, &need_refresh); /* By this time, inode ctx should have been created either in create, * mknod, readdirp or lookup. If not it is a bug! */ if ((ret == 0) && (need_refresh == _gf_false)) { gf_msg_debug(this->name, 0, "Skipping lookup on base file: %s" "Serving prebuf off the inode ctx cache", uuid_utoa(loc->gfid)); goto out; } xattr_req = dict_new(); if (!xattr_req) { local->op_ret = -1; local->op_errno = ENOMEM; goto out; } SHARD_MD_READ_FOP_INIT_REQ_DICT(this, xattr_req, loc->gfid, local, out); STACK_WIND(frame, shard_lookup_base_file_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->lookup, loc, xattr_req); dict_unref(xattr_req); return 0; out: if (xattr_req) dict_unref(xattr_req); handler(frame, this); return 0; } int shard_post_fstat_handler(call_frame_t *frame, xlator_t *this) { shard_local_t *local = NULL; local = frame->local; if (local->op_ret >= 0) shard_inode_ctx_set(local->fd->inode, this, &local->prebuf, 0, SHARD_LOOKUP_MASK); SHARD_STACK_UNWIND(fstat, frame, local->op_ret, local->op_errno, &local->prebuf, local->xattr_rsp); return 0; } int shard_post_stat_handler(call_frame_t *frame, xlator_t *this) { shard_local_t *local = NULL; local = frame->local; if (local->op_ret >= 0) shard_inode_ctx_set(local->loc.inode, this, &local->prebuf, 0, SHARD_LOOKUP_MASK); SHARD_STACK_UNWIND(stat, frame, local->op_ret, local->op_errno, &local->prebuf, local->xattr_rsp); return 0; } int shard_common_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata) { inode_t *inode = NULL; shard_local_t *local = NULL; local = frame->local; if (op_ret < 0) { gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_STAT_FAILED, "stat failed: %s", local->fd ? uuid_utoa(local->fd->inode->gfid) : uuid_utoa((local->loc.inode)->gfid)); local->op_ret = op_ret; local->op_errno = op_errno; goto unwind; } local->prebuf = *buf; if (shard_modify_size_and_block_count(&local->prebuf, xdata)) { local->op_ret = -1; local->op_errno = EINVAL; goto unwind; } local->xattr_rsp = dict_ref(xdata); if (local->loc.inode) inode = local->loc.inode; else inode = local->fd->inode; shard_inode_ctx_invalidate(inode, this, &local->prebuf); unwind: local->handler(frame, this); return 0; } int shard_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { int ret = -1; uint64_t block_size = 0; shard_local_t *local = NULL; if ((IA_ISDIR(loc->inode->ia_type)) || (IA_ISLNK(loc->inode->ia_type))) { STACK_WIND(frame, default_stat_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->stat, loc, xdata); return 0; } ret = shard_inode_ctx_get_block_size(loc->inode, this, &block_size); if (ret) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, "Failed to get block " "size from inode ctx of %s", uuid_utoa(loc->inode->gfid)); goto err; } if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { STACK_WIND(frame, default_stat_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->stat, loc, xdata); return 0; } local = mem_get0(this->local_pool); if (!local) goto err; frame->local = local; local->handler = shard_post_stat_handler; loc_copy(&local->loc, loc); local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); if (!local->xattr_req) goto err; SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, local->loc.gfid, local, err); STACK_WIND(frame, shard_common_stat_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->stat, loc, local->xattr_req); return 0; err: shard_common_failure_unwind(GF_FOP_STAT, frame, -1, ENOMEM); return 0; } int shard_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { int ret = -1; uint64_t block_size = 0; shard_local_t *local = NULL; if ((IA_ISDIR(fd->inode->ia_type)) || (IA_ISLNK(fd->inode->ia_type))) { STACK_WIND(frame, default_fstat_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fstat, fd, xdata); return 0; } ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); if (ret) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, "Failed to get block " "size from inode ctx of %s", uuid_utoa(fd->inode->gfid)); goto err; } if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { STACK_WIND(frame, default_fstat_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fstat, fd, xdata); return 0; } if (!this->itable) this->itable = fd->inode->table; local = mem_get0(this->local_pool); if (!local) goto err; frame->local = local; local->handler = shard_post_fstat_handler; local->fd = fd_ref(fd); local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); if (!local->xattr_req) goto err; SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, fd->inode->gfid, local, err); STACK_WIND(frame, shard_common_stat_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fstat, fd, local->xattr_req); return 0; err: shard_common_failure_unwind(GF_FOP_FSTAT, frame, -1, ENOMEM); return 0; } int shard_post_update_size_truncate_handler(call_frame_t *frame, xlator_t *this) { shard_local_t *local = NULL; local = frame->local; if (local->fop == GF_FOP_TRUNCATE) SHARD_STACK_UNWIND(truncate, frame, local->op_ret, local->op_errno, &local->prebuf, &local->postbuf, NULL); else SHARD_STACK_UNWIND(ftruncate, frame, local->op_ret, local->op_errno, &local->prebuf, &local->postbuf, NULL); return 0; } int shard_truncate_last_shard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { inode_t *inode = NULL; shard_local_t *local = NULL; local = frame->local; SHARD_UNSET_ROOT_FS_ID(frame, local); inode = (local->fop == GF_FOP_TRUNCATE) ? local->loc.inode : local->fd->inode; if (op_ret < 0) { gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_TRUNCATE_LAST_SHARD_FAILED, "truncate on last" " shard failed : %s", uuid_utoa(inode->gfid)); local->op_ret = op_ret; local->op_errno = op_errno; goto err; } local->postbuf.ia_size = local->offset; local->postbuf.ia_blocks -= (prebuf->ia_blocks - postbuf->ia_blocks); /* Let the delta be negative. We want xattrop to do subtraction */ local->delta_size = local->postbuf.ia_size - local->prebuf.ia_size; local->delta_blocks = postbuf->ia_blocks - prebuf->ia_blocks; local->hole_size = 0; shard_inode_ctx_set(inode, this, postbuf, 0, SHARD_MASK_TIMES); shard_update_file_size(frame, this, NULL, &local->loc, shard_post_update_size_truncate_handler); return 0; err: shard_common_failure_unwind(local->fop, frame, local->op_ret, local->op_errno); return 0; } int shard_truncate_last_shard(call_frame_t *frame, xlator_t *this, inode_t *inode) { size_t last_shard_size_after = 0; loc_t loc = { 0, }; shard_local_t *local = NULL; local = frame->local; /* A NULL inode could be due to the fact that the last shard which * needs to be truncated does not exist due to it lying in a hole * region. So the only thing left to do in that case would be an * update to file size xattr. */ if (!inode) { gf_msg_debug(this->name, 0, "Last shard to be truncated absent" " in backend: %s. Directly proceeding to update " "file size", uuid_utoa(inode->gfid)); shard_update_file_size(frame, this, NULL, &local->loc, shard_post_update_size_truncate_handler); return 0; } SHARD_SET_ROOT_FS_ID(frame, local); loc.inode = inode_ref(inode); gf_uuid_copy(loc.gfid, inode->gfid); last_shard_size_after = (local->offset % local->block_size); STACK_WIND(frame, shard_truncate_last_shard_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->truncate, &loc, last_shard_size_after, NULL); loc_wipe(&loc); return 0; } void shard_unlink_block_inode(shard_local_t *local, int shard_block_num); int shard_truncate_htol_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { int call_count = 0; int shard_block_num = (long)cookie; shard_local_t *local = NULL; local = frame->local; if (op_ret < 0) { local->op_ret = op_ret; local->op_errno = op_errno; goto done; } shard_unlink_block_inode(local, shard_block_num); done: call_count = shard_call_count_return(frame); if (call_count == 0) { SHARD_UNSET_ROOT_FS_ID(frame, local); shard_truncate_last_shard(frame, this, local->inode_list[0]); } return 0; } int shard_truncate_htol(call_frame_t *frame, xlator_t *this, inode_t *inode) { int i = 1; int ret = -1; int call_count = 0; uint32_t cur_block = 0; uint32_t last_block = 0; char path[PATH_MAX] = { 0, }; char *bname = NULL; loc_t loc = { 0, }; gf_boolean_t wind_failed = _gf_false; shard_local_t *local = NULL; shard_priv_t *priv = NULL; local = frame->local; priv = this->private; cur_block = local->first_block + 1; last_block = local->last_block; /* Determine call count */ for (i = 1; i < local->num_blocks; i++) { if (!local->inode_list[i]) continue; call_count++; } if (!call_count) { /* Call count = 0 implies that all of the shards that need to be * unlinked do not exist. So shard xlator would now proceed to * do the final truncate + size updates. */ gf_msg_debug(this->name, 0, "Shards to be unlinked as part of " "truncate absent in backend: %s. Directly " "proceeding to update file size", uuid_utoa(inode->gfid)); local->postbuf.ia_size = local->offset; local->postbuf.ia_blocks = local->prebuf.ia_blocks; local->delta_size = local->postbuf.ia_size - local->prebuf.ia_size; local->delta_blocks = 0; local->hole_size = 0; shard_update_file_size(frame, this, local->fd, &local->loc, shard_post_update_size_truncate_handler); return 0; } local->call_count = call_count; i = 1; SHARD_SET_ROOT_FS_ID(frame, local); while (cur_block <= last_block) { if (!local->inode_list[i]) { cur_block++; i++; continue; } if (wind_failed) { shard_truncate_htol_cbk(frame, (void *)(long)cur_block, this, -1, ENOMEM, NULL, NULL, NULL); goto next; } shard_make_block_abspath(cur_block, inode->gfid, path, sizeof(path)); bname = strrchr(path, '/') + 1; loc.parent = inode_ref(priv->dot_shard_inode); ret = inode_path(loc.parent, bname, (char **)&(loc.path)); if (ret < 0) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, "Inode path failed" " on %s. Base file gfid = %s", bname, uuid_utoa(inode->gfid)); local->op_ret = -1; local->op_errno = ENOMEM; loc_wipe(&loc); wind_failed = _gf_true; shard_truncate_htol_cbk(frame, (void *)(long)cur_block, this, -1, ENOMEM, NULL, NULL, NULL); goto next; } loc.name = strrchr(loc.path, '/'); if (loc.name) loc.name++; loc.inode = inode_ref(local->inode_list[i]); STACK_WIND_COOKIE(frame, shard_truncate_htol_cbk, (void *)(long)cur_block, FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink, &loc, 0, NULL); loc_wipe(&loc); next: i++; cur_block++; if (!--call_count) break; } return 0; } int shard_truncate_do(call_frame_t *frame, xlator_t *this) { shard_local_t *local = NULL; local = frame->local; if (local->num_blocks == 1) { /* This means that there are no shards to be unlinked. * The fop boils down to truncating the last shard, updating * the size and unwinding. */ shard_truncate_last_shard(frame, this, local->inode_list[0]); return 0; } else { shard_truncate_htol(frame, this, local->loc.inode); } return 0; } int shard_post_lookup_shards_truncate_handler(call_frame_t *frame, xlator_t *this) { shard_local_t *local = NULL; local = frame->local; if (local->op_ret < 0) { shard_common_failure_unwind(local->fop, frame, local->op_ret, local->op_errno); return 0; } shard_truncate_do(frame, this); return 0; } void shard_link_block_inode(shard_local_t *local, int block_num, inode_t *inode, struct iatt *buf) { int list_index = 0; char block_bname[256] = { 0, }; uuid_t gfid = { 0, }; inode_t *linked_inode = NULL; xlator_t *this = NULL; inode_t *fsync_inode = NULL; shard_priv_t *priv = NULL; this = THIS; priv = this->private; if (local->loc.inode) gf_uuid_copy(gfid, local->loc.inode->gfid); else gf_uuid_copy(gfid, local->base_gfid); shard_make_block_bname(block_num, gfid, block_bname, sizeof(block_bname)); shard_inode_ctx_set(inode, this, buf, 0, SHARD_LOOKUP_MASK); linked_inode = inode_link(inode, priv->dot_shard_inode, block_bname, buf); inode_lookup(linked_inode); list_index = block_num - local->first_block; local->inode_list[list_index] = linked_inode; LOCK(&priv->lock); { fsync_inode = __shard_update_shards_inode_list( linked_inode, this, local->loc.inode, block_num, gfid); } UNLOCK(&priv->lock); if (fsync_inode) shard_initiate_evicted_inode_fsync(this, fsync_inode); } int shard_common_lookup_shards_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata, struct iatt *postparent) { int call_count = 0; int shard_block_num = (long)cookie; uuid_t gfid = { 0, }; shard_local_t *local = NULL; local = frame->local; if (local->resolver_base_inode) gf_uuid_copy(gfid, local->resolver_base_inode->gfid); else gf_uuid_copy(gfid, local->base_gfid); if (op_ret < 0) { /* Ignore absence of shards in the backend in truncate fop. */ switch (local->fop) { case GF_FOP_TRUNCATE: case GF_FOP_FTRUNCATE: case GF_FOP_RENAME: case GF_FOP_UNLINK: if (op_errno == ENOENT) goto done; break; case GF_FOP_WRITE: case GF_FOP_READ: case GF_FOP_ZEROFILL: case GF_FOP_DISCARD: case GF_FOP_FALLOCATE: if ((!local->first_lookup_done) && (op_errno == ENOENT)) { LOCK(&frame->lock); { local->create_count++; } UNLOCK(&frame->lock); goto done; } break; default: break; } /* else */ gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_LOOKUP_SHARD_FAILED, "Lookup on shard %d " "failed. Base file gfid = %s", shard_block_num, uuid_utoa(gfid)); local->op_ret = op_ret; local->op_errno = op_errno; goto done; } shard_link_block_inode(local, shard_block_num, inode, buf); done: if (local->lookup_shards_barriered) { syncbarrier_wake(&local->barrier); return 0; } else { call_count = shard_call_count_return(frame); if (call_count == 0) { if (!local->first_lookup_done) local->first_lookup_done = _gf_true; local->pls_fop_handler(frame, this); } } return 0; } dict_t * shard_create_gfid_dict(dict_t *dict) { int ret = 0; dict_t *new = NULL; unsigned char *gfid = NULL; new = dict_copy_with_ref(dict, NULL); if (!new) return NULL; gfid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_char); if (!gfid) { ret = -1; goto out; } gf_uuid_generate(gfid); ret = dict_set_gfuuid(new, "gfid-req", gfid, false); out: if (ret) { dict_unref(new); new = NULL; GF_FREE(gfid); } return new; } int shard_common_lookup_shards(call_frame_t *frame, xlator_t *this, inode_t *inode, shard_post_lookup_shards_fop_handler_t handler) { int i = 0; int ret = 0; int count = 0; int call_count = 0; int32_t shard_idx_iter = 0; int last_block = 0; char path[PATH_MAX] = { 0, }; char *bname = NULL; uuid_t gfid = { 0, }; loc_t loc = { 0, }; shard_local_t *local = NULL; shard_priv_t *priv = NULL; gf_boolean_t wind_failed = _gf_false; dict_t *xattr_req = NULL; priv = this->private; local = frame->local; count = call_count = local->call_count; shard_idx_iter = local->first_block; last_block = local->last_block; local->pls_fop_handler = handler; if (local->lookup_shards_barriered) local->barrier.waitfor = local->call_count; if (inode) gf_uuid_copy(gfid, inode->gfid); else gf_uuid_copy(gfid, local->base_gfid); while (shard_idx_iter <= last_block) { if (local->inode_list[i]) { i++; shard_idx_iter++; continue; } if (wind_failed) { shard_common_lookup_shards_cbk(frame, (void *)(long)shard_idx_iter, this, -1, ENOMEM, NULL, NULL, NULL, NULL); goto next; } shard_make_block_abspath(shard_idx_iter, gfid, path, sizeof(path)); bname = strrchr(path, '/') + 1; loc.inode = inode_new(this->itable); loc.parent = inode_ref(priv->dot_shard_inode); gf_uuid_copy(loc.pargfid, priv->dot_shard_gfid); ret = inode_path(loc.parent, bname, (char **)&(loc.path)); if (ret < 0 || !(loc.inode)) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, "Inode path failed" " on %s, base file gfid = %s", bname, uuid_utoa(gfid)); local->op_ret = -1; local->op_errno = ENOMEM; loc_wipe(&loc); wind_failed = _gf_true; shard_common_lookup_shards_cbk(frame, (void *)(long)shard_idx_iter, this, -1, ENOMEM, NULL, NULL, NULL, NULL); goto next; } loc.name = strrchr(loc.path, '/'); if (loc.name) loc.name++; xattr_req = shard_create_gfid_dict(local->xattr_req); if (!xattr_req) { local->op_ret = -1; local->op_errno = ENOMEM; wind_failed = _gf_true; loc_wipe(&loc); shard_common_lookup_shards_cbk(frame, (void *)(long)shard_idx_iter, this, -1, ENOMEM, NULL, NULL, NULL, NULL); goto next; } STACK_WIND_COOKIE(frame, shard_common_lookup_shards_cbk, (void *)(long)shard_idx_iter, FIRST_CHILD(this), FIRST_CHILD(this)->fops->lookup, &loc, xattr_req); loc_wipe(&loc); dict_unref(xattr_req); next: shard_idx_iter++; i++; if (!--call_count) break; } if (local->lookup_shards_barriered) { syncbarrier_wait(&local->barrier, count); local->pls_fop_handler(frame, this); } return 0; } int shard_post_resolve_truncate_handler(call_frame_t *frame, xlator_t *this) { shard_local_t *local = NULL; local = frame->local; if (local->op_ret < 0) { if (local->op_errno == ENOENT) { /* If lookup on /.shard fails with ENOENT, it means that * the file was 0-byte in size but truncated sometime in * the past to a higher size which is reflected in the * size xattr, and now being truncated to a lower size. * In this case, the only thing that needs to be done is * to update the size xattr of the file and unwind. */ local->first_block = local->last_block = 0; local->num_blocks = 1; local->call_count = 0; local->op_ret = 0; local->postbuf.ia_size = local->offset; shard_update_file_size(frame, this, local->fd, &local->loc, shard_post_update_size_truncate_handler); return 0; } else { shard_common_failure_unwind(local->fop, frame, local->op_ret, local->op_errno); return 0; } } if (!local->call_count) shard_truncate_do(frame, this); else shard_common_lookup_shards(frame, this, local->loc.inode, shard_post_lookup_shards_truncate_handler); return 0; } int shard_truncate_begin(call_frame_t *frame, xlator_t *this) { int ret = 0; shard_local_t *local = NULL; shard_priv_t *priv = NULL; priv = this->private; local = frame->local; /* First participant block here is the lowest numbered block that would * hold the last byte of the file post successful truncation. * Last participant block is the block that contains the last byte in * the current state of the file. * If (first block == last_block): * then that means that the file only needs truncation of the * first (or last since both are same) block. * Else * if (new_size % block_size == 0) * then that means there is no truncate to be done with * only shards from first_block + 1 through the last * block needing to be unlinked. * else * both truncate of the first block and unlink of the * remaining shards until end of file is required. */ local->first_block = (local->offset == 0) ? 0 : get_lowest_block(local->offset - 1, local->block_size); local->last_block = get_highest_block(0, local->prebuf.ia_size, local->block_size); local->num_blocks = local->last_block - local->first_block + 1; local->resolver_base_inode = (local->fop == GF_FOP_TRUNCATE) ? local->loc.inode : local->fd->inode; if ((local->first_block == 0) && (local->num_blocks == 1)) { if (local->fop == GF_FOP_TRUNCATE) STACK_WIND(frame, shard_truncate_last_shard_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->truncate, &local->loc, local->offset, local->xattr_req); else STACK_WIND(frame, shard_truncate_last_shard_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->ftruncate, local->fd, local->offset, local->xattr_req); return 0; } local->inode_list = GF_CALLOC(local->num_blocks, sizeof(inode_t *), gf_shard_mt_inode_list); if (!local->inode_list) goto err; local->dot_shard_loc.inode = inode_find(this->itable, priv->dot_shard_gfid); if (!local->dot_shard_loc.inode) { ret = shard_init_internal_dir_loc(this, local, SHARD_INTERNAL_DIR_DOT_SHARD); if (ret) goto err; shard_lookup_internal_dir(frame, this, shard_post_resolve_truncate_handler, SHARD_INTERNAL_DIR_DOT_SHARD); } else { local->post_res_handler = shard_post_resolve_truncate_handler; shard_refresh_internal_dir(frame, this, SHARD_INTERNAL_DIR_DOT_SHARD); } return 0; err: shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); return 0; } int shard_post_lookup_truncate_handler(call_frame_t *frame, xlator_t *this) { shard_local_t *local = NULL; struct iatt tmp_stbuf = { 0, }; local = frame->local; if (local->op_ret < 0) { shard_common_failure_unwind(local->fop, frame, local->op_ret, local->op_errno); return 0; } local->postbuf = tmp_stbuf = local->prebuf; if (local->prebuf.ia_size == local->offset) { /* If the file size is same as requested size, unwind the call * immediately. */ if (local->fop == GF_FOP_TRUNCATE) SHARD_STACK_UNWIND(truncate, frame, 0, 0, &local->prebuf, &local->postbuf, NULL); else SHARD_STACK_UNWIND(ftruncate, frame, 0, 0, &local->prebuf, &local->postbuf, NULL); } else if (local->offset > local->prebuf.ia_size) { /* If the truncate is from a lower to a higher size, set the * new size xattr and unwind. */ local->hole_size = local->offset - local->prebuf.ia_size; local->delta_size = 0; local->delta_blocks = 0; local->postbuf.ia_size = local->offset; tmp_stbuf.ia_size = local->offset; shard_inode_ctx_set(local->loc.inode, this, &tmp_stbuf, 0, SHARD_INODE_WRITE_MASK); shard_update_file_size(frame, this, NULL, &local->loc, shard_post_update_size_truncate_handler); } else { /* ... else * i. unlink all shards that need to be unlinked. * ii. truncate the last of the shards. * iii. update the new size using setxattr. * and unwind the fop. */ local->hole_size = 0; local->delta_size = (local->offset - local->prebuf.ia_size); local->delta_blocks = 0; tmp_stbuf.ia_size = local->offset; shard_inode_ctx_set(local->loc.inode, this, &tmp_stbuf, 0, SHARD_INODE_WRITE_MASK); shard_truncate_begin(frame, this); } return 0; } /* TO-DO: * Fix updates to size and block count with racing write(s) and truncate(s). */ int shard_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, dict_t *xdata) { int ret = -1; uint64_t block_size = 0; shard_local_t *local = NULL; ret = shard_inode_ctx_get_block_size(loc->inode, this, &block_size); if (ret) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, "Failed to get block " "size from inode ctx of %s", uuid_utoa(loc->inode->gfid)); goto err; } if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { STACK_WIND(frame, default_truncate_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); return 0; } if (!this->itable) this->itable = loc->inode->table; local = mem_get0(this->local_pool); if (!local) goto err; frame->local = local; ret = syncbarrier_init(&local->barrier); if (ret) goto err; loc_copy(&local->loc, loc); local->offset = offset; local->block_size = block_size; local->fop = GF_FOP_TRUNCATE; local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); if (!local->xattr_req) goto err; local->resolver_base_inode = loc->inode; shard_lookup_base_file(frame, this, &local->loc, shard_post_lookup_truncate_handler); return 0; err: shard_common_failure_unwind(GF_FOP_TRUNCATE, frame, -1, ENOMEM); return 0; } int shard_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, dict_t *xdata) { int ret = -1; uint64_t block_size = 0; shard_local_t *local = NULL; ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); if (ret) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, "Failed to get block " "size from inode ctx of %s", uuid_utoa(fd->inode->gfid)); goto err; } if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { STACK_WIND(frame, default_ftruncate_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); return 0; } if (!this->itable) this->itable = fd->inode->table; local = mem_get0(this->local_pool); if (!local) goto err; frame->local = local; ret = syncbarrier_init(&local->barrier); if (ret) goto err; local->fd = fd_ref(fd); local->offset = offset; local->block_size = block_size; local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); if (!local->xattr_req) goto err; local->fop = GF_FOP_FTRUNCATE; local->loc.inode = inode_ref(fd->inode); gf_uuid_copy(local->loc.gfid, fd->inode->gfid); local->resolver_base_inode = fd->inode; shard_lookup_base_file(frame, this, &local->loc, shard_post_lookup_truncate_handler); return 0; err: shard_common_failure_unwind(GF_FOP_FTRUNCATE, frame, -1, ENOMEM); return 0; } int shard_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { int ret = -1; shard_local_t *local = NULL; local = frame->local; if (op_ret == -1) goto unwind; ret = shard_inode_ctx_set(inode, this, buf, local->block_size, SHARD_ALL_MASK); if (ret) gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INODE_CTX_SET_FAILED, "Failed to set inode " "ctx for %s", uuid_utoa(inode->gfid)); unwind: SHARD_STACK_UNWIND(mknod, frame, op_ret, op_errno, inode, buf, preparent, postparent, xdata); return 0; } int shard_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata) { shard_priv_t *priv = NULL; shard_local_t *local = NULL; priv = this->private; local = mem_get0(this->local_pool); if (!local) goto err; frame->local = local; local->block_size = priv->block_size; if (!__is_gsyncd_on_shard_dir(frame, loc)) { SHARD_INODE_CREATE_INIT(this, local->block_size, xdata, loc, 0, 0, err); } STACK_WIND(frame, shard_mknod_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata); return 0; err: shard_common_failure_unwind(GF_FOP_MKNOD, frame, -1, ENOMEM); return 0; } int32_t shard_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { shard_local_t *local = NULL; local = frame->local; if (op_ret < 0) goto err; shard_inode_ctx_set(inode, this, buf, 0, SHARD_MASK_NLINK | SHARD_MASK_TIMES); buf->ia_size = local->prebuf.ia_size; buf->ia_blocks = local->prebuf.ia_blocks; SHARD_STACK_UNWIND(link, frame, op_ret, op_errno, inode, buf, preparent, postparent, xdata); return 0; err: shard_common_failure_unwind(GF_FOP_LINK, frame, op_ret, op_errno); return 0; } int shard_post_lookup_link_handler(call_frame_t *frame, xlator_t *this) { shard_local_t *local = NULL; local = frame->local; if (local->op_ret < 0) { SHARD_STACK_UNWIND(link, frame, local->op_ret, local->op_errno, NULL, NULL, NULL, NULL, NULL); return 0; } STACK_WIND(frame, shard_link_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->link, &local->loc, &local->loc2, local->xattr_req); return 0; } int32_t shard_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, dict_t *xdata) { int ret = -1; uint64_t block_size = 0; shard_local_t *local = NULL; ret = shard_inode_ctx_get_block_size(oldloc->inode, this, &block_size); if (ret) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, "Failed to get block " "size from inode ctx of %s", uuid_utoa(oldloc->inode->gfid)); goto err; } if (!block_size) { STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata); return 0; } if (!this->itable) this->itable = oldloc->inode->table; local = mem_get0(this->local_pool); if (!local) goto err; frame->local = local; loc_copy(&local->loc, oldloc); loc_copy(&local->loc2, newloc); local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); if (!local->xattr_req) goto err; shard_lookup_base_file(frame, this, &local->loc, shard_post_lookup_link_handler); return 0; err: shard_common_failure_unwind(GF_FOP_LINK, frame, -1, ENOMEM); return 0; } int shard_unlink_shards_do(call_frame_t *frame, xlator_t *this, inode_t *inode); int shard_post_lookup_shards_unlink_handler(call_frame_t *frame, xlator_t *this) { shard_local_t *local = NULL; local = frame->local; if ((local->op_ret < 0) && (local->op_errno != ENOENT)) { gf_msg(this->name, GF_LOG_ERROR, local->op_errno, SHARD_MSG_FOP_FAILED, "failed to delete shards of %s", uuid_utoa(local->resolver_base_inode->gfid)); return 0; } local->op_ret = 0; local->op_errno = 0; shard_unlink_shards_do(frame, this, local->resolver_base_inode); return 0; } int shard_post_resolve_unlink_handler(call_frame_t *frame, xlator_t *this) { shard_local_t *local = NULL; local = frame->local; local->lookup_shards_barriered = _gf_true; if (!local->call_count) shard_unlink_shards_do(frame, this, local->resolver_base_inode); else shard_common_lookup_shards(frame, this, local->resolver_base_inode, shard_post_lookup_shards_unlink_handler); return 0; } void shard_unlink_block_inode(shard_local_t *local, int shard_block_num) { char block_bname[256] = { 0, }; uuid_t gfid = { 0, }; inode_t *inode = NULL; inode_t *base_inode = NULL; xlator_t *this = NULL; shard_priv_t *priv = NULL; shard_inode_ctx_t *ctx = NULL; shard_inode_ctx_t *base_ictx = NULL; int unref_base_inode = 0; int unref_shard_inode = 0; this = THIS; priv = this->private; inode = local->inode_list[shard_block_num - local->first_block]; shard_inode_ctx_get(inode, this, &ctx); base_inode = ctx->base_inode; if (base_inode) gf_uuid_copy(gfid, base_inode->gfid); else gf_uuid_copy(gfid, ctx->base_gfid); shard_make_block_bname(shard_block_num, gfid, block_bname, sizeof(block_bname)); LOCK(&priv->lock); if (base_inode) LOCK(&base_inode->lock); LOCK(&inode->lock); { __shard_inode_ctx_get(inode, this, &ctx); if (!list_empty(&ctx->ilist)) { list_del_init(&ctx->ilist); priv->inode_count--; unref_base_inode++; unref_shard_inode++; GF_ASSERT(priv->inode_count >= 0); } if (ctx->fsync_needed) { unref_base_inode++; unref_shard_inode++; list_del_init(&ctx->to_fsync_list); if (base_inode) { __shard_inode_ctx_get(base_inode, this, &base_ictx); base_ictx->fsync_count--; } } } UNLOCK(&inode->lock); if (base_inode) UNLOCK(&base_inode->lock); inode_unlink(inode, priv->dot_shard_inode, block_bname); inode_ref_reduce_by_n(inode, unref_shard_inode); inode_forget(inode, 0); if (base_inode && unref_base_inode) inode_ref_reduce_by_n(base_inode, unref_base_inode); UNLOCK(&priv->lock); } int shard_rename_cbk(call_frame_t *frame, xlator_t *this) { shard_local_t *local = NULL; local = frame->local; SHARD_STACK_UNWIND(rename, frame, local->op_ret, local->op_errno, &local->prebuf, &local->preoldparent, &local->postoldparent, &local->prenewparent, &local->postnewparent, local->xattr_rsp); return 0; } int32_t shard_unlink_cbk(call_frame_t *frame, xlator_t *this) { shard_local_t *local = frame->local; SHARD_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno, &local->preoldparent, &local->postoldparent, local->xattr_rsp); return 0; } int shard_unlink_shards_do_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { int shard_block_num = (long)cookie; shard_local_t *local = NULL; local = frame->local; if (op_ret < 0) { local->op_ret = op_ret; local->op_errno = op_errno; goto done; } shard_unlink_block_inode(local, shard_block_num); done: syncbarrier_wake(&local->barrier); return 0; } int shard_unlink_shards_do(call_frame_t *frame, xlator_t *this, inode_t *inode) { int i = 0; int ret = -1; int count = 0; uint32_t cur_block = 0; uint32_t cur_block_idx = 0; /*this is idx into inode_list[] array */ char *bname = NULL; char path[PATH_MAX] = { 0, }; uuid_t gfid = { 0, }; loc_t loc = { 0, }; gf_boolean_t wind_failed = _gf_false; shard_local_t *local = NULL; shard_priv_t *priv = NULL; priv = this->private; local = frame->local; if (inode) gf_uuid_copy(gfid, inode->gfid); else gf_uuid_copy(gfid, local->base_gfid); for (i = 0; i < local->num_blocks; i++) { if (!local->inode_list[i]) continue; count++; } if (!count) { /* callcount = 0 implies that all of the shards that need to be * unlinked are non-existent (in other words the file is full of * holes). */ gf_msg_debug(this->name, 0, "All shards that need to be " "unlinked are non-existent: %s", uuid_utoa(gfid)); return 0; } SHARD_SET_ROOT_FS_ID(frame, local); local->barrier.waitfor = count; cur_block = cur_block_idx + local->first_block; while (cur_block_idx < local->num_blocks) { if (!local->inode_list[cur_block_idx]) goto next; if (wind_failed) { shard_unlink_shards_do_cbk(frame, (void *)(long)cur_block, this, -1, ENOMEM, NULL, NULL, NULL); goto next; } shard_make_block_abspath(cur_block, gfid, path, sizeof(path)); bname = strrchr(path, '/') + 1; loc.parent = inode_ref(priv->dot_shard_inode); ret = inode_path(loc.parent, bname, (char **)&(loc.path)); if (ret < 0) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, "Inode path failed" " on %s, base file gfid = %s", bname, uuid_utoa(gfid)); local->op_ret = -1; local->op_errno = ENOMEM; loc_wipe(&loc); wind_failed = _gf_true; shard_unlink_shards_do_cbk(frame, (void *)(long)cur_block, this, -1, ENOMEM, NULL, NULL, NULL); goto next; } loc.name = strrchr(loc.path, '/'); if (loc.name) loc.name++; loc.inode = inode_ref(local->inode_list[cur_block_idx]); STACK_WIND_COOKIE(frame, shard_unlink_shards_do_cbk, (void *)(long)cur_block, FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink, &loc, local->xflag, local->xattr_req); loc_wipe(&loc); next: cur_block++; cur_block_idx++; } syncbarrier_wait(&local->barrier, count); SHARD_UNSET_ROOT_FS_ID(frame, local); return 0; } int shard_regulated_shards_deletion(call_frame_t *cleanup_frame, xlator_t *this, int now, int first_block, gf_dirent_t *entry) { int i = 0; int ret = 0; shard_local_t *local = NULL; uuid_t gfid = { 0, }; local = cleanup_frame->local; local->inode_list = GF_CALLOC(now, sizeof(inode_t *), gf_shard_mt_inode_list); if (!local->inode_list) return -ENOMEM; local->first_block = first_block; local->last_block = first_block + now - 1; local->num_blocks = now; gf_uuid_parse(entry->d_name, gfid); gf_uuid_copy(local->base_gfid, gfid); local->resolver_base_inode = inode_find(this->itable, gfid); local->call_count = 0; syncbarrier_init(&local->barrier); shard_common_resolve_shards(cleanup_frame, this, shard_post_resolve_unlink_handler); for (i = 0; i < local->num_blocks; i++) { if (local->inode_list[i]) inode_unref(local->inode_list[i]); } GF_FREE(local->inode_list); local->inode_list = NULL; if (local->op_ret) ret = -local->op_errno; syncbarrier_destroy(&local->barrier); inode_unref(local->resolver_base_inode); local->resolver_base_inode = NULL; STACK_RESET(cleanup_frame->root); return ret; } int __shard_delete_shards_of_entry(call_frame_t *cleanup_frame, xlator_t *this, gf_dirent_t *entry, inode_t *inode) { int ret = 0; int shard_count = 0; int first_block = 0; int now = 0; uint64_t size = 0; uint64_t block_size = 0; uint64_t size_array[4] = { 0, }; void *bsize = NULL; void *size_attr = NULL; dict_t *xattr_rsp = NULL; loc_t loc = { 0, }; shard_local_t *local = NULL; shard_priv_t *priv = NULL; priv = this->private; local = cleanup_frame->local; ret = dict_reset(local->xattr_req); if (ret) { gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, "Failed to reset dict"); ret = -ENOMEM; goto err; } ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_BLOCK_SIZE, 0); if (ret) { gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, "Failed to set dict value: key:%s", GF_XATTR_SHARD_BLOCK_SIZE); ret = -ENOMEM; goto err; } ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_FILE_SIZE, 8 * 4); if (ret) { gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, "Failed to set dict value: key:%s", GF_XATTR_SHARD_FILE_SIZE); ret = -ENOMEM; goto err; } loc.inode = inode_ref(inode); loc.parent = inode_ref(priv->dot_shard_rm_inode); ret = inode_path(loc.parent, entry->d_name, (char **)&(loc.path)); if (ret < 0) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, "Inode path failed on %s", entry->d_name); ret = -ENOMEM; goto err; } loc.name = strrchr(loc.path, '/'); if (loc.name) loc.name++; ret = syncop_lookup(FIRST_CHILD(this), &loc, NULL, NULL, local->xattr_req, &xattr_rsp); if (ret) goto err; ret = dict_get_ptr(xattr_rsp, GF_XATTR_SHARD_BLOCK_SIZE, &bsize); if (ret) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, "Failed to get dict value: key:%s", GF_XATTR_SHARD_BLOCK_SIZE); goto err; } block_size = ntoh64(*((uint64_t *)bsize)); ret = dict_get_ptr(xattr_rsp, GF_XATTR_SHARD_FILE_SIZE, &size_attr); if (ret) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, "Failed to get dict value: key:%s", GF_XATTR_SHARD_FILE_SIZE); goto err; } memcpy(size_array, size_attr, sizeof(size_array)); size = ntoh64(size_array[0]); shard_count = (size / block_size) - 1; if (shard_count < 0) { gf_msg_debug(this->name, 0, "Size of %s hasn't grown beyond " "its shard-block-size. Nothing to delete. " "Returning", entry->d_name); /* File size < shard-block-size, so nothing to delete */ ret = 0; goto delete_marker; } if ((size % block_size) > 0) shard_count++; if (shard_count == 0) { gf_msg_debug(this->name, 0, "Size of %s is exactly equal to " "its shard-block-size. Nothing to delete. " "Returning", entry->d_name); ret = 0; goto delete_marker; } gf_msg_debug(this->name, 0, "base file = %s, " "shard-block-size=%" PRIu64 ", file-size=%" PRIu64 ", " "shard_count=%d", entry->d_name, block_size, size, shard_count); /* Perform a gfid-based lookup to see if gfid corresponding to marker * file's base name exists. */ loc_wipe(&loc); loc.inode = inode_new(this->itable); if (!loc.inode) { ret = -ENOMEM; goto err; } gf_uuid_parse(entry->d_name, loc.gfid); ret = syncop_lookup(FIRST_CHILD(this), &loc, NULL, NULL, NULL, NULL); if (!ret) { gf_msg_debug(this->name, 0, "Base shard corresponding to gfid " "%s is present. Skipping shard deletion. " "Returning", entry->d_name); ret = 0; goto delete_marker; } first_block = 1; while (shard_count) { if (shard_count < local->deletion_rate) { now = shard_count; shard_count = 0; } else { now = local->deletion_rate; shard_count -= local->deletion_rate; } gf_msg_debug(this->name, 0, "deleting %d shards starting from " "block %d of gfid %s", now, first_block, entry->d_name); ret = shard_regulated_shards_deletion(cleanup_frame, this, now, first_block, entry); if (ret) goto err; first_block += now; } delete_marker: loc_wipe(&loc); loc.inode = inode_ref(inode); loc.parent = inode_ref(priv->dot_shard_rm_inode); ret = inode_path(loc.parent, entry->d_name, (char **)&(loc.path)); if (ret < 0) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, "Inode path failed on %s", entry->d_name); ret = -ENOMEM; goto err; } loc.name = strrchr(loc.path, '/'); if (loc.name) loc.name++; ret = syncop_unlink(FIRST_CHILD(this), &loc, NULL, NULL); if (ret) gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_SHARDS_DELETION_FAILED, "Failed to delete %s " "from /%s", entry->d_name, GF_SHARD_REMOVE_ME_DIR); err: if (xattr_rsp) dict_unref(xattr_rsp); loc_wipe(&loc); return ret; } int shard_delete_shards_of_entry(call_frame_t *cleanup_frame, xlator_t *this, gf_dirent_t *entry, inode_t *inode) { int ret = -1; loc_t loc = { 0, }; shard_priv_t *priv = NULL; priv = this->private; loc.inode = inode_ref(priv->dot_shard_rm_inode); ret = syncop_entrylk(FIRST_CHILD(this), this->name, &loc, entry->d_name, ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL, NULL); if (ret < 0) { if (ret == -EAGAIN) { ret = 0; } goto out; } { ret = __shard_delete_shards_of_entry(cleanup_frame, this, entry, inode); } syncop_entrylk(FIRST_CHILD(this), this->name, &loc, entry->d_name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL, NULL); out: loc_wipe(&loc); return ret; } int shard_delete_shards_cbk(int ret, call_frame_t *frame, void *data) { SHARD_STACK_DESTROY(frame); return 0; } int shard_resolve_internal_dir(xlator_t *this, shard_local_t *local, shard_internal_dir_type_t type) { int ret = 0; char *bname = NULL; loc_t *loc = NULL; shard_priv_t *priv = NULL; uuid_t gfid = { 0, }; struct iatt stbuf = { 0, }; priv = this->private; switch (type) { case SHARD_INTERNAL_DIR_DOT_SHARD: loc = &local->dot_shard_loc; gf_uuid_copy(gfid, priv->dot_shard_gfid); bname = GF_SHARD_DIR; break; case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: loc = &local->dot_shard_rm_loc; gf_uuid_copy(gfid, priv->dot_shard_rm_gfid); bname = GF_SHARD_REMOVE_ME_DIR; break; default: break; } loc->inode = inode_find(this->itable, gfid); if (!loc->inode) { ret = shard_init_internal_dir_loc(this, local, type); if (ret) goto err; ret = dict_reset(local->xattr_req); if (ret) { gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, "Failed to reset " "dict"); ret = -ENOMEM; goto err; } ret = dict_set_gfuuid(local->xattr_req, "gfid-req", gfid, true); ret = syncop_lookup(FIRST_CHILD(this), loc, &stbuf, NULL, local->xattr_req, NULL); if (ret < 0) { if (ret != -ENOENT) gf_msg(this->name, GF_LOG_ERROR, -ret, SHARD_MSG_SHARDS_DELETION_FAILED, "Lookup on %s failed, exiting", bname); goto err; } else { shard_link_internal_dir_inode(local, loc->inode, &stbuf, type); } } ret = 0; err: return ret; } int shard_lookup_marker_entry(xlator_t *this, shard_local_t *local, gf_dirent_t *entry) { int ret = 0; loc_t loc = { 0, }; loc.inode = inode_new(this->itable); if (!loc.inode) { ret = -ENOMEM; goto err; } loc.parent = inode_ref(local->fd->inode); ret = inode_path(loc.parent, entry->d_name, (char **)&(loc.path)); if (ret < 0) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, "Inode path failed on %s", entry->d_name); ret = -ENOMEM; goto err; } loc.name = strrchr(loc.path, '/'); if (loc.name) loc.name++; ret = syncop_lookup(FIRST_CHILD(this), &loc, NULL, NULL, NULL, NULL); if (ret < 0) { goto err; } entry->inode = inode_ref(loc.inode); ret = 0; err: loc_wipe(&loc); return ret; } int shard_delete_shards(void *opaque) { int ret = 0; off_t offset = 0; loc_t loc = { 0, }; inode_t *link_inode = NULL; xlator_t *this = NULL; shard_priv_t *priv = NULL; shard_local_t *local = NULL; gf_dirent_t entries; gf_dirent_t *entry = NULL; call_frame_t *cleanup_frame = NULL; gf_boolean_t done = _gf_false; this = THIS; priv = this->private; INIT_LIST_HEAD(&entries.list); cleanup_frame = opaque; local = mem_get0(this->local_pool); if (!local) { gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, "Failed to create local to " "delete shards"); ret = -ENOMEM; goto err; } cleanup_frame->local = local; local->fop = GF_FOP_UNLINK; local->xattr_req = dict_new(); if (!local->xattr_req) { ret = -ENOMEM; goto err; } local->deletion_rate = priv->deletion_rate; ret = shard_resolve_internal_dir(this, local, SHARD_INTERNAL_DIR_DOT_SHARD); if (ret == -ENOENT) { gf_msg_debug(this->name, 0, ".shard absent. Nothing to" " delete. Exiting"); ret = 0; goto err; } else if (ret < 0) { goto err; } ret = shard_resolve_internal_dir(this, local, SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME); if (ret == -ENOENT) { gf_msg_debug(this->name, 0, ".remove_me absent. " "Nothing to delete. Exiting"); ret = 0; goto err; } else if (ret < 0) { goto err; } local->fd = fd_anonymous(local->dot_shard_rm_loc.inode); if (!local->fd) { ret = -ENOMEM; goto err; } for (;;) { offset = 0; LOCK(&priv->lock); { if (priv->bg_del_state == SHARD_BG_DELETION_LAUNCHING) { priv->bg_del_state = SHARD_BG_DELETION_IN_PROGRESS; } else if (priv->bg_del_state == SHARD_BG_DELETION_IN_PROGRESS) { priv->bg_del_state = SHARD_BG_DELETION_NONE; done = _gf_true; } } UNLOCK(&priv->lock); if (done) break; while ( (ret = syncop_readdirp(FIRST_CHILD(this), local->fd, 131072, offset, &entries, local->xattr_req, NULL))) { if (ret > 0) ret = 0; list_for_each_entry(entry, &entries.list, list) { offset = entry->d_off; if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..")) continue; if (!entry->inode) { ret = shard_lookup_marker_entry(this, local, entry); if (ret < 0) continue; } link_inode = inode_link(entry->inode, local->fd->inode, entry->d_name, &entry->d_stat); gf_msg_debug(this->name, 0, "Initiating deletion of " "shards of gfid %s", entry->d_name); ret = shard_delete_shards_of_entry(cleanup_frame, this, entry, link_inode); inode_unlink(link_inode, local->fd->inode, entry->d_name); inode_unref(link_inode); if (ret) { gf_msg(this->name, GF_LOG_ERROR, -ret, SHARD_MSG_SHARDS_DELETION_FAILED, "Failed to clean up shards of gfid %s", entry->d_name); continue; } gf_msg(this->name, GF_LOG_INFO, 0, SHARD_MSG_SHARD_DELETION_COMPLETED, "Deleted " "shards of gfid=%s from backend", entry->d_name); } gf_dirent_free(&entries); if (ret) break; } } ret = 0; loc_wipe(&loc); return ret; err: LOCK(&priv->lock); { priv->bg_del_state = SHARD_BG_DELETION_NONE; } UNLOCK(&priv->lock); loc_wipe(&loc); return ret; } int shard_unlock_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { if (op_ret) gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, "Unlock failed. Please check brick logs for " "more details"); SHARD_STACK_DESTROY(frame); return 0; } int shard_unlock_inodelk(call_frame_t *frame, xlator_t *this) { loc_t *loc = NULL; call_frame_t *lk_frame = NULL; shard_local_t *local = NULL; shard_local_t *lk_local = NULL; shard_inodelk_t *lock = NULL; local = frame->local; lk_frame = local->inodelk_frame; lk_local = lk_frame->local; local->inodelk_frame = NULL; loc = &local->int_inodelk.loc; lock = &lk_local->int_inodelk; lock->flock.l_type = F_UNLCK; STACK_WIND(lk_frame, shard_unlock_inodelk_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->inodelk, lock->domain, loc, F_SETLK, &lock->flock, NULL); local->int_inodelk.acquired_lock = _gf_false; return 0; } int shard_rename_src_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *buf, struct iatt *preoldparent, struct iatt *postoldparent, struct iatt *prenewparent, struct iatt *postnewparent, dict_t *xdata); int shard_rename_src_base_file(call_frame_t *frame, xlator_t *this) { int ret = 0; loc_t *dst_loc = NULL; loc_t tmp_loc = { 0, }; shard_local_t *local = frame->local; if (local->dst_block_size) { tmp_loc.parent = inode_ref(local->loc2.parent); ret = inode_path(tmp_loc.parent, local->loc2.name, (char **)&tmp_loc.path); if (ret < 0) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, "Inode path failed" " on pargfid=%s bname=%s", uuid_utoa(tmp_loc.parent->gfid), local->loc2.name); local->op_ret = -1; local->op_errno = ENOMEM; goto err; } tmp_loc.name = strrchr(tmp_loc.path, '/'); if (tmp_loc.name) tmp_loc.name++; dst_loc = &tmp_loc; } else { dst_loc = &local->loc2; } /* To-Do: Request open-fd count on dst base file */ STACK_WIND(frame, shard_rename_src_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->rename, &local->loc, dst_loc, local->xattr_req); loc_wipe(&tmp_loc); return 0; err: loc_wipe(&tmp_loc); shard_common_failure_unwind(local->fop, frame, local->op_ret, local->op_errno); return 0; } int shard_unlink_base_file(call_frame_t *frame, xlator_t *this); int shard_set_size_attrs_on_marker_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { shard_priv_t *priv = NULL; shard_local_t *local = NULL; priv = this->private; local = frame->local; if (op_ret < 0) { gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, "Xattrop on marker file failed " "while performing %s; entry gfid=%s", gf_fop_string(local->fop), local->newloc.name); goto err; } inode_unlink(local->newloc.inode, priv->dot_shard_rm_inode, local->newloc.name); if (local->fop == GF_FOP_UNLINK) shard_unlink_base_file(frame, this); else if (local->fop == GF_FOP_RENAME) shard_rename_src_base_file(frame, this); return 0; err: shard_common_failure_unwind(local->fop, frame, op_ret, op_errno); return 0; } int shard_set_size_attrs_on_marker_file(call_frame_t *frame, xlator_t *this) { int op_errno = ENOMEM; uint64_t bs = 0; dict_t *xdata = NULL; shard_local_t *local = NULL; local = frame->local; xdata = dict_new(); if (!xdata) goto err; if (local->fop == GF_FOP_UNLINK) bs = local->block_size; else if (local->fop == GF_FOP_RENAME) bs = local->dst_block_size; SHARD_INODE_CREATE_INIT(this, bs, xdata, &local->newloc, local->prebuf.ia_size, 0, err); STACK_WIND(frame, shard_set_size_attrs_on_marker_file_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->xattrop, &local->newloc, GF_XATTROP_GET_AND_SET, xdata, NULL); dict_unref(xdata); return 0; err: if (xdata) dict_unref(xdata); shard_common_failure_unwind(local->fop, frame, -1, op_errno); return 0; } int shard_lookup_marker_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata, struct iatt *postparent) { inode_t *linked_inode = NULL; shard_priv_t *priv = NULL; shard_local_t *local = NULL; local = frame->local; priv = this->private; if (op_ret < 0) { gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, "Lookup on marker file failed " "while performing %s; entry gfid=%s", gf_fop_string(local->fop), local->newloc.name); goto err; } linked_inode = inode_link(inode, priv->dot_shard_rm_inode, local->newloc.name, buf); inode_unref(local->newloc.inode); local->newloc.inode = linked_inode; shard_set_size_attrs_on_marker_file(frame, this); return 0; err: shard_common_failure_unwind(local->fop, frame, op_ret, op_errno); return 0; } int shard_lookup_marker_file(call_frame_t *frame, xlator_t *this) { int op_errno = ENOMEM; dict_t *xattr_req = NULL; shard_local_t *local = NULL; local = frame->local; xattr_req = shard_create_gfid_dict(local->xattr_req); if (!xattr_req) goto err; STACK_WIND(frame, shard_lookup_marker_file_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->lookup, &local->newloc, xattr_req); dict_unref(xattr_req); return 0; err: shard_common_failure_unwind(local->fop, frame, -1, op_errno); return 0; } int shard_create_marker_file_under_remove_me_cbk( call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { inode_t *linked_inode = NULL; shard_priv_t *priv = NULL; shard_local_t *local = NULL; local = frame->local; priv = this->private; SHARD_UNSET_ROOT_FS_ID(frame, local); if (op_ret < 0) { if ((op_errno != EEXIST) && (op_errno != ENODATA)) { local->op_ret = op_ret; local->op_errno = op_errno; gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, "Marker file creation " "failed while performing %s; entry gfid=%s", gf_fop_string(local->fop), local->newloc.name); goto err; } else { shard_lookup_marker_file(frame, this); return 0; } } linked_inode = inode_link(inode, priv->dot_shard_rm_inode, local->newloc.name, buf); inode_unref(local->newloc.inode); local->newloc.inode = linked_inode; if (local->fop == GF_FOP_UNLINK) shard_unlink_base_file(frame, this); else if (local->fop == GF_FOP_RENAME) shard_rename_src_base_file(frame, this); return 0; err: shard_common_failure_unwind(local->fop, frame, -1, local->op_errno); return 0; } int shard_create_marker_file_under_remove_me(call_frame_t *frame, xlator_t *this, loc_t *loc) { int ret = 0; int op_errno = ENOMEM; uint64_t bs = 0; char g1[64] = { 0, }; char g2[64] = { 0, }; dict_t *xattr_req = NULL; shard_priv_t *priv = NULL; shard_local_t *local = NULL; priv = this->private; local = frame->local; SHARD_SET_ROOT_FS_ID(frame, local); xattr_req = shard_create_gfid_dict(local->xattr_req); if (!xattr_req) goto err; local->newloc.inode = inode_new(this->itable); local->newloc.parent = inode_ref(priv->dot_shard_rm_inode); ret = inode_path(local->newloc.parent, uuid_utoa(loc->inode->gfid), (char **)&local->newloc.path); if (ret < 0) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, "Inode path failed on " "pargfid=%s bname=%s", uuid_utoa_r(priv->dot_shard_rm_gfid, g1), uuid_utoa_r(loc->inode->gfid, g2)); goto err; } local->newloc.name = strrchr(local->newloc.path, '/'); if (local->newloc.name) local->newloc.name++; if (local->fop == GF_FOP_UNLINK) bs = local->block_size; else if (local->fop == GF_FOP_RENAME) bs = local->dst_block_size; SHARD_INODE_CREATE_INIT(this, bs, xattr_req, &local->newloc, local->prebuf.ia_size, 0, err); STACK_WIND(frame, shard_create_marker_file_under_remove_me_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod, &local->newloc, 0, 0, 0644, xattr_req); dict_unref(xattr_req); return 0; err: if (xattr_req) dict_unref(xattr_req); shard_create_marker_file_under_remove_me_cbk(frame, 0, this, -1, op_errno, NULL, NULL, NULL, NULL, NULL); return 0; } int shard_unlock_entrylk(call_frame_t *frame, xlator_t *this); int shard_unlink_base_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { int ret = 0; shard_local_t *local = NULL; local = frame->local; if (op_ret < 0) { local->op_ret = op_ret; local->op_errno = op_errno; } else { local->preoldparent = *preparent; local->postoldparent = *postparent; if (xdata) local->xattr_rsp = dict_ref(xdata); if (local->cleanup_required) shard_start_background_deletion(this); } if (local->entrylk_frame) { ret = shard_unlock_entrylk(frame, this); if (ret < 0) { local->op_ret = -1; local->op_errno = -ret; } } ret = shard_unlock_inodelk(frame, this); if (ret < 0) { local->op_ret = -1; local->op_errno = -ret; } shard_unlink_cbk(frame, this); return 0; } int shard_unlink_base_file(call_frame_t *frame, xlator_t *this) { shard_local_t *local = frame->local; /* To-Do: Request open-fd count on base file */ STACK_WIND(frame, shard_unlink_base_file_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink, &local->loc, local->xflag, local->xattr_req); return 0; } int shard_unlock_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { if (op_ret) gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, "Unlock failed. Please check brick logs for " "more details"); SHARD_STACK_DESTROY(frame); return 0; } int shard_unlock_entrylk(call_frame_t *frame, xlator_t *this) { loc_t *loc = NULL; call_frame_t *lk_frame = NULL; shard_local_t *local = NULL; shard_local_t *lk_local = NULL; shard_entrylk_t *lock = NULL; local = frame->local; lk_frame = local->entrylk_frame; lk_local = lk_frame->local; local->entrylk_frame = NULL; lock = &lk_local->int_entrylk; loc = &lock->loc; STACK_WIND(lk_frame, shard_unlock_entrylk_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->entrylk, this->name, loc, lk_local->int_entrylk.basename, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL); local->int_entrylk.acquired_lock = _gf_false; return 0; } int shard_post_entrylk_fop_handler(call_frame_t *frame, xlator_t *this) { shard_local_t *local = NULL; local = frame->local; switch (local->fop) { case GF_FOP_UNLINK: case GF_FOP_RENAME: shard_create_marker_file_under_remove_me(frame, this, &local->int_inodelk.loc); break; default: gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, "post-entrylk handler not defined. This case should not" " be hit"); break; } return 0; } int shard_acquire_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { call_frame_t *main_frame = NULL; shard_local_t *local = NULL; shard_local_t *main_local = NULL; local = frame->local; main_frame = local->main_frame; main_local = main_frame->local; if (local->op_ret < 0) { shard_common_failure_unwind(main_local->fop, main_frame, op_ret, op_errno); return 0; } main_local->int_entrylk.acquired_lock = _gf_true; shard_post_entrylk_fop_handler(main_frame, this); return 0; } int shard_acquire_entrylk(call_frame_t *frame, xlator_t *this, inode_t *inode, uuid_t gfid) { char gfid_str[GF_UUID_BUF_SIZE] = { 0, }; shard_local_t *local = NULL; shard_local_t *entrylk_local = NULL; shard_entrylk_t *int_entrylk = NULL; call_frame_t *entrylk_frame = NULL; local = frame->local; entrylk_frame = create_frame(this, this->ctx->pool); if (!entrylk_frame) { gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, "Failed to create new frame " "to lock marker file"); goto err; } entrylk_local = mem_get0(this->local_pool); if (!entrylk_local) { STACK_DESTROY(entrylk_frame->root); goto err; } entrylk_frame->local = entrylk_local; entrylk_local->main_frame = frame; int_entrylk = &entrylk_local->int_entrylk; int_entrylk->loc.inode = inode_ref(inode); set_lk_owner_from_ptr(&entrylk_frame->root->lk_owner, entrylk_frame->root); local->entrylk_frame = entrylk_frame; gf_uuid_unparse(gfid, gfid_str); int_entrylk->basename = gf_strdup(gfid_str); STACK_WIND(entrylk_frame, shard_acquire_entrylk_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->entrylk, this->name, &int_entrylk->loc, int_entrylk->basename, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); return 0; err: shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); return 0; } int shard_post_lookup_base_shard_rm_handler(call_frame_t *frame, xlator_t *this) { shard_local_t *local = NULL; shard_priv_t *priv = NULL; priv = this->private; local = frame->local; if (local->op_ret < 0) { shard_common_failure_unwind(local->fop, frame, -1, local->op_errno); return 0; } if (local->prebuf.ia_nlink > 1) { gf_msg_debug(this->name, 0, "link count on %s > 1:%d, " "performing rename()/unlink()", local->int_inodelk.loc.path, local->prebuf.ia_nlink); if (local->fop == GF_FOP_RENAME) shard_rename_src_base_file(frame, this); else if (local->fop == GF_FOP_UNLINK) shard_unlink_base_file(frame, this); } else { gf_msg_debug(this->name, 0, "link count on %s = 1, creating " "file under .remove_me", local->int_inodelk.loc.path); local->cleanup_required = _gf_true; shard_acquire_entrylk(frame, this, priv->dot_shard_rm_inode, local->prebuf.ia_gfid); } return 0; } int shard_post_inodelk_fop_handler(call_frame_t *frame, xlator_t *this) { shard_local_t *local = NULL; local = frame->local; switch (local->fop) { case GF_FOP_UNLINK: case GF_FOP_RENAME: shard_lookup_base_file(frame, this, &local->int_inodelk.loc, shard_post_lookup_base_shard_rm_handler); break; default: gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, "post-inodelk handler not defined. This case should not" " be hit"); break; } return 0; } int shard_acquire_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { call_frame_t *main_frame = NULL; shard_local_t *local = NULL; shard_local_t *main_local = NULL; local = frame->local; main_frame = local->main_frame; main_local = main_frame->local; if (local->op_ret < 0) { shard_common_failure_unwind(main_local->fop, main_frame, op_ret, op_errno); return 0; } main_local->int_inodelk.acquired_lock = _gf_true; shard_post_inodelk_fop_handler(main_frame, this); return 0; } int shard_acquire_inodelk(call_frame_t *frame, xlator_t *this, loc_t *loc) { call_frame_t *lk_frame = NULL; shard_local_t *local = NULL; shard_local_t *lk_local = NULL; shard_inodelk_t *int_inodelk = NULL; local = frame->local; lk_frame = create_frame(this, this->ctx->pool); if (!lk_frame) { gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, "Failed to create new frame " "to lock base shard"); goto err; } lk_local = mem_get0(this->local_pool); if (!lk_local) { STACK_DESTROY(lk_frame->root); goto err; } lk_frame->local = lk_local; lk_local->main_frame = frame; int_inodelk = &lk_local->int_inodelk; int_inodelk->flock.l_len = 0; int_inodelk->flock.l_start = 0; int_inodelk->domain = this->name; int_inodelk->flock.l_type = F_WRLCK; loc_copy(&local->int_inodelk.loc, loc); set_lk_owner_from_ptr(&lk_frame->root->lk_owner, lk_frame->root); local->inodelk_frame = lk_frame; STACK_WIND(lk_frame, shard_acquire_inodelk_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->inodelk, int_inodelk->domain, &local->int_inodelk.loc, F_SETLKW, &int_inodelk->flock, NULL); return 0; err: shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); return 0; } int shard_post_mkdir_rm_handler(call_frame_t *frame, xlator_t *this) { loc_t *loc = NULL; shard_local_t *local = NULL; local = frame->local; if (local->op_ret < 0) { shard_common_failure_unwind(local->fop, frame, -1, local->op_errno); return 0; } if (local->fop == GF_FOP_UNLINK) loc = &local->loc; else if (local->fop == GF_FOP_RENAME) loc = &local->loc2; shard_acquire_inodelk(frame, this, loc); return 0; } int shard_mkdir_internal_dir(call_frame_t *frame, xlator_t *this, shard_post_resolve_fop_handler_t handler, shard_internal_dir_type_t type); int shard_pre_mkdir_rm_handler(call_frame_t *frame, xlator_t *this) { shard_local_t *local = NULL; local = frame->local; if (local->op_ret < 0) { shard_common_failure_unwind(local->fop, frame, -1, local->op_errno); return 0; } shard_mkdir_internal_dir(frame, this, shard_post_mkdir_rm_handler, SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME); return 0; } void shard_begin_rm_resolution(call_frame_t *frame, xlator_t *this) { shard_priv_t *priv = NULL; shard_local_t *local = NULL; priv = this->private; local = frame->local; local->dot_shard_rm_loc.inode = inode_find(this->itable, priv->dot_shard_rm_gfid); if (!local->dot_shard_rm_loc.inode) { local->dot_shard_loc.inode = inode_find(this->itable, priv->dot_shard_gfid); if (!local->dot_shard_loc.inode) { shard_mkdir_internal_dir(frame, this, shard_pre_mkdir_rm_handler, SHARD_INTERNAL_DIR_DOT_SHARD); } else { local->post_res_handler = shard_pre_mkdir_rm_handler; shard_refresh_internal_dir(frame, this, SHARD_INTERNAL_DIR_DOT_SHARD); } } else { local->post_res_handler = shard_post_mkdir_rm_handler; shard_refresh_internal_dir(frame, this, SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME); } } int shard_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, dict_t *xdata) { int ret = -1; uint64_t block_size = 0; shard_local_t *local = NULL; ret = shard_inode_ctx_get_block_size(loc->inode, this, &block_size); if ((ret) && (!IA_ISLNK(loc->inode->ia_type))) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, "Failed to get block " "size from inode ctx of %s", uuid_utoa(loc->inode->gfid)); goto err; } if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { STACK_WIND(frame, default_unlink_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata); return 0; } local = mem_get0(this->local_pool); if (!local) goto err; frame->local = local; loc_copy(&local->loc, loc); local->xflag = xflag; local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); local->block_size = block_size; local->resolver_base_inode = loc->inode; local->fop = GF_FOP_UNLINK; if (!this->itable) this->itable = (local->loc.inode)->table; local->resolve_not = _gf_true; shard_begin_rm_resolution(frame, this); return 0; err: shard_common_failure_unwind(GF_FOP_UNLINK, frame, -1, ENOMEM); return 0; } int shard_post_rename_lookup_handler(call_frame_t *frame, xlator_t *this) { shard_rename_cbk(frame, this); return 0; } int shard_rename_src_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *buf, struct iatt *preoldparent, struct iatt *postoldparent, struct iatt *prenewparent, struct iatt *postnewparent, dict_t *xdata) { int ret = 0; shard_local_t *local = NULL; local = frame->local; if (op_ret < 0) { local->op_ret = op_ret; local->op_errno = op_errno; goto err; } /* Set ctx->refresh to TRUE to force a lookup on disk when * shard_lookup_base_file() is called next to refresh the hard link * count in ctx. Note that this is applicable only to the case where * the rename dst is already existent and sharded. */ if ((local->dst_block_size) && (!local->cleanup_required)) shard_inode_ctx_set_refresh_flag(local->int_inodelk.loc.inode, this); local->prebuf = *buf; local->preoldparent = *preoldparent; local->postoldparent = *postoldparent; local->prenewparent = *prenewparent; local->postnewparent = *postnewparent; if (xdata) local->xattr_rsp = dict_ref(xdata); if (local->dst_block_size) { if (local->entrylk_frame) { ret = shard_unlock_entrylk(frame, this); if (ret < 0) { local->op_ret = -1; local->op_errno = -ret; } } ret = shard_unlock_inodelk(frame, this); if (ret < 0) { local->op_ret = -1; local->op_errno = -ret; goto err; } if (local->cleanup_required) shard_start_background_deletion(this); } /* Now the base file of src, if sharded, is looked up to gather ia_size * and ia_blocks.*/ if (local->block_size) { local->tmp_loc.inode = inode_new(this->itable); gf_uuid_copy(local->tmp_loc.gfid, (local->loc.inode)->gfid); shard_lookup_base_file(frame, this, &local->tmp_loc, shard_post_rename_lookup_handler); } else { shard_rename_cbk(frame, this); } return 0; err: shard_common_failure_unwind(local->fop, frame, local->op_ret, local->op_errno); return 0; } int shard_post_lookup_dst_base_file_handler(call_frame_t *frame, xlator_t *this) { shard_local_t *local = NULL; local = frame->local; if (local->op_ret < 0) { shard_common_failure_unwind(local->fop, frame, local->op_ret, local->op_errno); return 0; } /* Save dst base file attributes into postbuf so the information is not * lost when it is overwritten after lookup on base file of src in * shard_lookup_base_file_cbk(). */ local->postbuf = local->prebuf; shard_rename_src_base_file(frame, this); return 0; } int shard_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, dict_t *xdata) { int ret = -1; uint64_t block_size = 0; uint64_t dst_block_size = 0; shard_local_t *local = NULL; if (IA_ISDIR(oldloc->inode->ia_type)) { STACK_WIND(frame, default_rename_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); return 0; } ret = shard_inode_ctx_get_block_size(oldloc->inode, this, &block_size); if ((ret) && (!IA_ISLNK(oldloc->inode->ia_type))) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, "Failed to get block " "size from inode ctx of %s", uuid_utoa(oldloc->inode->gfid)); goto err; } if (newloc->inode) ret = shard_inode_ctx_get_block_size(newloc->inode, this, &dst_block_size); /* The following stack_wind covers the case where: * a. the src file is not sharded and dst doesn't exist, OR * b. the src and dst both exist but are not sharded. */ if (((!block_size) && (!dst_block_size)) || frame->root->pid == GF_CLIENT_PID_GSYNCD) { STACK_WIND(frame, default_rename_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); return 0; } local = mem_get0(this->local_pool); if (!local) goto err; frame->local = local; loc_copy(&local->loc, oldloc); loc_copy(&local->loc2, newloc); local->resolver_base_inode = newloc->inode; local->fop = GF_FOP_RENAME; local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); if (!local->xattr_req) goto err; local->block_size = block_size; local->dst_block_size = dst_block_size; if (!this->itable) this->itable = (local->loc.inode)->table; local->resolve_not = _gf_true; /* The following if-block covers the case where the dst file exists * and is sharded. */ if (local->dst_block_size) { shard_begin_rm_resolution(frame, this); } else { /* The following block covers the case where the dst either doesn't * exist or is NOT sharded but the src is sharded. In this case, shard * xlator would go ahead and rename src to dst. Once done, it would also * lookup the base shard of src to get the ia_size and ia_blocks xattr * values. */ shard_rename_src_base_file(frame, this); } return 0; err: shard_common_failure_unwind(GF_FOP_RENAME, frame, -1, ENOMEM); return 0; } int shard_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { int ret = -1; shard_local_t *local = NULL; local = frame->local; if (op_ret == -1) goto unwind; ret = shard_inode_ctx_set(inode, this, stbuf, local->block_size, SHARD_ALL_MASK); if (ret) gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INODE_CTX_SET_FAILED, "Failed to set inode " "ctx for %s", uuid_utoa(inode->gfid)); unwind: SHARD_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, stbuf, preparent, postparent, xdata); return 0; } int shard_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { shard_priv_t *priv = NULL; shard_local_t *local = NULL; priv = this->private; local = mem_get0(this->local_pool); if (!local) goto err; frame->local = local; local->block_size = priv->block_size; if (!__is_gsyncd_on_shard_dir(frame, loc)) { SHARD_INODE_CREATE_INIT(this, local->block_size, xdata, loc, 0, 0, err); } STACK_WIND(frame, shard_create_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd, xdata); return 0; err: shard_common_failure_unwind(GF_FOP_CREATE, frame, -1, ENOMEM); return 0; } int shard_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { /* To-Do: Handle open with O_TRUNC under locks */ SHARD_STACK_UNWIND(open, frame, op_ret, op_errno, fd, xdata); return 0; } int shard_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, fd_t *fd, dict_t *xdata) { STACK_WIND(frame, shard_open_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); return 0; } int shard_readv_do_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iovec *vector, int32_t count, struct iatt *stbuf, struct iobref *iobref, dict_t *xdata) { int i = 0; int call_count = 0; void *address = NULL; uint64_t block_num = 0; off_t off = 0; struct iovec vec = { 0, }; shard_local_t *local = NULL; fd_t *anon_fd = cookie; shard_inode_ctx_t *ctx = NULL; local = frame->local; /* If shard has already seen a failure here before, there is no point * in aggregating subsequent reads, so just go to out. */ if (local->op_ret < 0) goto out; if (op_ret < 0) { local->op_ret = op_ret; local->op_errno = op_errno; goto out; } if (local->op_ret >= 0) local->op_ret += op_ret; shard_inode_ctx_get(anon_fd->inode, this, &ctx); block_num = ctx->block_num; if (block_num == local->first_block) { address = local->iobuf->ptr; } else { /* else * address to start writing to = beginning of buffer + * number of bytes until end of first block + * + block_size times number of blocks * between the current block and the first */ address = (char *)local->iobuf->ptr + (local->block_size - (local->offset % local->block_size)) + ((block_num - local->first_block - 1) * local->block_size); } for (i = 0; i < count; i++) { address = (char *)address + off; memcpy(address, vector[i].iov_base, vector[i].iov_len); off += vector[i].iov_len; } out: if (anon_fd) fd_unref(anon_fd); call_count = shard_call_count_return(frame); if (call_count == 0) { SHARD_UNSET_ROOT_FS_ID(frame, local); if (local->op_ret < 0) { shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, local->op_errno); } else { if (xdata) local->xattr_rsp = dict_ref(xdata); vec.iov_base = local->iobuf->ptr; vec.iov_len = local->total_size; SHARD_STACK_UNWIND(readv, frame, local->total_size, local->op_errno, &vec, 1, &local->prebuf, local->iobref, local->xattr_rsp); return 0; } } return 0; } int shard_readv_do(call_frame_t *frame, xlator_t *this) { int i = 0; int call_count = 0; int last_block = 0; int cur_block = 0; off_t orig_offset = 0; off_t shard_offset = 0; size_t read_size = 0; size_t remaining_size = 0; fd_t *fd = NULL; fd_t *anon_fd = NULL; shard_local_t *local = NULL; gf_boolean_t wind_failed = _gf_false; local = frame->local; fd = local->fd; orig_offset = local->offset; cur_block = local->first_block; last_block = local->last_block; remaining_size = local->total_size; local->call_count = call_count = local->num_blocks; SHARD_SET_ROOT_FS_ID(frame, local); if (fd->flags & O_DIRECT) local->flags = O_DIRECT; while (cur_block <= last_block) { if (wind_failed) { shard_readv_do_cbk(frame, (void *)(long)0, this, -1, ENOMEM, NULL, 0, NULL, NULL, NULL); goto next; } shard_offset = orig_offset % local->block_size; read_size = local->block_size - shard_offset; if (read_size > remaining_size) read_size = remaining_size; remaining_size -= read_size; if (cur_block == 0) { anon_fd = fd_ref(fd); } else { anon_fd = fd_anonymous(local->inode_list[i]); if (!anon_fd) { local->op_ret = -1; local->op_errno = ENOMEM; wind_failed = _gf_true; shard_readv_do_cbk(frame, (void *)(long)anon_fd, this, -1, ENOMEM, NULL, 0, NULL, NULL, NULL); goto next; } } STACK_WIND_COOKIE(frame, shard_readv_do_cbk, anon_fd, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readv, anon_fd, read_size, shard_offset, local->flags, local->xattr_req); orig_offset += read_size; next: cur_block++; i++; call_count--; } return 0; } int shard_common_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { int shard_block_num = (long)cookie; int call_count = 0; shard_local_t *local = NULL; local = frame->local; if (op_ret < 0) { if (op_errno == EEXIST) { LOCK(&frame->lock); { local->eexist_count++; } UNLOCK(&frame->lock); } else { local->op_ret = op_ret; local->op_errno = op_errno; } gf_msg_debug(this->name, 0, "mknod of shard %d " "failed: %s", shard_block_num, strerror(op_errno)); goto done; } shard_link_block_inode(local, shard_block_num, inode, buf); done: call_count = shard_call_count_return(frame); if (call_count == 0) { SHARD_UNSET_ROOT_FS_ID(frame, local); local->create_count = 0; local->post_mknod_handler(frame, this); } return 0; } int shard_common_resume_mknod(call_frame_t *frame, xlator_t *this, shard_post_mknod_fop_handler_t post_mknod_handler) { int i = 0; int shard_idx_iter = 0; int last_block = 0; int ret = 0; int call_count = 0; char path[PATH_MAX] = { 0, }; mode_t mode = 0; char *bname = NULL; shard_priv_t *priv = NULL; shard_inode_ctx_t ctx_tmp = { 0, }; shard_local_t *local = NULL; gf_boolean_t wind_failed = _gf_false; fd_t *fd = NULL; loc_t loc = { 0, }; dict_t *xattr_req = NULL; local = frame->local; priv = this->private; fd = local->fd; shard_idx_iter = local->first_block; last_block = local->last_block; call_count = local->call_count = local->create_count; local->post_mknod_handler = post_mknod_handler; SHARD_SET_ROOT_FS_ID(frame, local); ret = shard_inode_ctx_get_all(fd->inode, this, &ctx_tmp); if (ret) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, "Failed to get inode " "ctx for %s", uuid_utoa(fd->inode->gfid)); local->op_ret = -1; local->op_errno = ENOMEM; goto err; } mode = st_mode_from_ia(ctx_tmp.stat.ia_prot, ctx_tmp.stat.ia_type); while (shard_idx_iter <= last_block) { if (local->inode_list[i]) { shard_idx_iter++; i++; continue; } if (wind_failed) { shard_common_mknod_cbk(frame, (void *)(long)shard_idx_iter, this, -1, ENOMEM, NULL, NULL, NULL, NULL, NULL); goto next; } shard_make_block_abspath(shard_idx_iter, fd->inode->gfid, path, sizeof(path)); xattr_req = shard_create_gfid_dict(local->xattr_req); if (!xattr_req) { local->op_ret = -1; local->op_errno = ENOMEM; wind_failed = _gf_true; shard_common_mknod_cbk(frame, (void *)(long)shard_idx_iter, this, -1, ENOMEM, NULL, NULL, NULL, NULL, NULL); goto next; } bname = strrchr(path, '/') + 1; loc.inode = inode_new(this->itable); loc.parent = inode_ref(priv->dot_shard_inode); ret = inode_path(loc.parent, bname, (char **)&(loc.path)); if (ret < 0 || !(loc.inode)) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, "Inode path failed" "on %s, base file gfid = %s", bname, uuid_utoa(fd->inode->gfid)); local->op_ret = -1; local->op_errno = ENOMEM; wind_failed = _gf_true; loc_wipe(&loc); dict_unref(xattr_req); shard_common_mknod_cbk(frame, (void *)(long)shard_idx_iter, this, -1, ENOMEM, NULL, NULL, NULL, NULL, NULL); goto next; } loc.name = strrchr(loc.path, '/'); if (loc.name) loc.name++; STACK_WIND_COOKIE(frame, shard_common_mknod_cbk, (void *)(long)shard_idx_iter, FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod, &loc, mode, ctx_tmp.stat.ia_rdev, 0, xattr_req); loc_wipe(&loc); dict_unref(xattr_req); next: shard_idx_iter++; i++; if (!--call_count) break; } return 0; err: /* * This block is for handling failure in shard_inode_ctx_get_all(). * Failures in the while-loop are handled within the loop. */ SHARD_UNSET_ROOT_FS_ID(frame, local); post_mknod_handler(frame, this); return 0; } int shard_post_mknod_readv_handler(call_frame_t *frame, xlator_t *this); int shard_post_lookup_shards_readv_handler(call_frame_t *frame, xlator_t *this) { shard_local_t *local = NULL; local = frame->local; if (local->op_ret < 0) { shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, local->op_errno); return 0; } if (local->create_count) { shard_common_resume_mknod(frame, this, shard_post_mknod_readv_handler); } else { shard_readv_do(frame, this); } return 0; } int shard_post_mknod_readv_handler(call_frame_t *frame, xlator_t *this) { shard_local_t *local = NULL; local = frame->local; if (local->op_ret < 0) { shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, local->op_errno); return 0; } if (!local->eexist_count) { shard_readv_do(frame, this); } else { local->call_count = local->eexist_count; shard_common_lookup_shards(frame, this, local->loc.inode, shard_post_lookup_shards_readv_handler); } return 0; } int shard_post_resolve_readv_handler(call_frame_t *frame, xlator_t *this) { shard_local_t *local = NULL; local = frame->local; if (local->op_ret < 0) { if (local->op_errno != ENOENT) { shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, local->op_errno); return 0; } else { struct iovec vec = { 0, }; vec.iov_base = local->iobuf->ptr; vec.iov_len = local->total_size; SHARD_STACK_UNWIND(readv, frame, local->total_size, 0, &vec, 1, &local->prebuf, local->iobref, NULL); return 0; } } if (local->call_count) { shard_common_lookup_shards(frame, this, local->resolver_base_inode, shard_post_lookup_shards_readv_handler); } else { shard_readv_do(frame, this); } return 0; } int shard_post_lookup_readv_handler(call_frame_t *frame, xlator_t *this) { int ret = 0; struct iobuf *iobuf = NULL; shard_local_t *local = NULL; shard_priv_t *priv = NULL; priv = this->private; local = frame->local; if (local->op_ret < 0) { shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, local->op_errno); return 0; } if (local->offset >= local->prebuf.ia_size) { /* If the read is being performed past the end of the file, * unwind the FOP with 0 bytes read as status. */ struct iovec vec = { 0, }; iobuf = iobuf_get2(this->ctx->iobuf_pool, local->req_size); if (!iobuf) goto err; vec.iov_base = iobuf->ptr; vec.iov_len = 0; local->iobref = iobref_new(); iobref_add(local->iobref, iobuf); iobuf_unref(iobuf); SHARD_STACK_UNWIND(readv, frame, 0, 0, &vec, 1, &local->prebuf, local->iobref, NULL); return 0; } local->first_block = get_lowest_block(local->offset, local->block_size); local->total_size = local->req_size; local->last_block = get_highest_block(local->offset, local->total_size, local->block_size); local->num_blocks = local->last_block - local->first_block + 1; local->resolver_base_inode = local->loc.inode; local->inode_list = GF_CALLOC(local->num_blocks, sizeof(inode_t *), gf_shard_mt_inode_list); if (!local->inode_list) goto err; iobuf = iobuf_get2(this->ctx->iobuf_pool, local->total_size); if (!iobuf) goto err; local->iobref = iobref_new(); if (!local->iobref) { iobuf_unref(iobuf); goto err; } if (iobref_add(local->iobref, iobuf) != 0) { iobuf_unref(iobuf); goto err; } iobuf_unref(iobuf); local->iobuf = iobuf; memset(iobuf->ptr, 0, local->total_size); local->dot_shard_loc.inode = inode_find(this->itable, priv->dot_shard_gfid); if (!local->dot_shard_loc.inode) { ret = shard_init_internal_dir_loc(this, local, SHARD_INTERNAL_DIR_DOT_SHARD); if (ret) goto err; shard_lookup_internal_dir(frame, this, shard_post_resolve_readv_handler, SHARD_INTERNAL_DIR_DOT_SHARD); } else { local->post_res_handler = shard_post_resolve_readv_handler; shard_refresh_internal_dir(frame, this, SHARD_INTERNAL_DIR_DOT_SHARD); } return 0; err: shard_common_failure_unwind(GF_FOP_READ, frame, -1, ENOMEM); return 0; } int shard_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata) { int ret = 0; uint64_t block_size = 0; shard_local_t *local = NULL; ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); if (ret) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, "Failed to get block " "size for %s from its inode ctx", uuid_utoa(fd->inode->gfid)); goto err; } if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { /* block_size = 0 means that the file was created before * sharding was enabled on the volume. */ STACK_WIND(frame, default_readv_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata); return 0; } if (!this->itable) this->itable = fd->inode->table; local = mem_get0(this->local_pool); if (!local) goto err; frame->local = local; ret = syncbarrier_init(&local->barrier); if (ret) goto err; local->fd = fd_ref(fd); local->block_size = block_size; local->offset = offset; local->req_size = size; local->flags = flags; local->fop = GF_FOP_READ; local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); if (!local->xattr_req) goto err; local->loc.inode = inode_ref(fd->inode); gf_uuid_copy(local->loc.gfid, fd->inode->gfid); shard_lookup_base_file(frame, this, &local->loc, shard_post_lookup_readv_handler); return 0; err: shard_common_failure_unwind(GF_FOP_READ, frame, -1, ENOMEM); return 0; } int shard_common_inode_write_post_update_size_handler(call_frame_t *frame, xlator_t *this) { shard_local_t *local = NULL; local = frame->local; if (local->op_ret < 0) { shard_common_failure_unwind(local->fop, frame, local->op_ret, local->op_errno); } else { shard_common_inode_write_success_unwind(local->fop, frame, local->written_size); } return 0; } static gf_boolean_t shard_is_appending_write(shard_local_t *local) { if (local->fop != GF_FOP_WRITE) return _gf_false; if (local->flags & O_APPEND) return _gf_true; if (local->fd->flags & O_APPEND) return _gf_true; return _gf_false; } int __shard_get_delta_size_from_inode_ctx(shard_local_t *local, inode_t *inode, xlator_t *this) { int ret = -1; uint64_t ctx_uint = 0; shard_inode_ctx_t *ctx = NULL; ret = __inode_ctx_get(inode, this, &ctx_uint); if (ret < 0) return ret; ctx = (shard_inode_ctx_t *)ctx_uint; if (shard_is_appending_write(local)) { local->delta_size = local->total_size; } else if (local->offset + local->total_size > ctx->stat.ia_size) { local->delta_size = (local->offset + local->total_size) - ctx->stat.ia_size; } else { local->delta_size = 0; } ctx->stat.ia_size += (local->delta_size); local->postbuf = ctx->stat; return 0; } int shard_get_delta_size_from_inode_ctx(shard_local_t *local, inode_t *inode, xlator_t *this) { int ret = -1; LOCK(&inode->lock); { ret = __shard_get_delta_size_from_inode_ctx(local, inode, this); } UNLOCK(&inode->lock); return ret; } int shard_common_inode_write_do_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *pre, struct iatt *post, dict_t *xdata) { int call_count = 0; fd_t *anon_fd = cookie; shard_local_t *local = NULL; glusterfs_fop_t fop = 0; local = frame->local; fop = local->fop; LOCK(&frame->lock); { if (op_ret < 0) { local->op_ret = op_ret; local->op_errno = op_errno; } else { local->written_size += op_ret; local->delta_blocks += (post->ia_blocks - pre->ia_blocks); local->delta_size += (post->ia_size - pre->ia_size); shard_inode_ctx_set(local->fd->inode, this, post, 0, SHARD_MASK_TIMES); if (local->fd->inode != anon_fd->inode) shard_inode_ctx_add_to_fsync_list(local->fd->inode, this, anon_fd->inode); } } UNLOCK(&frame->lock); if (anon_fd) fd_unref(anon_fd); call_count = shard_call_count_return(frame); if (call_count == 0) { SHARD_UNSET_ROOT_FS_ID(frame, local); if (local->op_ret < 0) { shard_common_failure_unwind(fop, frame, local->op_ret, local->op_errno); } else { shard_get_delta_size_from_inode_ctx(local, local->fd->inode, this); local->hole_size = 0; if (xdata) local->xattr_rsp = dict_ref(xdata); shard_update_file_size( frame, this, local->fd, NULL, shard_common_inode_write_post_update_size_handler); } } return 0; } int shard_common_inode_write_wind(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vec, int count, off_t shard_offset, size_t size) { shard_local_t *local = NULL; local = frame->local; switch (local->fop) { case GF_FOP_WRITE: STACK_WIND_COOKIE( frame, shard_common_inode_write_do_cbk, fd, FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev, fd, vec, count, shard_offset, local->flags, local->iobref, local->xattr_req); break; case GF_FOP_FALLOCATE: STACK_WIND_COOKIE( frame, shard_common_inode_write_do_cbk, fd, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fallocate, fd, local->flags, shard_offset, size, local->xattr_req); break; case GF_FOP_ZEROFILL: STACK_WIND_COOKIE(frame, shard_common_inode_write_do_cbk, fd, FIRST_CHILD(this), FIRST_CHILD(this)->fops->zerofill, fd, shard_offset, size, local->xattr_req); break; case GF_FOP_DISCARD: STACK_WIND_COOKIE(frame, shard_common_inode_write_do_cbk, fd, FIRST_CHILD(this), FIRST_CHILD(this)->fops->discard, fd, shard_offset, size, local->xattr_req); break; default: gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, "Invalid fop id = %d", local->fop); break; } return 0; } int shard_common_inode_write_do(call_frame_t *frame, xlator_t *this) { int i = 0; int count = 0; int call_count = 0; int last_block = 0; uint32_t cur_block = 0; fd_t *fd = NULL; fd_t *anon_fd = NULL; shard_local_t *local = NULL; struct iovec *vec = NULL; gf_boolean_t wind_failed = _gf_false; gf_boolean_t odirect = _gf_false; off_t orig_offset = 0; off_t shard_offset = 0; off_t vec_offset = 0; size_t remaining_size = 0; size_t shard_write_size = 0; local = frame->local; fd = local->fd; orig_offset = local->offset; remaining_size = local->total_size; cur_block = local->first_block; local->call_count = call_count = local->num_blocks; last_block = local->last_block; SHARD_SET_ROOT_FS_ID(frame, local); if (dict_set_uint32(local->xattr_req, GLUSTERFS_WRITE_UPDATE_ATOMIC, 4)) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, "Failed to set " GLUSTERFS_WRITE_UPDATE_ATOMIC " into " "dict: %s", uuid_utoa(fd->inode->gfid)); local->op_ret = -1; local->op_errno = ENOMEM; local->call_count = 1; shard_common_inode_write_do_cbk(frame, (void *)(long)0, this, -1, ENOMEM, NULL, NULL, NULL); return 0; } if ((fd->flags & O_DIRECT) && (local->fop == GF_FOP_WRITE)) odirect = _gf_true; while (cur_block <= last_block) { if (wind_failed) { shard_common_inode_write_do_cbk(frame, (void *)(long)0, this, -1, ENOMEM, NULL, NULL, NULL); goto next; } shard_offset = orig_offset % local->block_size; shard_write_size = local->block_size - shard_offset; if (shard_write_size > remaining_size) shard_write_size = remaining_size; remaining_size -= shard_write_size; if (local->fop == GF_FOP_WRITE) { count = iov_subset(local->vector, local->count, vec_offset, vec_offset + shard_write_size, NULL); vec = GF_CALLOC(count, sizeof(struct iovec), gf_shard_mt_iovec); if (!vec) { local->op_ret = -1; local->op_errno = ENOMEM; wind_failed = _gf_true; GF_FREE(vec); shard_common_inode_write_do_cbk(frame, (void *)(long)0, this, -1, ENOMEM, NULL, NULL, NULL); goto next; } count = iov_subset(local->vector, local->count, vec_offset, vec_offset + shard_write_size, vec); } if (cur_block == 0) { anon_fd = fd_ref(fd); } else { anon_fd = fd_anonymous(local->inode_list[i]); if (!anon_fd) { local->op_ret = -1; local->op_errno = ENOMEM; wind_failed = _gf_true; GF_FREE(vec); shard_common_inode_write_do_cbk(frame, (void *)(long)anon_fd, this, -1, ENOMEM, NULL, NULL, NULL); goto next; } if (local->fop == GF_FOP_WRITE) { if (odirect) local->flags = O_DIRECT; else local->flags = GF_ANON_FD_FLAGS; } } shard_common_inode_write_wind(frame, this, anon_fd, vec, count, shard_offset, shard_write_size); if (vec) vec_offset += shard_write_size; orig_offset += shard_write_size; GF_FREE(vec); vec = NULL; next: cur_block++; i++; call_count--; } return 0; } int shard_common_inode_write_post_mknod_handler(call_frame_t *frame, xlator_t *this); int shard_common_inode_write_post_lookup_shards_handler(call_frame_t *frame, xlator_t *this) { shard_local_t *local = NULL; local = frame->local; if (local->op_ret < 0) { shard_common_failure_unwind(local->fop, frame, local->op_ret, local->op_errno); return 0; } if (local->create_count) { shard_common_resume_mknod(frame, this, shard_common_inode_write_post_mknod_handler); } else { shard_common_inode_write_do(frame, this); } return 0; } int shard_common_inode_write_post_mknod_handler(call_frame_t *frame, xlator_t *this) { shard_local_t *local = NULL; local = frame->local; if (local->op_ret < 0) { shard_common_failure_unwind(local->fop, frame, local->op_ret, local->op_errno); return 0; } if (!local->eexist_count) { shard_common_inode_write_do(frame, this); } else { local->call_count = local->eexist_count; shard_common_lookup_shards( frame, this, local->loc.inode, shard_common_inode_write_post_lookup_shards_handler); } return 0; } int shard_common_inode_write_post_resolve_handler(call_frame_t *frame, xlator_t *this) { shard_local_t *local = NULL; local = frame->local; if (local->op_ret < 0) { shard_common_failure_unwind(local->fop, frame, local->op_ret, local->op_errno); return 0; } if (local->call_count) { shard_common_lookup_shards( frame, this, local->resolver_base_inode, shard_common_inode_write_post_lookup_shards_handler); } else { shard_common_inode_write_do(frame, this); } return 0; } int shard_common_inode_write_post_lookup_handler(call_frame_t *frame, xlator_t *this) { shard_local_t *local = frame->local; shard_priv_t *priv = this->private; if (local->op_ret < 0) { shard_common_failure_unwind(local->fop, frame, local->op_ret, local->op_errno); return 0; } local->postbuf = local->prebuf; /*Adjust offset to EOF so that correct shard is chosen for append*/ if (shard_is_appending_write(local)) local->offset = local->prebuf.ia_size; local->first_block = get_lowest_block(local->offset, local->block_size); local->last_block = get_highest_block(local->offset, local->total_size, local->block_size); local->num_blocks = local->last_block - local->first_block + 1; local->inode_list = GF_CALLOC(local->num_blocks, sizeof(inode_t *), gf_shard_mt_inode_list); if (!local->inode_list) { shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); return 0; } gf_msg_trace(this->name, 0, "%s: gfid=%s first_block=%" PRIu32 " " "last_block=%" PRIu32 " num_blocks=%" PRIu32 " offset=%" PRId64 " total_size=%zu flags=%" PRId32 "", gf_fop_list[local->fop], uuid_utoa(local->resolver_base_inode->gfid), local->first_block, local->last_block, local->num_blocks, local->offset, local->total_size, local->flags); local->dot_shard_loc.inode = inode_find(this->itable, priv->dot_shard_gfid); if (!local->dot_shard_loc.inode) { /*change handler*/ shard_mkdir_internal_dir(frame, this, shard_common_inode_write_post_resolve_handler, SHARD_INTERNAL_DIR_DOT_SHARD); } else { /*change handler*/ local->post_res_handler = shard_common_inode_write_post_resolve_handler; shard_refresh_internal_dir(frame, this, SHARD_INTERNAL_DIR_DOT_SHARD); } return 0; } int shard_mkdir_internal_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { inode_t *link_inode = NULL; shard_local_t *local = NULL; shard_internal_dir_type_t type = (shard_internal_dir_type_t)cookie; local = frame->local; SHARD_UNSET_ROOT_FS_ID(frame, local); if (op_ret == -1) { if (op_errno != EEXIST) { local->op_ret = op_ret; local->op_errno = op_errno; goto unwind; } else { gf_msg_debug(this->name, 0, "mkdir on %s failed " "with EEXIST. Attempting lookup now", shard_internal_dir_string(type)); shard_lookup_internal_dir(frame, this, local->post_res_handler, type); return 0; } } link_inode = shard_link_internal_dir_inode(local, inode, buf, type); if (link_inode != inode) { shard_refresh_internal_dir(frame, this, type); } else { shard_inode_ctx_mark_dir_refreshed(link_inode, this); shard_common_resolve_shards(frame, this, local->post_res_handler); } return 0; unwind: shard_common_resolve_shards(frame, this, local->post_res_handler); return 0; } int shard_mkdir_internal_dir(call_frame_t *frame, xlator_t *this, shard_post_resolve_fop_handler_t handler, shard_internal_dir_type_t type) { int ret = -1; shard_local_t *local = NULL; shard_priv_t *priv = NULL; dict_t *xattr_req = NULL; uuid_t *gfid = NULL; loc_t *loc = NULL; gf_boolean_t free_gfid = _gf_true; local = frame->local; priv = this->private; local->post_res_handler = handler; gfid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t); if (!gfid) goto err; switch (type) { case SHARD_INTERNAL_DIR_DOT_SHARD: gf_uuid_copy(*gfid, priv->dot_shard_gfid); loc = &local->dot_shard_loc; break; case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: gf_uuid_copy(*gfid, priv->dot_shard_rm_gfid); loc = &local->dot_shard_rm_loc; break; default: bzero(*gfid, sizeof(uuid_t)); break; } xattr_req = dict_new(); if (!xattr_req) goto err; ret = shard_init_internal_dir_loc(this, local, type); if (ret) goto err; ret = dict_set_gfuuid(xattr_req, "gfid-req", *gfid, false); if (ret) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, "Failed to set gfid-req for %s", shard_internal_dir_string(type)); goto err; } else { free_gfid = _gf_false; } SHARD_SET_ROOT_FS_ID(frame, local); STACK_WIND_COOKIE(frame, shard_mkdir_internal_dir_cbk, (void *)(long)type, FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, loc, 0755, 0, xattr_req); dict_unref(xattr_req); return 0; err: if (xattr_req) dict_unref(xattr_req); local->op_ret = -1; local->op_errno = ENOMEM; if (free_gfid) GF_FREE(gfid); handler(frame, this); return 0; } int shard_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { /* To-Do: Wind flush on all shards of the file */ SHARD_STACK_UNWIND(flush, frame, op_ret, op_errno, xdata); return 0; } int shard_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { STACK_WIND(frame, shard_flush_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->flush, fd, xdata); return 0; } int __shard_get_timestamps_from_inode_ctx(shard_local_t *local, inode_t *inode, xlator_t *this) { int ret = -1; uint64_t ctx_uint = 0; shard_inode_ctx_t *ctx = NULL; ret = __inode_ctx_get(inode, this, &ctx_uint); if (ret < 0) return ret; ctx = (shard_inode_ctx_t *)ctx_uint; local->postbuf.ia_ctime = ctx->stat.ia_ctime; local->postbuf.ia_ctime_nsec = ctx->stat.ia_ctime_nsec; local->postbuf.ia_atime = ctx->stat.ia_atime; local->postbuf.ia_atime_nsec = ctx->stat.ia_atime_nsec; local->postbuf.ia_mtime = ctx->stat.ia_mtime; local->postbuf.ia_mtime_nsec = ctx->stat.ia_mtime_nsec; return 0; } int shard_get_timestamps_from_inode_ctx(shard_local_t *local, inode_t *inode, xlator_t *this) { int ret = 0; LOCK(&inode->lock); { ret = __shard_get_timestamps_from_inode_ctx(local, inode, this); } UNLOCK(&inode->lock); return ret; } int shard_fsync_shards_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { int call_count = 0; uint64_t fsync_count = 0; fd_t *anon_fd = cookie; shard_local_t *local = NULL; shard_inode_ctx_t *ctx = NULL; shard_inode_ctx_t *base_ictx = NULL; inode_t *base_inode = NULL; gf_boolean_t unref_shard_inode = _gf_false; local = frame->local; base_inode = local->fd->inode; if (local->op_ret < 0) goto out; LOCK(&frame->lock); { if (op_ret < 0) { local->op_ret = op_ret; local->op_errno = op_errno; UNLOCK(&frame->lock); goto out; } shard_inode_ctx_set(local->fd->inode, this, postbuf, 0, SHARD_MASK_TIMES); } UNLOCK(&frame->lock); fd_ctx_get(anon_fd, this, &fsync_count); out: if (anon_fd && (base_inode != anon_fd->inode)) { LOCK(&base_inode->lock); LOCK(&anon_fd->inode->lock); { __shard_inode_ctx_get(anon_fd->inode, this, &ctx); __shard_inode_ctx_get(base_inode, this, &base_ictx); if (op_ret == 0) ctx->fsync_needed -= fsync_count; GF_ASSERT(ctx->fsync_needed >= 0); if (ctx->fsync_needed != 0) { list_add_tail(&ctx->to_fsync_list, &base_ictx->to_fsync_list); base_ictx->fsync_count++; } else { unref_shard_inode = _gf_true; } } UNLOCK(&anon_fd->inode->lock); UNLOCK(&base_inode->lock); } if (unref_shard_inode) inode_unref(anon_fd->inode); if (anon_fd) fd_unref(anon_fd); call_count = shard_call_count_return(frame); if (call_count != 0) return 0; if (local->op_ret < 0) { shard_common_failure_unwind(GF_FOP_FSYNC, frame, local->op_ret, local->op_errno); } else { shard_get_timestamps_from_inode_ctx(local, base_inode, this); SHARD_STACK_UNWIND(fsync, frame, local->op_ret, local->op_errno, &local->prebuf, &local->postbuf, local->xattr_rsp); } return 0; } int shard_post_lookup_fsync_handler(call_frame_t *frame, xlator_t *this) { int ret = 0; int call_count = 0; int fsync_count = 0; fd_t *anon_fd = NULL; inode_t *base_inode = NULL; shard_local_t *local = NULL; shard_inode_ctx_t *ctx = NULL; shard_inode_ctx_t *iter = NULL; struct list_head copy = { 0, }; shard_inode_ctx_t *tmp = NULL; local = frame->local; base_inode = local->fd->inode; local->postbuf = local->prebuf; INIT_LIST_HEAD(©); if (local->op_ret < 0) { shard_common_failure_unwind(GF_FOP_FSYNC, frame, local->op_ret, local->op_errno); return 0; } LOCK(&base_inode->lock); { __shard_inode_ctx_get(base_inode, this, &ctx); list_splice_init(&ctx->to_fsync_list, ©); call_count = ctx->fsync_count; ctx->fsync_count = 0; } UNLOCK(&base_inode->lock); local->call_count = ++call_count; /* Send fsync() on the base shard first */ anon_fd = fd_ref(local->fd); STACK_WIND_COOKIE(frame, shard_fsync_shards_cbk, anon_fd, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync, anon_fd, local->datasync, local->xattr_req); call_count--; anon_fd = NULL; list_for_each_entry_safe(iter, tmp, ©, to_fsync_list) { list_del_init(&iter->to_fsync_list); fsync_count = 0; shard_inode_ctx_get_fsync_count(iter->inode, this, &fsync_count); GF_ASSERT(fsync_count > 0); anon_fd = fd_anonymous(iter->inode); if (!anon_fd) { local->op_ret = -1; local->op_errno = ENOMEM; gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, "Failed to create " "anon fd to fsync shard"); shard_fsync_shards_cbk(frame, (void *)(long)anon_fd, this, -1, ENOMEM, NULL, NULL, NULL); continue; } ret = fd_ctx_set(anon_fd, this, fsync_count); if (ret) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_FD_CTX_SET_FAILED, "Failed to set fd " "ctx for shard inode gfid=%s", uuid_utoa(iter->inode->gfid)); local->op_ret = -1; local->op_errno = ENOMEM; shard_fsync_shards_cbk(frame, (void *)(long)anon_fd, this, -1, ENOMEM, NULL, NULL, NULL); continue; } STACK_WIND_COOKIE(frame, shard_fsync_shards_cbk, anon_fd, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync, anon_fd, local->datasync, local->xattr_req); call_count--; } return 0; } int shard_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, dict_t *xdata) { int ret = 0; uint64_t block_size = 0; shard_local_t *local = NULL; ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); if (ret) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, "Failed to get block " "size for %s from its inode ctx", uuid_utoa(fd->inode->gfid)); goto err; } if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { STACK_WIND(frame, default_fsync_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata); return 0; } if (!this->itable) this->itable = fd->inode->table; local = mem_get0(this->local_pool); if (!local) goto err; frame->local = local; local->fd = fd_ref(fd); local->fop = GF_FOP_FSYNC; local->datasync = datasync; local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); if (!local->xattr_req) goto err; local->loc.inode = inode_ref(fd->inode); gf_uuid_copy(local->loc.gfid, fd->inode->gfid); shard_lookup_base_file(frame, this, &local->loc, shard_post_lookup_fsync_handler); return 0; err: shard_common_failure_unwind(GF_FOP_FSYNC, frame, -1, ENOMEM); return 0; } int shard_readdir_past_dot_shard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, gf_dirent_t *orig_entries, dict_t *xdata) { gf_dirent_t *entry = NULL; gf_dirent_t *tmp = NULL; shard_local_t *local = NULL; local = frame->local; if (op_ret < 0) goto unwind; list_for_each_entry_safe(entry, tmp, (&orig_entries->list), list) { list_del_init(&entry->list); list_add_tail(&entry->list, &local->entries_head.list); if (!entry->dict) continue; if (IA_ISDIR(entry->d_stat.ia_type)) continue; if (dict_get(entry->dict, GF_XATTR_SHARD_FILE_SIZE)) shard_modify_size_and_block_count(&entry->d_stat, entry->dict); if (!entry->inode) continue; shard_inode_ctx_update(entry->inode, this, entry->dict, &entry->d_stat); } local->op_ret += op_ret; unwind: if (local->fop == GF_FOP_READDIR) SHARD_STACK_UNWIND(readdir, frame, local->op_ret, local->op_errno, &local->entries_head, xdata); else SHARD_STACK_UNWIND(readdirp, frame, op_ret, op_errno, &local->entries_head, xdata); return 0; } int32_t shard_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, gf_dirent_t *orig_entries, dict_t *xdata) { fd_t *fd = NULL; gf_dirent_t *entry = NULL; gf_dirent_t *tmp = NULL; shard_local_t *local = NULL; gf_boolean_t last_entry = _gf_false; local = frame->local; fd = local->fd; if (op_ret < 0) goto unwind; list_for_each_entry_safe(entry, tmp, (&orig_entries->list), list) { if (last_entry) last_entry = _gf_false; if (__is_root_gfid(fd->inode->gfid) && !(strcmp(entry->d_name, GF_SHARD_DIR))) { local->offset = entry->d_off; op_ret--; last_entry = _gf_true; continue; } list_del_init(&entry->list); list_add_tail(&entry->list, &local->entries_head.list); if (!entry->dict) continue; if (IA_ISDIR(entry->d_stat.ia_type)) continue; if (dict_get(entry->dict, GF_XATTR_SHARD_FILE_SIZE) && frame->root->pid != GF_CLIENT_PID_GSYNCD) shard_modify_size_and_block_count(&entry->d_stat, entry->dict); if (!entry->inode) continue; shard_inode_ctx_update(entry->inode, this, entry->dict, &entry->d_stat); } local->op_ret = op_ret; if (last_entry) { if (local->fop == GF_FOP_READDIR) STACK_WIND(frame, shard_readdir_past_dot_shard_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdir, local->fd, local->readdir_size, local->offset, local->xattr_req); else STACK_WIND(frame, shard_readdir_past_dot_shard_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdirp, local->fd, local->readdir_size, local->offset, local->xattr_req); return 0; } unwind: if (local->fop == GF_FOP_READDIR) SHARD_STACK_UNWIND(readdir, frame, op_ret, op_errno, &local->entries_head, xdata); else SHARD_STACK_UNWIND(readdirp, frame, op_ret, op_errno, &local->entries_head, xdata); return 0; } int shard_readdir_do(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t offset, int whichop, dict_t *xdata) { int ret = 0; shard_local_t *local = NULL; local = mem_get0(this->local_pool); if (!local) { goto err; } frame->local = local; local->fd = fd_ref(fd); local->fop = whichop; local->readdir_size = size; INIT_LIST_HEAD(&local->entries_head.list); local->list_inited = _gf_true; if (whichop == GF_FOP_READDIR) { STACK_WIND(frame, shard_readdir_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdir, fd, size, offset, xdata); } else { local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, fd->inode->gfid, local, err); ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_BLOCK_SIZE, 0); if (ret) { gf_log(this->name, GF_LOG_WARNING, "Failed to set " "dict value: key:%s, directory gfid=%s", GF_XATTR_SHARD_BLOCK_SIZE, uuid_utoa(fd->inode->gfid)); goto err; } STACK_WIND(frame, shard_readdir_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdirp, fd, size, offset, local->xattr_req); } return 0; err: STACK_UNWIND_STRICT(readdir, frame, -1, ENOMEM, NULL, NULL); return 0; } int32_t shard_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t offset, dict_t *xdata) { shard_readdir_do(frame, this, fd, size, offset, GF_FOP_READDIR, xdata); return 0; } int32_t shard_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t offset, dict_t *xdata) { shard_readdir_do(frame, this, fd, size, offset, GF_FOP_READDIRP, xdata); return 0; } int32_t shard_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, dict_t *xdata) { int op_errno = EINVAL; if (frame->root->pid != GF_CLIENT_PID_GSYNCD) { GF_IF_NATIVE_XATTR_GOTO(SHARD_XATTR_PREFIX "*", name, op_errno, out); } if (xdata && (frame->root->pid != GF_CLIENT_PID_GSYNCD)) { dict_del(xdata, GF_XATTR_SHARD_BLOCK_SIZE); dict_del(xdata, GF_XATTR_SHARD_FILE_SIZE); } STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->removexattr, loc, name, xdata); return 0; out: shard_common_failure_unwind(GF_FOP_REMOVEXATTR, frame, -1, op_errno); return 0; } int32_t shard_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, dict_t *xdata) { int op_errno = EINVAL; if (frame->root->pid != GF_CLIENT_PID_GSYNCD) { GF_IF_NATIVE_XATTR_GOTO(SHARD_XATTR_PREFIX "*", name, op_errno, out); } if (xdata && (frame->root->pid != GF_CLIENT_PID_GSYNCD)) { dict_del(xdata, GF_XATTR_SHARD_BLOCK_SIZE); dict_del(xdata, GF_XATTR_SHARD_FILE_SIZE); } STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata); return 0; out: shard_common_failure_unwind(GF_FOP_FREMOVEXATTR, frame, -1, op_errno); return 0; } int32_t shard_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { if (op_ret < 0) goto unwind; if (dict && (frame->root->pid != GF_CLIENT_PID_GSYNCD)) { dict_del(dict, GF_XATTR_SHARD_BLOCK_SIZE); dict_del(dict, GF_XATTR_SHARD_FILE_SIZE); } unwind: SHARD_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, dict, xdata); return 0; } int32_t shard_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, dict_t *xdata) { int op_errno = EINVAL; if ((frame->root->pid != GF_CLIENT_PID_GSYNCD) && (name) && (!strncmp(name, SHARD_XATTR_PREFIX, SLEN(SHARD_XATTR_PREFIX)))) { op_errno = ENODATA; goto out; } STACK_WIND(frame, shard_fgetxattr_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata); return 0; out: shard_common_failure_unwind(GF_FOP_FGETXATTR, frame, -1, op_errno); return 0; } int32_t shard_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { if (op_ret < 0) goto unwind; if (dict && (frame->root->pid != GF_CLIENT_PID_GSYNCD)) { dict_del(dict, GF_XATTR_SHARD_BLOCK_SIZE); dict_del(dict, GF_XATTR_SHARD_FILE_SIZE); } unwind: SHARD_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata); return 0; } int32_t shard_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, dict_t *xdata) { int op_errno = EINVAL; if ((frame->root->pid != GF_CLIENT_PID_GSYNCD) && (name) && (!strncmp(name, SHARD_XATTR_PREFIX, sizeof(SHARD_XATTR_PREFIX) - 1))) { op_errno = ENODATA; goto out; } STACK_WIND(frame, shard_getxattr_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->getxattr, loc, name, xdata); return 0; out: shard_common_failure_unwind(GF_FOP_GETXATTR, frame, -1, op_errno); return 0; } int32_t shard_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata) { int op_errno = EINVAL; if (frame->root->pid != GF_CLIENT_PID_GSYNCD) { GF_IF_INTERNAL_XATTR_GOTO(SHARD_XATTR_PREFIX "*", dict, op_errno, out); } STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata); return 0; out: shard_common_failure_unwind(GF_FOP_FSETXATTR, frame, -1, op_errno); return 0; } int32_t shard_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata) { int op_errno = EINVAL; if (frame->root->pid != GF_CLIENT_PID_GSYNCD) { GF_IF_INTERNAL_XATTR_GOTO(SHARD_XATTR_PREFIX "*", dict, op_errno, out); } STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, xdata); return 0; out: shard_common_failure_unwind(GF_FOP_SETXATTR, frame, -1, op_errno); return 0; } int shard_post_setattr_handler(call_frame_t *frame, xlator_t *this) { shard_local_t *local = NULL; local = frame->local; if (local->fop == GF_FOP_SETATTR) { if (local->op_ret >= 0) shard_inode_ctx_set(local->loc.inode, this, &local->postbuf, 0, SHARD_LOOKUP_MASK); SHARD_STACK_UNWIND(setattr, frame, local->op_ret, local->op_errno, &local->prebuf, &local->postbuf, local->xattr_rsp); } else if (local->fop == GF_FOP_FSETATTR) { if (local->op_ret >= 0) shard_inode_ctx_set(local->fd->inode, this, &local->postbuf, 0, SHARD_LOOKUP_MASK); SHARD_STACK_UNWIND(fsetattr, frame, local->op_ret, local->op_errno, &local->prebuf, &local->postbuf, local->xattr_rsp); } return 0; } int shard_common_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { shard_local_t *local = NULL; local = frame->local; if (op_ret < 0) { local->op_ret = op_ret; local->op_errno = op_errno; goto unwind; } local->prebuf = *prebuf; if (shard_modify_size_and_block_count(&local->prebuf, xdata)) { local->op_ret = -1; local->op_errno = EINVAL; goto unwind; } if (xdata) local->xattr_rsp = dict_ref(xdata); local->postbuf = *postbuf; local->postbuf.ia_size = local->prebuf.ia_size; local->postbuf.ia_blocks = local->prebuf.ia_blocks; unwind: local->handler(frame, this); return 0; } int shard_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, int32_t valid, dict_t *xdata) { int ret = -1; uint64_t block_size = 0; shard_local_t *local = NULL; if ((IA_ISDIR(loc->inode->ia_type)) || (IA_ISLNK(loc->inode->ia_type))) { STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); return 0; } ret = shard_inode_ctx_get_block_size(loc->inode, this, &block_size); if (ret) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, "Failed to get block size from inode ctx of %s", uuid_utoa(loc->inode->gfid)); goto err; } if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); return 0; } local = mem_get0(this->local_pool); if (!local) goto err; frame->local = local; local->handler = shard_post_setattr_handler; local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); if (!local->xattr_req) goto err; local->fop = GF_FOP_SETATTR; loc_copy(&local->loc, loc); SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, local->loc.gfid, local, err); STACK_WIND(frame, shard_common_setattr_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, local->xattr_req); return 0; err: shard_common_failure_unwind(GF_FOP_SETATTR, frame, -1, ENOMEM); return 0; } int shard_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, int32_t valid, dict_t *xdata) { int ret = -1; uint64_t block_size = 0; shard_local_t *local = NULL; if ((IA_ISDIR(fd->inode->ia_type)) || (IA_ISLNK(fd->inode->ia_type))) { STACK_WIND(frame, default_fsetattr_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); return 0; } ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); if (ret) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, "Failed to get block size from inode ctx of %s", uuid_utoa(fd->inode->gfid)); goto err; } if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { STACK_WIND(frame, default_fsetattr_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); return 0; } if (!this->itable) this->itable = fd->inode->table; local = mem_get0(this->local_pool); if (!local) goto err; frame->local = local; local->handler = shard_post_setattr_handler; local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); if (!local->xattr_req) goto err; local->fop = GF_FOP_FSETATTR; local->fd = fd_ref(fd); SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, fd->inode->gfid, local, err); STACK_WIND(frame, shard_common_setattr_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, local->xattr_req); return 0; err: shard_common_failure_unwind(GF_FOP_FSETATTR, frame, -1, ENOMEM); return 0; } int shard_common_inode_write_begin(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop, fd_t *fd, struct iovec *vector, int32_t count, off_t offset, uint32_t flags, size_t len, struct iobref *iobref, dict_t *xdata) { int ret = 0; int i = 0; uint64_t block_size = 0; shard_local_t *local = NULL; ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); if (ret) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, "Failed to get block " "size for %s from its inode ctx", uuid_utoa(fd->inode->gfid)); goto out; } if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { /* block_size = 0 means that the file was created before * sharding was enabled on the volume. */ switch (fop) { case GF_FOP_WRITE: STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev, fd, vector, count, offset, flags, iobref, xdata); break; case GF_FOP_FALLOCATE: STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fallocate, fd, flags, offset, len, xdata); break; case GF_FOP_ZEROFILL: STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata); break; case GF_FOP_DISCARD: STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata); break; default: gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, "Invalid fop id = %d", fop); break; } return 0; } if (!this->itable) this->itable = fd->inode->table; local = mem_get0(this->local_pool); if (!local) goto out; frame->local = local; ret = syncbarrier_init(&local->barrier); if (ret) goto out; local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); if (!local->xattr_req) goto out; if (vector) { local->vector = iov_dup(vector, count); if (!local->vector) goto out; for (i = 0; i < count; i++) local->total_size += vector[i].iov_len; local->count = count; } else { local->total_size = len; } local->fop = fop; local->offset = offset; local->flags = flags; if (iobref) local->iobref = iobref_ref(iobref); local->fd = fd_ref(fd); local->block_size = block_size; local->resolver_base_inode = local->fd->inode; local->loc.inode = inode_ref(fd->inode); gf_uuid_copy(local->loc.gfid, fd->inode->gfid); shard_lookup_base_file(frame, this, &local->loc, shard_common_inode_write_post_lookup_handler); return 0; out: shard_common_failure_unwind(fop, frame, -1, ENOMEM); return 0; } int shard_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, dict_t *xdata) { shard_common_inode_write_begin(frame, this, GF_FOP_WRITE, fd, vector, count, offset, flags, 0, iobref, xdata); return 0; } int shard_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t keep_size, off_t offset, size_t len, dict_t *xdata) { if ((keep_size != 0) && (keep_size != FALLOC_FL_ZERO_RANGE) && (keep_size != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))) goto out; shard_common_inode_write_begin(frame, this, GF_FOP_FALLOCATE, fd, NULL, 0, offset, keep_size, len, NULL, xdata); return 0; out: shard_common_failure_unwind(GF_FOP_FALLOCATE, frame, -1, ENOTSUP); return 0; } int shard_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, off_t len, dict_t *xdata) { shard_common_inode_write_begin(frame, this, GF_FOP_ZEROFILL, fd, NULL, 0, offset, 0, len, NULL, xdata); return 0; } int shard_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, size_t len, dict_t *xdata) { shard_common_inode_write_begin(frame, this, GF_FOP_DISCARD, fd, NULL, 0, offset, 0, len, NULL, xdata); return 0; } int32_t shard_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, gf_seek_what_t what, dict_t *xdata) { /* TBD */ gf_msg(this->name, GF_LOG_INFO, ENOTSUP, SHARD_MSG_FOP_NOT_SUPPORTED, "seek called on %s.", uuid_utoa(fd->inode->gfid)); shard_common_failure_unwind(GF_FOP_SEEK, frame, -1, ENOTSUP); return 0; } int32_t mem_acct_init(xlator_t *this) { int ret = -1; if (!this) return ret; ret = xlator_mem_acct_init(this, gf_shard_mt_end + 1); if (ret != 0) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_MEM_ACCT_INIT_FAILED, "Memory accounting init" "failed"); return ret; } return ret; } int init(xlator_t *this) { int ret = -1; shard_priv_t *priv = NULL; if (!this) { gf_msg("shard", GF_LOG_ERROR, 0, SHARD_MSG_NULL_THIS, "this is NULL. init() failed"); return -1; } if (!this->parents) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INVALID_VOLFILE, "Dangling volume. Check volfile"); goto out; } if (!this->children || this->children->next) { gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INVALID_VOLFILE, "shard not configured with exactly one sub-volume. " "Check volfile"); goto out; } priv = GF_CALLOC(1, sizeof(shard_priv_t), gf_shard_mt_priv_t); if (!priv) goto out; GF_OPTION_INIT("shard-block-size", priv->block_size, size_uint64, out); GF_OPTION_INIT("shard-deletion-rate", priv->deletion_rate, uint32, out); GF_OPTION_INIT("shard-lru-limit", priv->lru_limit, uint64, out); this->local_pool = mem_pool_new(shard_local_t, 128); if (!this->local_pool) { ret = -1; goto out; } gf_uuid_parse(SHARD_ROOT_GFID, priv->dot_shard_gfid); gf_uuid_parse(DOT_SHARD_REMOVE_ME_GFID, priv->dot_shard_rm_gfid); this->private = priv; LOCK_INIT(&priv->lock); INIT_LIST_HEAD(&priv->ilist_head); ret = 0; out: if (ret) { GF_FREE(priv); mem_pool_destroy(this->local_pool); } return ret; } void fini(xlator_t *this) { shard_priv_t *priv = NULL; GF_VALIDATE_OR_GOTO("shard", this, out); mem_pool_destroy(this->local_pool); this->local_pool = NULL; priv = this->private; if (!priv) goto out; this->private = NULL; LOCK_DESTROY(&priv->lock); GF_FREE(priv); out: return; } int reconfigure(xlator_t *this, dict_t *options) { int ret = -1; shard_priv_t *priv = NULL; priv = this->private; GF_OPTION_RECONF("shard-block-size", priv->block_size, options, size, out); GF_OPTION_RECONF("shard-deletion-rate", priv->deletion_rate, options, uint32, out); ret = 0; out: return ret; } int shard_forget(xlator_t *this, inode_t *inode) { uint64_t ctx_uint = 0; shard_inode_ctx_t *ctx = NULL; shard_priv_t *priv = NULL; priv = this->private; if (!priv) return 0; inode_ctx_del(inode, this, &ctx_uint); if (!ctx_uint) return 0; ctx = (shard_inode_ctx_t *)ctx_uint; /* When LRU limit reaches inode will be forcefully removed from the * table, inode needs to be removed from LRU of shard as well. */ if (!list_empty(&ctx->ilist)) { LOCK(&priv->lock); { list_del_init(&ctx->ilist); priv->inode_count--; } UNLOCK(&priv->lock); } GF_FREE(ctx); return 0; } int shard_release(xlator_t *this, fd_t *fd) { /* TBD */ return 0; } int shard_priv_dump(xlator_t *this) { shard_priv_t *priv = NULL; char key_prefix[GF_DUMP_MAX_BUF_LEN] = { 0, }; char *str = NULL; priv = this->private; snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name); gf_proc_dump_add_section("%s", key_prefix); str = gf_uint64_2human_readable(priv->block_size); gf_proc_dump_write("shard-block-size", "%s", str); gf_proc_dump_write("inode-count", "%d", priv->inode_count); gf_proc_dump_write("ilist_head", "%p", &priv->ilist_head); gf_proc_dump_write("lru-max-limit", "%" PRIu64, priv->lru_limit); GF_FREE(str); return 0; } int shard_releasedir(xlator_t *this, fd_t *fd) { return 0; } struct xlator_fops fops = { .lookup = shard_lookup, .open = shard_open, .flush = shard_flush, .fsync = shard_fsync, .stat = shard_stat, .fstat = shard_fstat, .getxattr = shard_getxattr, .fgetxattr = shard_fgetxattr, .readv = shard_readv, .writev = shard_writev, .truncate = shard_truncate, .ftruncate = shard_ftruncate, .setxattr = shard_setxattr, .fsetxattr = shard_fsetxattr, .setattr = shard_setattr, .fsetattr = shard_fsetattr, .removexattr = shard_removexattr, .fremovexattr = shard_fremovexattr, .fallocate = shard_fallocate, .discard = shard_discard, .zerofill = shard_zerofill, .readdir = shard_readdir, .readdirp = shard_readdirp, .create = shard_create, .mknod = shard_mknod, .link = shard_link, .unlink = shard_unlink, .rename = shard_rename, .seek = shard_seek, }; struct xlator_cbks cbks = { .forget = shard_forget, .release = shard_release, .releasedir = shard_releasedir, }; struct xlator_dumpops dumpops = { .priv = shard_priv_dump, }; struct volume_options options[] = { { .key = {"shard-block-size"}, .type = GF_OPTION_TYPE_SIZET, .op_version = {GD_OP_VERSION_3_7_0}, .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, .tags = {"shard"}, .default_value = "64MB", .min = SHARD_MIN_BLOCK_SIZE, .max = SHARD_MAX_BLOCK_SIZE, .description = "The size unit used to break a file into multiple " "chunks", }, { .key = {"shard-deletion-rate"}, .type = GF_OPTION_TYPE_INT, .op_version = {GD_OP_VERSION_5_0}, .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, .tags = {"shard"}, .default_value = "100", .min = 100, .max = INT_MAX, .description = "The number of shards to send deletes on at a time", }, { .key = {"shard-lru-limit"}, .type = GF_OPTION_TYPE_INT, .op_version = {GD_OP_VERSION_5_0}, .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT, .tags = {"shard"}, .default_value = "16384", .min = 20, .max = INT_MAX, .description = "The number of resolved shard inodes to keep in " "memory. A higher number means shards that are " "resolved will remain in memory longer, avoiding " "frequent lookups on them when they participate in " "file operations. The option also has a bearing on " "amount of memory consumed by these inodes and their " "internal metadata", }, {.key = {NULL}}, };