diff options
Diffstat (limited to 'xlators/cluster/afr/src')
| -rw-r--r-- | xlators/cluster/afr/src/afr-dir-write.c | 225 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-inode-read.c | 305 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.c | 85 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 8 | 
4 files changed, 513 insertions, 110 deletions
diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index af72c6440b6..faaf75e45b6 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -72,6 +72,40 @@ afr_build_parent_loc (loc_t *parent, loc_t *child)  } +afr_inode_ctx_t * +afr_get_inode_ctx (xlator_t *this, inode_t *inode) +{ +        afr_inode_ctx_t * inode_ctx = NULL; +        uint64_t          ctx; + +        int               ret = 0; + +        LOCK (&inode->lock); +        { +                ret = __inode_ctx_get (inode, this, &ctx); + +                if (ret < 0) { +                        inode_ctx = CALLOC (1, sizeof (afr_inode_ctx_t)); +                         +                        ret = __inode_ctx_put (inode, this, +                                             (uint64_t)(long) inode_ctx); + +                        if (ret < 0) { +                                gf_log (this->name, GF_LOG_ERROR, +                                        "could not set inode ctx"); +                                FREE (inode_ctx); +                                inode_ctx = NULL; +                        } +                } else { +                        inode_ctx = (afr_inode_ctx_t *)(long) ctx; +                } +        } +        UNLOCK (&inode->lock); + +        return inode_ctx; +} + +  /* {{{ create */  int @@ -91,11 +125,13 @@ afr_create_unwind (call_frame_t *frame, xlator_t *this)  	}  	UNLOCK (&frame->lock); -	if (main_frame) +	if (main_frame) {  		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,  				  local->cont.create.fd,  				  local->cont.create.inode,  				  &local->cont.create.buf); +        } +          	return 0;  } @@ -107,6 +143,8 @@ afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  {  	afr_local_t *   local = NULL;  	afr_private_t * priv  = NULL; +         +        afr_inode_ctx_t * inode_ctx = NULL;  	int call_count = -1;  	int child_index = -1; @@ -124,14 +162,36 @@ afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  		if (op_ret != -1) {  			local->op_ret = op_ret; -			if ((local->success_count == 0) -			    || (child_index == priv->read_child)) { +			if (local->success_count == 0) {  				local->cont.create.buf        = *buf;  				local->cont.create.buf.st_ino =   					afr_itransform (buf->st_ino,  							priv->child_count,  							child_index); + +                                inode_ctx = afr_get_inode_ctx (this, inode); +                                 +                                if (inode_ctx) { +                                        if (priv->read_child >= 0) { +                                                inode_ctx->read_child = priv->read_child; +                                        } else { +                                                inode_ctx->read_child = local->read_child_index; +                                        } +                                }  			} +                         +                        if (child_index == local->read_child_index) { +                                inode_ctx = afr_get_inode_ctx (this, inode); +                                 +                                if (inode_ctx) { +                                        if (priv->read_child >= 0) { +                                                inode_ctx->read_child = priv->read_child; +                                        } else { +                                                inode_ctx->read_child = local->read_child_index; +                                        } +                                } +                        } +  			local->cont.create.inode = inode;  			local->success_count++; @@ -246,6 +306,13 @@ afr_create (call_frame_t *frame, xlator_t *this,  	loc_copy (&local->loc, loc); +        LOCK (&priv->read_child_lock); +        { +                local->read_child_index = (++priv->read_child_rr)  +                        % (priv->child_count); +        } +        UNLOCK (&priv->read_child_lock); +  	local->cont.create.flags = flags;  	local->cont.create.mode  = mode;  	local->cont.create.fd    = fd_ref (fd); @@ -294,10 +361,12 @@ afr_mknod_unwind (call_frame_t *frame, xlator_t *this)  	}  	UNLOCK (&frame->lock); -	if (main_frame) +	if (main_frame) {  		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,  				  local->cont.mknod.inode,  				  &local->cont.mknod.buf); +        } +  	return 0;  } @@ -310,6 +379,8 @@ afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  	afr_local_t *   local = NULL;  	afr_private_t * priv  = NULL; +        afr_inode_ctx_t * inode_ctx = NULL; +  	int call_count = -1;  	int child_index = -1; @@ -326,14 +397,36 @@ afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  		if (op_ret != -1) {  			local->op_ret = op_ret; -			if ((local->success_count == 0) -			    || (child_index == priv->read_child)) {	 +			if (local->success_count == 0){  				local->cont.mknod.buf   = *buf;  				local->cont.mknod.buf.st_ino =   					afr_itransform (buf->st_ino,  							priv->child_count,  							child_index); + +                                inode_ctx = afr_get_inode_ctx (this, inode); +                                 +                                if (inode_ctx) { +                                        if (priv->read_child >= 0) { +                                                inode_ctx->read_child = priv->read_child; +                                        } else { +                                                inode_ctx->read_child = local->read_child_index; +                                        } +                                }  			} + +                        if (child_index == local->read_child_index) { +                                inode_ctx = afr_get_inode_ctx (this, inode); +                                 +                                if (inode_ctx) { +                                        if (priv->read_child >= 0) { +                                                inode_ctx->read_child = priv->read_child; +                                        } else { +                                                inode_ctx->read_child = local->read_child_index; +                                        } +                                } +                        } +                          			local->cont.mknod.inode = inode;  			local->success_count++; @@ -444,6 +537,13 @@ afr_mknod (call_frame_t *frame, xlator_t *this,  	loc_copy (&local->loc, loc); +        LOCK (&priv->read_child_lock); +        { +                local->read_child_index = (++priv->read_child_rr)  +                        % (priv->child_count); +        } +        UNLOCK (&priv->read_child_lock); +  	local->cont.mknod.mode  = mode;  	local->cont.mknod.dev   = dev; @@ -492,10 +592,12 @@ afr_mkdir_unwind (call_frame_t *frame, xlator_t *this)  	}  	UNLOCK (&frame->lock); -	if (main_frame) +	if (main_frame) {  		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,  				  local->cont.mkdir.inode,  				  &local->cont.mkdir.buf); +        } +  	return 0;  } @@ -508,6 +610,8 @@ afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  	afr_local_t *   local = NULL;  	afr_private_t * priv  = NULL; +        afr_inode_ctx_t * inode_ctx = NULL; +  	int call_count = -1;  	int child_index = -1; @@ -524,13 +628,35 @@ afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  		if (op_ret != -1) {  			local->op_ret           = op_ret; -			if ((local->success_count == 0) -			    || (child_index == priv->read_child)) { +			if (local->success_count == 0) {  				local->cont.mkdir.buf   = *buf;  				local->cont.mkdir.buf.st_ino =   					afr_itransform (buf->st_ino, priv->child_count,  							child_index); +                                 +                                inode_ctx = afr_get_inode_ctx (this, inode); +                                 +                                if (inode_ctx) { +                                        if (priv->read_child >= 0) { +                                                inode_ctx->read_child = priv->read_child; +                                        } else { +                                                inode_ctx->read_child = local->read_child_index; +                                        } +                                }  			} +                         +                        if (child_index == local->read_child_index) { +                                inode_ctx = afr_get_inode_ctx (this, inode); +                                 +                                if (inode_ctx) { +                                        if (priv->read_child >= 0) { +                                                inode_ctx->read_child = priv->read_child; +                                        } else { +                                                inode_ctx->read_child = local->read_child_index; +                                        } +                                } +                        } +  			local->cont.mkdir.inode = inode;  			local->success_count++; @@ -642,6 +768,13 @@ afr_mkdir (call_frame_t *frame, xlator_t *this,  	loc_copy (&local->loc, loc); +        LOCK (&priv->read_child_lock); +        { +                local->read_child_index = (++priv->read_child_rr)  +                        % (priv->child_count); +        } +        UNLOCK (&priv->read_child_lock); +  	local->cont.mkdir.mode  = mode;  	local->transaction.fop    = afr_mkdir_wind; @@ -710,6 +843,8 @@ afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  	afr_local_t *   local = NULL;  	afr_private_t * priv  = NULL; +        afr_inode_ctx_t * inode_ctx = NULL; +  	int call_count = -1;  	int child_index = -1; @@ -726,13 +861,35 @@ afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  		if (op_ret != -1) {  			local->op_ret   = op_ret; -			if ((local->success_count == 0) -			    || (child_index == priv->read_child)) { +			if (local->success_count == 0) {  				local->cont.link.buf        = *buf;  				local->cont.link.buf.st_ino =   					afr_itransform (buf->st_ino, priv->child_count,  							child_index); +                                 +                                inode_ctx = afr_get_inode_ctx (this, inode); +                                 +                                if (inode_ctx) { +                                        if (priv->read_child >= 0) { +                                                inode_ctx->read_child = priv->read_child; +                                        } else { +                                                inode_ctx->read_child = local->read_child_index; +                                        } +                                }  			} +                         +                        if (child_index == local->read_child_index) { +                                inode_ctx = afr_get_inode_ctx (this, inode); +                                 +                                if (inode_ctx) { +                                        if (priv->read_child >= 0) { +                                                inode_ctx->read_child = priv->read_child; +                                        } else { +                                                inode_ctx->read_child = local->read_child_index; +                                        } +                                } +                        } +  			local->cont.link.inode    = inode;  			local->success_count++; @@ -844,6 +1001,13 @@ afr_link (call_frame_t *frame, xlator_t *this,  	loc_copy (&local->loc,    oldloc);  	loc_copy (&local->newloc, newloc); +        LOCK (&priv->read_child_lock); +        { +                local->read_child_index = (++priv->read_child_rr)  +                        % (priv->child_count); +        } +        UNLOCK (&priv->read_child_lock); +  	local->cont.link.ino = oldloc->inode->ino;  	local->transaction.fop    = afr_link_wind; @@ -892,10 +1056,12 @@ afr_symlink_unwind (call_frame_t *frame, xlator_t *this)  	}  	UNLOCK (&frame->lock); -	if (main_frame) +	if (main_frame) {  		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,  				  local->cont.symlink.inode,  				  &local->cont.symlink.buf); +        } +  	return 0;  } @@ -908,6 +1074,8 @@ afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  	afr_local_t *   local = NULL;  	afr_private_t * priv  = NULL; +        afr_inode_ctx_t * inode_ctx = NULL; +  	int call_count = -1;  	int child_index = -1; @@ -924,13 +1092,35 @@ afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  		if (op_ret != -1) {  			local->op_ret   = op_ret; -			if ((local->success_count == 0) -			    || (child_index == priv->read_child)) { +			if (local->success_count == 0) {  				local->cont.symlink.buf        = *buf;  				local->cont.symlink.buf.st_ino =   					afr_itransform (buf->st_ino, priv->child_count,  							child_index); +                                 +                                inode_ctx = afr_get_inode_ctx (this, inode); +                                 +                                if (inode_ctx) { +                                        if (priv->read_child >= 0) { +                                                inode_ctx->read_child = priv->read_child; +                                        } else { +                                                inode_ctx->read_child = local->read_child_index; +                                        } +                                }  			} + +                        if (child_index == local->read_child_index) { +                                inode_ctx = afr_get_inode_ctx (this, inode); +                                 +                                if (inode_ctx) { +                                        if (priv->read_child >= 0) { +                                                inode_ctx->read_child = priv->read_child; +                                        } else { +                                                inode_ctx->read_child = local->read_child_index; +                                        } +                                } +                        } +  			local->cont.symlink.inode    = inode;  			local->success_count++; @@ -1043,6 +1233,13 @@ afr_symlink (call_frame_t *frame, xlator_t *this,  	loc_copy (&local->loc, loc); +        LOCK (&priv->read_child_lock); +        { +                local->read_child_index = (++priv->read_child_rr)  +                        % (priv->child_count); +        } +        UNLOCK (&priv->read_child_lock); +  	local->cont.symlink.ino      = loc->inode->ino;  	local->cont.symlink.linkpath = strdup (linkpath); diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c index fd1edc3b593..97b429049c7 100644 --- a/xlators/cluster/afr/src/afr-inode-read.c +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -49,7 +49,7 @@  /**   * Common algorithm for inode read calls: - *  + *   * - Try the fop on the first child that is up   * - if we have failed due to ENOTCONN:   *     try the next child @@ -70,13 +70,17 @@ afr_access_cbk (call_frame_t *frame, void *cookie,  	int unwind     = 1;  	int last_tried = -1;  	int this_try = -1; +        int read_child = -1;  	priv     = this->private;  	children = priv->children;  	local = frame->local; +        read_child = (long) cookie; +  	if (op_ret == -1) { +        retry:  		last_tried = local->cont.access.last_tried;  		if (all_tried (last_tried, priv->child_count)) { @@ -84,11 +88,15 @@ afr_access_cbk (call_frame_t *frame, void *cookie,  		}  		this_try    = ++local->cont.access.last_tried; +                if (this_try == read_child) { +                        goto retry; +                } +  		unwind = 0;  		STACK_WIND_COOKIE (frame, afr_access_cbk, -				   (void *) (long) this_try, -				   children[this_try],  +				   (void *) (long) read_child, +				   children[this_try],  				   children[this_try]->fops->access,  				   &local->loc, local->cont.access.mask);  	} @@ -111,6 +119,10 @@ afr_access (call_frame_t *frame, xlator_t *this,  	int             call_child = 0;  	afr_local_t     *local     = NULL; +        afr_inode_ctx_t * inode_ctx = NULL; +        uint64_t          ctx; +        int               ret       = 0; +  	int32_t op_ret   = -1;  	int32_t op_errno = 0; @@ -125,15 +137,34 @@ afr_access (call_frame_t *frame, xlator_t *this,  	ALLOC_OR_GOTO (local, afr_local_t, out); -	call_child = afr_first_up_child (priv); -	if (call_child == -1) { -		op_errno = ENOTCONN; -		gf_log (this->name, GF_LOG_ERROR, -			"no child is up :("); -		goto out; -	} +        ret = inode_ctx_get (loc->inode, this, +                             &ctx); +        if (ret < 0) { +                op_errno = EINVAL; +                gf_log (this->name, GF_LOG_ERROR, +                        "inode ctx not set!"); +                goto out; +        } + +        inode_ctx = (afr_inode_ctx_t *)(long) ctx; + +        if (inode_ctx->read_child >= 0) { +                call_child = inode_ctx->read_child; + +                local->cont.access.last_tried = -1; + +        } else { +                call_child = afr_first_up_child (priv); +                if (call_child == -1) { +                        op_errno = ENOTCONN; +                        gf_log (this->name, GF_LOG_ERROR, +                                "no child is up :("); +                        goto out; +                } + +                local->cont.access.last_tried = call_child; +        } -	local->cont.access.last_tried = call_child;  	loc_copy (&local->loc, loc);  	local->cont.access.mask       = mask; @@ -164,16 +195,15 @@ afr_stat_cbk (call_frame_t *frame, void *cookie,  	afr_local_t *   local    = NULL;  	xlator_t **     children = NULL; -	int deitransform_child = -1; -  	int unwind     = 1;  	int last_tried = -1;  	int this_try = -1; +	int read_child = -1;  	priv     = this->private;  	children = priv->children; -	deitransform_child = (long) cookie; +	read_child = (long) cookie;  	local = frame->local; @@ -186,15 +216,15 @@ afr_stat_cbk (call_frame_t *frame, void *cookie,  		}  		this_try = ++local->cont.stat.last_tried; -		if (this_try == deitransform_child) { +		if (this_try == read_child) {  			goto retry;  		}  		unwind = 0;  		STACK_WIND_COOKIE (frame, afr_stat_cbk, -				   (void *) (long) deitransform_child, -				   children[this_try],  +				   (void *) (long) read_child, +				   children[this_try],  				   children[this_try]->fops->stat,  				   &local->loc);  	} @@ -219,6 +249,10 @@ afr_stat (call_frame_t *frame, xlator_t *this,  	afr_local_t   * local      = NULL;  	xlator_t **     children   = NULL; +        afr_inode_ctx_t * inode_ctx = NULL; +        uint64_t          ctx; +        int               ret       = 0; +  	int             call_child = 0;  	int32_t         op_ret     = -1; @@ -237,14 +271,36 @@ afr_stat (call_frame_t *frame, xlator_t *this,  	frame->local = local; -	call_child = afr_deitransform (loc->inode->ino, priv->child_count); +        ret = inode_ctx_get (loc->inode, this, +                             &ctx); +        if (ret < 0) { +                op_errno = EINVAL; +                gf_log (this->name, GF_LOG_ERROR, +                        "inode ctx not set!"); +                goto out; +        } + +        inode_ctx = (afr_inode_ctx_t *)(long) ctx; + +        if (inode_ctx->read_child >= 0) { +                call_child = inode_ctx->read_child; + +                local->cont.stat.last_tried = -1; + +        } else { +		call_child = afr_first_up_child (priv); +		if (call_child == -1) { +			op_errno = ENOTCONN; +			gf_log (this->name, GF_LOG_ERROR, +				"no child is up :("); +			goto out; +		} + +		local->cont.stat.last_tried = call_child; +	} +  	loc_copy (&local->loc, loc); -	/*  -	   if stat fails from the deitranform'd child, we try -	   all children starting with the first one -	*/ -	local->cont.stat.last_tried = -1;  	local->cont.stat.ino = loc->inode->ino;  	STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) call_child, @@ -275,19 +331,18 @@ afr_fstat_cbk (call_frame_t *frame, void *cookie,  	afr_local_t *   local    = NULL;  	xlator_t **     children = NULL; -	int deitransform_child = -1; -  	int unwind     = 1;  	int last_tried = -1;  	int this_try = -1; +        int read_child = -1;  	priv     = this->private;  	children = priv->children; -	deitransform_child = (long) cookie; -  	local = frame->local; +	read_child = (long) cookie; +  	if (op_ret == -1) {  	retry:  		last_tried = local->cont.fstat.last_tried; @@ -297,20 +352,15 @@ afr_fstat_cbk (call_frame_t *frame, void *cookie,  		}  		this_try   = ++local->cont.fstat.last_tried; -		if (this_try == deitransform_child) { -			/*  -			   skip the deitransform'd child since if we are here -			   we must have already tried that child -			*/ +		if (this_try == read_child) {  			goto retry;  		} -	         		unwind = 0;  		STACK_WIND_COOKIE (frame, afr_fstat_cbk, -				   (void *) (long) deitransform_child, -				   children[this_try],  +				   (void *) (long) read_child, +				   children[this_try],  				   children[this_try]->fops->fstat,  				   local->fd);  	} @@ -337,6 +387,10 @@ afr_fstat (call_frame_t *frame, xlator_t *this,  	int             call_child = 0; +        afr_inode_ctx_t * inode_ctx = NULL; +        uint64_t          ctx; +        int               ret       = 0; +  	int32_t         op_ret     = -1;  	int32_t         op_errno   = 0; @@ -356,13 +410,35 @@ afr_fstat (call_frame_t *frame, xlator_t *this,  	VALIDATE_OR_GOTO (fd->inode, out); -	call_child = afr_deitransform (fd->inode->ino, priv->child_count); +        ret = inode_ctx_get (fd->inode, this, +                             &ctx); + +        if (ret < 0) { +                op_errno = EINVAL; +                gf_log (this->name, GF_LOG_ERROR, +                        "inode ctx not set!"); +                goto out; +        } + +        inode_ctx = (afr_inode_ctx_t *)(long) ctx; + +        if (inode_ctx->read_child >= 0) { +                call_child = inode_ctx->read_child; + +                local->cont.fstat.last_tried = -1; +        } else { +                call_child = afr_first_up_child (priv); + +                if (call_child == -1) { +                        op_errno = ENOTCONN; +                        gf_log (this->name, GF_LOG_ERROR, +				"no child is up :("); +			goto out; +		} + +                local->cont.fstat.last_tried = call_child; +        } -	/*  -	   if fstat fails from the deitranform'd child, we try -	   all children starting with the first one -	*/ -	local->cont.fstat.last_tried = -1;  	local->cont.fstat.ino = fd->inode->ino;  	local->fd = fd_ref (fd); @@ -396,13 +472,17 @@ afr_readlink_cbk (call_frame_t *frame, void *cookie,  	int unwind     = 1;  	int last_tried = -1;  	int this_try = -1; +        int read_child = -1;  	priv     = this->private;  	children = priv->children;  	local = frame->local; +        read_child = (long) cookie; +  	if (op_ret == -1) { +        retry:  		last_tried = local->cont.readlink.last_tried;  		if (all_tried (last_tried, priv->child_count)) { @@ -410,10 +490,14 @@ afr_readlink_cbk (call_frame_t *frame, void *cookie,  		}  		this_try = ++local->cont.readlink.last_tried; +                if (this_try == read_child) { +                        goto retry; +                } +  		unwind = 0;  		STACK_WIND_COOKIE (frame, afr_readlink_cbk, -				   (void *) (long) this_try, -				   children[this_try],  +				   (void *) (long) read_child, +				   children[this_try],  				   children[this_try]->fops->readlink,  				   &local->loc,  				   local->cont.readlink.size); @@ -437,6 +521,10 @@ afr_readlink (call_frame_t *frame, xlator_t *this,  	int             call_child = 0;  	afr_local_t     *local     = NULL; +        afr_inode_ctx_t * inode_ctx = NULL; +        uint64_t          ctx; +        int               ret       = 0; +  	int32_t op_ret   = -1;  	int32_t op_errno = 0; @@ -453,15 +541,35 @@ afr_readlink (call_frame_t *frame, xlator_t *this,  	frame->local = local; -	call_child = afr_first_up_child (priv); -	if (call_child == -1) { -		op_errno = ENOTCONN; -		gf_log (this->name, GF_LOG_ERROR, -			"no child is up :("); -		goto out; -	} +        ret = inode_ctx_get (loc->inode, this, +                             &ctx); +        if (ret < 0) { +                op_errno = EINVAL; +                gf_log (this->name, GF_LOG_ERROR, +                        "inode ctx not set!"); +                goto out; +        } + +        inode_ctx = (afr_inode_ctx_t *)(long) ctx; + +        if (inode_ctx->read_child >= 0) { +                call_child = inode_ctx->read_child; + +                local->cont.readlink.last_tried = -1; + +        } else { +                call_child = afr_first_up_child (priv); + +                if (call_child == -1) { +                        op_errno = ENOTCONN; +                        gf_log (this->name, GF_LOG_ERROR, +                                "no child is up :("); +                        goto out; +                } + +                local->cont.readlink.last_tried = call_child; +        } -	local->cont.readlink.last_tried = call_child;  	loc_copy (&local->loc, loc);  	local->cont.readlink.size       = size; @@ -495,13 +603,17 @@ afr_getxattr_cbk (call_frame_t *frame, void *cookie,  	int unwind     = 1;  	int last_tried = -1;  	int this_try = -1; +        int read_child = -1;  	priv     = this->private;  	children = priv->children;  	local = frame->local; +        read_child = (long) cookie; +  	if (op_ret == -1) { +        retry:  		last_tried = local->cont.getxattr.last_tried;  		if (all_tried (last_tried, priv->child_count)) { @@ -509,10 +621,14 @@ afr_getxattr_cbk (call_frame_t *frame, void *cookie,  		}  		this_try = ++local->cont.getxattr.last_tried; +                if (this_try == read_child) { +                        goto retry; +                } +  		unwind = 0;  		STACK_WIND_COOKIE (frame, afr_getxattr_cbk, -				   (void *) (long) this_try, -				   children[this_try],  +				   (void *) (long) read_child, +				   children[this_try],  				   children[this_try]->fops->getxattr,  				   &local->loc,  				   local->cont.getxattr.name); @@ -536,6 +652,10 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,  	int               call_child = 0;  	afr_local_t     * local      = NULL; +        afr_inode_ctx_t * inode_ctx  = NULL; +        uint64_t          ctx; +        int               ret        = 0; +  	int32_t op_ret   = -1;  	int32_t op_errno = 0; @@ -551,15 +671,34 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,  	ALLOC_OR_GOTO (local, afr_local_t, out);  	frame->local = local; -	call_child = afr_first_up_child (priv); -	if (call_child == -1) { -		op_errno = ENOTCONN; -		gf_log (this->name, GF_LOG_ERROR, -			"no child is up :("); -		goto out; -	} +        ret = inode_ctx_get (loc->inode, this, &ctx); + +        if (ret < 0) { +                op_errno = EINVAL; +                gf_log (this->name, GF_LOG_ERROR, +                        "inode ctx not set!"); +                goto out; +        } + +        inode_ctx = (afr_inode_ctx_t *)(long) ctx; + +        if (inode_ctx->read_child >= 0) { +                call_child = inode_ctx->read_child; + +                local->cont.getxattr.last_tried = -1; +        } else { +                call_child = afr_first_up_child (priv); + +                if (call_child == -1) { +                        op_errno = ENOTCONN; +                        gf_log (this->name, GF_LOG_ERROR, +                                "no child is up :("); +                        goto out; +                } + +                local->cont.getxattr.last_tried = call_child; +        } -	local->cont.getxattr.last_tried = call_child;  	loc_copy (&local->loc, loc);  	if (name)  	  local->cont.getxattr.name       = strdup (name); @@ -584,7 +723,7 @@ out:  /**   * read algorithm: - *  + *   * if the user has specified a read subvolume, use it   * otherwise -   *   use the inode number to hash it to one of the subvolumes, and @@ -593,7 +732,7 @@ out:   * if any of the above read's fail, try the children in sequence   * beginning at the beginning   */ -  +  int32_t  afr_readv_cbk (call_frame_t *frame, void *cookie,  	       xlator_t *this, int32_t op_ret, int32_t op_errno, @@ -605,7 +744,8 @@ afr_readv_cbk (call_frame_t *frame, void *cookie,  	int unwind     = 1;  	int last_tried = -1; -	int this_try = -1; +	int this_try   = -1; +        int read_child = -1;  	VALIDATE_OR_GOTO (frame, out);  	VALIDATE_OR_GOTO (this, out); @@ -618,6 +758,8 @@ afr_readv_cbk (call_frame_t *frame, void *cookie,  	local = frame->local; +        read_child = (long) cookie; +  	if (op_ret == -1) {  	retry:  		last_tried = local->cont.readv.last_tried; @@ -627,8 +769,8 @@ afr_readv_cbk (call_frame_t *frame, void *cookie,  		}  		this_try = ++local->cont.readv.last_tried; -		if (this_try == priv->read_child) { -			/*  +		if (this_try == read_child) { +			/*  			   skip the read child since if we are here  			   we must have already tried that child  			*/ @@ -638,8 +780,8 @@ afr_readv_cbk (call_frame_t *frame, void *cookie,  		unwind = 0;  		STACK_WIND_COOKIE (frame, afr_readv_cbk, -				   (void *) (long) this_try, -				   children[this_try],  +				   (void *) (long) read_child, +				   children[this_try],  				   children[this_try]->fops->readv,  				   local->fd, local->cont.readv.size,  				   local->cont.readv.offset); @@ -662,6 +804,10 @@ afr_readv (call_frame_t *frame, xlator_t *this,  	afr_local_t   * local      = NULL;  	xlator_t **     children   = NULL; +        afr_inode_ctx_t * inode_ctx = NULL; +        uint64_t          ctx; +        int               ret       = 0; +  	int             call_child = 0;  	int32_t         op_ret     = -1; @@ -679,15 +825,28 @@ afr_readv (call_frame_t *frame, xlator_t *this,  	frame->local = local; -	if (priv->read_child != -1) { -		call_child = priv->read_child; +        ret = inode_ctx_get (fd->inode, this, +                             &ctx); -		/*  +        if (ret < 0) { +                op_errno = EINVAL; +                gf_log (this->name, GF_LOG_ERROR, +                        "inode ctx not set!"); +                goto out; +        } + +        inode_ctx = (afr_inode_ctx_t *)(long) ctx; + +        if (inode_ctx->read_child >= 0) { +                call_child = inode_ctx->read_child; + +		/*  		   if read fails from the read child, we try  		   all children starting with the first one  		*/ -		local->cont.readv.last_tried = -1; -	} else { +                local->cont.readv.last_tried = -1; + +        } else {  		call_child = afr_first_up_child (priv);  		if (call_child == -1) {  			op_errno = ENOTCONN; diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 4ae302deb7c..e3526087ae7 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -345,10 +345,13 @@ afr_lookup_cbk (call_frame_t *frame, void *cookie,  	struct stat *   lookup_buf = NULL;  	int             call_count = -1;  	int             child_index = -1; -	int             prev_child_index = -1; +  	uint32_t        open_fd_count = 0;  	int             ret = 0; +        afr_inode_ctx_t * inode_ctx = NULL; +        uint64_t          ctx; +  	child_index = (long) cookie;  	priv = this->private; @@ -409,7 +412,52 @@ afr_lookup_cbk (call_frame_t *frame, void *cookie,  			lookup_buf->st_ino = afr_itransform (buf->st_ino,  							     priv->child_count,  							     child_index); + +                        ret = inode_ctx_get (local->cont.lookup.inode, this, +                                             &ctx); + +                        inode_ctx = (afr_inode_ctx_t *)(long) ctx; + +                        if (priv->read_child >= 0) { +                                inode_ctx->read_child = priv->read_child; +                        } else { +                                inode_ctx->read_child = child_index; +                        } +  		} else { +                        if ((local->op_ret == 0) +                            && (child_index == local->read_child_index)) { +                                 +                                /*  +                                   lookup has succeeded on the read child. +                                   So use its inode number +                                */ + +                                local->op_ret = op_ret; + +                                if (local->cont.lookup.xattr) +                                        dict_unref (local->cont.lookup.xattr); +                                 +                                local->cont.lookup.inode = inode; +                                local->cont.lookup.xattr = dict_ref (xattr); + +                                *lookup_buf = *buf; +                                lookup_buf->st_ino = afr_itransform (buf->st_ino, +                                                                     priv->child_count, +                                                                     child_index); + +                                ret = inode_ctx_get (local->cont.lookup.inode, this, +                                                     &ctx); + +                                inode_ctx = (afr_inode_ctx_t *)(long) ctx; + +                                if (priv->read_child >= 0) { +                                        inode_ctx->read_child = priv->read_child; +                                } else { +                                        inode_ctx->read_child = local->read_child_index; +                                } +                        } +  			if (FILETYPE_DIFFERS (buf, lookup_buf)) {  				/* mismatching filetypes with same name  				   -- Govinda !! GOvinda !!! @@ -431,15 +479,6 @@ afr_lookup_cbk (call_frame_t *frame, void *cookie,  			    && S_ISREG (buf->st_mode)) {  				local->need_data_self_heal = 1;  			} - -			prev_child_index = afr_deitransform_orig (lookup_buf->st_ino,  -								  priv->child_count); -			if (child_index < prev_child_index) { -				*lookup_buf = *buf; -				lookup_buf->st_ino = afr_itransform (buf->st_ino, -								     priv->child_count, -								     child_index); -			}  		}  		local->success_count++; @@ -465,9 +504,13 @@ unlock:  		}  		if (local->success_count) { -			/* check for govinda_gOvinda case in previous lookup */ -			if (!inode_ctx_get (local->cont.lookup.inode,  -					   this, NULL)) +			/* check for split-brain case in previous lookup */ +                        ret = inode_ctx_get (local->cont.lookup.inode, this, +                                             &ctx); + +                        inode_ctx = (afr_inode_ctx_t *)(long) ctx; + +			if (inode_ctx->split_brain)  				local->need_data_self_heal = 1;  		} @@ -544,7 +587,12 @@ afr_lookup (call_frame_t *frame, xlator_t *this,                  }          } -	local->reval_child_index = 0; +        LOCK (&priv->read_child_lock); +        { +                local->read_child_index = (++priv->read_child_rr)  +                        % (priv->child_count); +        } +        UNLOCK (&priv->read_child_lock);  	local->call_count = priv->child_count; @@ -2187,17 +2235,12 @@ init (xlator_t *this)  		trav = trav->next;  	} -	/* XXX: return inode numbers from 1st subvolume till -	   afr supports read-subvolume based on inode's ctx  -	   (and not itransform) for this reason afr_deitransform()  -	   returns 0 always -	*/ -	priv->read_child = 0; -  	priv->wait_count = 1;  	priv->child_count = child_count; +  	LOCK_INIT (&priv->lock); +        LOCK_INIT (&priv->read_child_lock);  	priv->child_up = CALLOC (sizeof (unsigned char), child_count);  	if (!priv->child_up) { diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 5db6e98092a..a447b74f47b 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -35,6 +35,9 @@ typedef struct _afr_private {  	gf_lock_t lock;               /* to guard access to child_count, etc */  	unsigned int child_count;     /* total number of children   */ +        unsigned int read_child_rr;   /* round-robin index of the read_child */ +        gf_lock_t read_child_lock;    /* lock to protect above */ +          	xlator_t **children;  	unsigned char *child_up; @@ -48,7 +51,7 @@ typedef struct _afr_private {  	gf_boolean_t metadata_change_log;   /* on/off */  	gf_boolean_t entry_change_log;      /* on/off */ -	unsigned int read_child;      /* read-subvolume */ +	int read_child;               /* read-subvolume */  	unsigned int favorite_child;  /* subvolume to be preferred in resolving  					 split-brain cases */ @@ -110,7 +113,8 @@ typedef struct _afr_local {  	unsigned int need_data_self_heal;  	unsigned int govinda_gOvinda; -	unsigned int reval_child_index; +	unsigned int read_child_index; +          	int32_t op_ret;  	int32_t op_errno;  | 
